Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Sign in / Register
Toggle navigation
H
HWPX_parser
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
leeyena
HWPX_parser
Commits
cf911f9e
Commit
cf911f9e
authored
Nov 06, 2024
by
leeyena
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
241106
parent
930d5e2e
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
312 additions
and
0 deletions
+312
-0
final_test.hwpx
hwpx/final_test.hwpx
+0
-0
test05.py
hwpx/test05.py
+312
-0
No files found.
hwpx/final_test.hwpx
0 → 100644
View file @
cf911f9e
File added
hwpx/test05.py
0 → 100644
View file @
cf911f9e
import
xml.etree.ElementTree
as
ET
import
os
import
zipfile
from
lxml
import
etree
import
numpy
as
np
import
shutil
import
json
import
xmltodict
from
bs4
import
BeautifulSoup
class
print_xml
:
def
__init__
(
self
):
self
.
np
=
{
'hp'
:
'http://www.hancom.co.kr/hwpml/2011/paragraph'
}
self
.
hc
=
{
'hc'
:
'http://www.hancom.co.kr/hwpml/2011/core'
}
self
.
obj_list
=
[]
def
get_xml
(
self
,
hwpx_file
):
zip_file
=
hwpx_file
.
replace
(
".hwpx"
,
".zip"
)
shutil
.
copy
(
hwpx_file
,
zip_file
)
extract_path
=
os
.
path
.
join
(
os
.
path
.
dirname
(
zip_file
),
"zip_file"
)
with
zipfile
.
ZipFile
(
zip_file
,
'r'
)
as
zf
:
zf
.
extractall
(
extract_path
)
i
=
0
section_num
=
[]
while
True
:
file_name
=
os
.
path
.
join
(
extract_path
,
"Contents"
,
f
"section{i}.xml"
)
if
os
.
path
.
exists
(
file_name
):
section_num
.
append
(
i
)
else
:
break
i
+=
1
for
i
in
range
(
section_num
[
-
1
]
+
1
):
xml_file_path
=
os
.
path
.
join
(
extract_path
,
"Contents"
,
f
"section{i}.xml"
)
data_file
=
open
(
xml_file_path
,
'r'
,
encoding
=
'utf-8'
)
soup
=
BeautifulSoup
(
data_file
,
"xml"
)
self
.
parsing
(
soup
,
extract_path
)
#soup_xml = soup.prettify()
#with open(xml_file_path,'rb') as f:
# xml_content = f.read()
#root = etree.XML(xml_content)
#pretty_xml_string = etree.tostring(root,pretty_print=True).decode('utf-8')
'''
with open(f"fine{i}.txt", "w",encoding='utf-8') as file:
file.write(soup.prettify())
exit()
'''
return
True
'''
xml_dict = xmltodict.parse(xml_content)
json_data = json.dumps(xml_dict, indent=4)
# JSON 데이터를 파일로 저장
with open('output.json', 'w') as json_file:
json.dump(xml_dict, json_file, indent=4)
print("JSON 파일로 저장 완료: output.json")
'''
#print(pretty_xml_string)
#exit()
#return self.parsing(soup,extract_path)
def
table
(
self
,
tbl
,
zip_file_path
)
:
max
=
0
for
i
in
tbl
.
find_all
(
'hp:tr'
):
for
j
in
i
.
find_all
(
'hp:cellAddr'
)
:
col_
=
j
.
get
(
"colAddr"
)
col_
=
int
(
col_
)
if
max
>
col_
:
max
=
max
else
:
max
=
col_
colnum
=
max
rownum
=
len
(
tbl
.
find_all
(
'hp:tr'
))
make_table
=
np
.
empty
((
int
(
rownum
),
colnum
+
1
),
dtype
=
object
)
for
i
in
range
(
int
(
rownum
)):
for
j
in
range
(
colnum
+
1
):
make_table
[
i
,
j
]
=
[]
make_table
=
make_table
.
tolist
()
col
=
0
row
=
0
for
tr
in
tbl
.
find_all
(
'hp:tr'
)
:
colnum
=
len
(
tr
.
find_all
(
'hp:subList'
))
for
tc
in
tr
.
find_all
(
'hp:tc'
):
text
=
""
for
run
in
tc
.
find_all
(
'hp:run'
):
addr
=
tc
.
find_all
(
"hp:cellAddr"
)[
-
1
]
col
=
addr
.
get
(
"colAddr"
)
row
=
addr
.
get
(
"rowAddr"
)
t
=
run
.
find
(
"hp:t"
)
if
run
.
find
(
"hc:img"
)
:
data
=
self
.
image
(
run
,
zip_file_path
)
make_table
[
int
(
row
)][
int
(
col
)]
.
append
(
data
)
if
t
==
None
:
continue
else
:
if
t
.
text
is
not
None
:
a
=
t
.
text
.
replace
(
'
\n
'
,
""
)
a
=
a
.
replace
(
" "
,
""
)
if
a
==
''
:
continue
elif
t
.
text
is
None
:
continue
#text += t.text
if
run
.
find
(
"hp:tbl"
)
:
data
=
self
.
table
(
run
,
zip_file_path
)
make_table
[
int
(
row
)][
int
(
col
)]
.
append
(
data
)
#print(text)
span
=
tc
.
find
(
'hp:cellSpan'
)
colSpan
=
span
.
get
(
'colSpan'
)
rowSpan
=
span
.
get
(
'rowSpan'
)
#make_table = make_table.tolist()
if
rowSpan
!=
"1"
:
for
i
in
range
(
0
,
int
(
rowSpan
)):
make_table
[
int
(
row
)
+
i
][
int
(
col
)]
.
append
(
t
.
text
)
else
:
make_table
[
int
(
row
)][
int
(
col
)]
.
append
(
t
.
text
)
''' cellSpan을 구해서 같은 머라해야하노.. 세부분야?로 나눠져 있는 것을 넣어준다'''
'''if colSpan != "1":
for i in range(0,int(colSpan)):
make_table[int(row)][int(col)+i] = text
'''
#make_table = make_table.tolist()
data
=
{
"type"
:
"table"
,
"content"
:
make_table
,
}
tbl
.
decompose
()
#self.obj_list.append(data)
return
data
'''
for i in make_table[0]:
if i is None :
print(i)
'''
'''
data = {
"type": "table",
"content": make_table,
}
'''
'''
self.obj_list.append(data)
return make_table
'''
def
text
(
self
,
p
)
:
text_list
=
[]
text
=
''
for
t
in
p
.
find_all
(
"hp:t"
):
if
t
.
text
is
not
None
:
a
=
t
.
text
.
replace
(
'
\n
'
,
""
)
a
=
a
.
replace
(
" "
,
""
)
if
a
==
''
:
# 빈 문자열인 경우 continue로 다음으로 넘어감
continue
else
:
continue
text
+=
''
.
join
(
t
.
stripped_strings
)
if
text
.
strip
()
:
text_list
.
append
(
text
)
#print(text)
#text_list.append(text)
#for i in p.find_all("hp:t"):
#text_array = np.array(text)
#text_array = text_array.tolist()
data
=
{
"type"
:
"text"
,
"content"
:
text_list
}
#print(data)
self
.
obj_list
.
append
(
data
)
def
image
(
self
,
p
,
zip_file_path
)
:
text
=
""
self
.
text
(
p
)
for
hc
in
p
.
find_all
(
'hc:img'
)
:
text
=
hc
.
get
(
'binaryItemIDRef'
)
Bindata_path
=
os
.
path
.
join
(
zip_file_path
,
'BinData'
)
image_path
=
os
.
path
.
join
(
Bindata_path
,
text
)
data
=
{
"type"
:
"image"
,
"content"
:
image_path
}
return
data
def
parsing
(
self
,
root
,
zip_file_path
)
:
for
tag_p
in
root
.
find_all
(
'hp:p'
):
if
tag_p
.
find_all
(
'hp:tbl'
):
data
=
self
.
table
(
tag_p
,
zip_file_path
)
self
.
obj_list
.
append
(
data
)
elif
tag_p
.
find_all
(
'hp:pic'
)
:
data
=
self
.
image
(
tag_p
,
zip_file_path
)
self
.
obj_list
.
append
(
data
)
#elif tag_p.find_all('hp:drawText'):
# self.drawText(tag_p)
elif
tag_p
.
find_all
(
'hp:t'
):
if
not
tag_p
.
find_parent
(
'hp:tbl'
):
self
.
text
(
tag_p
)
#exit()
# for run in tag_p.findall('./hp:run',self.np):
# for tag in run :
# if tag.tag == f'{{{self.np["hp"]}}}t':
# self.text(tag)
# elif tag.tag == f'{{{self.np["hp"]}}}tbl':
# self.table(tag)
# elif tag.tag == f'{{{self.np["hp"]}}}pic':
# self.image(tag,zip_file_path)
def
add
(
self
,
list
):
result
=
[]
current_text
=
[]
for
item
in
list
:
if
item
[
"type"
]
==
"text"
:
if
not
item
[
"content"
]
or
item
[
"content"
]
==
[[]]:
continue
# 빈 content일 경우 추가하지 않고 건너뜀
current_text
.
append
(
item
[
"content"
])
#elif item["type"] == "table":
# for i in item["content"]:
# print(i)
#for j in range(len(i)):
# if i[j]
#exit()
elif
item
[
"type"
]
!=
"text"
:
if
current_text
:
result
.
append
({
"type"
:
"text"
,
"content"
:
current_text
})
current_text
=
[]
result
.
append
(
item
)
if
current_text
:
result
.
append
({
"type"
:
"text"
,
"content"
:
current_text
})
return
result
'''
def drawText (self,p):
text = ""
text_lis = []
for tr in p.find_all('.//hp:drawText',self.np) :
for tc in tr.find_all('.//hp:run',self.np):
t = tc.find(".//hp:t", self.np)
if t == None :
continue
else :
if t.text is not None:
a = t.text.replace('
\n
', "")
a = a.replace(" ","" )
if a == '' :
continue
elif t.text is None:
continue
text = t.text
text_lis.append(text)
data = {
"type": "drawText",
"content": text_lis
}
self.obj_list.append(data)
'''
if
__name__
==
"__main__"
:
# hwpx 파일과 경로 설정
#test_final
hwpx_file
=
"/Users/iyena/Documents/leeyena/hwpx/final_test.hwpx"
px
=
print_xml
()
i
=
False
i
=
px
.
get_xml
(
hwpx_file
)
reul
=
px
.
add
(
px
.
obj_list
)
if
i
:
with
open
(
"final_test2.json"
,
"w"
,
encoding
=
"utf-8"
)
as
f
:
json
.
dump
(
reul
,
f
,
ensure_ascii
=
False
,
indent
=
4
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment