Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Sign in / Register
Toggle navigation
H
HWPX_parser
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
leeyena
HWPX_parser
Commits
c9143d32
Commit
c9143d32
authored
Nov 06, 2024
by
leeyena
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
241106
parent
cf911f9e
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
13 additions
and
82 deletions
+13
-82
test05.py
hwpx/test05.py
+13
-82
No files found.
hwpx/test05.py
View file @
c9143d32
...
...
@@ -38,14 +38,9 @@ class print_xml :
data_file
=
open
(
xml_file_path
,
'r'
,
encoding
=
'utf-8'
)
soup
=
BeautifulSoup
(
data_file
,
"xml"
)
self
.
parsing
(
soup
,
extract_path
)
#soup_xml = soup.prettify()
#with open(xml_file_path,'rb') as f:
# xml_content = f.read()
#root = etree.XML(xml_content)
#pretty_xml_string = etree.tostring(root,pretty_print=True).decode('utf-8')
'''
xml 원본 파일 text 로 확인하기
'''
'''
with open(f"fine{i}.txt", "w",encoding='utf-8') as file:
file.write(soup.prettify())
...
...
@@ -53,20 +48,6 @@ class print_xml :
'''
return
True
'''
xml_dict = xmltodict.parse(xml_content)
json_data = json.dumps(xml_dict, indent=4)
# JSON 데이터를 파일로 저장
with open('output.json', 'w') as json_file:
json.dump(xml_dict, json_file, indent=4)
print("JSON 파일로 저장 완료: output.json")
'''
#print(pretty_xml_string)
#exit()
#return self.parsing(soup,extract_path)
def
table
(
self
,
tbl
,
zip_file_path
)
:
max
=
0
...
...
@@ -110,57 +91,32 @@ class print_xml :
continue
elif
t
.
text
is
None
:
continue
#text += t.text
if
run
.
find
(
"hp:tbl"
)
:
data
=
self
.
table
(
run
,
zip_file_path
)
make_table
[
int
(
row
)][
int
(
col
)]
.
append
(
data
)
#print(text)
span
=
tc
.
find
(
'hp:cellSpan'
)
colSpan
=
span
.
get
(
'colSpan'
)
rowSpan
=
span
.
get
(
'rowSpan'
)
#make_table = make_table.tolist()
if
rowSpan
!=
"1"
:
for
i
in
range
(
0
,
int
(
rowSpan
)):
make_table
[
int
(
row
)
+
i
][
int
(
col
)]
.
append
(
t
.
text
)
else
:
make_table
[
int
(
row
)][
int
(
col
)]
.
append
(
t
.
text
)
''' cellSpan을 구해서 같은 머라해야하노.. 세부분야?로 나눠져 있는 것을 넣어준다'''
'''if colSpan != "1":
for i in range(0,int(colSpan)):
make_table[int(row)][int(col)+i] = text
'''
cellSpan을 구해서 셀 병합 처리
'''
#make_table = make_table.tolist()
data
=
{
"type"
:
"table"
,
"content"
:
make_table
,
}
tbl
.
decompose
()
#self.obj_list.append(data)
return
data
'''
for i in make_table[0]:
if i is None :
print(i)
'''
'''
data = {
"type": "table",
"content": make_table,
}
'''
'''
self.obj_list.append(data)
return make_table
'''
def
text
(
self
,
p
)
:
text_list
=
[]
...
...
@@ -176,24 +132,15 @@ class print_xml :
else
:
continue
text
+=
''
.
join
(
t
.
stripped_strings
)
if
text
.
strip
()
:
text_list
.
append
(
text
)
#print(text)
#text_list.append(text)
#for i in p.find_all("hp:t"):
#text_array = np.array(text)
#text_array = text_array.tolist()
data
=
{
"type"
:
"text"
,
"content"
:
text_list
}
#print(data)
self
.
obj_list
.
append
(
data
)
...
...
@@ -222,22 +169,12 @@ class print_xml :
data
=
self
.
image
(
tag_p
,
zip_file_path
)
self
.
obj_list
.
append
(
data
)
#elif tag_p.find_all('hp:drawText'):
# self.drawText(tag_p)
elif
tag_p
.
find_all
(
'hp:t'
):
if
not
tag_p
.
find_parent
(
'hp:tbl'
):
self
.
text
(
tag_p
)
#exit()
# for run in tag_p.findall('./hp:run',self.np):
# for tag in run :
# if tag.tag == f'{{{self.np["hp"]}}}t':
# self.text(tag)
# elif tag.tag == f'{{{self.np["hp"]}}}tbl':
# self.table(tag)
# elif tag.tag == f'{{{self.np["hp"]}}}pic':
# self.image(tag,zip_file_path)
def
add
(
self
,
list
):
result
=
[]
...
...
@@ -247,12 +184,6 @@ class print_xml :
if
not
item
[
"content"
]
or
item
[
"content"
]
==
[[]]:
continue
# 빈 content일 경우 추가하지 않고 건너뜀
current_text
.
append
(
item
[
"content"
])
#elif item["type"] == "table":
# for i in item["content"]:
# print(i)
#for j in range(len(i)):
# if i[j]
#exit()
elif
item
[
"type"
]
!=
"text"
:
if
current_text
:
result
.
append
({
"type"
:
"text"
,
"content"
:
current_text
})
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment