Commit cf911f9e authored by leeyena's avatar leeyena

241106

parent 930d5e2e
import xml.etree.ElementTree as ET
import os
import zipfile
from lxml import etree
import numpy as np
import shutil
import json
import xmltodict
from bs4 import BeautifulSoup
class print_xml :
def __init__(self):
self.np = {'hp':'http://www.hancom.co.kr/hwpml/2011/paragraph'}
self.hc = {'hc':'http://www.hancom.co.kr/hwpml/2011/core'}
self.obj_list = []
def get_xml (self,hwpx_file):
zip_file = hwpx_file.replace(".hwpx",".zip")
shutil.copy(hwpx_file, zip_file)
extract_path = os.path.join(os.path.dirname(zip_file),"zip_file")
with zipfile.ZipFile(zip_file,'r') as zf:
zf.extractall(extract_path)
i = 0
section_num = []
while True:
file_name = os.path.join(extract_path,"Contents",f"section{i}.xml")
if os.path.exists(file_name):
section_num.append(i)
else:
break
i += 1
for i in range(section_num[-1]+1):
xml_file_path = os.path.join(extract_path,"Contents",f"section{i}.xml")
data_file = open(xml_file_path,'r',encoding='utf-8')
soup = BeautifulSoup(data_file,"xml")
self.parsing(soup,extract_path)
#soup_xml = soup.prettify()
#with open(xml_file_path,'rb') as f:
# xml_content = f.read()
#root = etree.XML(xml_content)
#pretty_xml_string = etree.tostring(root,pretty_print=True).decode('utf-8')
'''
with open(f"fine{i}.txt", "w",encoding='utf-8') as file:
file.write(soup.prettify())
exit()
'''
return True
'''
xml_dict = xmltodict.parse(xml_content)
json_data = json.dumps(xml_dict, indent=4)
# JSON 데이터를 파일로 저장
with open('output.json', 'w') as json_file:
json.dump(xml_dict, json_file, indent=4)
print("JSON 파일로 저장 완료: output.json")
'''
#print(pretty_xml_string)
#exit()
#return self.parsing(soup,extract_path)
def table(self,tbl,zip_file_path) :
max = 0
for i in tbl.find_all('hp:tr'):
for j in i.find_all('hp:cellAddr') :
col_ = j.get("colAddr")
col_ = int(col_)
if max > col_ :
max = max
else:
max = col_
colnum = max
rownum = len(tbl.find_all('hp:tr'))
make_table = np.empty((int(rownum),colnum+1),dtype=object)
for i in range(int(rownum)):
for j in range(colnum+1):
make_table[i, j] = []
make_table = make_table.tolist()
col = 0
row = 0
for tr in tbl.find_all('hp:tr') :
colnum = len(tr.find_all('hp:subList'))
for tc in tr.find_all('hp:tc'):
text = ""
for run in tc.find_all('hp:run'):
addr = tc.find_all("hp:cellAddr")[-1]
col = addr.get("colAddr")
row = addr.get("rowAddr")
t = run.find("hp:t")
if run.find("hc:img") :
data = self.image(run,zip_file_path)
make_table[int(row)][int(col)].append(data)
if t == None :
continue
else :
if t.text is not None:
a = t.text.replace('\n', "")
a = a.replace(" ","" )
if a == '' :
continue
elif t.text is None:
continue
#text += t.text
if run.find("hp:tbl") :
data = self.table(run,zip_file_path)
make_table[int(row)][int(col)].append(data)
#print(text)
span = tc.find('hp:cellSpan')
colSpan = span.get('colSpan')
rowSpan = span.get('rowSpan')
#make_table = make_table.tolist()
if rowSpan != "1" :
for i in range(0,int(rowSpan)):
make_table[int(row)+i][int(col)].append(t.text)
else :
make_table[int(row)][int(col)].append(t.text)
''' cellSpan을 구해서 같은 머라해야하노.. 세부분야?로 나눠져 있는 것을 넣어준다'''
'''if colSpan != "1":
for i in range(0,int(colSpan)):
make_table[int(row)][int(col)+i] = text
'''
#make_table = make_table.tolist()
data = {
"type": "table",
"content": make_table,
}
tbl.decompose()
#self.obj_list.append(data)
return data
'''
for i in make_table[0]:
if i is None :
print(i)
'''
'''
data = {
"type": "table",
"content": make_table,
}
'''
'''
self.obj_list.append(data)
return make_table
'''
def text(self,p) :
text_list = []
text = ''
for t in p.find_all("hp:t"):
if t.text is not None:
a = t.text.replace('\n', "")
a = a.replace(" ", "")
if a == '': # 빈 문자열인 경우 continue로 다음으로 넘어감
continue
else:
continue
text += ''.join(t.stripped_strings)
if text.strip() :
text_list.append(text)
#print(text)
#text_list.append(text)
#for i in p.find_all("hp:t"):
#text_array = np.array(text)
#text_array = text_array.tolist()
data = {
"type": "text",
"content": text_list
}
#print(data)
self.obj_list.append(data)
def image(self,p,zip_file_path) :
text = ""
self.text(p)
for hc in p.find_all('hc:img') :
text = hc.get('binaryItemIDRef')
Bindata_path = os.path.join(zip_file_path, 'BinData')
image_path = os.path.join(Bindata_path,text)
data = {
"type": "image",
"content": image_path
}
return data
def parsing (self,root,zip_file_path) :
for tag_p in root.find_all('hp:p'):
if tag_p.find_all('hp:tbl'):
data = self.table(tag_p,zip_file_path)
self.obj_list.append(data)
elif tag_p.find_all('hp:pic') :
data = self.image(tag_p,zip_file_path)
self.obj_list.append(data)
#elif tag_p.find_all('hp:drawText'):
# self.drawText(tag_p)
elif tag_p.find_all('hp:t'):
if not tag_p.find_parent('hp:tbl'):
self.text(tag_p)
#exit()
# for run in tag_p.findall('./hp:run',self.np):
# for tag in run :
# if tag.tag == f'{{{self.np["hp"]}}}t':
# self.text(tag)
# elif tag.tag == f'{{{self.np["hp"]}}}tbl':
# self.table(tag)
# elif tag.tag == f'{{{self.np["hp"]}}}pic':
# self.image(tag,zip_file_path)
def add(self,list):
result = []
current_text = []
for item in list:
if item["type"] == "text":
if not item["content"] or item["content"] == [[]]:
continue # 빈 content일 경우 추가하지 않고 건너뜀
current_text.append(item["content"])
#elif item["type"] == "table":
# for i in item["content"]:
# print(i)
#for j in range(len(i)):
# if i[j]
#exit()
elif item["type"] != "text":
if current_text:
result.append({"type": "text", "content": current_text})
current_text = []
result.append(item)
if current_text:
result.append({"type": "text", "content": current_text})
return result
'''
def drawText (self,p):
text = ""
text_lis = []
for tr in p.find_all('.//hp:drawText',self.np) :
for tc in tr.find_all('.//hp:run',self.np):
t = tc.find(".//hp:t", self.np)
if t == None :
continue
else :
if t.text is not None:
a = t.text.replace('\n', "")
a = a.replace(" ","" )
if a == '' :
continue
elif t.text is None:
continue
text = t.text
text_lis.append(text)
data = {
"type": "drawText",
"content": text_lis
}
self.obj_list.append(data)
'''
if __name__ == "__main__" :
# hwpx 파일과 경로 설정
#test_final
hwpx_file = "/Users/iyena/Documents/leeyena/hwpx/final_test.hwpx"
px = print_xml()
i = False
i = px.get_xml(hwpx_file)
reul = px.add(px.obj_list)
if i :
with open ("final_test2.json","w", encoding="utf-8") as f:
json.dump(reul,f,ensure_ascii=False, indent=4)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment