241106

cf911f9e · leeyena · 930d5e2e · cf911f9e · cf911f9e
Commit cf911f9e authored Nov 06, 2024 by leeyena
Hide whitespace changes
Inline Side-by-side

Showing with 312 additions and 0 deletions

final_test.hwpx hwpx/final_test.hwpx +0 -0

test05.py hwpx/test05.py +312 -0

No files found.
--- a/hwpx/final_test.hwpx
+++ b/hwpx/final_test.hwpx
--- a/hwpx/test05.py
+++ b/hwpx/test05.py
+import xml.etree.ElementTree as ET
+import os
+import zipfile
+from lxml import etree
+import numpy as np
+import shutil
+import json
+import xmltodict
+from bs4 import BeautifulSoup
+
+class print_xml :
+    def __init__(self):
+        
+        self.np = {'hp':'http://www.hancom.co.kr/hwpml/2011/paragraph'}
+        self.hc = {'hc':'http://www.hancom.co.kr/hwpml/2011/core'}
+        self.obj_list = []
+
+    def get_xml (self,hwpx_file):
+
+        zip_file = hwpx_file.replace(".hwpx",".zip")
+        shutil.copy(hwpx_file, zip_file) 
+        extract_path = os.path.join(os.path.dirname(zip_file),"zip_file")
+        with zipfile.ZipFile(zip_file,'r') as zf:
+            zf.extractall(extract_path)
+            
+        i = 0
+        section_num = []
+        while True:
+            file_name = os.path.join(extract_path,"Contents",f"section{i}.xml")
+            if os.path.exists(file_name):
+                section_num.append(i)
+            else:
+                break  
+            i += 1 
+            
+        for i in range(section_num[-1]+1):
+            xml_file_path = os.path.join(extract_path,"Contents",f"section{i}.xml")
+            data_file = open(xml_file_path,'r',encoding='utf-8')
+            soup = BeautifulSoup(data_file,"xml")
+            self.parsing(soup,extract_path)
+        
+        #soup_xml = soup.prettify()
+
+        #with open(xml_file_path,'rb') as f:
+        #   xml_content = f.read()
+        
+        #root = etree.XML(xml_content)
+        #pretty_xml_string = etree.tostring(root,pretty_print=True).decode('utf-8')
+        '''
+            with open(f"fine{i}.txt", "w",encoding='utf-8') as file:
+                file.write(soup.prettify())
+        exit()
+        '''
+        return True
+        
+        
+        '''
+        xml_dict = xmltodict.parse(xml_content)
+        json_data = json.dumps(xml_dict, indent=4)
+        # JSON 데이터를 파일로 저장
+        with open('output.json', 'w') as json_file:
+            json.dump(xml_dict, json_file, indent=4)
+
+        print("JSON 파일로 저장 완료: output.json")
+        '''
+        #print(pretty_xml_string)
+        #exit()
+        #return self.parsing(soup,extract_path) 
+               
+    def table(self,tbl,zip_file_path) :
+        
+        max = 0
+        for i in tbl.find_all('hp:tr'):
+            for j in i.find_all('hp:cellAddr') :
+                col_ = j.get("colAddr")
+                col_ = int(col_)
+                if max > col_ :
+                    max = max
+                else:
+                    max = col_                
+        colnum = max
+        rownum = len(tbl.find_all('hp:tr'))
+        make_table = np.empty((int(rownum),colnum+1),dtype=object)
+        for i in range(int(rownum)):
+            for j in range(colnum+1):
+                make_table[i, j] = []  
+        make_table = make_table.tolist()
+        col = 0
+        row = 0
+        for tr in tbl.find_all('hp:tr') :
+            colnum = len(tr.find_all('hp:subList'))   
+            for tc in tr.find_all('hp:tc'):
+                text = ""
+                for run in tc.find_all('hp:run'):
+                    addr = tc.find_all("hp:cellAddr")[-1]
+                    col = addr.get("colAddr")
+                    row = addr.get("rowAddr")
+                    t = run.find("hp:t")
+                    if run.find("hc:img") :
+                        data = self.image(run,zip_file_path)
+                        make_table[int(row)][int(col)].append(data)
+                        
+                    if t == None : 
+                        continue
+                    else :
+                        if t.text is not None:
+                            a = t.text.replace('\n', "")
+                            a = a.replace(" ","" )
+                            if a  == '' :
+                                continue
+                        elif t.text is None:
+                            continue 
+                        #text += t.text 
+                       
+                        if run.find("hp:tbl") :
+                            data = self.table(run,zip_file_path)   
+                            make_table[int(row)][int(col)].append(data)
+     
+                       
+                        #print(text)
+                        span = tc.find('hp:cellSpan')
+                        colSpan = span.get('colSpan')
+                        rowSpan = span.get('rowSpan')
+                            #make_table = make_table.tolist()
+                        if rowSpan != "1" :
+                            for i in range(0,int(rowSpan)):
+                                make_table[int(row)+i][int(col)].append(t.text)
+                        
+                        else :
+                            make_table[int(row)][int(col)].append(t.text) 
+        
+                            ''' cellSpan을 구해서 같은 머라해야하노.. 세부분야?로 나눠져 있는 것을 넣어준다'''
+                        
+                            '''if colSpan != "1":
+                            for i in range(0,int(colSpan)):
+                                make_table[int(row)][int(col)+i] = text 
+                            '''
+                
+        #make_table = make_table.tolist()
+        data = {
+            "type": "table",
+            "content": make_table,
+        }
+        tbl.decompose() 
+        #self.obj_list.append(data)
+        return data
+        
+        '''
+        for i in make_table[0]:
+            if i is None :
+                print(i)
+        '''
+        '''
+        data = {
+            "type": "table",
+            "content": make_table,
+        }
+        '''
+        '''
+        self.obj_list.append(data)
+         
+        return make_table 
+        '''
+
+    def text(self,p) :
+        text_list = []
+        text = ''
+
+        for t in p.find_all("hp:t"):   
+      
+            if t.text is not None:
+                a = t.text.replace('\n', "")
+                a = a.replace(" ", "")
+                if a == '':  # 빈 문자열인 경우 continue로 다음으로 넘어감
+                    continue
+            else:
+                continue
+            text += ''.join(t.stripped_strings)
+        if text.strip() :
+            text_list.append(text)
+        
+        #print(text)
+        #text_list.append(text)
+     
+            
+        
+        #for i in p.find_all("hp:t"):
+       
+          
+        #text_array = np.array(text)
+        #text_array = text_array.tolist()
+        data = {
+            "type": "text",
+            "content": text_list
+        } 
+        #print(data)
+        self.obj_list.append(data)
+
+
+    def image(self,p,zip_file_path) :
+        
+        text = ""
+        self.text(p)
+        for hc in p.find_all('hc:img') : 
+            text = hc.get('binaryItemIDRef')
+        Bindata_path = os.path.join(zip_file_path, 'BinData')  
+        image_path = os.path.join(Bindata_path,text)
+        data = {
+            "type": "image",
+            "content": image_path
+        }
+        return data
+        
+        
+    def parsing (self,root,zip_file_path) : 
+        
+        for tag_p in root.find_all('hp:p'): 
+            if tag_p.find_all('hp:tbl'):
+                data = self.table(tag_p,zip_file_path)
+                self.obj_list.append(data)
+            elif tag_p.find_all('hp:pic') :
+                data = self.image(tag_p,zip_file_path)
+                self.obj_list.append(data) 
+                
+            
+            #elif tag_p.find_all('hp:drawText'):
+            #    self.drawText(tag_p)
+            
+            elif tag_p.find_all('hp:t'):
+                if not tag_p.find_parent('hp:tbl'):
+                    self.text(tag_p)    
+        #exit()
+            # for run in tag_p.findall('./hp:run',self.np):
+            #     for tag in run : 
+            #         if tag.tag == f'{{{self.np["hp"]}}}t':
+            #             self.text(tag)
+            #         elif tag.tag == f'{{{self.np["hp"]}}}tbl':
+            #             self.table(tag) 
+            #         elif tag.tag == f'{{{self.np["hp"]}}}pic': 
+            #             self.image(tag,zip_file_path)
+    
+    def add(self,list):
+        result = []
+        current_text = []
+        for item in list:
+            if item["type"] == "text":
+                if not item["content"] or item["content"] == [[]]:
+                    continue  # 빈 content일 경우 추가하지 않고 건너뜀
+                current_text.append(item["content"])
+            #elif item["type"] == "table":
+            #    for i in item["content"]:
+            #        print(i)
+                    #for j in range(len(i)):
+                    #    if i[j]
+                #exit()    
+            elif item["type"] != "text":
+                if current_text:
+                    result.append({"type": "text", "content": current_text})
+                    current_text = []
+                result.append(item)
+                
+        if current_text:
+            result.append({"type": "text", "content": current_text})
+        
+        return result
+    
+    '''
+    def drawText (self,p):
+            text = ""
+            text_lis = []
+            for tr in p.find_all('.//hp:drawText',self.np) :  
+                for tc in tr.find_all('.//hp:run',self.np):
+                        t = tc.find(".//hp:t", self.np)
+                        if t == None : 
+                            continue
+                        else :
+                            if t.text is not None:
+                                a = t.text.replace('\n', "")
+                                a = a.replace(" ","" )
+                                if a  == '' :
+                                    continue
+                            elif t.text is None:
+                                continue 
+                            text = t.text 
+                        text_lis.append(text)
+
+            data = {
+                "type": "drawText",
+                "content": text_lis
+            }
+            self.obj_list.append(data) 
+            
+    '''
+          
+         
+if __name__ == "__main__" :
+    # hwpx 파일과 경로 설정
+    #test_final
+    hwpx_file = "/Users/iyena/Documents/leeyena/hwpx/final_test.hwpx" 
+    px = print_xml()
+    i = False
+    i = px.get_xml(hwpx_file)
+    reul = px.add(px.obj_list)
+    if i :
+        with open ("final_test2.json","w", encoding="utf-8") as f:
+            json.dump(reul,f,ensure_ascii=False, indent=4)
+
+            
+
+    
+    
+