|
1 |
| -import numpy as np |
2 |
| -from docx import Document |
3 |
| -from docx.opc.constants import RELATIONSHIP_TYPE as RT |
4 |
| -from docx.oxml.ns import qn |
5 |
| -import zipfile |
6 |
| -from bs4 import BeautifulSoup |
7 |
| - |
8 |
| - |
9 |
| -def iter_hyperlink_rels(rels): |
10 |
| - for rel in rels: |
11 |
| - if rels[rel].reltype == 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink': |
12 |
| - yield rels[rel]._target |
13 |
| - |
14 |
| - |
15 |
| -def find_hyperlink_indoc(doc): |
16 |
| - ''' |
17 |
| - :param doc: doc file get by doc = Document('./xxxx.docx') |
18 |
| - :return: a list of all hyperlink item in doc. |
19 |
| - ''' |
20 |
| - xml_e = doc.element |
21 |
| - hyperlink_list = xml_e.findall('.//' + qn("w:hyperlink")) |
22 |
| - return hyperlink_list |
23 |
| - |
24 |
| - |
25 |
| -def get_hyperlink_text(hyperlink_item): |
26 |
| - text = hyperlink_item.findall('.//' + qn("w:t"))[0].text |
27 |
| - if text[0] == '[': |
28 |
| - text = text.split('[')[1].split(']')[0] |
29 |
| - return text |
30 |
| - |
31 |
| - |
32 |
| -def get_linked_text(soup): |
33 |
| - links = [] |
34 |
| - |
35 |
| - # This kind of link has a corresponding URL in the _rel file. |
36 |
| - for tag in soup.find_all("hyperlink"): |
37 |
| - # try/except because some hyperlinks have no id. |
38 |
| - try: |
39 |
| - links.append({"id": tag["r:id"], "text": tag.text}) |
40 |
| - except: |
41 |
| - pass |
42 |
| - return links |
43 |
| - |
44 |
| - |
45 |
| -if __name__ == '__main__': |
46 |
| - file_name = "D:\\Laboratory\\CUHK\\OneDrive - The Chinese University of Hong Kong\\Documents\\申报书\\擦桌子\\220222CRF1P_Xi_20220720.docx" |
47 |
| - document = Document(file_name) |
48 |
| - |
49 |
| - hl_list = find_hyperlink_indoc(document) |
50 |
| - i = 0 |
51 |
| - text_lst = [] |
52 |
| - for item in hl_list: |
53 |
| - i += 1 |
54 |
| - print(i, get_hyperlink_text(item)) |
55 |
| - text_lst.append(get_hyperlink_text(item)) |
56 |
| - |
57 |
| - archive = zipfile.ZipFile(file_name, "r") |
58 |
| - file_data = archive.read("word/document.xml") |
59 |
| - doc_soup = BeautifulSoup(file_data, "xml") |
60 |
| - linked_text = get_linked_text(doc_soup) |
61 |
| - |
62 |
| - rels = document.part.rels |
63 |
| - i = 0 |
64 |
| - rel_lst = [] |
65 |
| - for rel in rels: |
66 |
| - if rels[rel].reltype == RT.HYPERLINK: |
67 |
| - i += 1 |
68 |
| - # print(i, rels[rel]._target) |
69 |
| - rel_lst.append(rels[rel]._target) |
70 |
| - for item in linked_text: |
71 |
| - if item['id'] == rel: |
72 |
| - item['url'] = rels[rel]._target |
73 |
| - |
74 |
| - rel_text = np.array([[linked_text[j]['text'].split('[')[1].split(']')[0], linked_text[j]['url']] if |
75 |
| - linked_text[j]['text'][0] == '[' else [linked_text[j]['text'], linked_text[j]['url']] for j in |
76 |
| - range(len(linked_text))]) |
77 |
| - # rel_text = np.hstack((np.array(text_lst).reshape((-1, 1)), np.array(rel_lst).reshape((-1, 1)))) |
78 |
| - np.savetxt("rel_text.csv", rel_text, delimiter=",", fmt="%s") |
| 1 | +# import numpy as np |
| 2 | +# from docx import Document |
| 3 | +# from docx.opc.constants import RELATIONSHIP_TYPE as RT |
| 4 | +# from docx.oxml.ns import qn |
| 5 | +# import zipfile |
| 6 | +# from bs4 import BeautifulSoup |
| 7 | +# |
| 8 | +# |
| 9 | +# def iter_hyperlink_rels(rels): |
| 10 | +# for rel in rels: |
| 11 | +# if rels[rel].reltype == 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink': |
| 12 | +# yield rels[rel]._target |
| 13 | +# |
| 14 | +# |
| 15 | +# def find_hyperlink_indoc(doc): |
| 16 | +# ''' |
| 17 | +# :param doc: doc file get by doc = Document('./xxxx.docx') |
| 18 | +# :return: a list of all hyperlink item in doc. |
| 19 | +# ''' |
| 20 | +# xml_e = doc.element |
| 21 | +# hyperlink_list = xml_e.findall('.//' + qn("w:hyperlink")) |
| 22 | +# return hyperlink_list |
| 23 | +# |
| 24 | +# |
| 25 | +# def get_hyperlink_text(hyperlink_item): |
| 26 | +# text = hyperlink_item.findall('.//' + qn("w:t"))[0].text |
| 27 | +# if text[0] == '[': |
| 28 | +# text = text.split('[')[1].split(']')[0] |
| 29 | +# return text |
| 30 | +# |
| 31 | +# |
| 32 | +# def get_linked_text(soup): |
| 33 | +# links = [] |
| 34 | +# |
| 35 | +# # This kind of link has a corresponding URL in the _rel file. |
| 36 | +# for tag in soup.find_all("hyperlink"): |
| 37 | +# # try/except because some hyperlinks have no id. |
| 38 | +# try: |
| 39 | +# links.append({"id": tag["r:id"], "text": tag.text}) |
| 40 | +# except: |
| 41 | +# pass |
| 42 | +# return links |
| 43 | +# |
| 44 | +# |
| 45 | +# if __name__ == '__main__': |
| 46 | +# file_name = "D:\\Laboratory\\CUHK\\OneDrive - The Chinese University of Hong Kong\\Documents\\申报书\\擦桌子\\220222CRF1P_Xi_20220720.docx" |
| 47 | +# document = Document(file_name) |
| 48 | +# |
| 49 | +# hl_list = find_hyperlink_indoc(document) |
| 50 | +# i = 0 |
| 51 | +# text_lst = [] |
| 52 | +# for item in hl_list: |
| 53 | +# i += 1 |
| 54 | +# print(i, get_hyperlink_text(item)) |
| 55 | +# text_lst.append(get_hyperlink_text(item)) |
| 56 | +# |
| 57 | +# archive = zipfile.ZipFile(file_name, "r") |
| 58 | +# file_data = archive.read("word/document.xml") |
| 59 | +# doc_soup = BeautifulSoup(file_data, "xml") |
| 60 | +# linked_text = get_linked_text(doc_soup) |
| 61 | +# |
| 62 | +# rels = document.part.rels |
| 63 | +# i = 0 |
| 64 | +# rel_lst = [] |
| 65 | +# for rel in rels: |
| 66 | +# if rels[rel].reltype == RT.HYPERLINK: |
| 67 | +# i += 1 |
| 68 | +# # print(i, rels[rel]._target) |
| 69 | +# rel_lst.append(rels[rel]._target) |
| 70 | +# for item in linked_text: |
| 71 | +# if item['id'] == rel: |
| 72 | +# item['url'] = rels[rel]._target |
| 73 | +# |
| 74 | +# rel_text = np.array([[linked_text[j]['text'].split('[')[1].split(']')[0], linked_text[j]['url']] if |
| 75 | +# linked_text[j]['text'][0] == '[' else [linked_text[j]['text'], linked_text[j]['url']] for j in |
| 76 | +# range(len(linked_text))]) |
| 77 | +# # rel_text = np.hstack((np.array(text_lst).reshape((-1, 1)), np.array(rel_lst).reshape((-1, 1)))) |
| 78 | +# np.savetxt("rel_text.csv", rel_text, delimiter=",", fmt="%s") |
0 commit comments