Skip to content

Commit 2d772c2

Browse files
committed
Update doc
1 parent 5537226 commit 2d772c2

File tree

2 files changed

+78
-82
lines changed

2 files changed

+78
-82
lines changed

doc/source/conf.py

-4
Original file line numberDiff line numberDiff line change
@@ -68,10 +68,6 @@
6868
# a list of builtin themes.
6969
#
7070
html_theme = 'sphinx_rtd_theme'
71-
html_theme_options = {
72-
"rightsidebar": "true",
73-
"relbarbgcolor": "black"
74-
}
7571

7672
# Add any paths that contain custom static files (such as style sheets) here,
7773
# relative to this directory. They are copied after the builtin static files,

rofunc/utils/file/docxurl2ref.py

+78-78
Original file line numberDiff line numberDiff line change
@@ -1,78 +1,78 @@
1-
import numpy as np
2-
from docx import Document
3-
from docx.opc.constants import RELATIONSHIP_TYPE as RT
4-
from docx.oxml.ns import qn
5-
import zipfile
6-
from bs4 import BeautifulSoup
7-
8-
9-
def iter_hyperlink_rels(rels):
10-
for rel in rels:
11-
if rels[rel].reltype == 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink':
12-
yield rels[rel]._target
13-
14-
15-
def find_hyperlink_indoc(doc):
16-
'''
17-
:param doc: doc file get by doc = Document('./xxxx.docx')
18-
:return: a list of all hyperlink item in doc.
19-
'''
20-
xml_e = doc.element
21-
hyperlink_list = xml_e.findall('.//' + qn("w:hyperlink"))
22-
return hyperlink_list
23-
24-
25-
def get_hyperlink_text(hyperlink_item):
26-
text = hyperlink_item.findall('.//' + qn("w:t"))[0].text
27-
if text[0] == '[':
28-
text = text.split('[')[1].split(']')[0]
29-
return text
30-
31-
32-
def get_linked_text(soup):
33-
links = []
34-
35-
# This kind of link has a corresponding URL in the _rel file.
36-
for tag in soup.find_all("hyperlink"):
37-
# try/except because some hyperlinks have no id.
38-
try:
39-
links.append({"id": tag["r:id"], "text": tag.text})
40-
except:
41-
pass
42-
return links
43-
44-
45-
if __name__ == '__main__':
46-
file_name = "D:\\Laboratory\\CUHK\\OneDrive - The Chinese University of Hong Kong\\Documents\\申报书\\擦桌子\\220222CRF1P_Xi_20220720.docx"
47-
document = Document(file_name)
48-
49-
hl_list = find_hyperlink_indoc(document)
50-
i = 0
51-
text_lst = []
52-
for item in hl_list:
53-
i += 1
54-
print(i, get_hyperlink_text(item))
55-
text_lst.append(get_hyperlink_text(item))
56-
57-
archive = zipfile.ZipFile(file_name, "r")
58-
file_data = archive.read("word/document.xml")
59-
doc_soup = BeautifulSoup(file_data, "xml")
60-
linked_text = get_linked_text(doc_soup)
61-
62-
rels = document.part.rels
63-
i = 0
64-
rel_lst = []
65-
for rel in rels:
66-
if rels[rel].reltype == RT.HYPERLINK:
67-
i += 1
68-
# print(i, rels[rel]._target)
69-
rel_lst.append(rels[rel]._target)
70-
for item in linked_text:
71-
if item['id'] == rel:
72-
item['url'] = rels[rel]._target
73-
74-
rel_text = np.array([[linked_text[j]['text'].split('[')[1].split(']')[0], linked_text[j]['url']] if
75-
linked_text[j]['text'][0] == '[' else [linked_text[j]['text'], linked_text[j]['url']] for j in
76-
range(len(linked_text))])
77-
# rel_text = np.hstack((np.array(text_lst).reshape((-1, 1)), np.array(rel_lst).reshape((-1, 1))))
78-
np.savetxt("rel_text.csv", rel_text, delimiter=",", fmt="%s")
1+
# import numpy as np
2+
# from docx import Document
3+
# from docx.opc.constants import RELATIONSHIP_TYPE as RT
4+
# from docx.oxml.ns import qn
5+
# import zipfile
6+
# from bs4 import BeautifulSoup
7+
#
8+
#
9+
# def iter_hyperlink_rels(rels):
10+
# for rel in rels:
11+
# if rels[rel].reltype == 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink':
12+
# yield rels[rel]._target
13+
#
14+
#
15+
# def find_hyperlink_indoc(doc):
16+
# '''
17+
# :param doc: doc file get by doc = Document('./xxxx.docx')
18+
# :return: a list of all hyperlink item in doc.
19+
# '''
20+
# xml_e = doc.element
21+
# hyperlink_list = xml_e.findall('.//' + qn("w:hyperlink"))
22+
# return hyperlink_list
23+
#
24+
#
25+
# def get_hyperlink_text(hyperlink_item):
26+
# text = hyperlink_item.findall('.//' + qn("w:t"))[0].text
27+
# if text[0] == '[':
28+
# text = text.split('[')[1].split(']')[0]
29+
# return text
30+
#
31+
#
32+
# def get_linked_text(soup):
33+
# links = []
34+
#
35+
# # This kind of link has a corresponding URL in the _rel file.
36+
# for tag in soup.find_all("hyperlink"):
37+
# # try/except because some hyperlinks have no id.
38+
# try:
39+
# links.append({"id": tag["r:id"], "text": tag.text})
40+
# except:
41+
# pass
42+
# return links
43+
#
44+
#
45+
# if __name__ == '__main__':
46+
# file_name = "D:\\Laboratory\\CUHK\\OneDrive - The Chinese University of Hong Kong\\Documents\\申报书\\擦桌子\\220222CRF1P_Xi_20220720.docx"
47+
# document = Document(file_name)
48+
#
49+
# hl_list = find_hyperlink_indoc(document)
50+
# i = 0
51+
# text_lst = []
52+
# for item in hl_list:
53+
# i += 1
54+
# print(i, get_hyperlink_text(item))
55+
# text_lst.append(get_hyperlink_text(item))
56+
#
57+
# archive = zipfile.ZipFile(file_name, "r")
58+
# file_data = archive.read("word/document.xml")
59+
# doc_soup = BeautifulSoup(file_data, "xml")
60+
# linked_text = get_linked_text(doc_soup)
61+
#
62+
# rels = document.part.rels
63+
# i = 0
64+
# rel_lst = []
65+
# for rel in rels:
66+
# if rels[rel].reltype == RT.HYPERLINK:
67+
# i += 1
68+
# # print(i, rels[rel]._target)
69+
# rel_lst.append(rels[rel]._target)
70+
# for item in linked_text:
71+
# if item['id'] == rel:
72+
# item['url'] = rels[rel]._target
73+
#
74+
# rel_text = np.array([[linked_text[j]['text'].split('[')[1].split(']')[0], linked_text[j]['url']] if
75+
# linked_text[j]['text'][0] == '[' else [linked_text[j]['text'], linked_text[j]['url']] for j in
76+
# range(len(linked_text))])
77+
# # rel_text = np.hstack((np.array(text_lst).reshape((-1, 1)), np.array(rel_lst).reshape((-1, 1))))
78+
# np.savetxt("rel_text.csv", rel_text, delimiter=",", fmt="%s")

0 commit comments

Comments
 (0)