Skip to content

Commit a1c4561

Browse files
committed
Random Forest to address PDF parsing issues. #23 #28 #11 #4 #39
1 parent 6608f58 commit a1c4561

File tree

5 files changed

+183
-153
lines changed

5 files changed

+183
-153
lines changed
Binary file not shown.
-223 KB
Binary file not shown.

src/harmony/parsing/pdf_parser.py

+46-147
Original file line numberDiff line numberDiff line change
@@ -24,32 +24,59 @@
2424
SOFTWARE.
2525
2626
'''
27+
28+
import pathlib
2729
import pickle as pkl
2830
import re
2931

30-
import numpy as np
31-
3232
import harmony
33+
from harmony.parsing.util.feature_extraction import convert_text_to_features
3334
from harmony.parsing.util.tika_wrapper import parse_pdf_to_plain_text
34-
# from harmony.parsing.util.tesseract_wrapper import parse_image_pdf_to_plain_text
35-
# from harmony.parsing.util.camelot_wrapper import parse_pdf_to_tables
3635
from harmony.schemas.requests.text import RawFile, Instrument
3736

38-
re_initial_num = re.compile(r'(^\d+)')
39-
re_initial_num_dot = re.compile(r'(^\d+\.)')
40-
re_word = re.compile(r'(?i)(\b[\w\']+\b)')
41-
re_alpha = re.compile(r'(^[a-zA-Z]+)')
42-
re_bracket = re.compile(r'(?:\(|\))')
43-
import pathlib
44-
4537
model_containing_folder = pathlib.Path(__file__).parent.resolve()
4638

47-
with open(f"{model_containing_folder}/rf_table_model.pkl", "rb") as f:
48-
rf_table_model = pkl.load(f)
49-
50-
with open(f"{model_containing_folder}/crf_text_model.pkl", "rb") as f:
39+
with open(f"{model_containing_folder}/20240719_pdf_question_extraction_sklearn_crf_model.pkl", "rb") as f:
5140
crf_text_model = pkl.load(f)
5241

42+
# Predict method is taken from the training repo. Use the training repo as the master copy of the predict method.
43+
# All training code is in https://github.com/harmonydata/pdf-questionnaire-extraction
44+
def predict(test_text):
45+
token_texts, token_start_char_indices, token_end_char_indices, token_properties = convert_text_to_features(
46+
test_text)
47+
48+
X = []
49+
X.append(token_properties)
50+
51+
y_pred = crf_text_model.predict(X)
52+
53+
questions_from_text = []
54+
55+
tokens_already_used = set()
56+
57+
last_token_category = "O"
58+
59+
for idx in range(len(X[0])):
60+
61+
if y_pred[0][idx] != "O" and idx not in tokens_already_used:
62+
if last_token_category == "O" or y_pred[0][idx] == "B":
63+
start_idx = token_start_char_indices[idx]
64+
end_idx = len(test_text)
65+
for j in range(idx + 1, len(X[0])):
66+
if y_pred[0][j] == "O" or y_pred[0][j] == "B":
67+
end_idx = token_end_char_indices[j - 1]
68+
break
69+
tokens_already_used.add(j)
70+
71+
question_text = test_text[start_idx:end_idx]
72+
question_text = re.sub(r'\s+', ' ', question_text)
73+
question_text = question_text.strip()
74+
questions_from_text.append(question_text)
75+
76+
last_token_category = y_pred[0][idx]
77+
78+
return questions_from_text
79+
5380

5481
def convert_pdf_to_instruments(file: RawFile) -> Instrument:
5582
# file is an object containing these properties:
@@ -60,136 +87,8 @@ def convert_pdf_to_instruments(file: RawFile) -> Instrument:
6087
if not file.text_content:
6188
file.text_content = parse_pdf_to_plain_text(file.content) # call Tika to convert the PDF to plain text
6289

63-
# TODO: New PDF parsing algorithm should go here, together with return statement.
64-
65-
table_cell_texts = []
66-
page_tables = file.tables
67-
questions_from_tables = []
68-
if len(page_tables) > 0:
69-
for page_table in page_tables:
70-
tables = page_table['tables']
71-
for row in tables:
72-
for item in row:
73-
if len(item.strip()) > 0:
74-
table_cell_texts.append(item)
75-
76-
X = []
77-
for idx in range(len(table_cell_texts)):
78-
t = table_cell_texts[idx]
79-
features = [len(t),
80-
len(re_initial_num.findall(t)),
81-
len(re_initial_num_dot.findall(t))]
82-
X.append(features)
83-
84-
if len(X) > 0:
85-
X = np.asarray(X)
86-
87-
y_pred = rf_table_model.predict(X)
88-
89-
questions_from_tables = []
90-
for idx in range(len(table_cell_texts)):
91-
if y_pred[idx] == 1:
92-
questions_from_tables.append(table_cell_texts[idx])
93-
94-
95-
if True: # text CRF model
96-
questions_from_text = []
97-
X = []
98-
99-
token_texts = []
100-
token_properties = []
101-
102-
text = file.text_content
103-
char_indices_of_newlines = set()
104-
for idx, c in enumerate(text):
105-
if c == "\n":
106-
char_indices_of_newlines.add(idx)
107-
108-
char_indices_of_question_marks = set()
109-
for idx, c in enumerate(text):
110-
if c == "?":
111-
char_indices_of_question_marks.add(idx)
112-
113-
tokens = list(re_word.finditer(text))
114-
115-
last_token_properties = {}
116-
117-
for token in tokens:
118-
is_number = len(re_initial_num.findall(token.group()))
119-
is_number_dot = len(re_initial_num_dot.findall(token.group()))
120-
is_alpha = len(re_alpha.findall(token.group()))
121-
122-
dist_to_newline = token.start()
123-
for c in range(token.start(), 1, -1):
124-
if c in char_indices_of_newlines:
125-
dist_to_newline = token.start() - c
126-
break
127-
128-
dist_to_question_mark = len(text) - token.start()
129-
for c in range(token.start(), len(text)):
130-
if c in char_indices_of_question_marks:
131-
dist_to_question_mark = c - token.start()
132-
break
133-
134-
is_capital = int(token.group()[0] != token.group()[0].lower())
135-
136-
this_token_properties = {"length": len(token.group()), "is_number": is_number,
137-
"is_alpha": is_alpha,
138-
"is_capital": is_capital,
139-
"is_number_dot": is_number_dot,
140-
"dist_to_newline": dist_to_newline, "dist_to_question_mark": dist_to_question_mark,
141-
"char_index": token.start()}
142-
143-
this_token_properties["prev_length"] = last_token_properties.get("length", 0)
144-
this_token_properties["prev_is_alpha"] = last_token_properties.get("is_alpha", 0)
145-
this_token_properties["prev_is_number"] = last_token_properties.get("is_number", 0)
146-
this_token_properties["prev_is_number_dot"] = last_token_properties.get("is_number_dot", 0)
147-
this_token_properties["prev_is_capital"] = last_token_properties.get("is_capital", 0)
148-
149-
this_token_properties["prev_prev_length"] = last_token_properties.get("prev_length", 0)
150-
this_token_properties["prev_prev_is_alpha"] = last_token_properties.get("prev_is_alpha", 0)
151-
this_token_properties["prev_prev_is_number"] = last_token_properties.get("prev_is_number", 0)
152-
this_token_properties["prev_prev_is_number_dot"] = last_token_properties.get("prev_is_number_dot", 0)
153-
this_token_properties["prev_prev_is_capital"] = last_token_properties.get("prev_is_capital", 0)
154-
155-
token_texts.append(token.group())
156-
157-
token_properties.append(this_token_properties)
158-
159-
last_token_properties = this_token_properties
160-
161-
X.append(token_properties)
162-
163-
y_pred = crf_text_model.predict(X)
164-
165-
last_token_category = "O"
166-
for idx in range(len(X[0])):
167-
168-
if y_pred[0][idx] != "O":
169-
if last_token_category == "O" or y_pred[0][idx] == "B":
170-
start_idx = tokens[idx].start()
171-
end_idx = len(text)
172-
for j in range(idx + 1, len(X[0])):
173-
if y_pred[0][j] == "O" or y_pred[0][j] == "B":
174-
end_idx = tokens[j - 1].end()
175-
break
176-
177-
question_text = text[start_idx:end_idx]
178-
question_text = re.sub(r'\s+', ' ', question_text)
179-
question_text = question_text.strip()
180-
questions_from_text.append(question_text)
181-
182-
last_token_category = y_pred[0][idx]
90+
questions_from_text = predict(file.text_content)
18391

184-
if len(questions_from_text) > len(questions_from_tables):
185-
print ("Source of parsing was text CRF")
186-
instrument = harmony.create_instrument_from_list(questions_from_text, instrument_name=file.file_name, file_name=file.file_name)
187-
print(instrument)
188-
return [instrument]
189-
elif len(questions_from_tables) > 0:
190-
instrument = harmony.create_instrument_from_list(questions_from_tables, instrument_name=file.file_name, file_name=file.file_name)
191-
return [instrument]
192-
else:
193-
return []
194-
195-
# return convert_text_to_instruments(file)
92+
instrument = harmony.create_instrument_from_list(questions_from_text, instrument_name=file.file_name,
93+
file_name=file.file_name)
94+
return [instrument]
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
'''
2+
MIT License
3+
4+
Copyright (c) 2023 Ulster University (https://www.ulster.ac.uk).
5+
Project: Harmony (https://harmonydata.ac.uk)
6+
Maintainer: Thomas Wood (https://fastdatascience.com)
7+
8+
Permission is hereby granted, free of charge, to any person obtaining a copy
9+
of this software and associated documentation files (the "Software"), to deal
10+
in the Software without restriction, including without limitation the rights
11+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12+
copies of the Software, and to permit persons to whom the Software is
13+
furnished to do so, subject to the following conditions:
14+
15+
The above copyright notice and this permission notice shall be included in all
16+
copies or substantial portions of the Software.
17+
18+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24+
SOFTWARE.
25+
26+
'''
27+
28+
import json
29+
import re
30+
31+
re_word = re.compile(r'(?i)(\S+)')
32+
33+
re_initial_num = re.compile(r'(^\d+)')
34+
re_contains_num = re.compile(r'\d')
35+
re_initial_num_dot = re.compile(r'(^\d+\.)')
36+
re_alpha = re.compile(r'(^[a-zA-Z]+)')
37+
re_bracket = re.compile(r'(?:\(|\))')
38+
39+
40+
def convert_text_to_features(text):
41+
token_texts = []
42+
token_start_char_indices = []
43+
token_end_char_indices = []
44+
token_properties = []
45+
46+
char_indices_of_newlines = set()
47+
for idx, c in enumerate(text):
48+
if c == "\n":
49+
char_indices_of_newlines.add(idx)
50+
51+
char_indices_of_question_marks = set()
52+
for idx, c in enumerate(text):
53+
if c == "?":
54+
char_indices_of_question_marks.add(idx)
55+
56+
tokens = list(re_word.finditer(text))
57+
58+
this_token_properties = {}
59+
60+
for token in tokens:
61+
is_number = len(re_initial_num.findall(token.group()))
62+
is_number_dot = len(re_initial_num_dot.findall(token.group()))
63+
num_nums = len(re_contains_num.findall(token.group()))
64+
is_alpha = len(re_alpha.findall(token.group()))
65+
is_bracket = len(re_bracket.findall(token.group()))
66+
67+
dist_to_prev_newline = token.start()
68+
for c in range(token.start(), 1, -1):
69+
if c in char_indices_of_newlines:
70+
dist_to_prev_newline = token.start() - c
71+
break
72+
73+
dist_to_next_question_mark = len(text) - token.start()
74+
for c in range(token.start(), len(text)):
75+
if c in char_indices_of_question_marks:
76+
dist_to_next_question_mark = c - token.start()
77+
break
78+
79+
is_capital = int(token.group()[0] != token.group()[0].lower())
80+
81+
is_letters_and_numbers = int(is_alpha and num_nums > 0)
82+
83+
this_token_properties = {"length": len(token.group()), "is_number": is_number,
84+
"is_alpha": is_alpha,
85+
"is_capital": is_capital,
86+
"is_letters_and_numbers": is_letters_and_numbers,
87+
"is_bracket": is_bracket,
88+
"is_number_dot": is_number_dot,
89+
"num_nums": num_nums,
90+
"dist_to_prev_newline": dist_to_prev_newline,
91+
"dist_to_next_question_mark": dist_to_next_question_mark,
92+
"char_index": token.start()}
93+
94+
token_texts.append(token.group())
95+
token_start_char_indices.append(token.start())
96+
token_end_char_indices.append(token.end())
97+
token_properties.append(this_token_properties)
98+
99+
all_property_names = list(sorted(this_token_properties))
100+
101+
for idx in range(len(token_properties)):
102+
focus_dict = token_properties[idx]
103+
# Generate features including prev and next token.
104+
# There was no increase in performance associated with increasing this window. (TW 19/07/2024)
105+
for offset in range(-1, 2):
106+
if offset == 0:
107+
continue
108+
j = idx + offset
109+
if j >= 0 and j < len(token_properties):
110+
offset_dict = token_properties[j]
111+
else:
112+
offset_dict = {}
113+
114+
for property_name in all_property_names:
115+
focus_dict[f"{property_name}_{offset}"] = offset_dict.get(property_name, 0)
116+
117+
return token_texts, token_start_char_indices, token_end_char_indices, token_properties
118+
119+
120+
if __name__ == "__main__":
121+
test_text = "this is a test123 a)"
122+
token_texts, token_start_char_indices, token_end_char_indices, token_properties = convert_text_to_features(
123+
test_text)
124+
print(token_texts)
125+
print(token_start_char_indices)
126+
print(token_end_char_indices)
127+
print(json.dumps(token_properties, indent=4))

tests/test_pdf_tables.py

+10-6
Original file line numberDiff line numberDiff line change
@@ -55,12 +55,16 @@
5555

5656
class TestConvertPdfTables(unittest.TestCase):
5757

58-
def test_empty_pdf(self):
59-
60-
self.assertEqual(0, len(convert_pdf_to_instruments(pdf_empty_table)))
61-
62-
def test_two_questions(self):
63-
self.assertEqual(2, len(convert_pdf_to_instruments(pdf_non_empty_table)[0].questions))
58+
pass
59+
60+
# Not using tables at the moment
61+
#
62+
# def test_empty_pdf(self):
63+
#
64+
# self.assertEqual(0, len(convert_pdf_to_instruments(pdf_empty_table)))
65+
#
66+
# def test_two_questions(self):
67+
# self.assertEqual(2, len(convert_pdf_to_instruments(pdf_non_empty_table)[0].questions))
6468

6569

6670
if __name__ == '__main__':

0 commit comments

Comments
 (0)