24
24
SOFTWARE.
25
25
26
26
'''
27
+
28
+ import pathlib
27
29
import pickle as pkl
28
30
import re
29
31
30
- import numpy as np
31
-
32
32
import harmony
33
+ from harmony .parsing .util .feature_extraction import convert_text_to_features
33
34
from harmony .parsing .util .tika_wrapper import parse_pdf_to_plain_text
34
- # from harmony.parsing.util.tesseract_wrapper import parse_image_pdf_to_plain_text
35
- # from harmony.parsing.util.camelot_wrapper import parse_pdf_to_tables
36
35
from harmony .schemas .requests .text import RawFile , Instrument
37
36
38
- re_initial_num = re .compile (r'(^\d+)' )
39
- re_initial_num_dot = re .compile (r'(^\d+\.)' )
40
- re_word = re .compile (r'(?i)(\b[\w\']+\b)' )
41
- re_alpha = re .compile (r'(^[a-zA-Z]+)' )
42
- re_bracket = re .compile (r'(?:\(|\))' )
43
- import pathlib
44
-
45
37
model_containing_folder = pathlib .Path (__file__ ).parent .resolve ()
46
38
47
- with open (f"{ model_containing_folder } /rf_table_model.pkl" , "rb" ) as f :
48
- rf_table_model = pkl .load (f )
49
-
50
- with open (f"{ model_containing_folder } /crf_text_model.pkl" , "rb" ) as f :
39
+ with open (f"{ model_containing_folder } /20240719_pdf_question_extraction_sklearn_crf_model.pkl" , "rb" ) as f :
51
40
crf_text_model = pkl .load (f )
52
41
42
+ # Predict method is taken from the training repo. Use the training repo as the master copy of the predict method.
43
+ # All training code is in https://github.com/harmonydata/pdf-questionnaire-extraction
44
+ def predict (test_text ):
45
+ token_texts , token_start_char_indices , token_end_char_indices , token_properties = convert_text_to_features (
46
+ test_text )
47
+
48
+ X = []
49
+ X .append (token_properties )
50
+
51
+ y_pred = crf_text_model .predict (X )
52
+
53
+ questions_from_text = []
54
+
55
+ tokens_already_used = set ()
56
+
57
+ last_token_category = "O"
58
+
59
+ for idx in range (len (X [0 ])):
60
+
61
+ if y_pred [0 ][idx ] != "O" and idx not in tokens_already_used :
62
+ if last_token_category == "O" or y_pred [0 ][idx ] == "B" :
63
+ start_idx = token_start_char_indices [idx ]
64
+ end_idx = len (test_text )
65
+ for j in range (idx + 1 , len (X [0 ])):
66
+ if y_pred [0 ][j ] == "O" or y_pred [0 ][j ] == "B" :
67
+ end_idx = token_end_char_indices [j - 1 ]
68
+ break
69
+ tokens_already_used .add (j )
70
+
71
+ question_text = test_text [start_idx :end_idx ]
72
+ question_text = re .sub (r'\s+' , ' ' , question_text )
73
+ question_text = question_text .strip ()
74
+ questions_from_text .append (question_text )
75
+
76
+ last_token_category = y_pred [0 ][idx ]
77
+
78
+ return questions_from_text
79
+
53
80
54
81
def convert_pdf_to_instruments (file : RawFile ) -> Instrument :
55
82
# file is an object containing these properties:
@@ -60,136 +87,8 @@ def convert_pdf_to_instruments(file: RawFile) -> Instrument:
60
87
if not file .text_content :
61
88
file .text_content = parse_pdf_to_plain_text (file .content ) # call Tika to convert the PDF to plain text
62
89
63
- # TODO: New PDF parsing algorithm should go here, together with return statement.
64
-
65
- table_cell_texts = []
66
- page_tables = file .tables
67
- questions_from_tables = []
68
- if len (page_tables ) > 0 :
69
- for page_table in page_tables :
70
- tables = page_table ['tables' ]
71
- for row in tables :
72
- for item in row :
73
- if len (item .strip ()) > 0 :
74
- table_cell_texts .append (item )
75
-
76
- X = []
77
- for idx in range (len (table_cell_texts )):
78
- t = table_cell_texts [idx ]
79
- features = [len (t ),
80
- len (re_initial_num .findall (t )),
81
- len (re_initial_num_dot .findall (t ))]
82
- X .append (features )
83
-
84
- if len (X ) > 0 :
85
- X = np .asarray (X )
86
-
87
- y_pred = rf_table_model .predict (X )
88
-
89
- questions_from_tables = []
90
- for idx in range (len (table_cell_texts )):
91
- if y_pred [idx ] == 1 :
92
- questions_from_tables .append (table_cell_texts [idx ])
93
-
94
-
95
- if True : # text CRF model
96
- questions_from_text = []
97
- X = []
98
-
99
- token_texts = []
100
- token_properties = []
101
-
102
- text = file .text_content
103
- char_indices_of_newlines = set ()
104
- for idx , c in enumerate (text ):
105
- if c == "\n " :
106
- char_indices_of_newlines .add (idx )
107
-
108
- char_indices_of_question_marks = set ()
109
- for idx , c in enumerate (text ):
110
- if c == "?" :
111
- char_indices_of_question_marks .add (idx )
112
-
113
- tokens = list (re_word .finditer (text ))
114
-
115
- last_token_properties = {}
116
-
117
- for token in tokens :
118
- is_number = len (re_initial_num .findall (token .group ()))
119
- is_number_dot = len (re_initial_num_dot .findall (token .group ()))
120
- is_alpha = len (re_alpha .findall (token .group ()))
121
-
122
- dist_to_newline = token .start ()
123
- for c in range (token .start (), 1 , - 1 ):
124
- if c in char_indices_of_newlines :
125
- dist_to_newline = token .start () - c
126
- break
127
-
128
- dist_to_question_mark = len (text ) - token .start ()
129
- for c in range (token .start (), len (text )):
130
- if c in char_indices_of_question_marks :
131
- dist_to_question_mark = c - token .start ()
132
- break
133
-
134
- is_capital = int (token .group ()[0 ] != token .group ()[0 ].lower ())
135
-
136
- this_token_properties = {"length" : len (token .group ()), "is_number" : is_number ,
137
- "is_alpha" : is_alpha ,
138
- "is_capital" : is_capital ,
139
- "is_number_dot" : is_number_dot ,
140
- "dist_to_newline" : dist_to_newline , "dist_to_question_mark" : dist_to_question_mark ,
141
- "char_index" : token .start ()}
142
-
143
- this_token_properties ["prev_length" ] = last_token_properties .get ("length" , 0 )
144
- this_token_properties ["prev_is_alpha" ] = last_token_properties .get ("is_alpha" , 0 )
145
- this_token_properties ["prev_is_number" ] = last_token_properties .get ("is_number" , 0 )
146
- this_token_properties ["prev_is_number_dot" ] = last_token_properties .get ("is_number_dot" , 0 )
147
- this_token_properties ["prev_is_capital" ] = last_token_properties .get ("is_capital" , 0 )
148
-
149
- this_token_properties ["prev_prev_length" ] = last_token_properties .get ("prev_length" , 0 )
150
- this_token_properties ["prev_prev_is_alpha" ] = last_token_properties .get ("prev_is_alpha" , 0 )
151
- this_token_properties ["prev_prev_is_number" ] = last_token_properties .get ("prev_is_number" , 0 )
152
- this_token_properties ["prev_prev_is_number_dot" ] = last_token_properties .get ("prev_is_number_dot" , 0 )
153
- this_token_properties ["prev_prev_is_capital" ] = last_token_properties .get ("prev_is_capital" , 0 )
154
-
155
- token_texts .append (token .group ())
156
-
157
- token_properties .append (this_token_properties )
158
-
159
- last_token_properties = this_token_properties
160
-
161
- X .append (token_properties )
162
-
163
- y_pred = crf_text_model .predict (X )
164
-
165
- last_token_category = "O"
166
- for idx in range (len (X [0 ])):
167
-
168
- if y_pred [0 ][idx ] != "O" :
169
- if last_token_category == "O" or y_pred [0 ][idx ] == "B" :
170
- start_idx = tokens [idx ].start ()
171
- end_idx = len (text )
172
- for j in range (idx + 1 , len (X [0 ])):
173
- if y_pred [0 ][j ] == "O" or y_pred [0 ][j ] == "B" :
174
- end_idx = tokens [j - 1 ].end ()
175
- break
176
-
177
- question_text = text [start_idx :end_idx ]
178
- question_text = re .sub (r'\s+' , ' ' , question_text )
179
- question_text = question_text .strip ()
180
- questions_from_text .append (question_text )
181
-
182
- last_token_category = y_pred [0 ][idx ]
90
+ questions_from_text = predict (file .text_content )
183
91
184
- if len (questions_from_text ) > len (questions_from_tables ):
185
- print ("Source of parsing was text CRF" )
186
- instrument = harmony .create_instrument_from_list (questions_from_text , instrument_name = file .file_name , file_name = file .file_name )
187
- print (instrument )
188
- return [instrument ]
189
- elif len (questions_from_tables ) > 0 :
190
- instrument = harmony .create_instrument_from_list (questions_from_tables , instrument_name = file .file_name , file_name = file .file_name )
191
- return [instrument ]
192
- else :
193
- return []
194
-
195
- # return convert_text_to_instruments(file)
92
+ instrument = harmony .create_instrument_from_list (questions_from_text , instrument_name = file .file_name ,
93
+ file_name = file .file_name )
94
+ return [instrument ]
0 commit comments