@@ -49,16 +49,16 @@ async def check_for_ordinance_info(doc, text_splitter, **kwargs):
49
49
snippet. Note that the snippet may contain other info as well,
50
50
but should encapsulate all of the ordinance text.
51
51
"""
52
- if "contains_ord_info" in doc .metadata :
52
+ if "contains_ord_info" in doc .attrs :
53
53
return doc
54
54
55
55
llm_caller = StructuredLLMCaller (** kwargs )
56
56
chunks = text_splitter .split_text (doc .text )
57
57
validator = OrdinanceValidator (llm_caller , chunks )
58
- doc .metadata ["contains_ord_info" ] = await validator .parse ()
59
- if doc .metadata ["contains_ord_info" ]:
60
- doc .metadata ["date" ] = await DateExtractor (llm_caller ).parse (doc )
61
- doc .metadata ["ordinance_text" ] = validator .ordinance_text
58
+ doc .attrs ["contains_ord_info" ] = await validator .parse ()
59
+ if doc .attrs ["contains_ord_info" ]:
60
+ doc .attrs ["date" ] = await DateExtractor (llm_caller ).parse (doc )
61
+ doc .attrs ["ordinance_text" ] = validator .ordinance_text
62
62
63
63
return doc
64
64
@@ -90,18 +90,18 @@ async def extract_ordinance_text_with_llm(doc, text_splitter, extractor):
90
90
-------
91
91
elm.web.document.BaseDocument
92
92
Document that has been parsed for ordinance text. The results of
93
- the extraction are stored in the document's metadata. In
94
- particular, the metadata will contain a
93
+ the extraction are stored in the document's metadata (attrs) . In
94
+ particular, the metadata (attrs) will contain a
95
95
``"cleaned_ordinance_text"`` key that will contain the cleaned
96
96
ordinance text.
97
97
"""
98
- text_chunks = text_splitter .split_text (doc .metadata ["ordinance_text" ])
98
+ text_chunks = text_splitter .split_text (doc .attrs ["ordinance_text" ])
99
99
ordinance_text = await extractor .check_for_restrictions (text_chunks )
100
- doc .metadata ["restrictions_ordinance_text" ] = ordinance_text
100
+ doc .attrs ["restrictions_ordinance_text" ] = ordinance_text
101
101
102
102
text_chunks = text_splitter .split_text (ordinance_text )
103
103
ordinance_text = await extractor .check_for_correct_size (text_chunks )
104
- doc .metadata ["cleaned_ordinance_text" ] = ordinance_text
104
+ doc .attrs ["cleaned_ordinance_text" ] = ordinance_text
105
105
106
106
return doc
107
107
@@ -167,7 +167,7 @@ async def extract_ordinance_text_with_ngram_validation(
167
167
``"cleaned_ordinance_text"`` key that will contain the cleaned
168
168
ordinance text.
169
169
"""
170
- if not doc .metadata .get ("ordinance_text" ):
170
+ if not doc .attrs .get ("ordinance_text" ):
171
171
msg = (
172
172
"Input document has no 'ordinance_text' key or string does not "
173
173
"contain information. Please run `check_for_ordinance_info` "
@@ -203,8 +203,8 @@ async def _extract_with_ngram_check(
203
203
"""Extract ordinance info from doc and validate using ngrams."""
204
204
from elm .ords .extraction .ngrams import sentence_ngram_containment
205
205
206
- source = doc .metadata .get ("source" , "Unknown" )
207
- og_text = doc .metadata ["ordinance_text" ]
206
+ source = doc .attrs .get ("source" , "Unknown" )
207
+ og_text = doc .attrs ["ordinance_text" ]
208
208
if not og_text :
209
209
msg = (
210
210
"Document missing original ordinance text! No extraction "
@@ -221,7 +221,7 @@ async def _extract_with_ngram_check(
221
221
doc = await extract_ordinance_text_with_llm (
222
222
doc , text_splitter , extractor
223
223
)
224
- cleaned_text = doc .metadata ["cleaned_ordinance_text" ]
224
+ cleaned_text = doc .attrs ["cleaned_ordinance_text" ]
225
225
if not cleaned_text :
226
226
logger .debug (
227
227
"No cleaned text found after extraction on attempt %d "
@@ -256,7 +256,7 @@ async def _extract_with_ngram_check(
256
256
source ,
257
257
)
258
258
else :
259
- doc .metadata ["cleaned_ordinance_text" ] = best_summary
259
+ doc .attrs ["cleaned_ordinance_text" ] = best_summary
260
260
msg = (
261
261
f"Ngram check failed after { num_tries } . LLM hallucination in "
262
262
"cleaned ordinance text is extremely likely! Proceed with "
@@ -294,7 +294,7 @@ async def extract_ordinance_values(doc, **kwargs):
294
294
particular, the metadata will contain an ``"ordinance_values"``
295
295
key that will contain the DataFame with ordinance values.
296
296
"""
297
- if not doc .metadata .get ("cleaned_ordinance_text" ):
297
+ if not doc .attrs .get ("cleaned_ordinance_text" ):
298
298
msg = (
299
299
"Input document has no 'cleaned_ordinance_text' key or string "
300
300
"does not contain info. Please run "
@@ -305,6 +305,6 @@ async def extract_ordinance_values(doc, **kwargs):
305
305
return doc
306
306
307
307
parser = StructuredOrdinanceParser (** kwargs )
308
- text = doc .metadata ["cleaned_ordinance_text" ]
309
- doc .metadata ["ordinance_values" ] = await parser .parse (text )
308
+ text = doc .attrs ["cleaned_ordinance_text" ]
309
+ doc .attrs ["ordinance_values" ] = await parser .parse (text )
310
310
return doc
0 commit comments