Skip to content

Commit c80b26c

Browse files
authored
Merge pull request #51 from NREL/pp/doc_updates
Some QOL updates to the Documents class
2 parents d2e56d3 + a06bd54 commit c80b26c

19 files changed

+140
-100
lines changed

docs/source/dev/ords_architecture.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -407,7 +407,7 @@ for multiprocessing tasks.
407407
408408
content = ...
409409
doc = HTMLDocument([content])
410-
doc.text, doc.raw_pages, doc.metadata
410+
doc.text, doc.raw_pages, doc.attrs
411411
412412
--------------------------------------------------------------------------------------------------------------------------------------------------
413413

elm/ords/download.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,7 @@ async def _down_select_docs_correct_content(docs, location, **kwargs):
136136
async def _contains_ords(doc, **kwargs):
137137
"""Helper coroutine that checks for ordinance info. """
138138
doc = await check_for_ordinance_info(doc, **kwargs)
139-
return doc.metadata.get("contains_ord_info", False)
139+
return doc.attrs.get("contains_ord_info", False)
140140

141141

142142
def _sort_final_ord_docs(all_ord_docs):
@@ -149,5 +149,5 @@ def _sort_final_ord_docs(all_ord_docs):
149149

150150
def _ord_doc_sorting_key(doc):
151151
"""All text sorting key"""
152-
year, month, day = doc.metadata.get("date", (-1, -1, -1))
152+
year, month, day = doc.attrs.get("date", (-1, -1, -1))
153153
return year, isinstance(doc, PDFDocument), -1 * len(doc.text), month, day

elm/ords/extraction/apply.py

+18-18
Original file line numberDiff line numberDiff line change
@@ -49,16 +49,16 @@ async def check_for_ordinance_info(doc, text_splitter, **kwargs):
4949
snippet. Note that the snippet may contain other info as well,
5050
but should encapsulate all of the ordinance text.
5151
"""
52-
if "contains_ord_info" in doc.metadata:
52+
if "contains_ord_info" in doc.attrs:
5353
return doc
5454

5555
llm_caller = StructuredLLMCaller(**kwargs)
5656
chunks = text_splitter.split_text(doc.text)
5757
validator = OrdinanceValidator(llm_caller, chunks)
58-
doc.metadata["contains_ord_info"] = await validator.parse()
59-
if doc.metadata["contains_ord_info"]:
60-
doc.metadata["date"] = await DateExtractor(llm_caller).parse(doc)
61-
doc.metadata["ordinance_text"] = validator.ordinance_text
58+
doc.attrs["contains_ord_info"] = await validator.parse()
59+
if doc.attrs["contains_ord_info"]:
60+
doc.attrs["date"] = await DateExtractor(llm_caller).parse(doc)
61+
doc.attrs["ordinance_text"] = validator.ordinance_text
6262

6363
return doc
6464

@@ -90,18 +90,18 @@ async def extract_ordinance_text_with_llm(doc, text_splitter, extractor):
9090
-------
9191
elm.web.document.BaseDocument
9292
Document that has been parsed for ordinance text. The results of
93-
the extraction are stored in the document's metadata. In
94-
particular, the metadata will contain a
93+
the extraction are stored in the document's metadata (attrs). In
94+
particular, the metadata (attrs) will contain a
9595
``"cleaned_ordinance_text"`` key that will contain the cleaned
9696
ordinance text.
9797
"""
98-
text_chunks = text_splitter.split_text(doc.metadata["ordinance_text"])
98+
text_chunks = text_splitter.split_text(doc.attrs["ordinance_text"])
9999
ordinance_text = await extractor.check_for_restrictions(text_chunks)
100-
doc.metadata["restrictions_ordinance_text"] = ordinance_text
100+
doc.attrs["restrictions_ordinance_text"] = ordinance_text
101101

102102
text_chunks = text_splitter.split_text(ordinance_text)
103103
ordinance_text = await extractor.check_for_correct_size(text_chunks)
104-
doc.metadata["cleaned_ordinance_text"] = ordinance_text
104+
doc.attrs["cleaned_ordinance_text"] = ordinance_text
105105

106106
return doc
107107

@@ -167,7 +167,7 @@ async def extract_ordinance_text_with_ngram_validation(
167167
``"cleaned_ordinance_text"`` key that will contain the cleaned
168168
ordinance text.
169169
"""
170-
if not doc.metadata.get("ordinance_text"):
170+
if not doc.attrs.get("ordinance_text"):
171171
msg = (
172172
"Input document has no 'ordinance_text' key or string does not "
173173
"contain information. Please run `check_for_ordinance_info` "
@@ -203,8 +203,8 @@ async def _extract_with_ngram_check(
203203
"""Extract ordinance info from doc and validate using ngrams."""
204204
from elm.ords.extraction.ngrams import sentence_ngram_containment
205205

206-
source = doc.metadata.get("source", "Unknown")
207-
og_text = doc.metadata["ordinance_text"]
206+
source = doc.attrs.get("source", "Unknown")
207+
og_text = doc.attrs["ordinance_text"]
208208
if not og_text:
209209
msg = (
210210
"Document missing original ordinance text! No extraction "
@@ -221,7 +221,7 @@ async def _extract_with_ngram_check(
221221
doc = await extract_ordinance_text_with_llm(
222222
doc, text_splitter, extractor
223223
)
224-
cleaned_text = doc.metadata["cleaned_ordinance_text"]
224+
cleaned_text = doc.attrs["cleaned_ordinance_text"]
225225
if not cleaned_text:
226226
logger.debug(
227227
"No cleaned text found after extraction on attempt %d "
@@ -256,7 +256,7 @@ async def _extract_with_ngram_check(
256256
source,
257257
)
258258
else:
259-
doc.metadata["cleaned_ordinance_text"] = best_summary
259+
doc.attrs["cleaned_ordinance_text"] = best_summary
260260
msg = (
261261
f"Ngram check failed after {num_tries}. LLM hallucination in "
262262
"cleaned ordinance text is extremely likely! Proceed with "
@@ -294,7 +294,7 @@ async def extract_ordinance_values(doc, **kwargs):
294294
particular, the metadata will contain an ``"ordinance_values"``
295295
key that will contain the DataFame with ordinance values.
296296
"""
297-
if not doc.metadata.get("cleaned_ordinance_text"):
297+
if not doc.attrs.get("cleaned_ordinance_text"):
298298
msg = (
299299
"Input document has no 'cleaned_ordinance_text' key or string "
300300
"does not contain info. Please run "
@@ -305,6 +305,6 @@ async def extract_ordinance_values(doc, **kwargs):
305305
return doc
306306

307307
parser = StructuredOrdinanceParser(**kwargs)
308-
text = doc.metadata["cleaned_ordinance_text"]
309-
doc.metadata["ordinance_values"] = await parser.parse(text)
308+
text = doc.attrs["cleaned_ordinance_text"]
309+
doc.attrs["ordinance_values"] = await parser.parse(text)
310310
return doc

elm/ords/process.py

+12-12
Original file line numberDiff line numberDiff line change
@@ -530,8 +530,8 @@ async def process_county(
530530
await _record_time_and_usage(start_time, **kwargs)
531531
return None
532532

533-
doc.metadata["location"] = county
534-
doc.metadata["location_name"] = county.full_name
533+
doc.attrs["location"] = county
534+
doc.attrs["location_name"] = county.full_name
535535
await _record_usage(**kwargs)
536536

537537
doc = await extract_ordinance_text_with_ngram_validation(
@@ -550,7 +550,7 @@ async def process_county(
550550
"%d ordinance value(s) found for %s. Outputs are here: '%s'",
551551
ord_count,
552552
county.full_name,
553-
doc.metadata["ord_db_fp"],
553+
doc.attrs["ord_db_fp"],
554554
)
555555
else:
556556
logger.info("No ordinances found for %s.", county.full_name)
@@ -579,21 +579,21 @@ async def _record_time_and_usage(start_time, **kwargs):
579579
async def _move_file_to_out_dir(doc):
580580
"""Move PDF or HTML text file to output directory."""
581581
out_fp = await FileMover.call(doc)
582-
doc.metadata["out_fp"] = out_fp
582+
doc.attrs["out_fp"] = out_fp
583583
return doc
584584

585585

586586
async def _write_cleaned_text(doc):
587587
"""Write cleaned text to `clean_dir`."""
588588
out_fp = await CleanedFileWriter.call(doc)
589-
doc.metadata["cleaned_fp"] = out_fp
589+
doc.attrs["cleaned_fp"] = out_fp
590590
return doc
591591

592592

593593
async def _write_ord_db(doc):
594594
"""Write cleaned text to `county_dbs_dir`."""
595595
out_fp = await OrdDBFileWriter.call(doc)
596-
doc.metadata["ord_db_fp"] = out_fp
596+
doc.attrs["ord_db_fp"] = out_fp
597597
return doc
598598

599599

@@ -628,10 +628,10 @@ def _num_ords_in_doc(doc):
628628
if doc is None:
629629
return 0
630630

631-
if "ordinance_values" not in doc.metadata:
631+
if "ordinance_values" not in doc.attrs:
632632
return 0
633633

634-
ord_vals = doc.metadata["ordinance_values"]
634+
ord_vals = doc.attrs["ordinance_values"]
635635
if ord_vals.empty:
636636
return 0
637637

@@ -666,16 +666,16 @@ def _docs_to_db(docs):
666666

667667
def _db_results(doc):
668668
"""Extract results from doc metadata to DataFrame."""
669-
results = doc.metadata.get("ordinance_values")
669+
results = doc.attrs.get("ordinance_values")
670670
if results is None:
671671
return None
672672

673-
results["source"] = doc.metadata.get("source")
674-
year = doc.metadata.get("date", (None, None, None))[0]
673+
results["source"] = doc.attrs.get("source")
674+
year = doc.attrs.get("date", (None, None, None))[0]
675675
results["ord_year"] = year if year is not None and year > 0 else None
676676
results["last_updated"] = datetime.now().strftime("%m/%d/%Y")
677677

678-
location = doc.metadata["location"]
678+
location = doc.attrs["location"]
679679
results["FIPS"] = location.fips
680680
results["county"] = location.name
681681
results["state"] = location.state

elm/ords/services/threaded.py

+6-6
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,12 @@
1616

1717
def _move_file(doc, out_dir):
1818
"""Move a file from a temp directory to an output directory."""
19-
cached_fp = doc.metadata.get("cache_fn")
19+
cached_fp = doc.attrs.get("cache_fn")
2020
if cached_fp is None:
2121
return
2222

2323
cached_fp = Path(cached_fp)
24-
out_fn = doc.metadata.get("location_name", cached_fp.name)
24+
out_fn = doc.attrs.get("location_name", cached_fp.name)
2525
if not out_fn.endswith(cached_fp.suffix):
2626
out_fn = f"{out_fn}{cached_fp.suffix}"
2727

@@ -32,8 +32,8 @@ def _move_file(doc, out_dir):
3232

3333
def _write_cleaned_file(doc, out_dir):
3434
"""Write cleaned ordinance text to directory."""
35-
cleaned_text = doc.metadata.get("cleaned_ordinance_text")
36-
location_name = doc.metadata.get("location_name")
35+
cleaned_text = doc.attrs.get("cleaned_ordinance_text")
36+
location_name = doc.attrs.get("location_name")
3737

3838
if cleaned_text is None or location_name is None:
3939
return
@@ -46,8 +46,8 @@ def _write_cleaned_file(doc, out_dir):
4646

4747
def _write_ord_db(doc, out_dir):
4848
"""Write parsed ordinance database to directory."""
49-
ord_db = doc.metadata.get("ordinance_values")
50-
location_name = doc.metadata.get("location_name")
49+
ord_db = doc.attrs.get("ordinance_values")
50+
location_name = doc.attrs.get("location_name")
5151

5252
if ord_db is None or location_name is None:
5353
return

elm/ords/validation/location.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -204,7 +204,7 @@ async def check(self, doc, county, state):
204204
`True` if the doc contents pertain to the input county.
205205
`False` otherwise.
206206
"""
207-
source = doc.metadata.get("source")
207+
source = doc.attrs.get("source")
208208
logger.debug(
209209
"Validating document from source: %s", source or "Unknown"
210210
)
@@ -280,7 +280,7 @@ async def _validator_check_for_doc(validator, doc, score_thresh=0.8, **kwargs):
280280
"%s score is %.2f for doc from source %s (Pass: %s)",
281281
validator.__class__.__name__,
282282
score,
283-
doc.metadata.get("source", "Unknown"),
283+
doc.attrs.get("source", "Unknown"),
284284
str(score > score_thresh),
285285
)
286286
return score > score_thresh

elm/version.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,4 @@
22
ELM version number
33
"""
44

5-
__version__ = "0.0.10"
5+
__version__ = "0.0.11"

elm/web/document.py

+28-10
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
from functools import cached_property
66
import logging
77

8+
import pandas as pd
9+
810
from elm.utilities.parse import (
911
combine_pages,
1012
clean_headers,
@@ -39,20 +41,37 @@ class BaseDocument(ABC):
3941
.. end desc
4042
"""
4143

42-
def __init__(self, pages, metadata=None):
44+
def __init__(self, pages, attrs=None):
4345
"""
4446
4547
Parameters
4648
----------
4749
pages : iterable
4850
Iterable of strings, where each string is a page of a
4951
document.
50-
metadata : dict, optional
52+
attrs : dict, optional
5153
Optional dict containing metadata for the document.
5254
By default, ``None``.
5355
"""
5456
self.pages = remove_blank_pages(pages)
55-
self.metadata = metadata or {}
57+
self.attrs = attrs or {}
58+
59+
def __repr__(self):
60+
header = (f"{self.__class__.__name__} with {len(self.pages):,} "
61+
"pages\nAttrs:")
62+
if not self.attrs:
63+
return f"{header} None"
64+
65+
attrs = {}
66+
for k, v in self.attrs.items():
67+
if isinstance(v, pd.DataFrame):
68+
v = f"DataFrame with {len(v):,} rows"
69+
attrs[k] = v
70+
71+
indent = max(len(k) for k in attrs) + 2
72+
attrs = "\n".join([f"{k:>{indent}}:\t{v}"
73+
for k, v in attrs.items()])
74+
return f"{header}\n{attrs}"
5675

5776
@property
5877
def empty(self):
@@ -118,7 +137,7 @@ class PDFDocument(BaseDocument):
118137
def __init__(
119138
self,
120139
pages,
121-
metadata=None,
140+
attrs=None,
122141
percent_raw_pages_to_keep=25,
123142
max_raw_pages=18,
124143
num_end_pages_to_keep=2,
@@ -131,8 +150,7 @@ def __init__(
131150
pages : iterable
132151
Iterable of strings, where each string is a page of a
133152
document.
134-
metadata : str, optional
135-
metadata : dict, optional
153+
attrs : str, optional
136154
Optional dict containing metadata for the document.
137155
By default, ``None``.
138156
percent_raw_pages_to_keep : int, optional
@@ -153,7 +171,7 @@ def __init__(
153171
to the :func:`~elm.utilities.parse.clean_headers`
154172
function. By default, ``None``.
155173
"""
156-
super().__init__(pages, metadata=metadata)
174+
super().__init__(pages, attrs=attrs)
157175
self.percent_raw_pages_to_keep = percent_raw_pages_to_keep
158176
self.max_raw_pages = min(len(self.pages), max_raw_pages)
159177
self.num_end_pages_to_keep = num_end_pages_to_keep
@@ -244,7 +262,7 @@ class HTMLDocument(BaseDocument):
244262
def __init__(
245263
self,
246264
pages,
247-
metadata=None,
265+
attrs=None,
248266
html_table_to_markdown_kwargs=None,
249267
ignore_html_links=True,
250268
text_splitter=None,
@@ -256,7 +274,7 @@ def __init__(
256274
pages : iterable
257275
Iterable of strings, where each string is a page of a
258276
document.
259-
metadata : dict, optional
277+
attrs : dict, optional
260278
Optional dict containing metadata for the document.
261279
By default, ``None``.
262280
html_table_to_markdown_kwargs : dict, optional
@@ -275,7 +293,7 @@ def __init__(
275293
By default, ``None``, which means the original pages input
276294
becomes the raw pages attribute.
277295
"""
278-
super().__init__(pages, metadata=metadata)
296+
super().__init__(pages, attrs=attrs)
279297
self.html_table_to_markdown_kwargs = deepcopy(
280298
self.HTML_TABLE_TO_MARKDOWN_KWARGS
281299
)

elm/web/file_loader.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -181,7 +181,7 @@ async def fetch(self, url):
181181
async def _fetch_doc_with_url_in_metadata(self, url):
182182
"""Fetch doc contents and add URL to metadata"""
183183
doc, raw_content = await self._fetch_doc(url)
184-
doc.metadata["source"] = url
184+
doc.attrs["source"] = url
185185
return doc, raw_content
186186

187187
async def _fetch_doc(self, url):
@@ -239,5 +239,5 @@ async def _cache_doc(self, doc, raw_content):
239239

240240
cache_fn = await self.file_cache_coroutine(doc, raw_content)
241241
if cache_fn is not None:
242-
doc.metadata["cache_fn"] = cache_fn
242+
doc.attrs["cache_fn"] = cache_fn
243243
return doc

0 commit comments

Comments
 (0)