From 312dbbba34b3c16ad64830e4c6907a19d2252557 Mon Sep 17 00:00:00 2001 From: Barry Moore Date: Tue, 16 Sep 2025 21:25:38 -0400 Subject: [PATCH 01/13] Initial commit --- llm/scripts/backfill_summaries.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 llm/scripts/backfill_summaries.py diff --git a/llm/scripts/backfill_summaries.py b/llm/scripts/backfill_summaries.py new file mode 100644 index 000000000..5c72a10ce --- /dev/null +++ b/llm/scripts/backfill_summaries.py @@ -0,0 +1,18 @@ +import firebase_admin +from firebase_admin import firestore + +# Application Default credentials are automatically created. +app = firebase_admin.initialize_app() +db = firestore.client() + +bills_ref = db.collection("generalCourts/194/bills") +bills = bills_ref.get() +count = 0 +for bill in bills: + document = bill.to_dict() + if document.get("summary") is None or document.get("topics") is None: + # Notes: DocumentText _can_ be None + print(document.get("content", {}).get("DocumentText")) + print(document.get("content", {}).get("Title")) + print(document.get("content", {}).get("BillNumber")) + exit() From 897e30f8b1c9c93e54b43894d126b99ed9d5146a Mon Sep 17 00:00:00 2001 From: Barry Moore Date: Tue, 23 Sep 2025 20:16:11 -0400 Subject: [PATCH 02/13] Fill out TODOs --- llm/scripts/backfill_summaries.py | 31 +++++++++++++++++++++++++------ 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/llm/scripts/backfill_summaries.py b/llm/scripts/backfill_summaries.py index 5c72a10ce..44a25afad 100644 --- a/llm/scripts/backfill_summaries.py +++ b/llm/scripts/backfill_summaries.py @@ -10,9 +10,28 @@ count = 0 for bill in bills: document = bill.to_dict() - if document.get("summary") is None or document.get("topics") is None: - # Notes: DocumentText _can_ be None - print(document.get("content", {}).get("DocumentText")) - print(document.get("content", {}).get("Title")) - print(document.get("content", {}).get("BillNumber")) - exit() + document_text = document.get("content", {}).get("DocumentText") + document_title = document.get("content", {}).get("Title") + summary = document.get("summary") + + # No document text, skip it because we can't summarize it + if document_text is None: + print(f"{document['id']},skipped") + continue + + # If the summary is already populated move on + if summary is not None: + print(f"{document['id']},previous_summary") + continue + + # TODO: Generate the summary + print(f"{document['id']},generate_summary") + + # If the summary is already populated move on + topics = document.get("topics") + if topics is not None: + print(f"{document['id']},previous_topics") + continue + + # TODO: Populate the topics + print(f"{document['id']},generate_topics") From d0331dd06228de172eb974a0e58b89c0a9bbd4bd Mon Sep 17 00:00:00 2001 From: Barry Moore Date: Tue, 23 Sep 2025 20:50:05 -0400 Subject: [PATCH 03/13] Move it so it works --- llm/backfill_summaries.py | 52 +++++++++++++++++++++++++++++++ llm/scripts/backfill_summaries.py | 37 ---------------------- 2 files changed, 52 insertions(+), 37 deletions(-) create mode 100644 llm/backfill_summaries.py delete mode 100644 llm/scripts/backfill_summaries.py diff --git a/llm/backfill_summaries.py b/llm/backfill_summaries.py new file mode 100644 index 000000000..30dff9375 --- /dev/null +++ b/llm/backfill_summaries.py @@ -0,0 +1,52 @@ +import firebase_admin +from llm_functions import get_summary_api_function, get_tags_api_function_v2 +from firebase_admin import firestore +from bill_on_document_created import get_categories_from_topics, CATEGORY_BY_TOPIC + +# Application Default credentials are automatically created. +app = firebase_admin.initialize_app() +db = firestore.client() + +bills_ref = db.collection("generalCourts/194/bills") +bills = bills_ref.get() +count = 0 +for bill in bills: + document = bill.to_dict() + bill_id = document["id"] + document_text = document.get("content", {}).get("DocumentText") + document_title = document.get("content", {}).get("Title") + summary = document.get("summary") + + # No document text, skip it because we can't summarize it + if document_text is None: + print(f"{bill_id},skipped") + continue + + # If the summary is already populated move on + if summary is not None: + print(f"{bill_id},previous_summary") + continue + + summary = get_summary_api_function(bill_id, document_title, document_text) + if summary["status"] in [-1, -2]: + print(f"{bill_id},failed_summary") + continue + summary = summary["summary"] + print(f"summary: {summary}") + # bill.reference.update({"summary": summary}) + print(f"{bill_id},generate_summary") + + # If the summary is already populated move on + topics = document.get("topics") + if topics is not None: + print(f"{document['id']},previous_topics") + continue + tags = get_tags_api_function_v2(bill_id, document_title, summary) + if tags["status"] != 1: + print(f"{bill_id},failed_topics") + continue + topics_and_categories = get_categories_from_topics(tags["tags"], CATEGORY_BY_TOPIC) + print(f"topics_and_categories: {topics_and_categories}") + # bill.reference.update({"topics": topics_and_categories}) + print(f"{bill_id},generate_topics") + exit() diff --git a/llm/scripts/backfill_summaries.py b/llm/scripts/backfill_summaries.py deleted file mode 100644 index 44a25afad..000000000 --- a/llm/scripts/backfill_summaries.py +++ /dev/null @@ -1,37 +0,0 @@ -import firebase_admin -from firebase_admin import firestore - -# Application Default credentials are automatically created. -app = firebase_admin.initialize_app() -db = firestore.client() - -bills_ref = db.collection("generalCourts/194/bills") -bills = bills_ref.get() -count = 0 -for bill in bills: - document = bill.to_dict() - document_text = document.get("content", {}).get("DocumentText") - document_title = document.get("content", {}).get("Title") - summary = document.get("summary") - - # No document text, skip it because we can't summarize it - if document_text is None: - print(f"{document['id']},skipped") - continue - - # If the summary is already populated move on - if summary is not None: - print(f"{document['id']},previous_summary") - continue - - # TODO: Generate the summary - print(f"{document['id']},generate_summary") - - # If the summary is already populated move on - topics = document.get("topics") - if topics is not None: - print(f"{document['id']},previous_topics") - continue - - # TODO: Populate the topics - print(f"{document['id']},generate_topics") From a9c645722e5e499757d7a107c2dd69d23fcaffbb Mon Sep 17 00:00:00 2001 From: Barry Moore Date: Tue, 23 Sep 2025 21:07:46 -0400 Subject: [PATCH 04/13] Update documentation on summary script --- llm/backfill_summaries.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/llm/backfill_summaries.py b/llm/backfill_summaries.py index 30dff9375..549c46557 100644 --- a/llm/backfill_summaries.py +++ b/llm/backfill_summaries.py @@ -1,3 +1,8 @@ +# This script fills any missing 'summary' or 'topics' fields on the data model. +# The document must have a 'Title' and 'DocumentText' field to generate them. +# +# Developer notes: +# - you'll need to set the 'OPENAI_API_KEY' environment variable import firebase_admin from llm_functions import get_summary_api_function, get_tags_api_function_v2 from firebase_admin import firestore From d27c3fd53f3bcd01f1cf0db736d42543f7edd7fb Mon Sep 17 00:00:00 2001 From: Barry Moore Date: Tue, 7 Oct 2025 20:51:02 -0400 Subject: [PATCH 05/13] Update --- llm/.gitignore | 3 +- llm/backfill_summaries.py | 98 +++++++++++++++++++++++---------------- 2 files changed, 59 insertions(+), 42 deletions(-) diff --git a/llm/.gitignore b/llm/.gitignore index 0c2b6575d..a93be3ee3 100644 --- a/llm/.gitignore +++ b/llm/.gitignore @@ -1,4 +1,5 @@ venv/ __pycache__/ databases/ -.secret.local \ No newline at end of file +.secret.local +summaries-and-topics.csv \ No newline at end of file diff --git a/llm/backfill_summaries.py b/llm/backfill_summaries.py index 549c46557..959850196 100644 --- a/llm/backfill_summaries.py +++ b/llm/backfill_summaries.py @@ -12,46 +12,62 @@ app = firebase_admin.initialize_app() db = firestore.client() + +# Conceptually, we want to return a very consistent format when generated status reports. +# It would allow us to skip LLM regeneration when moving from dev to production. +def make_bill_summary(bill_id, status, summary, topics): + return f"{bill_id},{status},{summary},{topics}" + + bills_ref = db.collection("generalCourts/194/bills") bills = bills_ref.get() -count = 0 -for bill in bills: - document = bill.to_dict() - bill_id = document["id"] - document_text = document.get("content", {}).get("DocumentText") - document_title = document.get("content", {}).get("Title") - summary = document.get("summary") - - # No document text, skip it because we can't summarize it - if document_text is None: - print(f"{bill_id},skipped") - continue - - # If the summary is already populated move on - if summary is not None: - print(f"{bill_id},previous_summary") - continue - - summary = get_summary_api_function(bill_id, document_title, document_text) - if summary["status"] in [-1, -2]: - print(f"{bill_id},failed_summary") - continue - summary = summary["summary"] - print(f"summary: {summary}") - # bill.reference.update({"summary": summary}) - print(f"{bill_id},generate_summary") - - # If the summary is already populated move on - topics = document.get("topics") - if topics is not None: - print(f"{document['id']},previous_topics") - continue - tags = get_tags_api_function_v2(bill_id, document_title, summary) - if tags["status"] != 1: - print(f"{bill_id},failed_topics") - continue - topics_and_categories = get_categories_from_topics(tags["tags"], CATEGORY_BY_TOPIC) - print(f"topics_and_categories: {topics_and_categories}") - # bill.reference.update({"topics": topics_and_categories}) - print(f"{bill_id},generate_topics") - exit() +with open("./summaries-and-topics.csv", "w") as f: + f.write("bill_id,status,summary,topics\n") + for bill in bills: + document = bill.to_dict() + bill_id = document["id"] + document_text = document.get("content", {}).get("DocumentText") + document_title = document.get("content", {}).get("Title") + summary = document.get("summary") + + # No document text, skip it because we can't summarize it + if document_text is None: + f.write(make_bill_summary(bill_id, "skipped", None, None)) + f.write("\n") + continue + + # If the summary is already populated move on + if summary is not None: + f.write(make_bill_summary(bill_id, "previous_summary", None, None)) + f.write("\n") + continue + + summary = get_summary_api_function(bill_id, document_title, document_text) + if summary["status"] in [-1, -2]: + f.write(make_bill_summary(bill_id, "failed_summary", None, None)) + f.write("\n") + continue + summary = summary["summary"] + bill.reference.update({"summary": summary}) + f.write(make_bill_summary(bill_id, "generated_summary", summary, None)) + f.write("\n") + + # If the summary is already populated move on + topics = document.get("topics") + if topics is not None: + f.write(make_bill_summary(bill_id, "previous_topics", None, None)) + f.write("\n") + continue + tags = get_tags_api_function_v2(bill_id, document_title, summary) + if tags["status"] != 1: + f.write(make_bill_summary(bill_id, "failed_topics", None, None)) + f.write("\n") + continue + topics_and_categories = get_categories_from_topics( + tags["tags"], CATEGORY_BY_TOPIC + ) + bill.reference.update({"topics": topics_and_categories}) + f.write( + make_bill_summary(bill_id, "generated_topics", None, topics_and_categories) + ) + exit() From e7b721afc512921683d92316068ac984ed6586a4 Mon Sep 17 00:00:00 2001 From: Barry Moore Date: Tue, 7 Oct 2025 20:56:01 -0400 Subject: [PATCH 06/13] Remove temporary exit --- llm/backfill_summaries.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/llm/backfill_summaries.py b/llm/backfill_summaries.py index 959850196..f621637a4 100644 --- a/llm/backfill_summaries.py +++ b/llm/backfill_summaries.py @@ -16,7 +16,7 @@ # Conceptually, we want to return a very consistent format when generated status reports. # It would allow us to skip LLM regeneration when moving from dev to production. def make_bill_summary(bill_id, status, summary, topics): - return f"{bill_id},{status},{summary},{topics}" + return f"{bill_id},{status},{summary},{topics}\n" bills_ref = db.collection("generalCourts/194/bills") @@ -33,35 +33,29 @@ def make_bill_summary(bill_id, status, summary, topics): # No document text, skip it because we can't summarize it if document_text is None: f.write(make_bill_summary(bill_id, "skipped", None, None)) - f.write("\n") continue # If the summary is already populated move on if summary is not None: f.write(make_bill_summary(bill_id, "previous_summary", None, None)) - f.write("\n") continue summary = get_summary_api_function(bill_id, document_title, document_text) if summary["status"] in [-1, -2]: f.write(make_bill_summary(bill_id, "failed_summary", None, None)) - f.write("\n") continue summary = summary["summary"] bill.reference.update({"summary": summary}) f.write(make_bill_summary(bill_id, "generated_summary", summary, None)) - f.write("\n") # If the summary is already populated move on topics = document.get("topics") if topics is not None: f.write(make_bill_summary(bill_id, "previous_topics", None, None)) - f.write("\n") continue tags = get_tags_api_function_v2(bill_id, document_title, summary) if tags["status"] != 1: f.write(make_bill_summary(bill_id, "failed_topics", None, None)) - f.write("\n") continue topics_and_categories = get_categories_from_topics( tags["tags"], CATEGORY_BY_TOPIC @@ -70,4 +64,3 @@ def make_bill_summary(bill_id, status, summary, topics): f.write( make_bill_summary(bill_id, "generated_topics", None, topics_and_categories) ) - exit() From afe6ca16eeae20f9c5771afb1b1dd8ef1add3178 Mon Sep 17 00:00:00 2001 From: Barry Moore Date: Wed, 15 Oct 2025 01:28:31 -0400 Subject: [PATCH 07/13] Progress --- llm/.gitignore | 6 +++++- llm/backfill_summaries.py | 1 + llm/normalize_summaries.py | 9 +++++++++ llm/test_normalize_summaries.py | 32 ++++++++++++++++++++++++++++++++ 4 files changed, 47 insertions(+), 1 deletion(-) create mode 100644 llm/normalize_summaries.py create mode 100644 llm/test_normalize_summaries.py diff --git a/llm/.gitignore b/llm/.gitignore index a93be3ee3..c280c8d59 100644 --- a/llm/.gitignore +++ b/llm/.gitignore @@ -2,4 +2,8 @@ venv/ __pycache__/ databases/ .secret.local -summaries-and-topics.csv \ No newline at end of file +fix_csv_summaries.py +run_normalize_summaries.py +summaries-and-topics.csv +summaries-and-topics-fixed.csv +summaries-and-topics-normalized.csv diff --git a/llm/backfill_summaries.py b/llm/backfill_summaries.py index f621637a4..e68516772 100644 --- a/llm/backfill_summaries.py +++ b/llm/backfill_summaries.py @@ -21,6 +21,7 @@ def make_bill_summary(bill_id, status, summary, topics): bills_ref = db.collection("generalCourts/194/bills") bills = bills_ref.get() +# TODO: I need to use csv.writer here to properly escape quotes with open("./summaries-and-topics.csv", "w") as f: f.write("bill_id,status,summary,topics\n") for bill in bills: diff --git a/llm/normalize_summaries.py b/llm/normalize_summaries.py new file mode 100644 index 000000000..15ae4a85f --- /dev/null +++ b/llm/normalize_summaries.py @@ -0,0 +1,9 @@ +import re + + +def normalize_summary(summary: str) -> str: + strip_summary = re.sub(r"^Summary:", "", summary) + lines = strip_summary.splitlines() + handle_list_items = [re.sub(r"^- ", "", x) for x in lines] + handle_remaining_whitespace = [x.strip() for x in handle_list_items if x.strip() != ""] + return " ".join(handle_remaining_whitespace) diff --git a/llm/test_normalize_summaries.py b/llm/test_normalize_summaries.py new file mode 100644 index 000000000..4a903df72 --- /dev/null +++ b/llm/test_normalize_summaries.py @@ -0,0 +1,32 @@ +import normalize_summaries + + +def test_normalize_summary_one(): + summary = """Summary: +- The bill allows Joe, the chief of police in Gravity, to continue working. +- The city can require annual health examinations + """ + assert normalize_summaries.normalize_summary( + summary + ) == "The bill allows Joe, the chief of police in Gravity, to continue working. The city can require annual health examinations" + + +def test_normalize_summary_two(): + summary = """Summary: +The bill allows Joe, the chief of police in Gravity, to continue working. + """ + assert normalize_summaries.normalize_summary( + summary + ) == "The bill allows Joe, the chief of police in Gravity, to continue working." + +def test_normalize_summary_three(): + summary = "Summary: The bill allows Joe, the chief of police in Gravity, to continue working." + assert normalize_summaries.normalize_summary( + summary + ) == "The bill allows Joe, the chief of police in Gravity, to continue working." + +def test_normalize_summary_four(): + summary = "The bill allows Joe, the chief of police in Gravity, to continue working." + assert normalize_summaries.normalize_summary( + summary + ) == "The bill allows Joe, the chief of police in Gravity, to continue working." From 7127b23d078ce9922b3d10e06d6b1a0961425c13 Mon Sep 17 00:00:00 2001 From: Barry Moore Date: Tue, 21 Oct 2025 20:13:41 -0400 Subject: [PATCH 08/13] Update with new CSV writer --- llm/backfill_summaries.py | 39 ++++++++++++++++++++++++--------------- 1 file changed, 24 insertions(+), 15 deletions(-) diff --git a/llm/backfill_summaries.py b/llm/backfill_summaries.py index e68516772..cbde4a98d 100644 --- a/llm/backfill_summaries.py +++ b/llm/backfill_summaries.py @@ -7,6 +7,8 @@ from llm_functions import get_summary_api_function, get_tags_api_function_v2 from firebase_admin import firestore from bill_on_document_created import get_categories_from_topics, CATEGORY_BY_TOPIC +import csv +from normalize_summaries import normalize_summary # Application Default credentials are automatically created. app = firebase_admin.initialize_app() @@ -16,14 +18,14 @@ # Conceptually, we want to return a very consistent format when generated status reports. # It would allow us to skip LLM regeneration when moving from dev to production. def make_bill_summary(bill_id, status, summary, topics): - return f"{bill_id},{status},{summary},{topics}\n" + return [f"{bill_id}", f"{status}", f"{summary}", f"{topics}"] bills_ref = db.collection("generalCourts/194/bills") bills = bills_ref.get() -# TODO: I need to use csv.writer here to properly escape quotes -with open("./summaries-and-topics.csv", "w") as f: - f.write("bill_id,status,summary,topics\n") +with open("./summaries-and-topics.csv", "w") as csvfile: + csv_writer = csv.writer(csvfile) + csv_writer.writerow(["bill_id", "status", "summary", "topics"]) for bill in bills: document = bill.to_dict() bill_id = document["id"] @@ -33,35 +35,42 @@ def make_bill_summary(bill_id, status, summary, topics): # No document text, skip it because we can't summarize it if document_text is None: - f.write(make_bill_summary(bill_id, "skipped", None, None)) + csv_writer.writerow(make_bill_summary(bill_id, "skipped", None, None)) continue # If the summary is already populated move on if summary is not None: - f.write(make_bill_summary(bill_id, "previous_summary", None, None)) + csv_writer.write(make_bill_summary(bill_id, "previous_summary", None, None)) continue summary = get_summary_api_function(bill_id, document_title, document_text) if summary["status"] in [-1, -2]: - f.write(make_bill_summary(bill_id, "failed_summary", None, None)) + csv_writer.write(make_bill_summary(bill_id, "failed_summary", None, None)) continue - summary = summary["summary"] + # Note: `normalize_summary` does some post-processing to clean up the summaries + # As of 2025-10-21 this was necessary due to the LLM prompt + summary = normalize_summary(summary["summary"]) bill.reference.update({"summary": summary}) - f.write(make_bill_summary(bill_id, "generated_summary", summary, None)) - # If the summary is already populated move on + # If the topics are already populated, just make a note of it topics = document.get("topics") if topics is not None: - f.write(make_bill_summary(bill_id, "previous_topics", None, None)) - continue + csv_writer.write(make_bill_summary(bill_id, "previous_topics", None, None)) + tags = get_tags_api_function_v2(bill_id, document_title, summary) + # If the tags fail, make a note and at least write the summary for debugging if tags["status"] != 1: - f.write(make_bill_summary(bill_id, "failed_topics", None, None)) + csv_writer.write(make_bill_summary(bill_id, "failed_topics", None, None)) + csv_writer.write( + make_bill_summary(bill_id, "generated_summary", summary, None) + ) continue topics_and_categories = get_categories_from_topics( tags["tags"], CATEGORY_BY_TOPIC ) bill.reference.update({"topics": topics_and_categories}) - f.write( - make_bill_summary(bill_id, "generated_topics", None, topics_and_categories) + csv_writer.write( + make_bill_summary( + bill_id, "generated_topics", summary, topics_and_categories + ) ) From bfb587e1a929166c0d2c713f6a8cedc32af4f978 Mon Sep 17 00:00:00 2001 From: Barry Moore Date: Tue, 21 Oct 2025 20:20:55 -0400 Subject: [PATCH 09/13] Minor writerow updates --- llm/backfill_summaries.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/llm/backfill_summaries.py b/llm/backfill_summaries.py index cbde4a98d..0d8cffe51 100644 --- a/llm/backfill_summaries.py +++ b/llm/backfill_summaries.py @@ -40,12 +40,16 @@ def make_bill_summary(bill_id, status, summary, topics): # If the summary is already populated move on if summary is not None: - csv_writer.write(make_bill_summary(bill_id, "previous_summary", None, None)) + csv_writer.writerow( + make_bill_summary(bill_id, "previous_summary", None, None) + ) continue summary = get_summary_api_function(bill_id, document_title, document_text) if summary["status"] in [-1, -2]: - csv_writer.write(make_bill_summary(bill_id, "failed_summary", None, None)) + csv_writer.writerow( + make_bill_summary(bill_id, "failed_summary", None, None) + ) continue # Note: `normalize_summary` does some post-processing to clean up the summaries # As of 2025-10-21 this was necessary due to the LLM prompt @@ -55,13 +59,15 @@ def make_bill_summary(bill_id, status, summary, topics): # If the topics are already populated, just make a note of it topics = document.get("topics") if topics is not None: - csv_writer.write(make_bill_summary(bill_id, "previous_topics", None, None)) + csv_writer.writerow( + make_bill_summary(bill_id, "previous_topics", None, None) + ) tags = get_tags_api_function_v2(bill_id, document_title, summary) # If the tags fail, make a note and at least write the summary for debugging if tags["status"] != 1: - csv_writer.write(make_bill_summary(bill_id, "failed_topics", None, None)) - csv_writer.write( + csv_writer.writerow(make_bill_summary(bill_id, "failed_topics", None, None)) + csv_writer.writerow( make_bill_summary(bill_id, "generated_summary", summary, None) ) continue @@ -69,7 +75,7 @@ def make_bill_summary(bill_id, status, summary, topics): tags["tags"], CATEGORY_BY_TOPIC ) bill.reference.update({"topics": topics_and_categories}) - csv_writer.write( + csv_writer.writerow( make_bill_summary( bill_id, "generated_topics", summary, topics_and_categories ) From 2652ed938818def474a40fbb576e83bc8a0f333f Mon Sep 17 00:00:00 2001 From: Barry Moore Date: Tue, 21 Oct 2025 20:23:27 -0400 Subject: [PATCH 10/13] Minor clean-up --- llm/.gitignore | 4 ---- llm/backfill_summaries.py | 2 +- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/llm/.gitignore b/llm/.gitignore index c280c8d59..dcd265e35 100644 --- a/llm/.gitignore +++ b/llm/.gitignore @@ -2,8 +2,4 @@ venv/ __pycache__/ databases/ .secret.local -fix_csv_summaries.py -run_normalize_summaries.py summaries-and-topics.csv -summaries-and-topics-fixed.csv -summaries-and-topics-normalized.csv diff --git a/llm/backfill_summaries.py b/llm/backfill_summaries.py index 0d8cffe51..75367471f 100644 --- a/llm/backfill_summaries.py +++ b/llm/backfill_summaries.py @@ -77,6 +77,6 @@ def make_bill_summary(bill_id, status, summary, topics): bill.reference.update({"topics": topics_and_categories}) csv_writer.writerow( make_bill_summary( - bill_id, "generated_topics", summary, topics_and_categories + bill_id, "generated_summary_and_topics", summary, topics_and_categories ) ) From be10fb56433660d5b9065723bc870a950507275c Mon Sep 17 00:00:00 2001 From: Barry Moore Date: Tue, 21 Oct 2025 20:44:50 -0400 Subject: [PATCH 11/13] Name the tests --- llm/test_normalize_summaries.py | 44 +++++++++++++++++++-------------- 1 file changed, 26 insertions(+), 18 deletions(-) diff --git a/llm/test_normalize_summaries.py b/llm/test_normalize_summaries.py index 4a903df72..6e4d2c07b 100644 --- a/llm/test_normalize_summaries.py +++ b/llm/test_normalize_summaries.py @@ -1,32 +1,40 @@ import normalize_summaries -def test_normalize_summary_one(): +def test_normalize_summary_handles_summary_prefix_and_bullets(): summary = """Summary: - The bill allows Joe, the chief of police in Gravity, to continue working. - The city can require annual health examinations """ - assert normalize_summaries.normalize_summary( - summary - ) == "The bill allows Joe, the chief of police in Gravity, to continue working. The city can require annual health examinations" + assert ( + normalize_summaries.normalize_summary(summary) + == "The bill allows Joe, the chief of police in Gravity, to continue working. The city can require annual health examinations" + ) -def test_normalize_summary_two(): +def test_normalize_summary_handles_summary_prefix_and_no_bullets(): summary = """Summary: The bill allows Joe, the chief of police in Gravity, to continue working. """ - assert normalize_summaries.normalize_summary( - summary - ) == "The bill allows Joe, the chief of police in Gravity, to continue working." + assert ( + normalize_summaries.normalize_summary(summary) + == "The bill allows Joe, the chief of police in Gravity, to continue working." + ) -def test_normalize_summary_three(): + +def test_normalize_summary_handles_summary_prefix_with_no_linebreak(): summary = "Summary: The bill allows Joe, the chief of police in Gravity, to continue working." - assert normalize_summaries.normalize_summary( - summary - ) == "The bill allows Joe, the chief of police in Gravity, to continue working." - -def test_normalize_summary_four(): - summary = "The bill allows Joe, the chief of police in Gravity, to continue working." - assert normalize_summaries.normalize_summary( - summary - ) == "The bill allows Joe, the chief of police in Gravity, to continue working." + assert ( + normalize_summaries.normalize_summary(summary) + == "The bill allows Joe, the chief of police in Gravity, to continue working." + ) + + +def test_normalize_summary_handles_bare_summary(): + summary = ( + "The bill allows Joe, the chief of police in Gravity, to continue working." + ) + assert ( + normalize_summaries.normalize_summary(summary) + == "The bill allows Joe, the chief of police in Gravity, to continue working." + ) From 00c76b1f7b01a027efa5bf23cb1350b15a2ed45e Mon Sep 17 00:00:00 2001 From: Barry Moore Date: Tue, 28 Oct 2025 21:25:00 -0400 Subject: [PATCH 12/13] Address feedback --- llm/backfill_summaries.py | 42 +++++++++++++++++++++++++++++---------- 1 file changed, 31 insertions(+), 11 deletions(-) diff --git a/llm/backfill_summaries.py b/llm/backfill_summaries.py index 75367471f..c5620b8e1 100644 --- a/llm/backfill_summaries.py +++ b/llm/backfill_summaries.py @@ -1,8 +1,21 @@ -# This script fills any missing 'summary' or 'topics' fields on the data model. -# The document must have a 'Title' and 'DocumentText' field to generate them. -# -# Developer notes: -# - you'll need to set the 'OPENAI_API_KEY' environment variable +"""This script fills any missing 'summary' or 'topics' fields on the data model. + +The document must have a 'Title' and 'DocumentText' field to generate them. The +script queries only the general court 194 bills, modifies the firebase database +in-place, and generates a CSV with a description of what happened. The header for +the CSV is `bill_id,status,summary,topics`. The possible statuses are, + +- `skipped` - the bill doesn't have either a title or text, skip it +- `previous_summary` - the bill previously had a summary, skip it +- `failed_summary` - something went wrong when trying to summarize, skip it +- `previous_topics` - the bill previously had topics, skip it +- `failed_topics` - something went wrong when trying to generate topics, skip it +- `generated_summary` - both the summary and topics were generated successfully + +Developer notes: +- you'll need to set the 'OPENAI_API_KEY' environment variable +""" + import firebase_admin from llm_functions import get_summary_api_function, get_tags_api_function_v2 from firebase_admin import firestore @@ -10,20 +23,27 @@ import csv from normalize_summaries import normalize_summary +# Module constants +FIREBASE_COLLECTION_PATH = "generalCourts/194/bills" +CSV_SUMMARY_OUTPUT = "./summaries-and-topics.csv" + # Application Default credentials are automatically created. app = firebase_admin.initialize_app() db = firestore.client() -# Conceptually, we want to return a very consistent format when generated status reports. -# It would allow us to skip LLM regeneration when moving from dev to production. def make_bill_summary(bill_id, status, summary, topics): + """Generate a row for csv.writerow + + The goal with this function is to not forget all the arguments to subsequent + csv.writerow calls. + """ return [f"{bill_id}", f"{status}", f"{summary}", f"{topics}"] -bills_ref = db.collection("generalCourts/194/bills") +bills_ref = db.collection(FIREBASE_COLLECTION_PATH) bills = bills_ref.get() -with open("./summaries-and-topics.csv", "w") as csvfile: +with open(CSV_SUMMARY_OUTPUT, "w") as csvfile: csv_writer = csv.writer(csvfile) csv_writer.writerow(["bill_id", "status", "summary", "topics"]) for bill in bills: @@ -33,8 +53,8 @@ def make_bill_summary(bill_id, status, summary, topics): document_title = document.get("content", {}).get("Title") summary = document.get("summary") - # No document text, skip it because we can't summarize it - if document_text is None: + # No document text or title, skip it because we can't summarize it + if document_text is None or document_title is None: csv_writer.writerow(make_bill_summary(bill_id, "skipped", None, None)) continue From 100df2ba2d47ea9dee0e76e3fd52cacaf5fa614e Mon Sep 17 00:00:00 2001 From: Barry Moore Date: Tue, 28 Oct 2025 21:29:43 -0400 Subject: [PATCH 13/13] Address feedback --- llm/normalize_summaries.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/llm/normalize_summaries.py b/llm/normalize_summaries.py index 15ae4a85f..814b17692 100644 --- a/llm/normalize_summaries.py +++ b/llm/normalize_summaries.py @@ -1,3 +1,16 @@ +"""Normalize summary outputs from the LLM + +The summary prompt has some formatting prose that we don't want to persist into +the database. For example, it prefixes every summary with `Summary:`. We apply a +few preprocessing steps to every summary to keep things uniform. The steps, + +1. Remove leading `Summary:` from the input text +2. Split any newlines created by unordered lists in the input text +3. Remove leading `- ` from the split unordered lists +4. Remove any remaining whitespace +5. Put everything back together separated with spaces +""" + import re @@ -5,5 +18,7 @@ def normalize_summary(summary: str) -> str: strip_summary = re.sub(r"^Summary:", "", summary) lines = strip_summary.splitlines() handle_list_items = [re.sub(r"^- ", "", x) for x in lines] - handle_remaining_whitespace = [x.strip() for x in handle_list_items if x.strip() != ""] + handle_remaining_whitespace = [ + x.strip() for x in handle_list_items if x.strip() != "" + ] return " ".join(handle_remaining_whitespace)