From 312dbbba34b3c16ad64830e4c6907a19d2252557 Mon Sep 17 00:00:00 2001
From: Barry Moore <chiroptical@gmail.com>
Date: Tue, 16 Sep 2025 21:25:38 -0400
Subject: [PATCH 01/13] Initial commit

---
 llm/scripts/backfill_summaries.py | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)
 create mode 100644 llm/scripts/backfill_summaries.py

diff --git a/llm/scripts/backfill_summaries.py b/llm/scripts/backfill_summaries.py
new file mode 100644
index 000000000..5c72a10ce
--- /dev/null
+++ b/llm/scripts/backfill_summaries.py
@@ -0,0 +1,18 @@
+import firebase_admin
+from firebase_admin import firestore
+
+# Application Default credentials are automatically created.
+app = firebase_admin.initialize_app()
+db = firestore.client()
+
+bills_ref = db.collection("generalCourts/194/bills")
+bills = bills_ref.get()
+count = 0
+for bill in bills:
+    document = bill.to_dict()
+    if document.get("summary") is None or document.get("topics") is None:
+        # Notes: DocumentText _can_ be None
+        print(document.get("content", {}).get("DocumentText"))
+        print(document.get("content", {}).get("Title"))
+        print(document.get("content", {}).get("BillNumber"))
+        exit()

From 897e30f8b1c9c93e54b43894d126b99ed9d5146a Mon Sep 17 00:00:00 2001
From: Barry Moore <chiroptical@gmail.com>
Date: Tue, 23 Sep 2025 20:16:11 -0400
Subject: [PATCH 02/13] Fill out TODOs

---
 llm/scripts/backfill_summaries.py | 31 +++++++++++++++++++++++++------
 1 file changed, 25 insertions(+), 6 deletions(-)

diff --git a/llm/scripts/backfill_summaries.py b/llm/scripts/backfill_summaries.py
index 5c72a10ce..44a25afad 100644
--- a/llm/scripts/backfill_summaries.py
+++ b/llm/scripts/backfill_summaries.py
@@ -10,9 +10,28 @@
 count = 0
 for bill in bills:
     document = bill.to_dict()
-    if document.get("summary") is None or document.get("topics") is None:
-        # Notes: DocumentText _can_ be None
-        print(document.get("content", {}).get("DocumentText"))
-        print(document.get("content", {}).get("Title"))
-        print(document.get("content", {}).get("BillNumber"))
-        exit()
+    document_text = document.get("content", {}).get("DocumentText")
+    document_title = document.get("content", {}).get("Title")
+    summary = document.get("summary")
+
+    # No document text, skip it because we can't summarize it
+    if document_text is None:
+        print(f"{document['id']},skipped")
+        continue
+
+    # If the summary is already populated move on
+    if summary is not None:
+        print(f"{document['id']},previous_summary")
+        continue
+
+    # TODO: Generate the summary
+    print(f"{document['id']},generate_summary")
+
+    # If the summary is already populated move on
+    topics = document.get("topics")
+    if topics is not None:
+        print(f"{document['id']},previous_topics")
+        continue
+
+    # TODO: Populate the topics
+    print(f"{document['id']},generate_topics")

From d0331dd06228de172eb974a0e58b89c0a9bbd4bd Mon Sep 17 00:00:00 2001
From: Barry Moore <chiroptical@gmail.com>
Date: Tue, 23 Sep 2025 20:50:05 -0400
Subject: [PATCH 03/13] Move it so it works

---
 llm/backfill_summaries.py         | 52 +++++++++++++++++++++++++++++++
 llm/scripts/backfill_summaries.py | 37 ----------------------
 2 files changed, 52 insertions(+), 37 deletions(-)
 create mode 100644 llm/backfill_summaries.py
 delete mode 100644 llm/scripts/backfill_summaries.py

diff --git a/llm/backfill_summaries.py b/llm/backfill_summaries.py
new file mode 100644
index 000000000..30dff9375
--- /dev/null
+++ b/llm/backfill_summaries.py
@@ -0,0 +1,52 @@
+import firebase_admin
+from llm_functions import get_summary_api_function, get_tags_api_function_v2
+from firebase_admin import firestore
+from bill_on_document_created import get_categories_from_topics, CATEGORY_BY_TOPIC
+
+# Application Default credentials are automatically created.
+app = firebase_admin.initialize_app()
+db = firestore.client()
+
+bills_ref = db.collection("generalCourts/194/bills")
+bills = bills_ref.get()
+count = 0
+for bill in bills:
+    document = bill.to_dict()
+    bill_id = document["id"]
+    document_text = document.get("content", {}).get("DocumentText")
+    document_title = document.get("content", {}).get("Title")
+    summary = document.get("summary")
+
+    # No document text, skip it because we can't summarize it
+    if document_text is None:
+        print(f"{bill_id},skipped")
+        continue
+
+    # If the summary is already populated move on
+    if summary is not None:
+        print(f"{bill_id},previous_summary")
+        continue
+
+    summary = get_summary_api_function(bill_id, document_title, document_text)
+    if summary["status"] in [-1, -2]:
+        print(f"{bill_id},failed_summary")
+        continue
+    summary = summary["summary"]
+    print(f"summary: {summary}")
+    # bill.reference.update({"summary": summary})
+    print(f"{bill_id},generate_summary")
+
+    # If the summary is already populated move on
+    topics = document.get("topics")
+    if topics is not None:
+        print(f"{document['id']},previous_topics")
+        continue
+    tags = get_tags_api_function_v2(bill_id, document_title, summary)
+    if tags["status"] != 1:
+        print(f"{bill_id},failed_topics")
+        continue
+    topics_and_categories = get_categories_from_topics(tags["tags"], CATEGORY_BY_TOPIC)
+    print(f"topics_and_categories: {topics_and_categories}")
+    # bill.reference.update({"topics": topics_and_categories})
+    print(f"{bill_id},generate_topics")
+    exit()
diff --git a/llm/scripts/backfill_summaries.py b/llm/scripts/backfill_summaries.py
deleted file mode 100644
index 44a25afad..000000000
--- a/llm/scripts/backfill_summaries.py
+++ /dev/null
@@ -1,37 +0,0 @@
-import firebase_admin
-from firebase_admin import firestore
-
-# Application Default credentials are automatically created.
-app = firebase_admin.initialize_app()
-db = firestore.client()
-
-bills_ref = db.collection("generalCourts/194/bills")
-bills = bills_ref.get()
-count = 0
-for bill in bills:
-    document = bill.to_dict()
-    document_text = document.get("content", {}).get("DocumentText")
-    document_title = document.get("content", {}).get("Title")
-    summary = document.get("summary")
-
-    # No document text, skip it because we can't summarize it
-    if document_text is None:
-        print(f"{document['id']},skipped")
-        continue
-
-    # If the summary is already populated move on
-    if summary is not None:
-        print(f"{document['id']},previous_summary")
-        continue
-
-    # TODO: Generate the summary
-    print(f"{document['id']},generate_summary")
-
-    # If the summary is already populated move on
-    topics = document.get("topics")
-    if topics is not None:
-        print(f"{document['id']},previous_topics")
-        continue
-
-    # TODO: Populate the topics
-    print(f"{document['id']},generate_topics")

From a9c645722e5e499757d7a107c2dd69d23fcaffbb Mon Sep 17 00:00:00 2001
From: Barry Moore <chiroptical@gmail.com>
Date: Tue, 23 Sep 2025 21:07:46 -0400
Subject: [PATCH 04/13] Update documentation on summary script

---
 llm/backfill_summaries.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/llm/backfill_summaries.py b/llm/backfill_summaries.py
index 30dff9375..549c46557 100644
--- a/llm/backfill_summaries.py
+++ b/llm/backfill_summaries.py
@@ -1,3 +1,8 @@
+# This script fills any missing 'summary' or 'topics' fields on the data model.
+# The document must have a 'Title' and 'DocumentText' field to generate them.
+#
+# Developer notes:
+# - you'll need to set the 'OPENAI_API_KEY' environment variable
 import firebase_admin
 from llm_functions import get_summary_api_function, get_tags_api_function_v2
 from firebase_admin import firestore

From d27c3fd53f3bcd01f1cf0db736d42543f7edd7fb Mon Sep 17 00:00:00 2001
From: Barry Moore <chiroptical@gmail.com>
Date: Tue, 7 Oct 2025 20:51:02 -0400
Subject: [PATCH 05/13] Update

---
 llm/.gitignore            |  3 +-
 llm/backfill_summaries.py | 98 +++++++++++++++++++++++----------------
 2 files changed, 59 insertions(+), 42 deletions(-)

diff --git a/llm/.gitignore b/llm/.gitignore
index 0c2b6575d..a93be3ee3 100644
--- a/llm/.gitignore
+++ b/llm/.gitignore
@@ -1,4 +1,5 @@
 venv/
 __pycache__/
 databases/
-.secret.local
\ No newline at end of file
+.secret.local
+summaries-and-topics.csv
\ No newline at end of file
diff --git a/llm/backfill_summaries.py b/llm/backfill_summaries.py
index 549c46557..959850196 100644
--- a/llm/backfill_summaries.py
+++ b/llm/backfill_summaries.py
@@ -12,46 +12,62 @@
 app = firebase_admin.initialize_app()
 db = firestore.client()
 
+
+# Conceptually, we want to return a very consistent format when generated status reports.
+# It would allow us to skip LLM regeneration when moving from dev to production.
+def make_bill_summary(bill_id, status, summary, topics):
+    return f"{bill_id},{status},{summary},{topics}"
+
+
 bills_ref = db.collection("generalCourts/194/bills")
 bills = bills_ref.get()
-count = 0
-for bill in bills:
-    document = bill.to_dict()
-    bill_id = document["id"]
-    document_text = document.get("content", {}).get("DocumentText")
-    document_title = document.get("content", {}).get("Title")
-    summary = document.get("summary")
-
-    # No document text, skip it because we can't summarize it
-    if document_text is None:
-        print(f"{bill_id},skipped")
-        continue
-
-    # If the summary is already populated move on
-    if summary is not None:
-        print(f"{bill_id},previous_summary")
-        continue
-
-    summary = get_summary_api_function(bill_id, document_title, document_text)
-    if summary["status"] in [-1, -2]:
-        print(f"{bill_id},failed_summary")
-        continue
-    summary = summary["summary"]
-    print(f"summary: {summary}")
-    # bill.reference.update({"summary": summary})
-    print(f"{bill_id},generate_summary")
-
-    # If the summary is already populated move on
-    topics = document.get("topics")
-    if topics is not None:
-        print(f"{document['id']},previous_topics")
-        continue
-    tags = get_tags_api_function_v2(bill_id, document_title, summary)
-    if tags["status"] != 1:
-        print(f"{bill_id},failed_topics")
-        continue
-    topics_and_categories = get_categories_from_topics(tags["tags"], CATEGORY_BY_TOPIC)
-    print(f"topics_and_categories: {topics_and_categories}")
-    # bill.reference.update({"topics": topics_and_categories})
-    print(f"{bill_id},generate_topics")
-    exit()
+with open("./summaries-and-topics.csv", "w") as f:
+    f.write("bill_id,status,summary,topics\n")
+    for bill in bills:
+        document = bill.to_dict()
+        bill_id = document["id"]
+        document_text = document.get("content", {}).get("DocumentText")
+        document_title = document.get("content", {}).get("Title")
+        summary = document.get("summary")
+
+        # No document text, skip it because we can't summarize it
+        if document_text is None:
+            f.write(make_bill_summary(bill_id, "skipped", None, None))
+            f.write("\n")
+            continue
+
+        # If the summary is already populated move on
+        if summary is not None:
+            f.write(make_bill_summary(bill_id, "previous_summary", None, None))
+            f.write("\n")
+            continue
+
+        summary = get_summary_api_function(bill_id, document_title, document_text)
+        if summary["status"] in [-1, -2]:
+            f.write(make_bill_summary(bill_id, "failed_summary", None, None))
+            f.write("\n")
+            continue
+        summary = summary["summary"]
+        bill.reference.update({"summary": summary})
+        f.write(make_bill_summary(bill_id, "generated_summary", summary, None))
+        f.write("\n")
+
+        # If the summary is already populated move on
+        topics = document.get("topics")
+        if topics is not None:
+            f.write(make_bill_summary(bill_id, "previous_topics", None, None))
+            f.write("\n")
+            continue
+        tags = get_tags_api_function_v2(bill_id, document_title, summary)
+        if tags["status"] != 1:
+            f.write(make_bill_summary(bill_id, "failed_topics", None, None))
+            f.write("\n")
+            continue
+        topics_and_categories = get_categories_from_topics(
+            tags["tags"], CATEGORY_BY_TOPIC
+        )
+        bill.reference.update({"topics": topics_and_categories})
+        f.write(
+            make_bill_summary(bill_id, "generated_topics", None, topics_and_categories)
+        )
+        exit()

From e7b721afc512921683d92316068ac984ed6586a4 Mon Sep 17 00:00:00 2001
From: Barry Moore <chiroptical@gmail.com>
Date: Tue, 7 Oct 2025 20:56:01 -0400
Subject: [PATCH 06/13] Remove temporary exit

---
 llm/backfill_summaries.py | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/llm/backfill_summaries.py b/llm/backfill_summaries.py
index 959850196..f621637a4 100644
--- a/llm/backfill_summaries.py
+++ b/llm/backfill_summaries.py
@@ -16,7 +16,7 @@
 # Conceptually, we want to return a very consistent format when generated status reports.
 # It would allow us to skip LLM regeneration when moving from dev to production.
 def make_bill_summary(bill_id, status, summary, topics):
-    return f"{bill_id},{status},{summary},{topics}"
+    return f"{bill_id},{status},{summary},{topics}\n"
 
 
 bills_ref = db.collection("generalCourts/194/bills")
@@ -33,35 +33,29 @@ def make_bill_summary(bill_id, status, summary, topics):
         # No document text, skip it because we can't summarize it
         if document_text is None:
             f.write(make_bill_summary(bill_id, "skipped", None, None))
-            f.write("\n")
             continue
 
         # If the summary is already populated move on
         if summary is not None:
             f.write(make_bill_summary(bill_id, "previous_summary", None, None))
-            f.write("\n")
             continue
 
         summary = get_summary_api_function(bill_id, document_title, document_text)
         if summary["status"] in [-1, -2]:
             f.write(make_bill_summary(bill_id, "failed_summary", None, None))
-            f.write("\n")
             continue
         summary = summary["summary"]
         bill.reference.update({"summary": summary})
         f.write(make_bill_summary(bill_id, "generated_summary", summary, None))
-        f.write("\n")
 
         # If the summary is already populated move on
         topics = document.get("topics")
         if topics is not None:
             f.write(make_bill_summary(bill_id, "previous_topics", None, None))
-            f.write("\n")
             continue
         tags = get_tags_api_function_v2(bill_id, document_title, summary)
         if tags["status"] != 1:
             f.write(make_bill_summary(bill_id, "failed_topics", None, None))
-            f.write("\n")
             continue
         topics_and_categories = get_categories_from_topics(
             tags["tags"], CATEGORY_BY_TOPIC
@@ -70,4 +64,3 @@ def make_bill_summary(bill_id, status, summary, topics):
         f.write(
             make_bill_summary(bill_id, "generated_topics", None, topics_and_categories)
         )
-        exit()

From afe6ca16eeae20f9c5771afb1b1dd8ef1add3178 Mon Sep 17 00:00:00 2001
From: Barry Moore <chiroptical@gmail.com>
Date: Wed, 15 Oct 2025 01:28:31 -0400
Subject: [PATCH 07/13] Progress

---
 llm/.gitignore                  |  6 +++++-
 llm/backfill_summaries.py       |  1 +
 llm/normalize_summaries.py      |  9 +++++++++
 llm/test_normalize_summaries.py | 32 ++++++++++++++++++++++++++++++++
 4 files changed, 47 insertions(+), 1 deletion(-)
 create mode 100644 llm/normalize_summaries.py
 create mode 100644 llm/test_normalize_summaries.py

diff --git a/llm/.gitignore b/llm/.gitignore
index a93be3ee3..c280c8d59 100644
--- a/llm/.gitignore
+++ b/llm/.gitignore
@@ -2,4 +2,8 @@ venv/
 __pycache__/
 databases/
 .secret.local
-summaries-and-topics.csv
\ No newline at end of file
+fix_csv_summaries.py
+run_normalize_summaries.py
+summaries-and-topics.csv
+summaries-and-topics-fixed.csv
+summaries-and-topics-normalized.csv
diff --git a/llm/backfill_summaries.py b/llm/backfill_summaries.py
index f621637a4..e68516772 100644
--- a/llm/backfill_summaries.py
+++ b/llm/backfill_summaries.py
@@ -21,6 +21,7 @@ def make_bill_summary(bill_id, status, summary, topics):
 
 bills_ref = db.collection("generalCourts/194/bills")
 bills = bills_ref.get()
+# TODO: I need to use csv.writer here to properly escape quotes
 with open("./summaries-and-topics.csv", "w") as f:
     f.write("bill_id,status,summary,topics\n")
     for bill in bills:
diff --git a/llm/normalize_summaries.py b/llm/normalize_summaries.py
new file mode 100644
index 000000000..15ae4a85f
--- /dev/null
+++ b/llm/normalize_summaries.py
@@ -0,0 +1,9 @@
+import re
+
+
+def normalize_summary(summary: str) -> str:
+    strip_summary = re.sub(r"^Summary:", "", summary)
+    lines = strip_summary.splitlines()
+    handle_list_items = [re.sub(r"^- ", "", x) for x in lines]
+    handle_remaining_whitespace = [x.strip() for x in handle_list_items if x.strip() != ""]
+    return " ".join(handle_remaining_whitespace)
diff --git a/llm/test_normalize_summaries.py b/llm/test_normalize_summaries.py
new file mode 100644
index 000000000..4a903df72
--- /dev/null
+++ b/llm/test_normalize_summaries.py
@@ -0,0 +1,32 @@
+import normalize_summaries
+
+
+def test_normalize_summary_one():
+    summary = """Summary:  
+- The bill allows Joe, the chief of police in Gravity, to continue working.    
+- The city can require annual health examinations   
+    """
+    assert normalize_summaries.normalize_summary(
+        summary
+    ) == "The bill allows Joe, the chief of police in Gravity, to continue working. The city can require annual health examinations"
+
+
+def test_normalize_summary_two():
+    summary = """Summary:  
+The bill allows Joe, the chief of police in Gravity, to continue working.    
+    """
+    assert normalize_summaries.normalize_summary(
+        summary
+    ) == "The bill allows Joe, the chief of police in Gravity, to continue working."
+
+def test_normalize_summary_three():
+    summary = "Summary: The bill allows Joe, the chief of police in Gravity, to continue working."
+    assert normalize_summaries.normalize_summary(
+        summary
+    ) == "The bill allows Joe, the chief of police in Gravity, to continue working."
+
+def test_normalize_summary_four():
+    summary = "The bill allows Joe, the chief of police in Gravity, to continue working."
+    assert normalize_summaries.normalize_summary(
+        summary
+    ) == "The bill allows Joe, the chief of police in Gravity, to continue working."

From 7127b23d078ce9922b3d10e06d6b1a0961425c13 Mon Sep 17 00:00:00 2001
From: Barry Moore <chiroptical@gmail.com>
Date: Tue, 21 Oct 2025 20:13:41 -0400
Subject: [PATCH 08/13] Update with new CSV writer

---
 llm/backfill_summaries.py | 39 ++++++++++++++++++++++++---------------
 1 file changed, 24 insertions(+), 15 deletions(-)

diff --git a/llm/backfill_summaries.py b/llm/backfill_summaries.py
index e68516772..cbde4a98d 100644
--- a/llm/backfill_summaries.py
+++ b/llm/backfill_summaries.py
@@ -7,6 +7,8 @@
 from llm_functions import get_summary_api_function, get_tags_api_function_v2
 from firebase_admin import firestore
 from bill_on_document_created import get_categories_from_topics, CATEGORY_BY_TOPIC
+import csv
+from normalize_summaries import normalize_summary
 
 # Application Default credentials are automatically created.
 app = firebase_admin.initialize_app()
@@ -16,14 +18,14 @@
 # Conceptually, we want to return a very consistent format when generated status reports.
 # It would allow us to skip LLM regeneration when moving from dev to production.
 def make_bill_summary(bill_id, status, summary, topics):
-    return f"{bill_id},{status},{summary},{topics}\n"
+    return [f"{bill_id}", f"{status}", f"{summary}", f"{topics}"]
 
 
 bills_ref = db.collection("generalCourts/194/bills")
 bills = bills_ref.get()
-# TODO: I need to use csv.writer here to properly escape quotes
-with open("./summaries-and-topics.csv", "w") as f:
-    f.write("bill_id,status,summary,topics\n")
+with open("./summaries-and-topics.csv", "w") as csvfile:
+    csv_writer = csv.writer(csvfile)
+    csv_writer.writerow(["bill_id", "status", "summary", "topics"])
     for bill in bills:
         document = bill.to_dict()
         bill_id = document["id"]
@@ -33,35 +35,42 @@ def make_bill_summary(bill_id, status, summary, topics):
 
         # No document text, skip it because we can't summarize it
         if document_text is None:
-            f.write(make_bill_summary(bill_id, "skipped", None, None))
+            csv_writer.writerow(make_bill_summary(bill_id, "skipped", None, None))
             continue
 
         # If the summary is already populated move on
         if summary is not None:
-            f.write(make_bill_summary(bill_id, "previous_summary", None, None))
+            csv_writer.write(make_bill_summary(bill_id, "previous_summary", None, None))
             continue
 
         summary = get_summary_api_function(bill_id, document_title, document_text)
         if summary["status"] in [-1, -2]:
-            f.write(make_bill_summary(bill_id, "failed_summary", None, None))
+            csv_writer.write(make_bill_summary(bill_id, "failed_summary", None, None))
             continue
-        summary = summary["summary"]
+        # Note: `normalize_summary` does some post-processing to clean up the summaries
+        # As of 2025-10-21 this was necessary due to the LLM prompt
+        summary = normalize_summary(summary["summary"])
         bill.reference.update({"summary": summary})
-        f.write(make_bill_summary(bill_id, "generated_summary", summary, None))
 
-        # If the summary is already populated move on
+        # If the topics are already populated, just make a note of it
         topics = document.get("topics")
         if topics is not None:
-            f.write(make_bill_summary(bill_id, "previous_topics", None, None))
-            continue
+            csv_writer.write(make_bill_summary(bill_id, "previous_topics", None, None))
+
         tags = get_tags_api_function_v2(bill_id, document_title, summary)
+        # If the tags fail, make a note and at least write the summary for debugging
         if tags["status"] != 1:
-            f.write(make_bill_summary(bill_id, "failed_topics", None, None))
+            csv_writer.write(make_bill_summary(bill_id, "failed_topics", None, None))
+            csv_writer.write(
+                make_bill_summary(bill_id, "generated_summary", summary, None)
+            )
             continue
         topics_and_categories = get_categories_from_topics(
             tags["tags"], CATEGORY_BY_TOPIC
         )
         bill.reference.update({"topics": topics_and_categories})
-        f.write(
-            make_bill_summary(bill_id, "generated_topics", None, topics_and_categories)
+        csv_writer.write(
+            make_bill_summary(
+                bill_id, "generated_topics", summary, topics_and_categories
+            )
         )

From bfb587e1a929166c0d2c713f6a8cedc32af4f978 Mon Sep 17 00:00:00 2001
From: Barry Moore <chiroptical@gmail.com>
Date: Tue, 21 Oct 2025 20:20:55 -0400
Subject: [PATCH 09/13] Minor writerow updates

---
 llm/backfill_summaries.py | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/llm/backfill_summaries.py b/llm/backfill_summaries.py
index cbde4a98d..0d8cffe51 100644
--- a/llm/backfill_summaries.py
+++ b/llm/backfill_summaries.py
@@ -40,12 +40,16 @@ def make_bill_summary(bill_id, status, summary, topics):
 
         # If the summary is already populated move on
         if summary is not None:
-            csv_writer.write(make_bill_summary(bill_id, "previous_summary", None, None))
+            csv_writer.writerow(
+                make_bill_summary(bill_id, "previous_summary", None, None)
+            )
             continue
 
         summary = get_summary_api_function(bill_id, document_title, document_text)
         if summary["status"] in [-1, -2]:
-            csv_writer.write(make_bill_summary(bill_id, "failed_summary", None, None))
+            csv_writer.writerow(
+                make_bill_summary(bill_id, "failed_summary", None, None)
+            )
             continue
         # Note: `normalize_summary` does some post-processing to clean up the summaries
         # As of 2025-10-21 this was necessary due to the LLM prompt
@@ -55,13 +59,15 @@ def make_bill_summary(bill_id, status, summary, topics):
         # If the topics are already populated, just make a note of it
         topics = document.get("topics")
         if topics is not None:
-            csv_writer.write(make_bill_summary(bill_id, "previous_topics", None, None))
+            csv_writer.writerow(
+                make_bill_summary(bill_id, "previous_topics", None, None)
+            )
 
         tags = get_tags_api_function_v2(bill_id, document_title, summary)
         # If the tags fail, make a note and at least write the summary for debugging
         if tags["status"] != 1:
-            csv_writer.write(make_bill_summary(bill_id, "failed_topics", None, None))
-            csv_writer.write(
+            csv_writer.writerow(make_bill_summary(bill_id, "failed_topics", None, None))
+            csv_writer.writerow(
                 make_bill_summary(bill_id, "generated_summary", summary, None)
             )
             continue
@@ -69,7 +75,7 @@ def make_bill_summary(bill_id, status, summary, topics):
             tags["tags"], CATEGORY_BY_TOPIC
         )
         bill.reference.update({"topics": topics_and_categories})
-        csv_writer.write(
+        csv_writer.writerow(
             make_bill_summary(
                 bill_id, "generated_topics", summary, topics_and_categories
             )

From 2652ed938818def474a40fbb576e83bc8a0f333f Mon Sep 17 00:00:00 2001
From: Barry Moore <chiroptical@gmail.com>
Date: Tue, 21 Oct 2025 20:23:27 -0400
Subject: [PATCH 10/13] Minor clean-up

---
 llm/.gitignore            | 4 ----
 llm/backfill_summaries.py | 2 +-
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/llm/.gitignore b/llm/.gitignore
index c280c8d59..dcd265e35 100644
--- a/llm/.gitignore
+++ b/llm/.gitignore
@@ -2,8 +2,4 @@ venv/
 __pycache__/
 databases/
 .secret.local
-fix_csv_summaries.py
-run_normalize_summaries.py
 summaries-and-topics.csv
-summaries-and-topics-fixed.csv
-summaries-and-topics-normalized.csv
diff --git a/llm/backfill_summaries.py b/llm/backfill_summaries.py
index 0d8cffe51..75367471f 100644
--- a/llm/backfill_summaries.py
+++ b/llm/backfill_summaries.py
@@ -77,6 +77,6 @@ def make_bill_summary(bill_id, status, summary, topics):
         bill.reference.update({"topics": topics_and_categories})
         csv_writer.writerow(
             make_bill_summary(
-                bill_id, "generated_topics", summary, topics_and_categories
+                bill_id, "generated_summary_and_topics", summary, topics_and_categories
             )
         )

From be10fb56433660d5b9065723bc870a950507275c Mon Sep 17 00:00:00 2001
From: Barry Moore <chiroptical@gmail.com>
Date: Tue, 21 Oct 2025 20:44:50 -0400
Subject: [PATCH 11/13] Name the tests

---
 llm/test_normalize_summaries.py | 44 +++++++++++++++++++--------------
 1 file changed, 26 insertions(+), 18 deletions(-)

diff --git a/llm/test_normalize_summaries.py b/llm/test_normalize_summaries.py
index 4a903df72..6e4d2c07b 100644
--- a/llm/test_normalize_summaries.py
+++ b/llm/test_normalize_summaries.py
@@ -1,32 +1,40 @@
 import normalize_summaries
 
 
-def test_normalize_summary_one():
+def test_normalize_summary_handles_summary_prefix_and_bullets():
     summary = """Summary:  
 - The bill allows Joe, the chief of police in Gravity, to continue working.    
 - The city can require annual health examinations   
     """
-    assert normalize_summaries.normalize_summary(
-        summary
-    ) == "The bill allows Joe, the chief of police in Gravity, to continue working. The city can require annual health examinations"
+    assert (
+        normalize_summaries.normalize_summary(summary)
+        == "The bill allows Joe, the chief of police in Gravity, to continue working. The city can require annual health examinations"
+    )
 
 
-def test_normalize_summary_two():
+def test_normalize_summary_handles_summary_prefix_and_no_bullets():
     summary = """Summary:  
 The bill allows Joe, the chief of police in Gravity, to continue working.    
     """
-    assert normalize_summaries.normalize_summary(
-        summary
-    ) == "The bill allows Joe, the chief of police in Gravity, to continue working."
+    assert (
+        normalize_summaries.normalize_summary(summary)
+        == "The bill allows Joe, the chief of police in Gravity, to continue working."
+    )
 
-def test_normalize_summary_three():
+
+def test_normalize_summary_handles_summary_prefix_with_no_linebreak():
     summary = "Summary: The bill allows Joe, the chief of police in Gravity, to continue working."
-    assert normalize_summaries.normalize_summary(
-        summary
-    ) == "The bill allows Joe, the chief of police in Gravity, to continue working."
-
-def test_normalize_summary_four():
-    summary = "The bill allows Joe, the chief of police in Gravity, to continue working."
-    assert normalize_summaries.normalize_summary(
-        summary
-    ) == "The bill allows Joe, the chief of police in Gravity, to continue working."
+    assert (
+        normalize_summaries.normalize_summary(summary)
+        == "The bill allows Joe, the chief of police in Gravity, to continue working."
+    )
+
+
+def test_normalize_summary_handles_bare_summary():
+    summary = (
+        "The bill allows Joe, the chief of police in Gravity, to continue working."
+    )
+    assert (
+        normalize_summaries.normalize_summary(summary)
+        == "The bill allows Joe, the chief of police in Gravity, to continue working."
+    )

From 00c76b1f7b01a027efa5bf23cb1350b15a2ed45e Mon Sep 17 00:00:00 2001
From: Barry Moore <chiroptical@gmail.com>
Date: Tue, 28 Oct 2025 21:25:00 -0400
Subject: [PATCH 12/13] Address feedback

---
 llm/backfill_summaries.py | 42 +++++++++++++++++++++++++++++----------
 1 file changed, 31 insertions(+), 11 deletions(-)

diff --git a/llm/backfill_summaries.py b/llm/backfill_summaries.py
index 75367471f..c5620b8e1 100644
--- a/llm/backfill_summaries.py
+++ b/llm/backfill_summaries.py
@@ -1,8 +1,21 @@
-# This script fills any missing 'summary' or 'topics' fields on the data model.
-# The document must have a 'Title' and 'DocumentText' field to generate them.
-#
-# Developer notes:
-# - you'll need to set the 'OPENAI_API_KEY' environment variable
+"""This script fills any missing 'summary' or 'topics' fields on the data model.
+
+The document must have a 'Title' and 'DocumentText' field to generate them. The
+script queries only the general court 194 bills, modifies the firebase database
+in-place, and generates a CSV with a description of what happened. The header for
+the CSV is `bill_id,status,summary,topics`. The possible statuses are,
+
+- `skipped` - the bill doesn't have either a title or text, skip it
+- `previous_summary` - the bill previously had a summary, skip it
+- `failed_summary` - something went wrong when trying to summarize, skip it
+- `previous_topics` - the bill previously had topics, skip it
+- `failed_topics` - something went wrong when trying to generate topics, skip it
+- `generated_summary` - both the summary and topics were generated successfully
+
+Developer notes:
+- you'll need to set the 'OPENAI_API_KEY' environment variable
+"""
+
 import firebase_admin
 from llm_functions import get_summary_api_function, get_tags_api_function_v2
 from firebase_admin import firestore
@@ -10,20 +23,27 @@
 import csv
 from normalize_summaries import normalize_summary
 
+# Module constants
+FIREBASE_COLLECTION_PATH = "generalCourts/194/bills"
+CSV_SUMMARY_OUTPUT = "./summaries-and-topics.csv"
+
 # Application Default credentials are automatically created.
 app = firebase_admin.initialize_app()
 db = firestore.client()
 
 
-# Conceptually, we want to return a very consistent format when generated status reports.
-# It would allow us to skip LLM regeneration when moving from dev to production.
 def make_bill_summary(bill_id, status, summary, topics):
+    """Generate a row for csv.writerow
+
+    The goal with this function is to not forget all the arguments to subsequent
+    csv.writerow calls.
+    """
     return [f"{bill_id}", f"{status}", f"{summary}", f"{topics}"]
 
 
-bills_ref = db.collection("generalCourts/194/bills")
+bills_ref = db.collection(FIREBASE_COLLECTION_PATH)
 bills = bills_ref.get()
-with open("./summaries-and-topics.csv", "w") as csvfile:
+with open(CSV_SUMMARY_OUTPUT, "w") as csvfile:
     csv_writer = csv.writer(csvfile)
     csv_writer.writerow(["bill_id", "status", "summary", "topics"])
     for bill in bills:
@@ -33,8 +53,8 @@ def make_bill_summary(bill_id, status, summary, topics):
         document_title = document.get("content", {}).get("Title")
         summary = document.get("summary")
 
-        # No document text, skip it because we can't summarize it
-        if document_text is None:
+        # No document text or title, skip it because we can't summarize it
+        if document_text is None or document_title is None:
             csv_writer.writerow(make_bill_summary(bill_id, "skipped", None, None))
             continue
 

From 100df2ba2d47ea9dee0e76e3fd52cacaf5fa614e Mon Sep 17 00:00:00 2001
From: Barry Moore <chiroptical@gmail.com>
Date: Tue, 28 Oct 2025 21:29:43 -0400
Subject: [PATCH 13/13] Address feedback

---
 llm/normalize_summaries.py | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/llm/normalize_summaries.py b/llm/normalize_summaries.py
index 15ae4a85f..814b17692 100644
--- a/llm/normalize_summaries.py
+++ b/llm/normalize_summaries.py
@@ -1,3 +1,16 @@
+"""Normalize summary outputs from the LLM
+
+The summary prompt has some formatting prose that we don't want to persist into
+the database. For example, it prefixes every summary with `Summary:`. We apply a
+few preprocessing steps to every summary to keep things uniform. The steps,
+
+1. Remove leading `Summary:` from the input text
+2. Split any newlines created by unordered lists in the input text
+3. Remove leading `- ` from the split unordered lists
+4. Remove any remaining whitespace
+5. Put everything back together separated with spaces
+"""
+
 import re
 
 
@@ -5,5 +18,7 @@ def normalize_summary(summary: str) -> str:
     strip_summary = re.sub(r"^Summary:", "", summary)
     lines = strip_summary.splitlines()
     handle_list_items = [re.sub(r"^- ", "", x) for x in lines]
-    handle_remaining_whitespace = [x.strip() for x in handle_list_items if x.strip() != ""]
+    handle_remaining_whitespace = [
+        x.strip() for x in handle_list_items if x.strip() != ""
+    ]
     return " ".join(handle_remaining_whitespace)