-
-
Notifications
You must be signed in to change notification settings - Fork 145
Add summary backfill #1948
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Add summary backfill #1948
Changes from 11 commits
312dbbb
897e30f
d0331dd
a9c6457
d27c3fd
e7b721a
afe6ca1
7127b23
bfb587e
2652ed9
be10fb5
00c76b1
100df2b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,4 +1,5 @@ | ||
| venv/ | ||
| __pycache__/ | ||
| databases/ | ||
| .secret.local | ||
| .secret.local | ||
| summaries-and-topics.csv | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,82 @@ | ||
| # This script fills any missing 'summary' or 'topics' fields on the data model. | ||
|
||
| # The document must have a 'Title' and 'DocumentText' field to generate them. | ||
| # | ||
| # Developer notes: | ||
| # - you'll need to set the 'OPENAI_API_KEY' environment variable | ||
| import firebase_admin | ||
| from llm_functions import get_summary_api_function, get_tags_api_function_v2 | ||
| from firebase_admin import firestore | ||
| from bill_on_document_created import get_categories_from_topics, CATEGORY_BY_TOPIC | ||
| import csv | ||
| from normalize_summaries import normalize_summary | ||
|
|
||
| # Application Default credentials are automatically created. | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we have docs on how to connect to the MAPLE prod firebase, assuming that's what you are doing? If so, can we link that here?
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good question, as far as I know yes. In
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Great! Can we link here?
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. By |
||
| app = firebase_admin.initialize_app() | ||
| db = firestore.client() | ||
|
|
||
|
|
||
| # Conceptually, we want to return a very consistent format when generated status reports. | ||
| # It would allow us to skip LLM regeneration when moving from dev to production. | ||
| def make_bill_summary(bill_id, status, summary, topics): | ||
nesanders marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| return [f"{bill_id}", f"{status}", f"{summary}", f"{topics}"] | ||
|
|
||
|
|
||
| bills_ref = db.collection("generalCourts/194/bills") | ||
|
||
| bills = bills_ref.get() | ||
| with open("./summaries-and-topics.csv", "w") as csvfile: | ||
|
||
| csv_writer = csv.writer(csvfile) | ||
| csv_writer.writerow(["bill_id", "status", "summary", "topics"]) | ||
| for bill in bills: | ||
| document = bill.to_dict() | ||
| bill_id = document["id"] | ||
| document_text = document.get("content", {}).get("DocumentText") | ||
| document_title = document.get("content", {}).get("Title") | ||
| summary = document.get("summary") | ||
|
|
||
| # No document text, skip it because we can't summarize it | ||
| if document_text is None: | ||
| csv_writer.writerow(make_bill_summary(bill_id, "skipped", None, None)) | ||
| continue | ||
|
|
||
| # If the summary is already populated move on | ||
| if summary is not None: | ||
| csv_writer.writerow( | ||
| make_bill_summary(bill_id, "previous_summary", None, None) | ||
| ) | ||
| continue | ||
|
|
||
| summary = get_summary_api_function(bill_id, document_title, document_text) | ||
| if summary["status"] in [-1, -2]: | ||
| csv_writer.writerow( | ||
| make_bill_summary(bill_id, "failed_summary", None, None) | ||
nesanders marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| ) | ||
| continue | ||
| # Note: `normalize_summary` does some post-processing to clean up the summaries | ||
| # As of 2025-10-21 this was necessary due to the LLM prompt | ||
| summary = normalize_summary(summary["summary"]) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It can be a followup issue/PR, but do we also need to inject this function somewhere in our production code, i.e. when we run this as a lambda?
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes we do, good call out.
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks, did you file a followup issue?
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've not done that, but I totally can do that quick!
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
| bill.reference.update({"summary": summary}) | ||
|
|
||
| # If the topics are already populated, just make a note of it | ||
| topics = document.get("topics") | ||
| if topics is not None: | ||
| csv_writer.writerow( | ||
| make_bill_summary(bill_id, "previous_topics", None, None) | ||
| ) | ||
|
|
||
| tags = get_tags_api_function_v2(bill_id, document_title, summary) | ||
| # If the tags fail, make a note and at least write the summary for debugging | ||
| if tags["status"] != 1: | ||
| csv_writer.writerow(make_bill_summary(bill_id, "failed_topics", None, None)) | ||
| csv_writer.writerow( | ||
| make_bill_summary(bill_id, "generated_summary", summary, None) | ||
| ) | ||
| continue | ||
| topics_and_categories = get_categories_from_topics( | ||
| tags["tags"], CATEGORY_BY_TOPIC | ||
| ) | ||
| bill.reference.update({"topics": topics_and_categories}) | ||
| csv_writer.writerow( | ||
| make_bill_summary( | ||
| bill_id, "generated_summary_and_topics", summary, topics_and_categories | ||
| ) | ||
| ) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,9 @@ | ||
| import re | ||
nesanders marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
|
|
||
| def normalize_summary(summary: str) -> str: | ||
| strip_summary = re.sub(r"^Summary:", "", summary) | ||
nesanders marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| lines = strip_summary.splitlines() | ||
nesanders marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| handle_list_items = [re.sub(r"^- ", "", x) for x in lines] | ||
nesanders marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| handle_remaining_whitespace = [x.strip() for x in handle_list_items if x.strip() != ""] | ||
|
||
| return " ".join(handle_remaining_whitespace) | ||
nesanders marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,40 @@ | ||
| import normalize_summaries | ||
|
|
||
|
|
||
| def test_normalize_summary_handles_summary_prefix_and_bullets(): | ||
| summary = """Summary: | ||
| - The bill allows Joe, the chief of police in Gravity, to continue working. | ||
| - The city can require annual health examinations | ||
| """ | ||
| assert ( | ||
| normalize_summaries.normalize_summary(summary) | ||
| == "The bill allows Joe, the chief of police in Gravity, to continue working. The city can require annual health examinations" | ||
| ) | ||
|
|
||
|
|
||
| def test_normalize_summary_handles_summary_prefix_and_no_bullets(): | ||
| summary = """Summary: | ||
| The bill allows Joe, the chief of police in Gravity, to continue working. | ||
| """ | ||
| assert ( | ||
| normalize_summaries.normalize_summary(summary) | ||
| == "The bill allows Joe, the chief of police in Gravity, to continue working." | ||
| ) | ||
|
|
||
|
|
||
| def test_normalize_summary_handles_summary_prefix_with_no_linebreak(): | ||
| summary = "Summary: The bill allows Joe, the chief of police in Gravity, to continue working." | ||
| assert ( | ||
| normalize_summaries.normalize_summary(summary) | ||
| == "The bill allows Joe, the chief of police in Gravity, to continue working." | ||
| ) | ||
|
|
||
|
|
||
| def test_normalize_summary_handles_bare_summary(): | ||
| summary = ( | ||
| "The bill allows Joe, the chief of police in Gravity, to continue working." | ||
| ) | ||
| assert ( | ||
| normalize_summaries.normalize_summary(summary) | ||
| == "The bill allows Joe, the chief of police in Gravity, to continue working." | ||
| ) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is generated by running
llm/backfill_summaries.pyand I assume we don't want to accidentally commit that.