uchicago-capp-30320 · jgibson517 · May 27, 2024 · May 27, 2024 · May 27, 2024 · May 27, 2024
diff --git a/civiclens/collect/move_data_from_api_to_database.py b/civiclens/collect/move_data_from_api_to_database.py
@@ -19,7 +19,7 @@
     DATABASE_USER,
     REG_GOV_API_KEY,
 )
-from civiclens.utils.text import clean_text
+from civiclens.utils.text import parse_html
 
 
 # Configure logging
@@ -525,7 +525,7 @@ def clean_document_data(document_data: json) -> None:
     Clean document data in place; run cleaning code on summary
     """
     if document_data["summary"] is not None:
-        document_data["summary"] = clean_text(document_data["summary"])
+        document_data["summary"] = parse_html(document_data["summary"])
 
 
 def check_CFR_data(document_data: json) -> bool:
@@ -850,7 +850,7 @@ def clean_comment_data(comment_data: json) -> None:
 
     # clean the text
     if comment_text_attributes["comment"] is not None:
-        comment_text_attributes["comment"] = clean_text(
+        comment_text_attributes["comment"] = parse_html(
             comment_text_attributes["comment"]
         )
 

diff --git a/civiclens/nlp/comments.py b/civiclens/nlp/comments.py
@@ -7,6 +7,7 @@
 
 from civiclens.nlp.tools import RepComments
 from civiclens.utils.database_access import Database, pull_data
+from civiclens.utils.text import parse_html
 
 
 def get_doc_comments(id: str) -> pl.DataFrame:
@@ -41,7 +42,10 @@ def get_doc_comments(id: str) -> pl.DataFrame:
     # TODO create clusters column in comment table and delete these lines
     rows = filtered_df.shape[0]
     filtered_df = filtered_df.with_columns(
-        pl.Series("cluster", [None] * rows).cast(pl.Utf8)
+        pl.Series("cluster", [None] * rows).cast(pl.Utf8),
+        pl.col("comment")
+        .apply(parse_html, return_dtype=pl.Utf8)
+        .alias("comment"),
     )
     return filtered_df
 

diff --git a/civiclens/nlp/topics.py b/civiclens/nlp/topics.py
@@ -10,7 +10,7 @@
 
 from civiclens.nlp.models import label_model, label_tokenizer
 from civiclens.nlp.tools import Comment, RepComments
-from civiclens.utils.text import clean_text, regex_tokenize
+from civiclens.utils.text import regex_tokenize
 
 
 def stopwords(model_path: Path) -> set[str]:
@@ -51,7 +51,7 @@ def _process_text(
         docs = []
         document_ids = {}
         for idx, comment in enumerate(comments):
-            docs.append(self.tokenizer(clean_text(comment.text).lower()))
+            docs.append(self.tokenizer(comment.text.lower()))
             document_ids[idx] = comment.id
 
         # remove numbers, 2 character tokens, and stop words

diff --git a/civiclens/tests/test_utils.py b/civiclens/tests/test_utils.py
@@ -5,7 +5,7 @@
 import pytest
 
 from civiclens.utils.database_access import pull_data
-from civiclens.utils.text import clean_text
+from civiclens.utils.text import parse_html
 
 
 BASE_DIR = Path(__file__).resolve().parent
@@ -47,13 +47,25 @@ def test_bad_query():
         pull_data(conn, "SELECT data FROM not_a_table", return_type="list")
 
 
-def test_clean_string():
-    dirty = "<br/> Here's some text. ndash Also more text"
-    clean = "Here's some text. Also more text"
-    assert clean_text(dirty) == clean
+def test_encode_string():
+    dirty = "(âAOsâ)"
+    clean = "(“AOs”)"
+    assert parse_html(dirty) == clean
 
 
-def test_clean_user_regex():
-    dirty = "The cat is home"
-    clean = "The dog is home"
-    assert clean_text(dirty, patterns=[(r"cat", "dog")]) == clean
+def test_remove_html_entities():
+    dirty = "&quot;Family Sponsor Immigration Act of 2002,&quot;"
+    clean = '"Family Sponsor Immigration Act of 2002,"'
+    assert parse_html(dirty) == clean
+
+
+def test_remove_html_tags():
+    dirty = "This <br/>has some <b>tags<span>"
+    clean = "This has some tags"
+    assert parse_html(dirty) == clean
+
+
+def test_other_qoute_tags():
+    dirty = "ldquothe black dogrdquo"
+    clean = "'the black dog'"
+    assert parse_html(dirty) == clean
diff --git a/civiclens/utils/text.py b/civiclens/utils/text.py
@@ -1,5 +1,7 @@
+import html
 import re
-from typing import Optional
+
+from django.utils.html import strip_tags
 
 
 def regex_tokenize(text: str, pattern: str = r"\W+"):
@@ -16,37 +18,23 @@ def regex_tokenize(text: str, pattern: str = r"\W+"):
     return re.split(pattern, text)
 
 
-def clean_text(text: str, patterns: Optional[list[tuple]] = None) -> str:
-    r"""
-    String cleaning function for comments.
+def parse_html(text: str) -> str:
+    """
+    Encodes Regulations.gov text as UTF-8. Removes HTML entities, tags.
 
-    Args:
-        text (str): comment text
-        patterns (list[str]): optional list of regular expression patterns
-            to pass in (eg. [(r'\w+', "-")])
+    Arg
+        text (str): string to be cleaned
 
     Returns:
-        Cleaned verison of text
+        Text cleaned of HTML entities and tags
     """
-    if patterns is None:
-        patterns = []
-
-    text = re.sub(r"&#39;", "'", text)  # this replaces html entity with '
-    text = re.sub(r"&rdquo;", '"', text)  # this replaces html entity with "
-    text = re.sub(r"&amp;", "&", text)  # this replaces html entity with &
-    text = re.sub(r"â", "", text)
-    text = re.sub(r"<br\s*/?>", "", text)
-
-    text = re.sub(r"<\s*br\s*/>", " ", text)
-    text = re.sub(r"[^a-zA-Z0-9.'\"\?\: -]", "", text)
-    text = re.sub(r"\w*ndash\w*", "", text)
+    utf_text = text.encode("latin1").decode("utf-8")
+    clean_text = strip_tags(html.unescape(utf_text))
 
-    if patterns:
-        for pattern, replacement in patterns:
-            text = re.sub(pattern, replacement, text)
+    # remove other qoutes not parsed
+    clean_text = re.sub(r"(rs|ls|rd|ld)?quo", "'", clean_text)
 
-    # remove extra whitespace
-    return re.sub(r"\s+", " ", text).strip()
+    return clean_text
 
 
 def truncate(text: str, num_words: int) -> str: