Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Text cleaning v2 #295

Draft
wants to merge 4 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions civiclens/collect/move_data_from_api_to_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
DATABASE_USER,
REG_GOV_API_KEY,
)
from civiclens.utils.text import clean_text
from civiclens.utils.text import parse_html


# Configure logging
Expand Down Expand Up @@ -525,7 +525,7 @@ def clean_document_data(document_data: json) -> None:
Clean document data in place; run cleaning code on summary
"""
if document_data["summary"] is not None:
document_data["summary"] = clean_text(document_data["summary"])
document_data["summary"] = parse_html(document_data["summary"])


def check_CFR_data(document_data: json) -> bool:
Expand Down Expand Up @@ -850,7 +850,7 @@ def clean_comment_data(comment_data: json) -> None:

# clean the text
if comment_text_attributes["comment"] is not None:
comment_text_attributes["comment"] = clean_text(
comment_text_attributes["comment"] = parse_html(
comment_text_attributes["comment"]
)

Expand Down
6 changes: 5 additions & 1 deletion civiclens/nlp/comments.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

from civiclens.nlp.tools import RepComments
from civiclens.utils.database_access import Database, pull_data
from civiclens.utils.text import parse_html


def get_doc_comments(id: str) -> pl.DataFrame:
Expand Down Expand Up @@ -41,7 +42,10 @@ def get_doc_comments(id: str) -> pl.DataFrame:
# TODO create clusters column in comment table and delete these lines
rows = filtered_df.shape[0]
filtered_df = filtered_df.with_columns(
pl.Series("cluster", [None] * rows).cast(pl.Utf8)
pl.Series("cluster", [None] * rows).cast(pl.Utf8),
pl.col("comment")
.apply(parse_html, return_dtype=pl.Utf8)
.alias("comment"),
)
return filtered_df

Expand Down
4 changes: 2 additions & 2 deletions civiclens/nlp/topics.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

from civiclens.nlp.models import label_model, label_tokenizer
from civiclens.nlp.tools import Comment, RepComments
from civiclens.utils.text import clean_text, regex_tokenize
from civiclens.utils.text import regex_tokenize


def stopwords(model_path: Path) -> set[str]:
Expand Down Expand Up @@ -51,7 +51,7 @@ def _process_text(
docs = []
document_ids = {}
for idx, comment in enumerate(comments):
docs.append(self.tokenizer(clean_text(comment.text).lower()))
docs.append(self.tokenizer(comment.text.lower()))
document_ids[idx] = comment.id

# remove numbers, 2 character tokens, and stop words
Expand Down
30 changes: 21 additions & 9 deletions civiclens/tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import pytest

from civiclens.utils.database_access import pull_data
from civiclens.utils.text import clean_text
from civiclens.utils.text import parse_html


BASE_DIR = Path(__file__).resolve().parent
Expand Down Expand Up @@ -47,13 +47,25 @@ def test_bad_query():
pull_data(conn, "SELECT data FROM not_a_table", return_type="list")


def test_clean_string():
dirty = "<br/> Here's some text. ndash Also more text"
clean = "Here's some text. Also more text"
assert clean_text(dirty) == clean
def test_encode_string():
dirty = "(“AOs”)"
clean = "(“AOs”)"
assert parse_html(dirty) == clean


def test_clean_user_regex():
dirty = "The cat is home"
clean = "The dog is home"
assert clean_text(dirty, patterns=[(r"cat", "dog")]) == clean
def test_remove_html_entities():
dirty = "&quot;Family Sponsor Immigration Act of 2002,&quot;"
clean = '"Family Sponsor Immigration Act of 2002,"'
assert parse_html(dirty) == clean


def test_remove_html_tags():
dirty = "This <br/>has some <b>tags<span>"
clean = "This has some tags"
assert parse_html(dirty) == clean


def test_other_qoute_tags():
dirty = "ldquothe black dogrdquo"
clean = "'the black dog'"
assert parse_html(dirty) == clean
40 changes: 14 additions & 26 deletions civiclens/utils/text.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import html
import re
from typing import Optional

from django.utils.html import strip_tags


def regex_tokenize(text: str, pattern: str = r"\W+"):
Expand All @@ -16,37 +18,23 @@ def regex_tokenize(text: str, pattern: str = r"\W+"):
return re.split(pattern, text)


def clean_text(text: str, patterns: Optional[list[tuple]] = None) -> str:
r"""
String cleaning function for comments.
def parse_html(text: str) -> str:
"""
Encodes Regulations.gov text as UTF-8. Removes HTML entities, tags.

Args:
text (str): comment text
patterns (list[str]): optional list of regular expression patterns
to pass in (eg. [(r'\w+', "-")])
Arg
text (str): string to be cleaned

Returns:
Cleaned verison of text
Text cleaned of HTML entities and tags
"""
if patterns is None:
patterns = []

text = re.sub(r"&#39;", "'", text) # this replaces html entity with '
text = re.sub(r"&rdquo;", '"', text) # this replaces html entity with "
text = re.sub(r"&amp;", "&", text) # this replaces html entity with &
text = re.sub(r"â", "", text)
text = re.sub(r"<br\s*/?>", "", text)

text = re.sub(r"<\s*br\s*/>", " ", text)
text = re.sub(r"[^a-zA-Z0-9.'\"\?\: -]", "", text)
text = re.sub(r"\w*ndash\w*", "", text)
utf_text = text.encode("latin1").decode("utf-8")
clean_text = strip_tags(html.unescape(utf_text))

if patterns:
for pattern, replacement in patterns:
text = re.sub(pattern, replacement, text)
# remove other qoutes not parsed
clean_text = re.sub(r"(rs|ls|rd|ld)?quo", "'", clean_text)

# remove extra whitespace
return re.sub(r"\s+", " ", text).strip()
return clean_text


def truncate(text: str, num_words: int) -> str:
Expand Down
Loading