Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: improve ad detection with normalization #327

Merged
merged 4 commits into from
Feb 5, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 13 additions & 6 deletions quotaclimat/data_processing/mediatree/detect_keywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,8 @@ def remove_stopwords(plaintext: str, stopwords: list[str]) -> str:

if len(stopwords) == 0:
logging.warning("Stop words list empty")

return plaintext

if "groupe verlaine" in plaintext:
logging.info(f"special groupe verlaine case")
plaintext = replace_word_with_context(plaintext, word="groupe verlaine")
Expand All @@ -135,10 +136,19 @@ def remove_stopwords(plaintext: str, stopwords: list[str]) -> str:
plaintext = replace_word_with_context(plaintext, word="fleuron industrie", length_to_remove=150)

for word in stopwords:
logging.info(f"Test {word}")
plaintext = plaintext.replace(word, '')

return plaintext

def get_detected_keywords(plaitext_without_stopwords: str, keywords_dict):
matching_words = []
for keyword_dict in keywords_dict:
if is_word_in_sentence(keyword_dict["keyword"], plaitext_without_stopwords):
matching_words.append({"keyword": keyword_dict["keyword"], "category": keyword_dict["category"]})

return matching_words

@sentry_sdk.trace
def get_themes_keywords_duration(plaintext: str, subtitle_duration: List[str], start: datetime, stop_words: List[str] = []):
keywords_with_timestamp = []
Expand All @@ -150,11 +160,8 @@ def get_themes_keywords_duration(plaintext: str, subtitle_duration: List[str], s

for theme, keywords_dict in THEME_KEYWORDS.items():
logging.debug(f"searching {theme} for {keywords_dict}")
matching_words = []
for keyword_dict in keywords_dict:
if is_word_in_sentence(keyword_dict["keyword"], plaitext_without_stopwords):
matching_words.append({"keyword": keyword_dict["keyword"], "category": keyword_dict["category"]})

matching_words = get_detected_keywords(plaitext_without_stopwords, keywords_dict)

if matching_words:
logging.debug(f"theme found : {theme} with word {matching_words}")

Expand Down
20 changes: 14 additions & 6 deletions quotaclimat/data_processing/mediatree/stop_word/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,10 @@ def get_all_stop_word(session: Session, offset: int = 0, batch_size: int = 50000
func.timezone('UTC', Stop_Word.created_at).label('created_at'),
func.timezone('UTC', Stop_Word.updated_at).label('updated_at')
).select_from(Stop_Word) \
.order_by(Stop_Word.count.desc(), Stop_Word.created_at) \
.order_by(Stop_Word.count.desc(),
func.length(Stop_Word.context).desc(),
Stop_Word.created_at
) \
.limit(batch_size).offset(offset)

if validated_only:
Expand Down Expand Up @@ -130,7 +133,7 @@ def get_all_repetitive_context_advertising_for_a_keyword(
end_date_sql = get_date_sql_query(end_date) #"'2024-12-19 00:00:00.000 +01:00'"
sql_query = f"""
SELECT SUBSTRING("context_keyword",0,{total_length}) AS "context",
MAX("keyword_id") AS "keyword_id",
MAX("keyword_id") AS "keyword_id", -- enable to check directly on mediatree service
COUNT(*) AS "count"
FROM (
SELECT
Expand All @@ -139,10 +142,14 @@ def get_all_repetitive_context_advertising_for_a_keyword(
"public"."keywords"."start" AS "start_default_time",
"public"."keywords"."theme" AS "theme",
"public"."keywords"."id" AS "keyword_id",
SUBSTRING(
REPLACE("public"."keywords"."plaintext",'{MEDIATREE_TRANSCRIPTION_PROBLEM}',''), -- mediatree transcription pollution
GREATEST(POSITION('{escaped_keyword}' IN "public"."keywords"."plaintext") - {before_context}, 1), -- start position
LEAST({after_context}, LENGTH( "public"."keywords"."plaintext")) -- length of the context
TRIM(
REGEXP_REPLACE(
SUBSTRING(
REPLACE("public"."keywords"."plaintext",'{MEDIATREE_TRANSCRIPTION_PROBLEM}',''), -- mediatree transcription pollution
GREATEST(POSITION('{escaped_keyword}' IN REPLACE("public"."keywords"."plaintext",'{MEDIATREE_TRANSCRIPTION_PROBLEM}','')) - {before_context}, 1), -- start position
LEAST({after_context}, LENGTH( REPLACE("public"."keywords"."plaintext",'<unk> ',''))) -- length of the context
)
,'^\w{{1,2}}\s+|\s+\w{{1,2}}\s*$', '', 'g') -- removes 1-2 letter words at boundaries
) AS "context_keyword",
"public"."keywords"."keywords_with_timestamp" AS "keywords_with_timestamp",
"public"."keywords"."number_of_keywords" AS "number_of_keywords",
Expand Down Expand Up @@ -170,6 +177,7 @@ def get_all_repetitive_context_advertising_for_a_keyword(
result = [dict(row) for row in result.mappings()]
# add metadata to result for all rows
for row in result:
row["context"] = row["context"].strip()
row["id"] = get_consistent_hash(row["context"]) # to avoid adding duplicates
row["keyword"] = keyword
row["channel_title"] = channel_title
Expand Down
2 changes: 1 addition & 1 deletion test/sitemap/test_keywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def test_remove_stopwords_energie():
output = remove_stopwords(plaintext,STOP_WORDS)
# plantext does not contain photovoltaïque
assert "photovoltaïque" not in output
assert "rénovation énergétique" not in output
assert "rénovation énergetique" not in output
assert "chauffage" not in output

def test_remove_stopwords_fleuron():
Expand Down
8 changes: 4 additions & 4 deletions test/sitemap/test_update_pg_keywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -815,9 +815,9 @@ def test_update_only_keywords_that_includes_some_keywords():
"id": "test2",
"keyword": "climatique",
"channel_title": "TF1",
"context": "lacieux selon les experts question climatique en fait elle dépasse la question ",
"context": "lacieux selon les experts question climatique en fait elle dépasse la question",
"count": 19,
"id" : get_consistent_hash("lacieux selon les experts question climatique en fait elle dépasse la question "),
"id" : get_consistent_hash("lacieux selon les experts question climatique en fait elle dépasse la question"),
}
]
save_append_stop_word(conn, stop_word_to_save)
Expand Down Expand Up @@ -1020,9 +1020,9 @@ def test_update_nothing_because_no_keywords_are_included():
"id": "test2",
"keyword": "climatique",
"channel_title": "TF1",
"context": "lacieux selon les experts question climatique en fait elle dépasse la question ",
"context": "lacieux selon les experts question climatique en fait elle dépasse la question",
"count": 19,
"id" : get_consistent_hash("lacieux selon les experts question climatique en fait elle dépasse la question "),
"id" : get_consistent_hash("lacieux selon les experts question climatique en fait elle dépasse la question"),
}
]
save_append_stop_word(conn, stop_word_to_save)
Expand Down
34 changes: 17 additions & 17 deletions test/stop_word/test_stop_word.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,8 @@ def test_stop_word_get_all_repetitive_context_advertising_for_a_keyword_default(
'keyword_id': 'f9761d34d1e9adfc44bab9ad220e216b1a9a6a0aca44c39c5fab5115fe098d79',
'start_date': start_date,
"channel_title": "France 2",
"context": " avait promis de lancer un plan de replantation euh hélas pas pu tout s' est pa",
'id': '4bd208a8e3b14f2ac46e272647729f05fb7588e427ce12d99bde6d5698415970',
"context": "avait promis de lancer un plan de replantation euh hélas pas pu tout s' est pas",
'id': get_consistent_hash("avait promis de lancer un plan de replantation euh hélas pas pu tout s' est pas"),
"count": 20 # min number of repetition
}
]
Expand All @@ -90,7 +90,7 @@ def test_stop_word_get_all_repetitive_context_advertising_for_a_keyword_utf8_min
'start_date': start_date,
'context': "agroécologie végétation dans l' antre des las vegas raiders c' est ici que se j",
'count': 1,
'id': '06130961a8c4556edfd80084d9cf413819b8ba2d91dc8f90cca888585fac8adc',
'id': get_consistent_hash("agroécologie végétation dans l' antre des las vegas raiders c' est ici que se j"),
'keyword': 'agroécologie',
},
{
Expand All @@ -116,8 +116,7 @@ def test_stop_word_get_all_repetitive_context_advertising_for_a_keyword_utf8_min
'channel_title': 'TF1',
'keyword_id': '1571457f2fb35ff37ca3cb9eaa9040606497baaf5e6ad5d6a42c69b12c596596',
'start_date': start_date,
'context': "climatique agroécologie moment-là parce que l' éblouissement au "
'balcon de bucki',
'context': "climatique agroécologie moment-là parce que l' éblouissement au balcon de bucki",
'count': 1,
'keyword': 'agroécologie',
"id" : get_consistent_hash("climatique agroécologie moment-là parce que l' éblouissement au balcon de bucki")
Expand Down Expand Up @@ -186,24 +185,25 @@ def test_stop_word_get_repetitive_context_advertising():
}
]
)

context1 = "avait promis de lancer un plan de replantation euh hélas pas pu tout s' est pas"
context2 = "lacieux selon les experts question climatique en fait elle dépasse la question"
excepted = [
{
"keyword_id": "f9761d34d1e9adfc44bab9ad220e216b1a9a6a0aca44c39c5fab5115fe098d79",
"keyword": "replantation",
"channel_title": "France 2",
"context": " avait promis de lancer un plan de replantation euh hélas pas pu tout s' est pa",
"context": context1,
"count": 20,
"id" : get_consistent_hash(" avait promis de lancer un plan de replantation euh hélas pas pu tout s' est pa"),
"id" : get_consistent_hash(context1),
'start_date': start_date,
},
{
"keyword_id": "f9761d34d1e9adfc44bab9ad220e216b1a9a6a0aca44c39c5fab5115fe098d79",
"keyword": "climatique",
"channel_title": "TF1",
"context": "lacieux selon les experts question climatique en fait elle dépasse la question ",
"context": context2,
"count": 20,
"id" : get_consistent_hash("lacieux selon les experts question climatique en fait elle dépasse la question "),
"id" : get_consistent_hash(context2),
'start_date': start_date,
}
]
Expand All @@ -221,18 +221,18 @@ def test_stop_word_save_append_stop_word():
"id": "test1",
"keyword": "replantation",
"channel_title": "France 2",
"context": " avait promis de lancer un plan de replantation euh hélas pas pu tout s' est pa",
"context": "avait promis de lancer un plan de replantation euh hélas pas pu tout s' est pas",
"count": 20,
"id" : get_consistent_hash(" avait promis de lancer un plan de replantation euh hélas pas pu tout s' est pa"),
"id" : get_consistent_hash("avait promis de lancer un plan de replantation euh hélas pas pu tout s' est pas"),
},
{
"keyword_id": "fake_id",
"id": "test2",
"keyword": "climatique",
"channel_title": "TF1",
"context": "lacieux selon les experts question climatique en fait elle dépasse la question ",
"context": "lacieux selon les experts question climatique en fait elle dépasse la question",
"count": 19,
"id" : get_consistent_hash("lacieux selon les experts question climatique en fait elle dépasse la question "),
"id" : get_consistent_hash("lacieux selon les experts question climatique en fait elle dépasse la question"),
},
{
"keyword_id": "fake_id",
Expand All @@ -256,8 +256,8 @@ def test_stop_word_save_append_stop_word():
assert stop_words[1].count == 19
assert stop_words[0].channel_title == "France 2"
assert stop_words[1].channel_title == "TF1"
assert stop_words[0].context == " avait promis de lancer un plan de replantation euh hélas pas pu tout s' est pa"
assert stop_words[1].context == "lacieux selon les experts question climatique en fait elle dépasse la question "
assert stop_words[0].context == "avait promis de lancer un plan de replantation euh hélas pas pu tout s' est pas"
assert stop_words[1].context == "lacieux selon les experts question climatique en fait elle dépasse la question"
assert stop_words[2].context == "empty_keyword"
assert stop_words[2].keyword == None

Expand All @@ -272,7 +272,7 @@ def test_stop_word_main():

def test_stop_word_is_already_known_stop_word():

context1_avait= " avait promis de lancer un plan de replantation euh hélas pas pu tout s' est pa"
context1_avait= "avait promis de lancer un plan de replantation euh hélas pas pu tout s' est pa"
context2_avais= " avais promis de lancer un plan de replantation euh hélas pas pu tout s' est pa"
assert is_already_known_stop_word(context1_avait, context2_avais) == True

Expand Down