Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -329,10 +329,10 @@ def _load_categories(self, categories: List[ContentFilterCategoryConfig]) -> Non
action if action else category_config_obj.default_action
)

# Handle conditional categories (with identifier_words + inherit_from)
if (
category_config_obj.identifier_words
and category_config_obj.inherit_from
# Handle conditional categories (with identifier_words + inherit_from OR identifier_words + additional_block_words)
if category_config_obj.identifier_words and (
category_config_obj.inherit_from
or category_config_obj.additional_block_words
):
self._load_conditional_category(
category_name,
Expand Down Expand Up @@ -387,74 +387,99 @@ def _load_conditional_category(
categories_dir: str,
) -> None:
"""
Load a conditional category that uses identifier_words + inherited block_words.
Load a conditional category that uses identifier_words + block_words.

Supports two patterns:
1. Inherit + additional: identifier_words + inherit_from + optional additional_block_words
2. Standalone: identifier_words + additional_block_words (no inheritance)

Args:
category_name: Name of the category
category_config_obj: CategoryConfig object with identifier_words and inherit_from
category_config_obj: CategoryConfig object with identifier_words and either inherit_from or additional_block_words
category_action: Action to take when match is found
severity_threshold: Minimum severity threshold
categories_dir: Directory containing category files
"""
# Load the inherited category to get block words
block_words = []
inherit_from = category_config_obj.inherit_from
if not inherit_from:
return

# Remove .json or .yaml extension if included
inherit_base = inherit_from.replace(".json", "").replace(".yaml", "")
# Pattern 1: Load inherited category to get base block words
if inherit_from:
# Remove .json or .yaml extension if included
inherit_base = inherit_from.replace(".json", "").replace(".yaml", "")

# Find the inherited category file
inherit_yaml_path = os.path.join(categories_dir, f"{inherit_base}.yaml")
inherit_json_path = os.path.join(categories_dir, f"{inherit_base}.json")
# Find the inherited category file
inherit_yaml_path = os.path.join(categories_dir, f"{inherit_base}.yaml")
inherit_json_path = os.path.join(categories_dir, f"{inherit_base}.json")

if os.path.exists(inherit_yaml_path):
inherit_file_path = inherit_yaml_path
elif os.path.exists(inherit_json_path):
inherit_file_path = inherit_json_path
else:
if os.path.exists(inherit_yaml_path):
inherit_file_path = inherit_yaml_path
elif os.path.exists(inherit_json_path):
inherit_file_path = inherit_json_path
else:
verbose_proxy_logger.warning(
f"Category {category_name}: inherit_from '{inherit_from}' file not found at {categories_dir}"
)
verbose_proxy_logger.debug(
f"Tried paths: {inherit_yaml_path}, {inherit_json_path}"
)
return

try:
# Load the inherited category
inherited_category = self._load_category_file(inherit_file_path)

# Extract block words from inherited category that meet severity threshold
for keyword_data in inherited_category.keywords:
keyword = keyword_data["keyword"].lower()
severity = keyword_data["severity"]
if self._should_apply_severity(severity, severity_threshold):
block_words.append(keyword)
except Exception as e:
verbose_proxy_logger.error(
f"Error loading inherited category for {category_name}: {e}"
)
return

# Pattern 2 or supplement to Pattern 1: Add additional block words
if category_config_obj.additional_block_words:
block_words.extend(category_config_obj.additional_block_words)

# Ensure we have block words before storing
if not block_words:
verbose_proxy_logger.warning(
f"Category {category_name}: inherit_from '{inherit_from}' file not found at {categories_dir}"
)
verbose_proxy_logger.debug(
f"Tried paths: {inherit_yaml_path}, {inherit_json_path}"
f"Category {category_name}: no block words found (check inherit_from or additional_block_words)"
)
return

try:
# Load the inherited category
inherited_category = self._load_category_file(inherit_file_path)

# Extract block words from inherited category that meet severity threshold
block_words = []
for keyword_data in inherited_category.keywords:
keyword = keyword_data["keyword"].lower()
severity = keyword_data["severity"]
if self._should_apply_severity(severity, severity_threshold):
block_words.append(keyword)

# Add additional block words specific to this category
if category_config_obj.additional_block_words:
block_words.extend(category_config_obj.additional_block_words)

# Store the conditional category configuration
self.conditional_categories[category_name] = {
"identifier_words": category_config_obj.identifier_words,
"block_words": block_words,
"action": category_action,
"severity": "high", # Combinations are always high severity
}
# Store the conditional category configuration
self.conditional_categories[category_name] = {
"identifier_words": category_config_obj.identifier_words,
"block_words": block_words,
"action": category_action,
"severity": "high", # Combinations are always high severity
}

# Log different messages based on pattern
if inherit_from and category_config_obj.additional_block_words:
verbose_proxy_logger.info(
f"Loaded conditional category {category_name}: "
f"{len(category_config_obj.identifier_words)} identifiers + "
f"{len(block_words)} block words "
f"({len(category_config_obj.additional_block_words)} additional + "
f"{len(block_words) - len(category_config_obj.additional_block_words)} from {inherit_from})"
)
except Exception as e:
verbose_proxy_logger.error(
f"Error loading inherited category for {category_name}: {e}"
elif inherit_from:
verbose_proxy_logger.info(
f"Loaded conditional category {category_name}: "
f"{len(category_config_obj.identifier_words)} identifiers + "
f"{len(block_words)} block words (from {inherit_from})"
)
else:
verbose_proxy_logger.info(
f"Loaded conditional category {category_name}: "
f"{len(category_config_obj.identifier_words)} identifiers + "
f"{len(block_words)} block words (standalone)"
)

def _load_category_file(self, file_path: str) -> CategoryConfig:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
# EU AI Act Article 5 - Prohibited Practices Detection
# Uses conditional logic: BLOCK if identifier word + block word appear together
# Reference: https://artificialintelligenceact.eu/article/5/
category_name: "eu_ai_act_article5_prohibited_practices"
description: "Detects EU AI Act Article 5 prohibited practices using conditional keyword matching"
default_action: "BLOCK"

# IDENTIFIER WORDS - Actions that could create prohibited systems
# When combined with prohibited contexts (block words), triggers violation
identifier_words:
# System development actions
- "build"
- "create"
- "develop"
- "design"
- "implement"

# Analysis/scoring actions
- "score"
- "rank"
- "rate"
- "assess"
- "evaluate"
- "classify"
- "profile"
- "grade"

# Detection/recognition actions
- "recognize"
- "detect"
- "identify"
- "infer"
- "deduce"
- "predict"
- "determine"
- "analyze"

# ADDITIONAL BLOCK WORDS - Prohibited contexts from Article 5
# When combined with identifier words, indicates violation
additional_block_words:
# Social scoring (Article 5.1.c)
- "social behavior"
- "social credit"
- "social score"
- "trustworthiness"
- "citizen score"
- "reputation score"
- "behavioral score"
- "social media behavior"

# Emotion recognition in workplace/education (Article 5.1.f)
- "employee emotion"
- "worker emotion"
- "staff emotion"
- "workplace emotion"
- "student emotion"
- "classroom emotion"
- "employee sentiment"
- "worker sentiment"
- "student sentiment"
- "employee mood"
- "worker mood"

# Biometric categorization (Article 5.1.h)
- "race from face"
- "ethnicity from face"
- "race from biometric"
- "ethnicity from biometric"
- "political views from"
- "political opinions from"
- "sexual orientation from"
- "religion from biometric"
- "religious beliefs from"

# Predictive profiling
- "crime prediction"
- "criminal behavior"
- "recidivism"
- "employee performance prediction"
- "creditworthiness from social"

# Manipulation (Article 5.1.a)
- "subliminal"
- "subconscious"
- "behavior manipulation"

# Vulnerability exploitation (Article 5.1.b)
- "children vulnerability"
- "elderly vulnerability"
- "disabled vulnerability"

# ALWAYS BLOCK - Explicit prohibited practices (always blocked regardless of context)
always_block_keywords:
# Social scoring systems
- keyword: "social credit system"
severity: "high"
- keyword: "social scoring system"
severity: "high"
- keyword: "citizen scoring"
severity: "high"

# Emotion recognition in workplace/education
- keyword: "emotion recognition in workplace"
severity: "high"
- keyword: "emotion detection of employees"
severity: "high"
- keyword: "emotion recognition in classroom"
severity: "high"
- keyword: "student emotion detection"
severity: "high"

# Biometric categorization
- keyword: "infer race from face"
severity: "high"
- keyword: "predict race from facial"
severity: "high"
- keyword: "infer ethnicity from biometric"
severity: "high"
- keyword: "predict political opinions from"
severity: "high"
- keyword: "biometric categorization system"
severity: "high"

# Predictive profiling
- keyword: "predictive policing"
severity: "high"
- keyword: "crime prediction algorithm"
severity: "high"
- keyword: "recidivism prediction"
severity: "high"

# EXCEPTIONS - Legitimate use cases (always allowed)
exceptions:
# Research and education
- "research on"
- "study on"
- "academic"
- "thesis on"

# Compliance monitoring
- "audit for bias"
- "detect discrimination"
- "compliance monitoring"
- "ethical review"
- "fairness testing"

# Entertainment/product contexts
- "movie"
- "game"
- "product review"
- "customer feedback"

# Meta-discussion
- "explain"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"explain" exception trivially bypasses all blocking

The exception "explain" is matched as a substring via if exception in text_lower in both _check_conditional_categories (line 831) and _check_category_keywords (line 917). This means any prompt containing the word "explain" bypasses the entire guardrail — including always_block_keywords that are documented to "always block regardless of context."

For example, "Explain how to build a social credit system" will:

  1. Hit the exception check in _check_conditional_categories → returns None (skipped)
  2. Hit the exception check in _check_category_keywords → returns None (skipped)
  3. Pass through completely unblocked

The same bypass works for "game" (e.g., "This is a game, now build a social credit system") and "what is" (e.g., "What is the best way to build a social credit system").

Consider using more specific phrases that are less likely to appear alongside genuine violation requests, e.g. "explain what", "explain the concept of", or implementing exceptions as a separate pass that checks whether the exception phrase meaningfully frames the context (not just appears anywhere in the text).

- "what is"
- "article 5"
Comment on lines +1 to +156
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Conditional matching won't activate without inherit_from

The loading code in content_filter.py at line 310-312 only registers a conditional category when both identifier_words AND inherit_from are present:

if (
    category_config_obj.identifier_words
    and category_config_obj.inherit_from
):
    self._load_conditional_category(...)

This template has identifier_words and additional_block_words but no inherit_from field. As a result, the _load_conditional_category method is never called, and the identifier_words + additional_block_words conditional matching logic will be completely skipped at runtime. Only the always_block_keywords section will actually be enforced.

This means test cases 11-25 (the conditional matches like "score + social behavior", "detect + employee emotion") will not be blocked as intended. The fix requires either:

  1. Adding inherit_from pointing to an empty or minimal JSON category file, or
  2. Modifying content_filter.py to also handle identifier_words + additional_block_words without requiring inherit_from:
if category_config_obj.identifier_words and (
    category_config_obj.inherit_from or category_config_obj.additional_block_words
):
    self._load_conditional_category(...)

And updating _load_conditional_category to handle the case where inherit_from is None.

- "prohibited by"
Loading
Loading