Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions chromadb/api/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,11 @@
"UpdateMetadata",
]
META_KEY_CHROMA_DOCUMENT = "chroma:document"
# Define reserved operator names
RESERVED_OPERATORS = [
"$and", "$or", "$in", "$nin", "$gt", "$gte", "$lt", "$lte", "$ne", "$eq"
]

T = TypeVar("T")
OneOrMany = Union[T, List[T]]

Expand Down Expand Up @@ -682,6 +687,18 @@ def validate_metadata(metadata: Metadata) -> Metadata:
raise ValueError(
f"Expected metadata to not contain the reserved key {META_KEY_CHROMA_DOCUMENT}"
)
# Add validation for operator names used as metadata keys
if key in RESERVED_OPERATORS:
raise ValueError(
f"Metadata key '{key}' conflicts with a reserved operator name. "
f"Please use a different key name."
)
# Validate that strings starting with $ aren't used for metadata keys
if isinstance(key, str) and key.startswith("$"):
raise ValueError(
f"Metadata key '{key}' starts with '$', which is reserved for operators. "
f"Please use a different key name."
)
if not isinstance(key, str):
raise TypeError(
f"Expected metadata key to be a str, got {key} which is a {type(key).__name__}"
Expand Down
11 changes: 11 additions & 0 deletions chromadb/segment/impl/metadata/sqlite.py
Original file line number Diff line number Diff line change
Expand Up @@ -538,6 +538,17 @@ def _where_map_criterion(
for w in cast(Sequence[Where], v)
]
clause.append(reduce(lambda x, y: x | y, criteria))
# Add explicit checks for $in and $nin operators at top level
elif k == "$in" or k == "$nin":
# Handle $in and $nin operators when they are at the root of a where clause
# This ensures they are treated as operators, not metadata keys
expr = cast(Union[LiteralValue, Dict[WhereOperator, LiteralValue]], v)
# We pass an empty string as the key and the operator as a parameter to _value_criterion
if isinstance(v, list):
clause.append(_value_criterion("", v, cast(InclusionExclusionOperator, k), q, metadata_t, embeddings_t))
else:
# Handle other value types
clause.append(_value_criterion("", [v], cast(InclusionExclusionOperator, k), q, metadata_t, embeddings_t))
else:
expr = cast(Union[LiteralValue, Dict[WhereOperator, LiteralValue]], v)
clause.append(_where_clause(k, expr, q, metadata_t, embeddings_t))
Expand Down
68 changes: 68 additions & 0 deletions test_reproduce/test_complex_operator_bug.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
import pytest
import chromadb
import json

def test_complex_metadata_operator_bug():
"""
This test demonstrates a critical bug in Chroma's metadata filtering when metadata keys match operator names.

The issue occurs with complex where clauses that could be ambiguously interpreted.
"""
client = chromadb.Client()
collection = client.create_collection("test_collection_complex")

# Setup test documents
collection.add(
ids=["id1", "id2", "id3", "id4"],
documents=["doc1", "doc2", "doc3", "doc4"],
metadatas=[
{"$in": "value1", "category": "A"}, # Has a key that matches operator
{"category": "A", "tag": "special"}, # Normal metadata, category A
{"$in": "value3", "category": "B"}, # Has a key that matches operator, different category
{"category": "B", "tag": "special"} # Normal metadata, category B
]
)

print("=== Test: Complex filtering showing operator name ambiguity bug ===")

# The critical test: filter for category B documents using "$in" as a METADATA KEY
# and simultaneously filter for "tag" being in a list using $in as an OPERATOR
try:
bug_result = collection.get(
where={"$and": [
{"$in": "value3"}, # Looks for documents where metadata key "$in" equals "value3"
{"category": "B"}, # Looks for documents where "category" is "B"
{"tag": {"$in": ["special"]}} # Uses $in as an operator to match "tag" in list
]}
)

print(f"Result of query trying to find documents with key '$in'='value3' AND category='B' AND tag in ['special']:")
print(json.dumps(bug_result, indent=2))

# Check if we get the expected result (should be only id4)
# But due to the bug, the system may get confused with the "$in" usage
if len(bug_result["ids"]) == 1 and bug_result["ids"][0] == "id4":
print("✅ Query worked correctly, found the right document")
else:
print(f"❌ BUG CONFIRMED: Query returned incorrect results: {bug_result['ids']}")
print(" Expected only id4, as it's the only one with category B AND tag 'special'")
if len(bug_result["ids"]) == 0:
print(" Got no results - system likely confused by '$in' appearing both as metadata key and operator")

except Exception as e:
print(f"❌ Error with complex query: {e}")
print("BUG CONFIRMED: System cannot handle complex queries with operator names as metadata keys")

# Compare with a reference query that doesn't use "$in" as a metadata key
ref_result = collection.get(
where={"$and": [
{"category": "B"},
{"tag": {"$in": ["special"]}}
]}
)

print("\nReference query without using '$in' as metadata key:")
print(json.dumps(ref_result, indent=2))

if __name__ == "__main__":
test_complex_metadata_operator_bug()
Loading