diff --git a/chromadb/api/types.py b/chromadb/api/types.py index 3cfc2c84301..965e4bd0ec0 100644 --- a/chromadb/api/types.py +++ b/chromadb/api/types.py @@ -42,6 +42,11 @@ "UpdateMetadata", ] META_KEY_CHROMA_DOCUMENT = "chroma:document" +# Define reserved operator names +RESERVED_OPERATORS = [ + "$and", "$or", "$in", "$nin", "$gt", "$gte", "$lt", "$lte", "$ne", "$eq" +] + T = TypeVar("T") OneOrMany = Union[T, List[T]] @@ -682,6 +687,18 @@ def validate_metadata(metadata: Metadata) -> Metadata: raise ValueError( f"Expected metadata to not contain the reserved key {META_KEY_CHROMA_DOCUMENT}" ) + # Add validation for operator names used as metadata keys + if key in RESERVED_OPERATORS: + raise ValueError( + f"Metadata key '{key}' conflicts with a reserved operator name. " + f"Please use a different key name." + ) + # Validate that strings starting with $ aren't used for metadata keys + if isinstance(key, str) and key.startswith("$"): + raise ValueError( + f"Metadata key '{key}' starts with '$', which is reserved for operators. " + f"Please use a different key name." + ) if not isinstance(key, str): raise TypeError( f"Expected metadata key to be a str, got {key} which is a {type(key).__name__}" diff --git a/chromadb/segment/impl/metadata/sqlite.py b/chromadb/segment/impl/metadata/sqlite.py index 22544f1948f..dc46c68436f 100644 --- a/chromadb/segment/impl/metadata/sqlite.py +++ b/chromadb/segment/impl/metadata/sqlite.py @@ -538,6 +538,17 @@ def _where_map_criterion( for w in cast(Sequence[Where], v) ] clause.append(reduce(lambda x, y: x | y, criteria)) + # Add explicit checks for $in and $nin operators at top level + elif k == "$in" or k == "$nin": + # Handle $in and $nin operators when they are at the root of a where clause + # This ensures they are treated as operators, not metadata keys + expr = cast(Union[LiteralValue, Dict[WhereOperator, LiteralValue]], v) + # We pass an empty string as the key and the operator as a parameter to _value_criterion + if isinstance(v, list): + clause.append(_value_criterion("", v, cast(InclusionExclusionOperator, k), q, metadata_t, embeddings_t)) + else: + # Handle other value types + clause.append(_value_criterion("", [v], cast(InclusionExclusionOperator, k), q, metadata_t, embeddings_t)) else: expr = cast(Union[LiteralValue, Dict[WhereOperator, LiteralValue]], v) clause.append(_where_clause(k, expr, q, metadata_t, embeddings_t)) diff --git a/test_reproduce/test_complex_operator_bug.py b/test_reproduce/test_complex_operator_bug.py new file mode 100644 index 00000000000..43579b722f4 --- /dev/null +++ b/test_reproduce/test_complex_operator_bug.py @@ -0,0 +1,68 @@ +import pytest +import chromadb +import json + +def test_complex_metadata_operator_bug(): + """ + This test demonstrates a critical bug in Chroma's metadata filtering when metadata keys match operator names. + + The issue occurs with complex where clauses that could be ambiguously interpreted. + """ + client = chromadb.Client() + collection = client.create_collection("test_collection_complex") + + # Setup test documents + collection.add( + ids=["id1", "id2", "id3", "id4"], + documents=["doc1", "doc2", "doc3", "doc4"], + metadatas=[ + {"$in": "value1", "category": "A"}, # Has a key that matches operator + {"category": "A", "tag": "special"}, # Normal metadata, category A + {"$in": "value3", "category": "B"}, # Has a key that matches operator, different category + {"category": "B", "tag": "special"} # Normal metadata, category B + ] + ) + + print("=== Test: Complex filtering showing operator name ambiguity bug ===") + + # The critical test: filter for category B documents using "$in" as a METADATA KEY + # and simultaneously filter for "tag" being in a list using $in as an OPERATOR + try: + bug_result = collection.get( + where={"$and": [ + {"$in": "value3"}, # Looks for documents where metadata key "$in" equals "value3" + {"category": "B"}, # Looks for documents where "category" is "B" + {"tag": {"$in": ["special"]}} # Uses $in as an operator to match "tag" in list + ]} + ) + + print(f"Result of query trying to find documents with key '$in'='value3' AND category='B' AND tag in ['special']:") + print(json.dumps(bug_result, indent=2)) + + # Check if we get the expected result (should be only id4) + # But due to the bug, the system may get confused with the "$in" usage + if len(bug_result["ids"]) == 1 and bug_result["ids"][0] == "id4": + print("✅ Query worked correctly, found the right document") + else: + print(f"❌ BUG CONFIRMED: Query returned incorrect results: {bug_result['ids']}") + print(" Expected only id4, as it's the only one with category B AND tag 'special'") + if len(bug_result["ids"]) == 0: + print(" Got no results - system likely confused by '$in' appearing both as metadata key and operator") + + except Exception as e: + print(f"❌ Error with complex query: {e}") + print("BUG CONFIRMED: System cannot handle complex queries with operator names as metadata keys") + + # Compare with a reference query that doesn't use "$in" as a metadata key + ref_result = collection.get( + where={"$and": [ + {"category": "B"}, + {"tag": {"$in": ["special"]}} + ]} + ) + + print("\nReference query without using '$in' as metadata key:") + print(json.dumps(ref_result, indent=2)) + +if __name__ == "__main__": + test_complex_metadata_operator_bug()