diff --git a/test/collection/test_config.py b/test/collection/test_config.py index f789d39f8..2b0cada35 100644 --- a/test/collection/test_config.py +++ b/test/collection/test_config.py @@ -262,11 +262,29 @@ def test_basic_config(): Configure.Vectorizer.text2vec_transformers( pooling_strategy="cls", vectorize_collection_name=False, + inference_url="https://api.transformers.com", ), { "text2vec-transformers": { "vectorizeClassName": False, "poolingStrategy": "cls", + "inferenceUrl": "https://api.transformers.com", + } + }, + ), + ( + Configure.Vectorizer.text2vec_voyageai( + vectorize_collection_name=False, + model="voyage-large-2", + truncate=False, + base_url="https://voyage.made-up.com", + ), + { + "text2vec-voyageai": { + "vectorizeClassName": False, + "model": "voyage-large-2", + "baseURL": "https://voyage.made-up.com", + "truncate": False, } }, ), @@ -293,6 +311,23 @@ def test_basic_config(): } }, ), + ( + Configure.Vectorizer.multi2vec_palm( + image_fields=["image"], + text_fields=["text"], + project_id="project", + location="us-central1", + ), + { + "multi2vec-palm": { + "imageFields": ["image"], + "textFields": ["text"], + "projectId": "project", + "location": "us-central1", + "vectorizeClassName": True, + } + }, + ), ( Configure.Vectorizer.multi2vec_clip( image_fields=[Multi2VecField(name="image")], @@ -535,6 +570,10 @@ def test_config_with_vectorizer_and_properties( Configure.Generative.anyscale(), {"generative-anyscale": {}}, ), + ( + Configure.Generative.mistral(temperature=0.5, max_tokens=100, model="model"), + {"generative-mistral": {"temperature": 0.5, "maxTokens": 100, "model": "model"}}, + ), ( Configure.Generative.openai( model="gpt-4", @@ -700,7 +739,7 @@ def test_config_with_generative( def test_config_with_reranker( reranker_config: _RerankerConfigCreate, expected_mc: dict, -): +) -> None: config = _CollectionConfigCreate(name="test", reranker_config=reranker_config) assert config._to_dict() == { **DEFAULTS, @@ -710,7 +749,7 @@ def test_config_with_reranker( } -def test_config_with_properties(): +def test_config_with_properties() -> None: config = _CollectionConfigCreate( name="test", description="test", @@ -1046,6 +1085,25 @@ def test_vector_config_flat_pq() -> None: } }, ), + ( + [ + Configure.NamedVectors.text2vec_voyageai( + name="test", source_properties=["prop"], truncate=True + ) + ], + { + "test": { + "vectorizer": { + "text2vec-voyageai": { + "properties": ["prop"], + "vectorizeClassName": True, + "truncate": True, + } + }, + "vectorIndexType": "hnsw", + } + }, + ), ( [ Configure.NamedVectors.img2vec_neural( @@ -1085,6 +1143,31 @@ def test_vector_config_flat_pq() -> None: } }, ), + ( + [ + Configure.NamedVectors.multi2vec_palm( + name="test", + image_fields=["image"], + text_fields=["text"], + project_id="project", + location="us-central1", + ) + ], + { + "test": { + "vectorizer": { + "multi2vec-palm": { + "imageFields": ["image"], + "textFields": ["text"], + "projectId": "project", + "location": "us-central1", + "vectorizeClassName": True, + } + }, + "vectorIndexType": "hnsw", + } + }, + ), ( [ Configure.NamedVectors.multi2vec_bind( diff --git a/weaviate/collections/classes/config.py b/weaviate/collections/classes/config.py index 7f02599b3..bd3b4a28e 100644 --- a/weaviate/collections/classes/config.py +++ b/weaviate/collections/classes/config.py @@ -164,6 +164,7 @@ class GenerativeSearches(str, Enum): PALM = "generative-palm" AWS = "generative-aws" ANYSCALE = "generative-anyscale" + MISTRAL = "generative-mistral" class Rerankers(str, Enum): @@ -367,6 +368,15 @@ class _GenerativeAnyscale(_GenerativeConfigCreate): model: Optional[str] +class _GenerativeMistral(_GenerativeConfigCreate): + generative: GenerativeSearches = Field( + default=GenerativeSearches.MISTRAL, frozen=True, exclude=True + ) + temperature: Optional[float] + model: Optional[str] + maxTokens: Optional[int] + + class _GenerativeOpenAIConfigBase(_GenerativeConfigCreate): generative: GenerativeSearches = Field( default=GenerativeSearches.OPENAI, frozen=True, exclude=True @@ -464,6 +474,14 @@ def anyscale( ) -> _GenerativeConfigCreate: return _GenerativeAnyscale(model=model, temperature=temperature) + @staticmethod + def mistral( + model: Optional[str] = None, + temperature: Optional[float] = None, + max_tokens: Optional[int] = None, + ) -> _GenerativeConfigCreate: + return _GenerativeMistral(model=model, temperature=temperature, maxTokens=max_tokens) + @staticmethod def openai( model: Optional[str] = None, diff --git a/weaviate/collections/classes/config_named_vectors.py b/weaviate/collections/classes/config_named_vectors.py index be765cbf1..ceef721f7 100644 --- a/weaviate/collections/classes/config_named_vectors.py +++ b/weaviate/collections/classes/config_named_vectors.py @@ -17,6 +17,7 @@ _Img2VecNeuralConfigCreate, _Multi2VecBindConfigCreate, _Multi2VecClipConfigCreate, + _Multi2VecPalmConfig, _Ref2VecCentroidConfigCreate, _Text2VecAWSConfigCreate, _Text2VecAzureOpenAIConfigCreate, @@ -28,6 +29,7 @@ _Text2VecOpenAIConfigCreate, _Text2VecPalmConfigCreate, _Text2VecTransformersConfigCreate, + _Text2VecVoyageConfigCreate, _VectorizerConfigCreate, AWSModel, AWSService, @@ -38,6 +40,7 @@ OpenAIModel, OpenAIType, Vectorizers, + VoyageModel, _map_multi2vec_fields, ) @@ -319,10 +322,54 @@ def img2vec_neural( def multi2vec_clip( name: str, *, + vector_index_config: Optional[_VectorIndexConfigCreate] = None, + vectorize_collection_name: bool = True, image_fields: Optional[Union[List[str], List[Multi2VecField]]] = None, text_fields: Optional[Union[List[str], List[Multi2VecField]]] = None, + interference_url: Optional[str] = None, + ) -> _NamedVectorConfigCreate: + """Create a named vector using the `multi2vec_clip` model. + + See the [documentation](https://weaviate.io/developers/weaviate/modules/retriever-vectorizer-modules/text2vec-gpt4all) + for detailed usage. + + Arguments: + `name` + The name of the named vector. + `vector_index_config` + The configuration for Weaviate's vector index. Use wvc.config.Configure.VectorIndex to create a vector index configuration. None by default + `vectorize_collection_name` + Whether to vectorize the collection name. Defaults to `True`. + `image_fields` + The image fields to use in vectorization. + `text_fields` + The text fields to use in vectorization. + `inference_url` + The inference url to use where API requests should go. Defaults to `None`, which uses the server-defined default. + """ + return _NamedVectorConfigCreate( + name=name, + vectorizer=_Multi2VecClipConfigCreate( + imageFields=_map_multi2vec_fields(image_fields), + textFields=_map_multi2vec_fields(text_fields), + vectorizeClassName=vectorize_collection_name, + inferenceUrl=interference_url, + ), + vector_index_config=vector_index_config, + ) + + @staticmethod + def multi2vec_palm( + name: str, + *, vector_index_config: Optional[_VectorIndexConfigCreate] = None, vectorize_collection_name: bool = True, + location: str, + project_id: str, + image_fields: Optional[Union[List[str], List[Multi2VecField]]] = None, + text_fields: Optional[Union[List[str], List[Multi2VecField]]] = None, + dimensions: Optional[int] = None, + model_id: Optional[str] = None, ) -> _NamedVectorConfigCreate: """Create a named vector using the `multi2vec_clip` model. @@ -332,18 +379,32 @@ def multi2vec_clip( Arguments: `name` The name of the named vector. - `source_properties` - Which properties should be included when vectorizing. By default all text properties are included. `vector_index_config` The configuration for Weaviate's vector index. Use wvc.config.Configure.VectorIndex to create a vector index configuration. None by default `vectorize_collection_name` Whether to vectorize the collection name. Defaults to `True`. + `location` + Where the model runs. REQUIRED. + `project_id` + The project ID to use, REQUIRED. + `image_fields` + The image fields to use in vectorization. + `text_fields` + The text fields to use in vectorization. + `dimensions` + The number of dimensions to use. Defaults to `None`, which uses the server-defined default. + `model_id` + The model ID to use. Defaults to `None`, which uses the server-defined default. """ return _NamedVectorConfigCreate( name=name, - vectorizer=_Multi2VecClipConfigCreate( + vectorizer=_Multi2VecPalmConfig( + projectId=project_id, + location=location, imageFields=_map_multi2vec_fields(image_fields), textFields=_map_multi2vec_fields(text_fields), + dimensions=dimensions, + modelId=model_id, vectorizeClassName=vectorize_collection_name, ), vector_index_config=vector_index_config, @@ -371,8 +432,6 @@ def multi2vec_bind( Arguments: `name` The name of the named vector. - `source_properties` - Which properties should be included when vectorizing. By default all text properties are included. `vector_index_config` The configuration for Weaviate's vector index. Use wvc.config.Configure.VectorIndex to create a vector index configuration. None by default `vectorize_collection_name` @@ -409,8 +468,8 @@ def ref2vec_centroid( Arguments: `name` The name of the named vector. - `source_properties` - Which properties should be included when vectorizing. By default all text properties are included. + `reference_properties` + The reference properties to use in vectorization, REQUIRED. `vector_index_config` The configuration for Weaviate's vector index. Use wvc.config.Configure.VectorIndex to create a vector index configuration. None by default `vectorize_collection_name` @@ -617,6 +676,9 @@ def text2vec_transformers( vector_index_config: Optional[_VectorIndexConfigCreate] = None, vectorize_collection_name: bool = True, pooling_strategy: Literal["masked_mean", "cls"] = "masked_mean", + inference_url: Optional[str] = None, + passage_inference_url: Optional[str] = None, + query_inference_url: Optional[str] = None, ) -> _NamedVectorConfigCreate: """Create a named vector using the `text2vec_transformers` model. @@ -634,6 +696,12 @@ def text2vec_transformers( Whether to vectorize the collection name. Defaults to `True`. `pooling_strategy` The pooling strategy to use. Defaults to `masked_mean`. + `inference_url` + The inferenceUrl to use where API requests should go. You can use either this OR passage/query_inference_url. Defaults to `None`, which uses the server-defined default. + `passage_inference_url` + The inferenceUrl to use where passage API requests should go. You can use either this and query_inference_url OR inference_url. Defaults to `None`, which uses the server-defined default. + `query_inference_url` + The inferenceUrl to use where query API requests should go. You can use either this and passage_inference_url OR inference_url. Defaults to `None`, which uses the server-defined default. """ return _NamedVectorConfigCreate( name=name, @@ -641,6 +709,9 @@ def text2vec_transformers( vectorizer=_Text2VecTransformersConfigCreate( poolingStrategy=pooling_strategy, vectorizeClassName=vectorize_collection_name, + inferenceUrl=inference_url, + passageInferenceUrl=passage_inference_url, + queryInferenceUrl=query_inference_url, ), vector_index_config=vector_index_config, ) @@ -683,6 +754,52 @@ def text2vec_jinaai( vector_index_config=vector_index_config, ) + @staticmethod + def text2vec_voyageai( + name: str, + *, + source_properties: Optional[List[str]] = None, + vector_index_config: Optional[_VectorIndexConfigCreate] = None, + vectorize_collection_name: bool = True, + model: Optional[Union[VoyageModel, str]] = None, + base_url: Optional[str] = None, + truncate: Optional[bool] = None, + ) -> _NamedVectorConfigCreate: + """Create a named vector using the `text2vec-jinaai` model. + + See the [documentation](https://weaviate.io/developers/weaviate/modules/retriever-vectorizer-modules/text2vec-jinaai) + for detailed usage. + + Arguments: + `name` + The name of the named vector. + `source_properties` + Which properties should be included when vectorizing. By default all text properties are included. + `vector_index_config` + The configuration for Weaviate's vector index. Use wvc.config.Configure.VectorIndex to create a vector index configuration. None by default + `vectorize_collection_name` + Whether to vectorize the collection name. Defaults to `True`. + `model` + The model to use. Defaults to `None`, which uses the server-defined default. + See the + [documentation](https://weaviate.io/developers/weaviate/modules/retriever-vectorizer-modules/text2vec-voyageai#available-models) for more details. + `base_url` + The base URL to use where API requests should go. Defaults to `None`, which uses the server-defined default. + `truncate` + Whether to truncate the input texts to fit within the context length. Defaults to `None`, which uses the server-defined default. + """ + return _NamedVectorConfigCreate( + name=name, + source_properties=source_properties, + vectorizer=_Text2VecVoyageConfigCreate( + model=model, + vectorizeClassName=vectorize_collection_name, + baseURL=base_url, + truncate=truncate, + ), + vector_index_config=vector_index_config, + ) + class _NamedVectorsUpdate: @staticmethod diff --git a/weaviate/collections/classes/config_vectorizers.py b/weaviate/collections/classes/config_vectorizers.py index 5aad11a1f..69f1c2095 100644 --- a/weaviate/collections/classes/config_vectorizers.py +++ b/weaviate/collections/classes/config_vectorizers.py @@ -24,6 +24,7 @@ "text-embedding-3-small", "text-embedding-3-large", "text-embedding-ada-002" ] JinaModel: TypeAlias = Literal["jina-embeddings-v2-base-en", "jina-embeddings-v2-small-en"] +VoyageModel: TypeAlias = Literal["voyage-large-2, voyage-code-2, voyage-2"] AWSModel: TypeAlias = Literal[ "amazon.titan-embed-text-v1", "cohere.embed-english-v3", @@ -62,10 +63,14 @@ class Vectorizers(str, Enum): Weaviate module backed by Transformers text-based embedding models. `TEXT2VEC_JINAAI` Weaviate module backed by Jina AI text-based embedding models. + `TEXT2VEC_VOYAGEAI` + Weaviate module backed by Voyage AI text-based embedding models. `IMG2VEC_NEURAL` Weaviate module backed by a ResNet-50 neural network for images. `MULTI2VEC_CLIP` Weaviate module backed by a Sentence-BERT CLIP model for images and text. + `MULTI2VEC_PALM` + Weaviate module backed by a palm model for images and text. `MULTI2VEC_BIND` Weaviate module backed by the ImageBind model for images, text, audio, depth, IMU, thermal, and video. `REF2VEC_CENTROID` @@ -82,9 +87,11 @@ class Vectorizers(str, Enum): TEXT2VEC_PALM = "text2vec-palm" TEXT2VEC_TRANSFORMERS = "text2vec-transformers" TEXT2VEC_JINAAI = "text2vec-jinaai" + TEXT2VEC_VOYAGEAI = "text2vec-voyageai" IMG2VEC_NEURAL = "img2vec-neural" MULTI2VEC_CLIP = "multi2vec-clip" MULTI2VEC_BIND = "multi2vec-bind" + MULTI2VEC_PALM = "multi2vec-palm" REF2VEC_CENTROID = "ref2vec-centroid" @@ -260,6 +267,9 @@ class _Text2VecTransformersConfig(_ConfigCreateModel): ) poolingStrategy: Literal["masked_mean", "cls"] vectorizeClassName: bool + inferenceUrl: Optional[str] + passageInferenceUrl: Optional[str] + queryInferenceUrl: Optional[str] class _Text2VecTransformersConfigCreate(_Text2VecTransformersConfig, _VectorizerConfigCreate): @@ -285,6 +295,20 @@ class _Text2VecJinaConfigCreate(_Text2VecJinaConfig, _VectorizerConfigCreate): pass +class _Text2VecVoyageConfig(_ConfigCreateModel): + vectorizer: Vectorizers = Field( + default=Vectorizers.TEXT2VEC_VOYAGEAI, frozen=True, exclude=True + ) + model: Optional[str] + baseURL: Optional[str] + truncate: Optional[bool] + vectorizeClassName: bool + + +class _Text2VecVoyageConfigCreate(_Text2VecVoyageConfig, _VectorizerConfigCreate): + pass + + class _Img2VecNeuralConfig(_ConfigCreateModel): vectorizer: Vectorizers = Field(default=Vectorizers.IMG2VEC_NEURAL, frozen=True, exclude=True) imageFields: List[str] @@ -324,12 +348,22 @@ def _to_dict(self) -> Dict[str, Any]: class _Multi2VecClipConfig(_Multi2VecBase): vectorizer: Vectorizers = Field(default=Vectorizers.MULTI2VEC_CLIP, frozen=True, exclude=True) + inferenceUrl: Optional[str] class _Multi2VecClipConfigCreate(_Multi2VecClipConfig, _VectorizerConfigCreate): pass +class _Multi2VecPalmConfig(_Multi2VecBase, _VectorizerConfigCreate): + vectorizer: Vectorizers = Field(default=Vectorizers.MULTI2VEC_PALM, frozen=True, exclude=True) + projectId: str + location: Optional[str] + modelId: Optional[str] + dimensions: Optional[int] + vectorizeClassName: bool + + class _Multi2VecBindConfig(_Multi2VecBase): vectorizer: Vectorizers = Field(default=Vectorizers.MULTI2VEC_BIND, frozen=True, exclude=True) audioFields: Optional[List[Multi2VecField]] @@ -396,6 +430,7 @@ def img2vec_neural( def multi2vec_clip( image_fields: Optional[Union[List[str], List[Multi2VecField]]] = None, text_fields: Optional[Union[List[str], List[Multi2VecField]]] = None, + interference_url: Optional[str] = None, vectorize_collection_name: bool = True, ) -> _VectorizerConfigCreate: """Create a `_Multi2VecClipConfigCreate` object for use when vectorizing using the `multi2vec-clip` model. @@ -408,6 +443,8 @@ def multi2vec_clip( The image fields to use in vectorization. `text_fields` The text fields to use in vectorization. + `inference_url` + The inference url to use where API requests should go. Defaults to `None`, which uses the server-defined default. `vectorize_collection_name` Whether to vectorize the collection name. Defaults to `True`. @@ -418,6 +455,7 @@ def multi2vec_clip( imageFields=_map_multi2vec_fields(image_fields), textFields=_map_multi2vec_fields(text_fields), vectorizeClassName=vectorize_collection_name, + inferenceUrl=interference_url, ) @staticmethod @@ -745,10 +783,58 @@ def text2vec_palm( vectorizeClassName=vectorize_collection_name, ) + @staticmethod + def multi2vec_palm( + *, + location: str, + project_id: str, + image_fields: Optional[Union[List[str], List[Multi2VecField]]] = None, + text_fields: Optional[Union[List[str], List[Multi2VecField]]] = None, + dimensions: Optional[int] = None, + model_id: Optional[str] = None, + vectorize_collection_name: bool = True, + ) -> _VectorizerConfigCreate: + """Create a `_Multi2VecPalmConfig` object for use when vectorizing using the `text2vec-palm` model. + + See the [documentation](https://weaviate.io/developers/weaviate/modules/retriever-vectorizer-modules/text2vec-palm) + for detailed usage. + + Arguments: + `location` + Where the model runs. REQUIRED. + `project_id` + The project ID to use, REQUIRED. + `image_fields` + The image fields to use in vectorization. + `text_fields` + The text fields to use in vectorization. + `dimensions` + The number of dimensions to use. Defaults to `None`, which uses the server-defined default. + `model_id` + The model ID to use. Defaults to `None`, which uses the server-defined default. + `vectorize_collection_name` + Whether to vectorize the collection name. Defaults to `True`. + + Raises: + `pydantic.ValidationError` if `api_endpoint` is not a valid URL. + """ + return _Multi2VecPalmConfig( + projectId=project_id, + location=location, + imageFields=_map_multi2vec_fields(image_fields), + textFields=_map_multi2vec_fields(text_fields), + dimensions=dimensions, + modelId=model_id, + vectorizeClassName=vectorize_collection_name, + ) + @staticmethod def text2vec_transformers( pooling_strategy: Literal["masked_mean", "cls"] = "masked_mean", vectorize_collection_name: bool = True, + inference_url: Optional[str] = None, + passage_inference_url: Optional[str] = None, + query_inference_url: Optional[str] = None, ) -> _VectorizerConfigCreate: """Create a `_Text2VecTransformersConfigCreate` object for use when vectorizing using the `text2vec-transformers` model. @@ -760,6 +846,12 @@ def text2vec_transformers( The pooling strategy to use. Defaults to `masked_mean`. `vectorize_collection_name` Whether to vectorize the collection name. Defaults to `True`. + `inference_url` + The inference url to use where API requests should go. You can use either this OR passage/query_inference_url. Defaults to `None`, which uses the server-defined default. + `passage_inference_url` + The inference url to use where passage API requests should go. You can use either this and query_inference_url OR inference_url. Defaults to `None`, which uses the server-defined default. + `query_inference_url` + The inference url to use where query API requests should go. You can use either this and passage_inference_url OR inference_url. Defaults to `None`, which uses the server-defined default. Raises: `pydantic.ValidationError` if `pooling_strategy` is not a valid value from the `PoolingStrategy` type. @@ -767,6 +859,9 @@ def text2vec_transformers( return _Text2VecTransformersConfigCreate( poolingStrategy=pooling_strategy, vectorizeClassName=vectorize_collection_name, + inferenceUrl=inference_url, + passageInferenceUrl=passage_inference_url, + queryInferenceUrl=query_inference_url, ) @staticmethod @@ -788,3 +883,35 @@ def text2vec_jinaai( Whether to vectorize the collection name. Defaults to `True`. """ return _Text2VecJinaConfigCreate(model=model, vectorizeClassName=vectorize_collection_name) + + @staticmethod + def text2vec_voyageai( + *, + model: Optional[Union[VoyageModel, str]] = None, + base_url: Optional[str] = None, + truncate: Optional[bool] = None, + vectorize_collection_name: bool = True, + ) -> _VectorizerConfigCreate: + """Create a `_Text2VecVoyageConfigCreate` object for use when vectorizing using the `text2vec-voyageai` model. + + See the [documentation](https://weaviate.io/developers/weaviate/modules/retriever-vectorizer-modules/text2vec-voyageai) + for detailed usage. + + Arguments: + `model` + The model to use. Defaults to `None`, which uses the server-defined default. + See the + [documentation](https://weaviate.io/developers/weaviate/modules/retriever-vectorizer-modules/text2vec-voyageai#available-models) for more details. + `base_url` + The base URL to use where API requests should go. Defaults to `None`, which uses the server-defined default. + `truncate` + Whether to truncate the input texts to fit within the context length. Defaults to `None`, which uses the server-defined default. + `vectorize_collection_name` + Whether to vectorize the collection name. Defaults to `True`. + """ + return _Text2VecVoyageConfigCreate( + model=model, + baseURL=base_url, + truncate=truncate, + vectorizeClassName=vectorize_collection_name, + )