add suffix to artifact_location

AnandInguva · Dec 8, 2023 · cfb1883 · cfb1883
1 parent 816174a
commit cfb1883
Show file tree

Hide file tree

Showing 2 changed files with 33 additions and 37 deletions.
diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/huggingface_test.py b/sdks/python/apache_beam/ml/transforms/embeddings/huggingface_test.py
@@ -14,9 +14,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
 import shutil
 import tempfile
 import unittest
+import uuid
 
 import numpy as np
 from parameterized import parameterized
@@ -55,24 +57,12 @@
     ([{
         test_query_column: test_query,
     }], DEFAULT_MODEL_NAME, [0.13]),
-    (
-        [{
-            test_query_column: 'query: how much protein should a female eat',
-        },
-         {
-             test_query_column: (
-                 "passage: As a general guideline, the CDC's "
-                 "average requirement of protein for women "
-                 "ages 19 to 70 is 46 grams per day. But, "
-                 "as you can see from this chart, you'll need "
-                 "to increase that if you're expecting or training"
-                 " for a marathon. Check out the chart below "
-                 "to see how much protein "
-                 "you should be eating each day.")
-         }],
-        'intfloat/e5-base-v2',
-        # this model requires inputs to be specified as query: and passage:
-        [0.1, 0.1]),
+    ([{
+        test_query_column: 'This is an example sentence',
+    }, {
+        test_query_column: ("Each sentence is converted")
+    }],
+     'sentence-transformers/all-MiniLM-L6-v2', [0.15, 0.14]),
 ]
 
 
@@ -88,6 +78,7 @@ def tearDown(self) -> None:
 
   def test_sentence_transformer_embeddings(self):
     model_name = DEFAULT_MODEL_NAME
+    artifact_location = os.path.join(self.artifact_location, uuid.uuid4().hex)
     embedding_config = SentenceTransformerEmbeddings(
         model_name=model_name, columns=[test_query_column])
     with beam.Pipeline() as pipeline:
@@ -96,9 +87,9 @@ def test_sentence_transformer_embeddings(self):
           | "CreateData" >> beam.Create([{
               test_query_column: test_query
           }])
-          | "MLTransform" >> MLTransform(
-              write_artifact_location=self.artifact_location).with_transform(
-                  embedding_config))
+          | "MLTransform" >>
+          MLTransform(write_artifact_location=artifact_location).with_transform(
+              embedding_config))
 
       def assert_element(element):
         assert len(element[test_query_column]) == 768
@@ -107,6 +98,7 @@ def assert_element(element):
 
   @unittest.skipIf(tft is None, 'Tensorflow Transform is not installed.')
   def test_embeddings_with_scale_to_0_1(self):
+    artifact_location = os.path.join(self.artifact_location, uuid.uuid4().hex)
     model_name = DEFAULT_MODEL_NAME
     embedding_config = SentenceTransformerEmbeddings(
         model_name=model_name,
@@ -118,10 +110,10 @@ def test_embeddings_with_scale_to_0_1(self):
           | "CreateData" >> beam.Create([{
               test_query_column: test_query
           }])
-          | "MLTransform" >> MLTransform(
-              write_artifact_location=self.artifact_location).with_transform(
-                  embedding_config).with_transform(
-                      ScaleTo01(columns=[test_query_column])))
+          | "MLTransform" >>
+          MLTransform(write_artifact_location=artifact_location).with_transform(
+              embedding_config).with_transform(
+                  ScaleTo01(columns=[test_query_column])))
 
       def assert_element(element):
         assert max(element.feature_1) == 1
@@ -134,13 +126,14 @@ def test_embeddings_with_read_artifact_location(
     embedding_config = SentenceTransformerEmbeddings(
         model_name=model_name, columns=[test_query_column])
 
+    artifact_location = os.path.join(self.artifact_location, uuid.uuid4().hex)
     with beam.Pipeline() as p:
       result_pcoll = (
           p
           | "CreateData" >> beam.Create(inputs)
-          | "MLTransform" >> MLTransform(
-              write_artifact_location=self.artifact_location).with_transform(
-                  embedding_config))
+          | "MLTransform" >>
+          MLTransform(write_artifact_location=artifact_location).with_transform(
+              embedding_config))
       max_ele_pcoll = (
           result_pcoll
           | beam.Map(lambda x: round(max(x[test_query_column]), 2)))
@@ -152,7 +145,7 @@ def test_embeddings_with_read_artifact_location(
           p
           | "CreateData" >> beam.Create(inputs)
           | "MLTransform" >>
-          MLTransform(read_artifact_location=self.artifact_location))
+          MLTransform(read_artifact_location=artifact_location))
       max_ele_pcoll = (
           result_pcoll
           | beam.Map(lambda x: round(max(x[test_query_column]), 2)))
@@ -161,6 +154,7 @@ def test_embeddings_with_read_artifact_location(
 
   def test_sentence_transformer_with_int_data_types(self):
     model_name = DEFAULT_MODEL_NAME
+    artifact_location = os.path.join(self.artifact_location, uuid.uuid4().hex)
     embedding_config = SentenceTransformerEmbeddings(
         model_name=model_name, columns=[test_query_column])
     with self.assertRaises(TypeError):
@@ -171,12 +165,13 @@ def test_sentence_transformer_with_int_data_types(self):
                 test_query_column: 1
             }])
             | "MLTransform" >> MLTransform(
-                write_artifact_location=self.artifact_location).with_transform(
+                write_artifact_location=artifact_location).with_transform(
                     embedding_config))
 
   @parameterized.expand(_parameterized_inputs)
   def test_with_gcs_artifact_location(self, inputs, model_name, output):
-    artifact_location = ('gs://apache-beam-ml/testing/sentence_transformers')
+    artifact_location = os.path.join(
+        'gs://apache-beam-ml/testing/sentence_transformers', uuid.uuid4().hex)
     embedding_config = SentenceTransformerEmbeddings(
         model_name=model_name, columns=[test_query_column])
 
@@ -207,7 +202,7 @@ def test_with_gcs_artifact_location(self, inputs, model_name, output):
 
   def test_embeddings_with_inference_args(self):
     model_name = DEFAULT_MODEL_NAME
-
+    artifact_location = os.path.join(self.artifact_location, uuid.uuid4().hex)
     inference_args = {'convert_to_numpy': False}
     embedding_config = SentenceTransformerEmbeddings(
         model_name=model_name,
@@ -219,9 +214,9 @@ def test_embeddings_with_inference_args(self):
           | "CreateData" >> beam.Create([{
               test_query_column: test_query
           }])
-          | "MLTransform" >> MLTransform(
-              write_artifact_location=self.artifact_location).with_transform(
-                  embedding_config))
+          | "MLTransform" >>
+          MLTransform(write_artifact_location=artifact_location).with_transform(
+              embedding_config))
 
       def assert_element(element):
         assert type(element) == torch.Tensor
@@ -233,14 +228,15 @@ def assert_element(element):
 
   def test_mltransform_to_ptransform_with_vertex(self):
     model_name = ''
+    artifact_location = os.path.join(self.artifact_location, uuid.uuid4().hex)
     transforms = [
         SentenceTransformerEmbeddings(columns=['x'], model_name=model_name),
         SentenceTransformerEmbeddings(
             columns=['y', 'z'], model_name=model_name)
     ]
     ptransform_mapper = base._MLTransformToPTransformMapper(
         transforms=transforms,
-        artifact_location=self.artifact_location,
+        artifact_location=artifact_location,
         artifact_mode=None)
 
     ptransform_list = ptransform_mapper.create_and_save_ptransform_list()

diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py b/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py
@@ -119,7 +119,7 @@ def __init__(
       model_name: The name of the Vertex AI Text Embedding model.
       columns: The columns containing the text to be embedded.
       task_type: The downstream task for the embeddings.
-        Valid values: RETRIEVAL_QUERY, RETRIEVAL_DOCUMENT,
+        Valid values are RETRIEVAL_QUERY, RETRIEVAL_DOCUMENT,
         SEMANTIC_SIMILARITY, CLASSIFICATION, CLUSTERING.
       title: Identifier of the text content.
       project: The default GCP project for API calls.