googleapis · sararob · May 5, 2022 · Apr 26, 2022 · Apr 26, 2022 · Apr 26, 2022
diff --git a/google/cloud/aiplatform/datasets/tabular_dataset.py b/google/cloud/aiplatform/datasets/tabular_dataset.py
@@ -19,12 +19,18 @@
 
 from google.auth import credentials as auth_credentials
 
+from google.cloud import bigquery
+from google.cloud.aiplatform import base
 from google.cloud.aiplatform import datasets
 from google.cloud.aiplatform.datasets import _datasources
 from google.cloud.aiplatform import initializer
 from google.cloud.aiplatform import schema
 from google.cloud.aiplatform import utils
 
+_AUTOML_TRAINING_MIN_ROWS = 1000
+
+_LOGGER = base.Logger(__name__)
+
 
 class TabularDataset(datasets._ColumnNamesDataset):
     """Managed tabular dataset resource for Vertex AI."""
@@ -146,6 +152,97 @@ def create(
             create_request_timeout=create_request_timeout,
         )
 
+    @classmethod
+    def create_from_dataframe(
+        cls,
+        df_source: "pd.DataFrame",  # noqa: F821 - skip check for undefined name 'pd'
+        staging_path: str,
+        display_name: Optional[str] = None,
+        project: Optional[str] = None,
+        location: Optional[str] = None,
+        credentials: Optional[auth_credentials.Credentials] = None,
+    ) -> "TabularDataset":
+        """Creates a new tabular dataset from a Pandas DataFrame.
+
+        Args:ers.
+            df_source (pd.DataFrame):
+                Required. Pandas DataFrame containing the source data for
+                ingestion as a TabularDataset.
+            staging_path (str):
+                Required. The BigQuery table to stage the data
+                for Vertex. Because Vertex maintains a reference to this source
+                to create the Vertex Dataset, this BigQuery table should
+                not be deleted. Example: `bq://my-project.my-dataset.my-table`.
+                If the provided BigQuery table doesn't exist, this method will
+                create the table. If the provided BigQuery table already exists,
+                and the schemas of the BigQuery table and your DataFrame match,
+                this method will append the data in your local DataFrame to the table.
+            display_name (str):
+                Optional. The user-defined name of the Dataset.
+                The name can be up to 128 characters long and can be consist
+                of any UTF-8 charact
+            project (str):
+                Optional. Project to upload this dataset to. Overrides project set in
+                aiplatform.init.
+            location (str):
+                Optional. Location to upload this dataset to. Overrides location set in
+                aiplatform.init.
+            credentials (auth_credentials.Credentials):
+                Optional. Custom credentials to use to upload this dataset. Overrides
+                credentials set in aiplatform.init.
+        """
+
+        if len(df_source) < _AUTOML_TRAINING_MIN_ROWS:
+            _LOGGER.info(
+                "Your DataFrame has %s rows and AutoML requires %s rows to train on tabular data. You can still train a custom model once your dataset has been uploaded to Vertex, but you will not be able to use AutoML for training."
+                % (len(df_source), _AUTOML_TRAINING_MIN_ROWS),
+            )
+
+        try:
+            import pyarrow  # noqa: F401 - skip check for 'pyarrow' which is required when using 'google.cloud.bigquery'
+        except ImportError:
+            raise ImportError(
+                "Pyarrow is not installed. Please install pyarrow to use the BigQuery client."
+            )
+
+        bigquery_client = bigquery.Client(
+            project=project or initializer.global_config.project,
+            credentials=credentials or initializer.global_config.credentials,
+        )
+
+        if staging_path.startswith("bq://"):
+            bq_staging_path = staging_path[len("bq://") :]
+        else:
+            raise ValueError(
+                "Only BigQuery staging paths are supported. Provide a staging path in the format `bq://your-project.your-dataset.your-table`."
+            )
+
+        try:
+            parquet_options = bigquery.format_options.ParquetOptions()
+            parquet_options.enable_list_inference = True
+
+            job_config = bigquery.LoadJobConfig(
+                source_format=bigquery.SourceFormat.PARQUET,
+                parquet_options=parquet_options,
+            )
+
+            job = bigquery_client.load_table_from_dataframe(
+                dataframe=df_source, destination=bq_staging_path, job_config=job_config
+            )
+
+            job.result()
+
+        finally:
+            dataset_from_dataframe = cls.create(
+                display_name=display_name,
+                bq_source=staging_path,
+                project=project,
+                location=location,
+                credentials=credentials,
+            )
+
+        return dataset_from_dataframe
+
     def import_data(self):
         raise NotImplementedError(
             f"{self.__class__.__name__} class does not support 'import_data'"

diff --git a/tests/unit/aiplatform/test_datasets.py b/tests/unit/aiplatform/test_datasets.py
@@ -22,6 +22,7 @@
 from unittest import mock
 from importlib import reload
 from unittest.mock import patch
+import pandas as pd
 
 from google.api_core import operation
 from google.auth.exceptions import GoogleAuthError
@@ -147,6 +148,30 @@
 
 _TEST_LABELS = {"my_key": "my_value"}
 
+# create_from_dataframe
+_TEST_INVALID_SOURCE_URI_BQ = "my-project.my-dataset.table"
+
+_TEST_BOOL_COL = "bool_col"
+_TEST_BOOL_ARR_COL = "bool_array_col"
+_TEST_DOUBLE_COL = "double_col"
+_TEST_DOUBLE_ARR_COL = "double_array_col"
+_TEST_INT_COL = "int64_col"
+_TEST_INT_ARR_COL = "int64_array_col"
+_TEST_STR_COL = "string_col"
+_TEST_STR_ARR_COL = "string_array_col"
+_TEST_BYTES_COL = "bytes_col"
+_TEST_DF_COLUMN_NAMES = [
+    _TEST_BOOL_COL,
+    _TEST_BOOL_ARR_COL,
+    _TEST_DOUBLE_COL,
+    _TEST_DOUBLE_ARR_COL,
+    _TEST_INT_COL,
+    _TEST_INT_ARR_COL,
+    _TEST_STR_COL,
+    _TEST_STR_ARR_COL,
+    _TEST_BYTES_COL,
+]
+
 
 @pytest.fixture
 def get_dataset_mock():
@@ -1378,6 +1403,114 @@ def test_create_dataset_with_labels(self, create_dataset_mock, sync):
             timeout=None,
         )
 
+    @pytest.mark.usefixtures("get_dataset_tabular_bq_mock")
+    @pytest.mark.parametrize(
+        "source_df",
+        [
+            pd.DataFrame(
+                data=[
+                    [
+                        False,
+                        [True, False],
+                        1.2,
+                        [1.2, 3.4],
+                        1,
+                        [1, 2],
+                        "test",
+                        ["test1", "test2"],
+                        b"1",
+                    ],
+                    [
+                        True,
+                        [True, True],
+                        2.2,
+                        [2.2, 4.4],
+                        2,
+                        [2, 3],
+                        "test1",
+                        ["test2", "test3"],
+                        b"0",
+                    ],
+                ],
+                columns=_TEST_DF_COLUMN_NAMES,
+            ),
+        ],
+    )
+    @pytest.mark.parametrize("sync", [True, False])
+    def test_create_dataset_tabular_from_dataframe(
+        self, create_dataset_mock, source_df, bq_client_mock, sync
+    ):
+        aiplatform.init(project=_TEST_PROJECT)
+
+        dataset_from_df = datasets.TabularDataset.create_from_dataframe(
+            display_name=_TEST_DISPLAY_NAME,
+            df_source=source_df,
+            staging_path=_TEST_SOURCE_URI_BQ,
+        )
+
+        if not sync:
+            dataset_from_df.wait()
+
+        assert dataset_from_df.metadata_schema_uri == _TEST_METADATA_SCHEMA_URI_TABULAR
+
+        expected_dataset = gca_dataset.Dataset(
+            display_name=_TEST_DISPLAY_NAME,
+            metadata_schema_uri=_TEST_METADATA_SCHEMA_URI_TABULAR,
+            metadata=_TEST_METADATA_TABULAR_BQ,
+        )
+
+        create_dataset_mock.assert_called_once_with(
+            parent=_TEST_PARENT,
+            dataset=expected_dataset,
+            metadata=_TEST_REQUEST_METADATA,
+            timeout=None,
+        )
+
+    @pytest.mark.usefixtures("get_dataset_tabular_bq_mock")
+    @pytest.mark.parametrize(
+        "source_df",
+        [
+            pd.DataFrame(
+                data=[
+                    [
+                        False,
+                        [True, False],
+                        1.2,
+                        [1.2, 3.4],
+                        1,
+                        [1, 2],
+                        "test",
+                        ["test1", "test2"],
+                        b"1",
+                    ],
+                    [
+                        True,
+                        [True, True],
+                        2.2,
+                        [2.2, 4.4],
+                        2,
+                        [2, 3],
+                        "test1",
+                        ["test2", "test3"],
+                        b"0",
+                    ],
+                ],
+                columns=_TEST_DF_COLUMN_NAMES,
+            ),
+        ],
+    )
+    @pytest.mark.parametrize("sync", [True, False])
+    def test_create_dataset_tabular_from_dataframe_with_invalid_bq_uri(
+        self, create_dataset_mock, source_df, bq_client_mock, sync
+    ):
+        aiplatform.init(project=_TEST_PROJECT)
+        with pytest.raises(ValueError):
+            datasets.TabularDataset.create_from_dataframe(
+                display_name=_TEST_DISPLAY_NAME,
+                df_source=source_df,
+                staging_path=_TEST_INVALID_SOURCE_URI_BQ,
+            )
+
 
 class TestTextDataset:
     def setup_method(self):