databricks · susodapop · Dec 30, 2022 · Nov 15, 2022 · Nov 15, 2022 · Nov 15, 2022
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -112,6 +112,7 @@ export access_token=""
 There are several e2e test suites available:
 - `PySQLCoreTestSuite`
 - `PySQLLargeQueriesSuite`
+- `PySQLStagingIngestionTestSuite`
 - `PySQLRetryTestSuite.HTTP503Suite` **[not documented]**
 - `PySQLRetryTestSuite.HTTP429Suite` **[not documented]**
 - `PySQLUnityCatalogTestSuite` **[not documented]**
@@ -122,6 +123,12 @@ To execute the core test suite:
 poetry run python -m pytest tests/e2e/driver_tests.py::PySQLCoreTestSuite
 ```
 
+The `PySQLCoreTestSuite` namespace contains tests for all of the connector's basic features and behaviours. This is the default namespace where tests should be written unless they require specially configured clusters or take an especially long-time to execute by design.
+
+The `PySQLLargeQueriesSuite` namespace contains long-running query tests and is kept separate. In general, if the `PySQLCoreTestSuite` passes then these tests will as well.
+
+The `PySQLStagingIngestionTestSuite` namespace requires a cluster running DBR version > 13.x which supports staging ingestion commands.
+
 The suites marked `[not documented]` require additional configuration which will be documented at a later time.
 ### Code formatting
 

diff --git a/src/databricks/sql/client.py b/src/databricks/sql/client.py
@@ -2,6 +2,7 @@
 
 import pandas
 import pyarrow
+import requests
 
 from databricks.sql import __version__
 from databricks.sql import *
@@ -297,6 +298,58 @@ def _check_not_closed(self):
         if not self.open:
             raise Error("Attempting operation on closed cursor")
 
+    def _handle_staging_operation(self):
+        """Make HTTP request using instructions provided by server"""
+
+        row = self.active_result_set.fetchone()
+
+        # TODO: Handle headers. What format will gateway send? json? plaintext?
+        operation, presigned_url, local_file, headers = (
+            row.operation,
+            row.presignedUrl,
+            row.localFile,
+            None,
+        )
+
+        operation_map = {
+            "PUT": requests.put,
+            "GET": requests.get,
+        }
+
+        if operation not in operation_map:
+            raise Error(
+                "Operation {} is not supported. Supported operations are {}".format(
+                    operation, ",".join(operation_map.keys())
+                )
+            )
+
+        req_func = operation_map[operation]
+
+        if local_file:
+            raw_data = open(local_file, "rb")
+        else:
+            raw_data = None
+
+        rq_func_args = dict(url=presigned_url, data=raw_data)
+
+        logger.debug(
+            "Attempting staging operation: {} - {}".format(operation, local_file)
+        )
+
+        # Call the function
+        resp = req_func(**rq_func_args)
+
+        if resp.status_code != 200:
+            raise Error(
+                "Staging operation over HTTP was unsuccessful: {}-{}".format(
+                    resp.status_code, resp.text
+                )
+            )
+
+        if operation == "GET":
+            with open(local_file, "wb") as fp:
+                fp.write(resp.content)
+
     def execute(
         self, operation: str, parameters: Optional[Dict[str, str]] = None
     ) -> "Cursor":
@@ -331,6 +384,10 @@ def execute(
             self.buffer_size_bytes,
             self.arraysize,
         )
+
+        if execute_response.is_staging_operation:
+            self._handle_staging_operation()
+
         return self
 
     def executemany(self, operation, seq_of_parameters):

diff --git a/src/databricks/sql/thrift_backend.py b/src/databricks/sql/thrift_backend.py
@@ -452,7 +452,7 @@ def open_session(self, session_configuration, catalog, schema):
                 initial_namespace = None
 
             open_session_req = ttypes.TOpenSessionReq(
-                client_protocol_i64=ttypes.TProtocolVersion.SPARK_CLI_SERVICE_PROTOCOL_V6,
+                client_protocol_i64=ttypes.TProtocolVersion.SPARK_CLI_SERVICE_PROTOCOL_V7,
                 client_protocol=None,
                 initialNamespace=initial_namespace,
                 canUseMultipleCatalogs=True,
@@ -733,6 +733,8 @@ def _results_message_to_execute_response(self, resp, operation_state):
             .to_pybytes()
         )
         lz4_compressed = t_result_set_metadata_resp.lz4Compressed
+        # TODO: will this fail if metadata doesn't include `isStagingOperation`?
+        is_staging_operation = t_result_set_metadata_resp.isStagingOperation
         if direct_results and direct_results.resultSet:
             assert direct_results.resultSet.results.startRowOffset == 0
             assert direct_results.resultSetMetadata
@@ -752,6 +754,7 @@ def _results_message_to_execute_response(self, resp, operation_state):
             has_been_closed_server_side=has_been_closed_server_side,
             has_more_rows=has_more_rows,
             lz4_compressed=lz4_compressed,
+            is_staging_operation=is_staging_operation,
             command_handle=resp.operationHandle,
             description=description,
             arrow_schema_bytes=schema_bytes,

diff --git a/src/databricks/sql/utils.py b/src/databricks/sql/utils.py
@@ -40,7 +40,7 @@ def remaining_rows(self) -> pyarrow.Table:
 
 ExecuteResponse = namedtuple(
     "ExecuteResponse",
-    "status has_been_closed_server_side has_more_rows description lz4_compressed "
+    "status has_been_closed_server_side has_more_rows description lz4_compressed is_staging_operation "
     "command_handle arrow_queue arrow_schema_bytes",
 )
 

diff --git a/tests/e2e/driver_tests.py b/tests/e2e/driver_tests.py
@@ -5,6 +5,7 @@
 import logging
 import os
 import sys
+import tempfile
 import threading
 import time
 from unittest import loader, skipIf, skipUnless, TestCase
@@ -14,6 +15,7 @@
 import pyarrow
 import pytz
 import thrift
+import pytest
 
 import databricks.sql as sql
 from databricks.sql import STRING, BINARY, NUMBER, DATETIME, DATE, DatabaseError, Error, OperationalError
@@ -630,6 +632,59 @@ def test_initial_namespace(self):
             cursor.execute("select current_database()")
             self.assertEqual(cursor.fetchone()[0], table_name)
 
+class PySQLStagingIngestionTestSuite(PySQLTestCase):
+    """Simple namespace for ingestion tests. These should be run against DBR >13.x
+
+    In addition to connection credentials (host, path, token) this suite requires an env var
+    named staging_ingestion_user"""
+
+    staging_ingestion_user = os.getenv("staging_ingestion_user")
+
+    if staging_ingestion_user is None:
+        raise ValueError("To run these tests you must designate a `staging_ingestion_user` environment variable. This will the user associated with the personal access token.")
+
+    def test_staging_ingestion_put_and_get(self):
+
+        fh, temp_path =  tempfile.mkstemp()
+
+        original_text = "hello world!".encode("utf-8")
+
+        with open(fh, 'wb') as fp:
+            fp.write(original_text)
+
+        with self.connection() as conn:
+            cursor = conn.cursor()
+            query = f"PUT '{temp_path}' INTO 'stage://tmp/{self.staging_ingestion_user}/tmp/11/15/file1.csv' OVERWRITE"
+            cursor.execute(query)
+
+        # TODO: What is the acceptance test for a successful staging operation?
+        # For now, let's GET the file back and compare it to the original
+
+        new_fh, new_temp_path = tempfile.mkstemp()
+
+        with self.connection() as conn:
+            cursor = conn.cursor()
+            query = f"GET 'stage://tmp/{self.staging_ingestion_user}/tmp/11/15/file1.csv' TO '{new_temp_path}'"
+            cursor.execute(query)
+
+        with open(new_fh, 'rb') as fp:
+            fetched_text = fp.read()
+
+        assert fetched_text == original_text
+
+        os.remove(temp_path)
+        os.remove(new_temp_path)
+
+    def test_staging_ingestion_delete(self):
+
+        # Test stub to be completed when we implement DELETE. We need to guarantee this file exists before we attempt to remove it.
+
+        with self.connection() as conn:
+            cursor = conn.cursor()
+            query = f"REMOVE 'stage://tmp/{self.staging_ingestion_user}/tmp/11/15/file1.csv''"
+            with pytest.raises(Error):
+                cursor.execute(query)
+
 
 def main(cli_args):
     global get_args_from_env

diff --git a/tests/unit/test_fetches.py b/tests/unit/test_fetches.py
@@ -41,7 +41,8 @@ def make_dummy_result_set_from_initial_results(initial_results):
                 lz4_compressed=Mock(),
                 command_handle=None,
                 arrow_queue=arrow_queue,
-                arrow_schema_bytes=schema.serialize().to_pybytes()))
+                arrow_schema_bytes=schema.serialize().to_pybytes(),
+                is_staging_operation=False))
         num_cols = len(initial_results[0]) if initial_results else 0
         rs.description = [(f'col{col_id}', 'integer', None, None, None, None, None)
                           for col_id in range(num_cols)]
@@ -75,7 +76,8 @@ def fetch_results(op_handle, max_rows, max_bytes, expected_row_start_offset, lz4
                 lz4_compressed=Mock(),
                 command_handle=None,
                 arrow_queue=None,
-                arrow_schema_bytes=None))
+                arrow_schema_bytes=None,
+                is_staging_operation=False))
         return rs
 
     def assertEqualRowValues(self, actual, expected):

diff --git a/tests/unit/tests.py b/tests/unit/tests.py
@@ -12,9 +12,9 @@
 from databricks.sql import InterfaceError, DatabaseError, Error, NotSupportedError
 from databricks.sql.types import Row
 
-from test_fetches import FetchTests
-from test_thrift_backend import ThriftBackendTestSuite
-from test_arrow_queue import ArrowQueueSuite
+from tests.unit.test_fetches import FetchTests
+from tests.unit.test_thrift_backend import ThriftBackendTestSuite
+from tests.unit.test_arrow_queue import ArrowQueueSuite
 
 
 class ClientTestSuite(unittest.TestCase):
@@ -534,6 +534,21 @@ def test_cursor_keeps_connection_alive(self, mock_client_class):
         self.assertEqual(instance.close_session.call_count, 0)
         cursor.close()
 
+    @patch("%s.client.ThriftBackend" % PACKAGE_NAME)
+    @patch("%s.client.Cursor._handle_staging_operation" % PACKAGE_NAME)
+    @patch("%s.utils.ExecuteResponse" % PACKAGE_NAME)
+    def test_staging_operation_response_is_handled(self, mock_client_class, mock_handle_staging_operation, mock_execute_response):
+        # If server sets ExecuteResponse.is_staging_operation True then _handle_staging_operation should be called
+
+        mock_execute_response.is_staging_operation = True
+
+        connection = databricks.sql.connect(**self.DUMMY_CONNECTION_ARGS)
+        cursor = connection.cursor()
+        cursor.execute("Text of some staging operation command;")
+        connection.close()
+
+        mock_handle_staging_operation.assert_called_once_with()
+
 
 if __name__ == '__main__':
     suite = unittest.TestLoader().loadTestsFromModule(sys.modules[__name__])