diff --git a/.github/workflows/ibis-ci.yml b/.github/workflows/ibis-ci.yml index 1bca2996b..318e67234 100644 --- a/.github/workflows/ibis-ci.yml +++ b/.github/workflows/ibis-ci.yml @@ -74,7 +74,7 @@ jobs: AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} AWS_REGION: ${{ secrets.AWS_REGION }} AWS_S3_BUCKET: ${{ secrets.AWS_S3_BUCKET }} - run: poetry run pytest -m "not bigquery and not snowflake and not canner and not s3_file and not gcs_file and not athena and not redshift" + run: poetry run pytest -m "not bigquery and not snowflake and not canner and not s3_file and not gcs_file and not athena and not redshift and not databricks" - name: Test bigquery if need if: contains(github.event.pull_request.labels.*.name, 'bigquery') env: diff --git a/ibis-server/app/model/__init__.py b/ibis-server/app/model/__init__.py index 224cf879c..c8f62d06b 100644 --- a/ibis-server/app/model/__init__.py +++ b/ibis-server/app/model/__init__.py @@ -67,6 +67,10 @@ class QuerySnowflakeDTO(QueryDTO): connection_info: SnowflakeConnectionInfo = connection_info_field +class QueryDatabricksDTO(QueryDTO): + connection_info: DatabricksConnectionInfo = connection_info_field + + class QueryTrinoDTO(QueryDTO): connection_info: ConnectionUrl | TrinoConnectionInfo = connection_info_field @@ -360,6 +364,24 @@ class SnowflakeConnectionInfo(BaseConnectionInfo): ) +class DatabricksConnectionInfo(BaseConnectionInfo): + server_hostname: SecretStr = Field( + alias="serverHostname", + description="the server hostname of your Databricks instance", + examples=["dbc-xxxxxxxx-xxxx.cloud.databricks.com"], + ) + http_path: SecretStr = Field( + alias="httpPath", + description="the HTTP path of your Databricks SQL warehouse", + examples=["/sql/1.0/warehouses/xxxxxxxx"], + ) + access_token: SecretStr = Field( + alias="accessToken", + description="the access token for your Databricks instance", + examples=["XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"], + ) + + class TrinoConnectionInfo(BaseConnectionInfo): host: SecretStr = Field( description="the hostname of your database", examples=["localhost"] @@ -485,6 +507,7 @@ class GcsFileConnectionInfo(BaseConnectionInfo): | RedshiftConnectionInfo | RedshiftIAMConnectionInfo | SnowflakeConnectionInfo + | DatabricksConnectionInfo | TrinoConnectionInfo | LocalFileConnectionInfo | S3FileConnectionInfo diff --git a/ibis-server/app/model/data_source.py b/ibis-server/app/model/data_source.py index a944b3aeb..e26c20cf9 100644 --- a/ibis-server/app/model/data_source.py +++ b/ibis-server/app/model/data_source.py @@ -20,6 +20,7 @@ ClickHouseConnectionInfo, ConnectionInfo, ConnectionUrl, + DatabricksConnectionInfo, GcsFileConnectionInfo, LocalFileConnectionInfo, MinioFileConnectionInfo, @@ -31,6 +32,7 @@ QueryBigQueryDTO, QueryCannerDTO, QueryClickHouseDTO, + QueryDatabricksDTO, QueryDTO, QueryGcsFileDTO, QueryLocalFileDTO, @@ -71,6 +73,7 @@ class DataSource(StrEnum): s3_file = auto() minio_file = auto() gcs_file = auto() + databricks = auto() def get_connection(self, info: ConnectionInfo) -> BaseBackend: try: @@ -176,6 +179,8 @@ def _build_connection_info(self, data: dict) -> ConnectionInfo: return MinioFileConnectionInfo.model_validate(data) case DataSource.gcs_file: return GcsFileConnectionInfo.model_validate(data) + case DataSource.databricks: + return DatabricksConnectionInfo.model_validate(data) case _: raise NotImplementedError(f"Unsupported data source: {self}") @@ -225,6 +230,7 @@ class DataSourceExtension(Enum): s3_file = QueryS3FileDTO minio_file = QueryMinioFileDTO gcs_file = QueryGcsFileDTO + databricks = QueryDatabricksDTO def __init__(self, dto: QueryDTO): self.dto = dto @@ -408,6 +414,14 @@ def get_trino_connection(info: TrinoConnectionInfo) -> BaseBackend: **info.kwargs if info.kwargs else dict(), ) + @staticmethod + def get_databricks_connection(info: DatabricksConnectionInfo) -> BaseBackend: + return ibis.databricks.connect( + server_hostname=info.server_hostname.get_secret_value(), + http_path=info.http_path.get_secret_value(), + access_token=info.access_token.get_secret_value(), + ) + @staticmethod def _create_ssl_context(info: ConnectionInfo) -> ssl.SSLContext | None: ssl_mode = ( diff --git a/ibis-server/app/model/metadata/databricks.py b/ibis-server/app/model/metadata/databricks.py new file mode 100644 index 000000000..0b8548892 --- /dev/null +++ b/ibis-server/app/model/metadata/databricks.py @@ -0,0 +1,187 @@ +from loguru import logger + +from app.model import DatabricksConnectionInfo +from app.model.data_source import DataSource +from app.model.metadata.dto import ( + Column, + Constraint, + ConstraintType, + RustWrenEngineColumnType, + Table, + TableProperties, +) +from app.model.metadata.metadata import Metadata + +# https://docs.databricks.com/aws/en/sql/language-manual/sql-ref-datatypes +DATABRICKS_TYPE_MAPPING = { + "bigint": RustWrenEngineColumnType.BIGINT, + "binary": RustWrenEngineColumnType.BYTEA, + "boolean": RustWrenEngineColumnType.BOOL, + "date": RustWrenEngineColumnType.DATE, + "decimal": RustWrenEngineColumnType.DECIMAL, + "double": RustWrenEngineColumnType.DOUBLE, + "float": RustWrenEngineColumnType.FLOAT, + "int": RustWrenEngineColumnType.INTEGER, + "smallint": RustWrenEngineColumnType.SMALLINT, + "string": RustWrenEngineColumnType.STRING, + "timestamp": RustWrenEngineColumnType.TIMESTAMP, + "timestamp_ntz": RustWrenEngineColumnType.TIMESTAMP, + "tinyint": RustWrenEngineColumnType.TINYINT, + "variant": RustWrenEngineColumnType.VARIANT, + "object": RustWrenEngineColumnType.JSON, +} + + +class DatabricksMetadata(Metadata): + def __init__(self, connection_info: DatabricksConnectionInfo): + super().__init__(connection_info) + self.connection = DataSource.databricks.get_connection(connection_info) + + def get_table_list(self) -> list[Table]: + sql = """ + SELECT + c.TABLE_CATALOG AS TABLE_CATALOG, + c.TABLE_SCHEMA AS TABLE_SCHEMA, + c.TABLE_NAME AS TABLE_NAME, + c.COLUMN_NAME AS COLUMN_NAME, + c.DATA_TYPE AS DATA_TYPE, + c.IS_NULLABLE AS IS_NULLABLE, + c.COMMENT AS COLUMN_COMMENT, + t.COMMENT AS TABLE_COMMENT + FROM + INFORMATION_SCHEMA.COLUMNS c + JOIN + INFORMATION_SCHEMA.TABLES t + ON c.TABLE_SCHEMA = t.TABLE_SCHEMA + AND c.TABLE_NAME = t.TABLE_NAME + AND c.TABLE_CATALOG = t.TABLE_CATALOG + WHERE + c.TABLE_SCHEMA NOT IN ('information_schema') + """ + response = self.connection.sql(sql).to_pandas().to_dict(orient="records") + + unique_tables = {} + for row in response: + # generate unique table name + schema_table = self._format_compact_table_name( + row["TABLE_CATALOG"], row["TABLE_SCHEMA"], row["TABLE_NAME"] + ) + # init table if not exists + if schema_table not in unique_tables: + unique_tables[schema_table] = Table( + name=schema_table, + description=row["TABLE_COMMENT"], + columns=[], + properties=TableProperties( + schema=row["TABLE_SCHEMA"], + catalog=row["TABLE_CATALOG"], + table=row["TABLE_NAME"], + ), + primaryKey="", + ) + + # table exists, and add column to the table + data_type = row["DATA_TYPE"].lower() + if data_type.startswith(("array", "map", "struct")): + col_type = data_type + else: + col_type = self._transform_column_type(row["DATA_TYPE"]) + + unique_tables[schema_table].columns.append( + Column( + name=row["COLUMN_NAME"], + type=col_type, + notNull=row["IS_NULLABLE"].lower() == "no", + description=row["COLUMN_COMMENT"], + properties=None, + ) + ) + return list(unique_tables.values()) + + def get_constraints(self) -> list[Constraint]: + sql = """ + SELECT + tc.table_catalog, + tc.table_schema, + tc.table_name, + kcu.column_name, + ccu.table_catalog AS foreign_table_catalog, + ccu.table_schema AS foreign_table_schema, + ccu.table_name AS foreign_table_name, + ccu.column_name AS foreign_column_name + FROM information_schema.table_constraints AS tc + JOIN information_schema.key_column_usage AS kcu + ON tc.constraint_name = kcu.constraint_name + AND tc.constraint_schema = kcu.constraint_schema + AND tc.table_catalog = kcu.table_catalog + AND tc.table_schema = kcu.table_schema + AND tc.table_name = kcu.table_name + JOIN information_schema.constraint_column_usage AS ccu + ON ccu.constraint_name = tc.constraint_name + AND ccu.constraint_catalog = tc.constraint_catalog + AND ccu.constraint_schema = tc.constraint_schema + WHERE tc.constraint_type = 'FOREIGN KEY' + """ + res = self.connection.sql(sql).to_pandas().to_dict(orient="records") + constraints = [] + for row in res: + constraints.append( + Constraint( + constraintName=self._format_constraint_name( + row["table_name"], + row["column_name"], + row["foreign_table_name"], + row["foreign_column_name"], + ), + constraintTable=self._format_compact_table_name( + row["table_catalog"], row["table_schema"], row["table_name"] + ), + constraintColumn=row["column_name"], + constraintedTable=self._format_compact_table_name( + row["foreign_table_catalog"], + row["foreign_table_schema"], + row["foreign_table_name"], + ), + constraintedColumn=row["foreign_column_name"], + constraintType=ConstraintType.FOREIGN_KEY, + ) + ) + return constraints + + def get_version(self) -> str: + return ( + self.connection.sql("SELECT current_version().dbsql_version") + .to_pandas() + .iloc[0, 0] + ) + + def _format_constraint_name( + self, table_name, column_name, foreign_table_name, foreign_column_name + ): + return f"{table_name}_{column_name}_{foreign_table_name}_{foreign_column_name}" + + def _format_compact_table_name(self, catalog: str, schema: str, table: str): + return f"{catalog}.{schema}.{table}" + + def _transform_column_type(self, data_type: str) -> RustWrenEngineColumnType: + # Convert to lowercase for comparison + normalized_type = data_type.lower() + + if normalized_type.startswith("decimal"): + return RustWrenEngineColumnType.DECIMAL + + if normalized_type.startswith("geography"): + return RustWrenEngineColumnType.GEOGRAPHY + + if normalized_type.startswith("geometry"): + return RustWrenEngineColumnType.GEOMETRY + + # Use the module-level mapping table + mapped_type = DATABRICKS_TYPE_MAPPING.get( + normalized_type, RustWrenEngineColumnType.UNKNOWN + ) + + if mapped_type == RustWrenEngineColumnType.UNKNOWN: + logger.warning(f"Unknown Databricks data type: {data_type}") + + return mapped_type diff --git a/ibis-server/app/model/metadata/factory.py b/ibis-server/app/model/metadata/factory.py index 911d42747..cfd2f974b 100644 --- a/ibis-server/app/model/metadata/factory.py +++ b/ibis-server/app/model/metadata/factory.py @@ -3,6 +3,7 @@ from app.model.metadata.bigquery import BigQueryMetadata from app.model.metadata.canner import CannerMetadata from app.model.metadata.clickhouse import ClickHouseMetadata +from app.model.metadata.databricks import DatabricksMetadata from app.model.metadata.metadata import Metadata from app.model.metadata.mssql import MSSQLMetadata from app.model.metadata.mysql import MySQLMetadata @@ -35,6 +36,7 @@ DataSource.s3_file: S3FileMetadata, DataSource.minio_file: MinioFileMetadata, DataSource.gcs_file: GcsFileMetadata, + DataSource.databricks: DatabricksMetadata, } diff --git a/ibis-server/poetry.lock b/ibis-server/poetry.lock index 201fed48a..87d5225ba 100644 --- a/ibis-server/poetry.lock +++ b/ibis-server/poetry.lock @@ -1138,6 +1138,33 @@ files = [ docs = ["ipython", "matplotlib", "numpydoc", "sphinx"] tests = ["pytest", "pytest-cov", "pytest-xdist"] +[[package]] +name = "databricks-sql-connector" +version = "4.1.4" +description = "Databricks SQL Connector for Python" +optional = false +python-versions = "<4.0.0,>=3.8.0" +groups = ["main"] +files = [ + {file = "databricks_sql_connector-4.1.4-py3-none-any.whl", hash = "sha256:cabe1640412c240b328291d7155c280570892961ce56d0529593f354e9958727"}, + {file = "databricks_sql_connector-4.1.4.tar.gz", hash = "sha256:803428c3a7d63b0fbb3bbf3f7ab9bf96f062f6a989f7df7a92529d7b0bbc8a14"}, +] + +[package.dependencies] +lz4 = ">=4.0.2,<5.0.0" +oauthlib = ">=3.1.0,<4.0.0" +openpyxl = ">=3.0.10,<4.0.0" +pandas = {version = ">=1.2.5,<2.3.0", markers = "python_version >= \"3.8\" and python_version < \"3.13\""} +pyarrow = {version = ">=14.0.1", optional = true, markers = "python_version >= \"3.8\" and python_version < \"3.13\" and extra == \"pyarrow\""} +pyjwt = ">=2.0.0,<3.0.0" +python-dateutil = ">=2.8.0,<3.0.0" +requests = ">=2.18.1,<3.0.0" +thrift = ">=0.16.0,<0.21.0" +urllib3 = ">=1.26" + +[package.extras] +pyarrow = ["pyarrow (>=14.0.1) ; python_version >= \"3.8\" and python_version < \"3.13\"", "pyarrow (>=18.0.0) ; python_version >= \"3.13\""] + [[package]] name = "datafusion" version = "47.0.0" @@ -1354,6 +1381,18 @@ files = [ dnspython = ">=2.0.0" idna = ">=2.0.0" +[[package]] +name = "et-xmlfile" +version = "2.0.0" +description = "An implementation of lxml.xmlfile for the standard library" +optional = false +python-versions = ">=3.8" +groups = ["main"] +files = [ + {file = "et_xmlfile-2.0.0-py3-none-any.whl", hash = "sha256:7a91720bc756843502c3b7504c77b8fe44217c85c537d85037f0f536151b2caa"}, + {file = "et_xmlfile-2.0.0.tar.gz", hash = "sha256:dab3f4764309081ce75662649be815c4c9081e88f0837825f90fd28317d4da54"}, +] + [[package]] name = "executing" version = "2.2.1" @@ -3993,6 +4032,21 @@ files = [ {file = "opendal-0.46.0.tar.gz", hash = "sha256:334aa4c5b3cc0776598ef8d3c154f074f6a9d87981b951d70db1407efed3b06c"}, ] +[[package]] +name = "openpyxl" +version = "3.1.5" +description = "A Python library to read/write Excel 2010 xlsx/xlsm files" +optional = false +python-versions = ">=3.8" +groups = ["main"] +files = [ + {file = "openpyxl-3.1.5-py2.py3-none-any.whl", hash = "sha256:5282c12b107bffeef825f4617dc029afaf41d0ea60823bbb665ef3079dc79de2"}, + {file = "openpyxl-3.1.5.tar.gz", hash = "sha256:cf0e3cf56142039133628b5acffe8ef0c12bc902d2aadd3e0fe5878dc08d1050"}, +] + +[package.dependencies] +et-xmlfile = "*" + [[package]] name = "opentelemetry-api" version = "1.37.0" @@ -6988,6 +7042,25 @@ test-module-import = ["httpx"] trino = ["trino"] weaviate = ["weaviate-client (>=4.5.4,<5.0.0)"] +[[package]] +name = "thrift" +version = "0.20.0" +description = "Python bindings for the Apache Thrift RPC system" +optional = false +python-versions = "*" +groups = ["main"] +files = [ + {file = "thrift-0.20.0.tar.gz", hash = "sha256:4dd662eadf6b8aebe8a41729527bd69adf6ceaa2a8681cbef64d1273b3e8feba"}, +] + +[package.dependencies] +six = ">=1.7.2" + +[package.extras] +all = ["tornado (>=4.0)", "twisted"] +tornado = ["tornado (>=4.0)"] +twisted = ["twisted"] + [[package]] name = "tinycss2" version = "1.4.0" @@ -7943,4 +8016,4 @@ cffi = ["cffi (>=1.17,<2.0) ; platform_python_implementation != \"PyPy\" and pyt [metadata] lock-version = "2.1" python-versions = ">=3.11,<3.12" -content-hash = "086f8f8ecc86ae518732a6f95857f396a3336aac342d0d5bb7d4136f8ff770c2" +content-hash = "a6031c4395bd88833b09e4991d06e0c507d7a208322750ddd63903dd66484cf1" diff --git a/ibis-server/pyproject.toml b/ibis-server/pyproject.toml index 971c05461..ddb19e44a 100644 --- a/ibis-server/pyproject.toml +++ b/ibis-server/pyproject.toml @@ -20,6 +20,7 @@ ibis-framework = { git = "https://github.com/Canner/ibis.git", branch = "canner/ "postgres", "snowflake", "trino", + "databricks", ] } google-auth = "2.38.0" httpx = "0.28.1" @@ -48,6 +49,7 @@ jinja2 = ">=3.1.6" redshift_connector = "2.1.7" datafusion = "^47.0.0, <49.0.0" starlette = "^0.49.1" +databricks-sql-connector = { version = "^4.0.1", extras = ["pyarrow"] } [tool.poetry.group.jupyter] optional = true @@ -98,6 +100,7 @@ markers = [ "redshift: mark a test as a redshift test", "snowflake: mark a test as a snowflake test", "trino: mark a test as a trino test", + "databricks: mark a test as a databricks test", "local_file: mark a test as a local file test", "s3_file: mark a test as a s3 file test", "minio_file: mark a test as a minio file test", diff --git a/ibis-server/resources/function_list/databricks.csv b/ibis-server/resources/function_list/databricks.csv new file mode 100644 index 000000000..627870fa1 --- /dev/null +++ b/ibis-server/resources/function_list/databricks.csv @@ -0,0 +1,492 @@ +function_type,name,return_type,param_names,param_types,description +scalar,add_months,date,,,"add_months(start_date, num_months) - Returns the date that is `num_months` after `start_date`." +scalar,aes_decrypt,binary,,,"aes_decrypt(expr, key[, mode[, padding[, aad]]]) - Returns a decrypted value of `expr` using AES in `mode` with `padding`. Key lengths of 16, 24 and 32 bits are supported. Supported combinations of (`mode`, `padding`) are ('ECB', 'PKCS'), ('GCM', 'NONE') and ('CBC', 'PKCS'). Optional additional authenticated data (AAD) is only supported for GCM. If provided for encryption, the identical AAD value must be provided for decryption. The default mode is GCM." +scalar,aes_encrypt,binary,,,"aes_encrypt(expr, key[, mode[, padding[, iv[, aad]]]]) - Returns an encrypted value of `expr` using AES in given `mode` with the specified `padding`. Key lengths of 16, 24 and 32 bits are supported. Supported combinations of (`mode`, `padding`) are ('ECB', 'PKCS'), ('GCM', 'NONE') and ('CBC', 'PKCS'). Optional initialization vectors (IVs) are only supported for CBC and GCM modes. These must be 16 bytes for CBC and 12 bytes for GCM. If not provided, a random vector will be generated and prepended to the output. Optional additional authenticated data (AAD) is only supported for GCM. If provided for encryption, the identical AAD value must be provided for decryption. The default mode is GCM." +scalar,aggregate,any,,,"aggregate(expr, start, merge, finish) - Applies a binary operator to an initial state and all elements in the array, and reduces this to a single state. The final state is converted into the final result by applying a finish function." +scalar,ai_analyze_sentiment,varchar,,,"ai_analyze_sentiment(text) - Analyzes the sentiment of the provided text, categorizing it as positive, negative, neutral or mixed." +scalar,ai_classify,varchar,,,"ai_classify(text, labels) - Classify the input text into one of the labels. The labels must be strings with semantic meanings." +scalar,ai_complete,varchar,,,"ai_complete(endpoint, request, modelParameters, responseFormat, skipOnError, showDetails, files) - Invokes an existing Databricks Model Serving chat endpoint and returns its response." +scalar,ai_embed,array,,,"ai_embed(endpoint, request, skipOnError, showDetails) - Invokes an existing Databricks Model Serving embedding endpoint and returns its response." +scalar,ai_extract,varchar,,,"ai_extract(text, labels) - Extracts the provided text for the given labels." +scalar,ai_extract_table_data,any,,,"ai_extract_table_data(blob, schema[, text, options]) - Returns a struct representing the structured data matching the given schema from the BinaryType blob value." +scalar,ai_extract_table_schema,any,,,"ai_extract_table_schema(blob[, text, options]) - Aggregate function to return the schema of a BinaryType column in the schema DDL format." +scalar,ai_fix_grammar,varchar,,,ai_fix_grammar(text) - Corrects grammatical errors in the provided text. +scalar,ai_forecast,any,,,N/A. +scalar,ai_gen,varchar,,,ai_gen(prompt) Generates human-like text based on the input prompt. +scalar,ai_generate_text,varchar,,,"ai_generate_text(prompt, model_name, [, param1, value1, [, param2, value2] ...]) - Calls an LLM selected by the user with the given prompt and returns the text generated by the model." +scalar,ai_mask,varchar,,,"ai_mask(text, labels) - Masks the provided text for the given labels." +scalar,ai_parse,varchar,,,"ai_parse(blob[, options]) - Returns a VARIANT value with the contextual layout metadata from the given `blob`" +scalar,ai_parse_document,varchar,,,"ai_parse_document(blob[, options]) - Returns a VARIANT value with the document parse result from the given `blob`" +scalar,ai_query,any,,,"ai_query(endpointName, request, returnType)Invokes an existing Databricks Model Serving endpoint and parses and returns its response." +scalar,ai_similarity,double,,,"ai_similarity(text, text) - Outputs a similarity score in the range of [0, 1] between the two input texts. 1 means two texts are identical, 0 means two texts are completely different." +scalar,ai_summarize,varchar,,,"ai_summarize(text[, max_words]) - Summarizes the provided text. An optional `max_words` parameter can be used to specify the maximum number of words in the summary. The default value of `max_words` is 50. If set to 0, there is no limit." +scalar,ai_top_drivers,any,,,ai_top_drivers() - Returns the key drivers for a key performance indicator by running a segmentation analysis model. +scalar,ai_translate,varchar,,,"ai_translate(text, to_lang) - Translates the input text to a specified target language." +aggregate,and,boolean,,,expr1 and expr2 - Logical AND. +aggregate,any,boolean,,,any(expr) - Returns true if at least one value of `expr` is true. +aggregate,any_value,same_as_input,,,Returns an arbitrary value from the group +aggregate,approx_count_distinct,bigint,,,Approximate count of distinct values +aggregate,approx_percentile,same_as_input,,,"approx_percentile(col, percentage [, accuracy]) - Returns the approximate `percentile` of the numeric or ansi interval column `col` which is the smallest value in the ordered `col` values (sorted from least to greatest) such that no more than `percentage` of `col` values is less than the value or equal to that value. The value of percentage must be between 0.0 and 1.0. The `accuracy` parameter (default: 10000) is a positive numeric literal which controls approximation accuracy at the cost of memory. Higher value of `accuracy` yields better accuracy, `1.0/accuracy` is the relative error of the approximation. When `percentage` is an array, each value of the percentage array must be between 0.0 and 1.0. In this case, returns the approximate percentile array of column `col` at the given percentage array." +aggregate,approx_top_k,any,,,"approx_top_k(col, k , [maxItemsTracked]) - Returns the top `k` most frequently occurring item values in a column `col` along with their approximate counts. The error in each count may be up to `2.0 * numRows / maxItemsTracked` where `numRows` is the total number of rows. `k` (default: 10) and `maxItemsTracked` (default: 10000) are both integer parameters. Higher values of `maxItemsTracked` provide better accuracy at the cost of increased memory usage. Columns that have fewer than `maxItemsTracked` distinct items will yield exact item counts. NULL values are included as their own value in the results." +scalar,array,array,,,"array(expr, ...) - Returns an array with the given elements." +scalar,array_compact,same_as_input,,,array_compact(array) - Removes null values from the array. +scalar,array_insert,same_as_input,,,"array_insert(x, pos, val) - Places val into index pos of array x. Array indices start at 1. The maximum negative index is -1 for which the function inserts new element after the current last element. Index above array size appends the array, or prepends the array if index is negative, with 'null' elements." +scalar,array_size,bigint,,,array_size(expr) - Returns the size of an array. The function returns null for null input. +scalar,arrays_zip,array,,,"arrays_zip(a1, a2, ...) - Returns a merged array of structs in which the N-th struct contains all N-th values of input arrays." +scalar,assert_true,any,,,"assert_true(expr [, message]) - Throws an exception if `expr` is not true." +scalar,base64,varchar,,,base64(bin) - Converts the argument from a binary `bin` to a base 64 string. +scalar,between,boolean,,,input [NOT] between lower AND upper - evaluate if `input` is [not] in between `lower` and `upper` +scalar,bigint,bigint,,,bigint(expr) - Casts the value `expr` to the target data type `bigint`. +scalar,bin,varchar,,,bin(expr) - Returns the string representation of the long value `expr` represented in binary. +scalar,binary,binary,,,binary(expr) - Casts the value `expr` to the target data type `binary`. +scalar,bit_count,bigint,,,"bit_count(expr) - Returns the number of bits that are set in the argument expr as an unsigned 64-bit integer, or NULL if the argument is NULL." +scalar,bit_get,bigint,,,"bit_get(expr, pos) - Returns the value of the bit (0 or 1) at the specified position. The positions are numbered from right to left, starting at zero. The position argument cannot be negative." +scalar,bit_reverse,same_as_input,,,bit_reverse(expr) - Returns the value obtained by reversing the order of the bits in the two's complement binary representation of the specified integral value. +scalar,bitmap_bit_position,any,,,bitmap_bit_position(child) - Returns the bit position for the given input child expression. +scalar,bitmap_bucket_number,any,,,bitmap_bucket_number(child) - Returns the bucket number for the given input child expression. +aggregate,bitmap_construct_agg,any,,,bitmap_construct_agg(child) - Returns a bitmap with the positions of the bits set from all the values from the child expression. The child expression will most likely be bitmap_bit_position(). +scalar,bitmap_count,any,,,bitmap_count(child) - Returns the number of set bits in the child bitmap. +aggregate,bitmap_or_agg,any,,,bitmap_or_agg(child) - Returns a bitmap that is the bitwise OR of all of the bitmaps from the child expression. The input should be bitmaps created from bitmap_construct_agg(). +scalar,boolean,boolean,,,boolean(expr) - Casts the value `expr` to the target data type `boolean`. +scalar,bround,same_as_input,,,"bround(expr, d) - Returns `expr` rounded to `d` decimal places using HALF_EVEN rounding mode." +scalar,case,same_as_input,,,"CASE expr1 WHEN expr2 THEN expr3 [WHEN expr4 THEN expr5]* [ELSE expr6] END - When `expr1` = `expr2`, returns `expr3`; when `expr1` = `expr4`, return `expr5`; else return `expr6`." +scalar,cast,any,,,cast(expr AS type) - Casts the value `expr` to the target data type `type`. `expr` :: `type` alternative casting syntax is also supported. +scalar,ceiling,same_as_input,,,"ceiling(expr[, scale]) - Returns the smallest number after rounding up that is not smaller than `expr`. An optional `scale` parameter can be specified to control the rounding behavior." +scalar,char,varchar,,,char(expr) - Returns the ASCII character having the binary equivalent to `expr`. If n is larger than 256 the result is equivalent to chr(n % 256) +scalar,charindex,bigint,,,"charindex(substr, str[, pos]) - Returns the position of the first occurrence of `substr` in `str` after position `pos`. The given `pos` and return value are 1-based." +scalar,cloud_files,any,,,"cloud_files(path: string, format: string [, options: map])" +scalar,cloud_files_state,any,,,"cloud_files_state() - A table function that returns file metadata for all sources of an autoloader stream. It accepts a streaming table identifier or a checkpoint location, and an optional argument as the target checkpoint version, and returns the file metadata for all the sources of the related autoloader stream" +scalar,collate,same_as_input,,,"collate(expr, collationName) - Marks a given expression with the specified collation." +scalar,collation,varchar,,,collation(expr) - Returns the collation name of a given expression. +scalar,collations,any,,,collations() - Get all of the Spark SQL string collations +aggregate,collect_list,array,,,Aggregates values into an array +aggregate,collect_set,array,,,Aggregates distinct values into an array +scalar,conv,varchar,,,"conv(num, from_base, to_base) - Convert `num` from `from_base` to `to_base`." +scalar,convert_timezone,timestamp,,,"convert_timezone([sourceTz, ]targetTz, sourceTs) - Converts the timestamp without time zone `sourceTs` from the `sourceTz` time zone to `targetTz`." +aggregate,count_if,bigint,,,Counts rows where predicate is true +aggregate,count_min_sketch,binary,,,"count_min_sketch(col, eps, confidence, seed) - Returns a count-min sketch of a column with the given esp, confidence and seed. The result is an array of bytes, which can be deserialized to a `CountMinSketch` before usage. Count-min sketch is a probabilistic data structure used for cardinality estimation using sub-linear space." +scalar,crc32,bigint,,,crc32(expr) - Returns a cyclic redundancy check value of the `expr` as a bigint. +scalar,csc,double,,,"csc(expr) - Returns the cosecant of `expr`, as if computed by `1/java.lang.Math.sin`." +scalar,curdate,date,,,curdate() - Returns the current date at the start of query evaluation. All calls of curdate within the same query return the same value. +scalar,current_catalog,varchar,,,current_catalog() - Returns the current catalog. +scalar,current_database,varchar,,,current_database() - Returns the current database. +scalar,current_metastore,varchar,,,current_metastore(group) - Returns the current Unity Catalog metastore. +scalar,current_oauth_custom_identity_claim,varchar,,,current_oauth_custom_identity_claim() - get the current identity claim from OAuth token +scalar,current_recipient,varchar,,,"current_recipient(propertyKey) - Given a recipient property key, returns the property value associated with the specified key for the current data recipient" +scalar,current_schema,varchar,,,current_schema() - Returns the current database. +scalar,current_timezone,varchar,,,current_timezone() - Returns the current session local timezone. +scalar,current_user,varchar,,,current_user() - user name of current execution context. +scalar,current_version,any,,,"current_version() - Returns the version information. The result struct contains the DBR version (null if not in DBR), the DBSQL version (null if not in DBSQL), and other internal information. Use this function instead of the version() for version related information." +scalar,date,date,,,date(expr) - Casts the value `expr` to the target data type `date`. +scalar,date_add,date,,,"date_add(start_date, num_days) - Returns the date that is `num_days` after `start_date`." +scalar,date_from_unix_date,date,,,date_from_unix_date(days) - Create date from the number of days since 1970-01-01. +scalar,date_sub,date,,,"date_sub(start_date, num_days) - Returns the date that is `num_days` before `start_date`." +scalar,dateadd,date,,,"dateadd(start_date, num_days) - Returns the date that is `num_days` after `start_date`." +scalar,day,bigint,,,day(date) - Returns the day of month of the date/timestamp. +scalar,dayname,varchar,,,dayname(date) - Returns the three-letter abbreviated day name from the given date. +scalar,dayofmonth,bigint,,,dayofmonth(date) - Returns the day of month of the date/timestamp. +scalar,dayofweek,bigint,,,"dayofweek(date) - Returns the day of the week for date/timestamp (1 = Sunday, 2 = Monday, ..., 7 = Saturday)." +scalar,dayofyear,bigint,,,dayofyear(date) - Returns the day of year of the date/timestamp. +scalar,decimal,decimal,,,decimal(expr) - Casts the value `expr` to the target data type `decimal`. +scalar,div,bigint,,,expr1 div expr2 - Divide `expr1` by `expr2`. It returns NULL if an operand is NULL or `expr2` is 0. The result is casted to long. +scalar,double,double,,,double(expr) - Casts the value `expr` to the target data type `double`. +scalar,e,double,,,"e() - Returns Euler's number, e." +scalar,elt,same_as_input,,,"elt(n, input1, input2, ...) - Returns the `n`-th input, e.g., returns `input2` when `n` is 2. The function returns NULL if the index exceeds the length of the array and `spark.sql.ansi.enabled` is set to false. If `spark.sql.ansi.enabled` is set to true, it throws ArrayIndexOutOfBoundsException for invalid indices." +scalar,endswith,boolean,,,"endswith(left, right) - Returns a boolean. The value is True if left ends with right. Returns NULL if either input expression is NULL. Otherwise, returns False. Both left or right must be of STRING or BINARY type." +scalar,equal_null,boolean,,,"equal_null(expr1, expr2) - Returns same result as the EQUAL(=) operator for non-null operands, but returns true if both are null, false if one of the them is null." +scalar,event_log,any,,,"event_log() - A table function that returns event logs of a materialized view, a streaming table, or a DLT pipeline. It accepts a table identifier or a pipeline id, and returns the content of its event logs." +aggregate,every,boolean,,,every(expr) - Returns true if all values of `expr` are true. +scalar,exists,boolean,,,"exists(expr, pred) - Tests whether a predicate holds for one or more elements in the array." +scalar,explode,any,,,"explode(expr) - Separates the elements of array `expr` into multiple rows, or the elements of map `expr` into multiple rows and columns. Unless specified otherwise, uses the default column name `col` for elements of the array or `key` and `value` for the elements of the map." +scalar,explode_outer,any,,,"explode_outer(expr) - Separates the elements of array `expr` into multiple rows, or the elements of map `expr` into multiple rows and columns. Unless specified otherwise, uses the default column name `col` for elements of the array or `key` and `value` for the elements of the map." +scalar,expm1,double,,,expm1(expr) - Returns exp(`expr`) - 1. +scalar,extract,any,,,extract(field FROM source) - Extracts a part of the date or timestamp or time or interval source. +scalar,filter,same_as_input,,,"filter(expr, func) - Filters the input array using the given predicate." +aggregate,first,same_as_input,,,"first(expr[, isIgnoreNull]) - Returns the first value of `expr` for a group of rows. If `isIgnoreNull` is true, returns only non-null values." +scalar,float,float,,,float(expr) - Casts the value `expr` to the target data type `float`. +scalar,forall,boolean,,,"forall(expr, pred) - Tests whether a predicate holds for all elements in the array." +scalar,format_number,varchar,,,"format_number(expr1, expr2) - Formats the number `expr1` like '#,###,###.##', rounded to `expr2` decimal places. If `expr2` is 0, the result has no decimal point or fractional part. `expr2` also accept a user specified format. This is supposed to function like MySQL's FORMAT." +scalar,format_string,varchar,,,"format_string(strfmt, obj, ...) - Returns a formatted string from printf-style format strings." +scalar,from_avro,any,,,"from_avro(child, jsonFormatSchema, options) - Converts a binary Avro value into a Catalyst value." +scalar,from_csv,any,,,"from_csv(csvStr, schema[, options]) - Returns a struct value with the given `csvStr` and `schema`." +scalar,from_json,any,,,"from_json(jsonStr, schema[, options]) - Returns a struct value with the given `jsonStr` and `schema`." +scalar,from_protobuf,any,,,"from_protobuf(data, messageName, descFilePath, options) - Converts a binary Protobuf value into a Catalyst value." +scalar,from_utc_timestamp,timestamp,,,"from_utc_timestamp(timestamp, timezone) - Given a timestamp like '2017-07-14 02:40:00.0', interprets it as a time in UTC, and renders that time as a timestamp in the given time zone. For example, 'GMT+1' would yield '2017-07-14 03:40:00.0'." +scalar,from_xml,any,,,"from_xml(xmlStr, schema[, options]) - Returns a struct value with the given `xmlStr` and `schema`." +scalar,get,same_as_input_first_array_element,,,"get(array, index) - Returns element of array at given (0-based) index. If the index points outside of the array boundaries, then this function returns NULL." +scalar,get_json_object,varchar,,,"get_json_object(json_txt, path) - Extracts a json object from `path`." +scalar,get_warmup_tracing,any,,,get_warmup_tracing() - Queries warmup config access tracing. It accepts no arguments and returns a relation comprising multiple fields as strings. +scalar,getbit,bigint,,,"getbit(expr, pos) - Returns the value of the bit (0 or 1) at the specified position. The positions are numbered from right to left, starting at zero. The position argument cannot be negative." +scalar,getdate,timestamp,,,getdate() - Returns the current timestamp at the start of query evaluation. All calls of current_timestamp within the same query return the same value. +scalar,grouping_id,bigint,,,"grouping_id([col1[, col2 ..]]) - returns the level of grouping, equals to `(grouping(c1) << (n-1)) + (grouping(c2) << (n-2)) + ... + grouping(cn)`" +scalar,h3_boundaryasgeojson,any,,,h3_boundaryasgeojson(h3cell) - Returns the boundary of an H3 cell in GeoJSON format. +scalar,h3_boundaryaswkb,any,,,h3_boundaryaswkb(h3cell) - Returns the boundary of an H3 cell in WKB format. +scalar,h3_boundaryaswkt,any,,,h3_boundaryaswkt(h3cell) - Returns the boundary of an H3 cell in WKT format. +scalar,h3_centerasgeojson,any,,,h3_centerasgeojson(h3cell) - Returns the center of an H3 cell in GeoJSON format. +scalar,h3_centeraswkb,any,,,h3_centeraswkb(h3cell) - Returns the center of an H3 cell in WKB format. +scalar,h3_centeraswkt,any,,,h3_centeraswkt(h3cell) - Returns the center of an H3 cell in WKT format. +scalar,h3_compact,any,,,h3_compact(h3cells) - Compacts the input set of H3 cell IDs as best as possible. +scalar,h3_coverash3,any,,,"h3_coverash3(geoRep, resolution) - Returns an array of cell IDs represented as long integers, corresponding to hexagons or pentagons of the specified resolution that minimally cover the input linear or areal geography." +scalar,h3_coverash3string,any,,,"h3_coverash3string(geoRep, resolution) - Returns an array of cell IDs represented as strings, corresponding to hexagons or pentagons of the specified resolution that minimally cover the input linear or areal geography." +scalar,h3_distance,any,,,"h3_distance(h3cell1, h3cell2) - Returns the grid distance between two H3 cell IDs." +scalar,h3_getpentagoncellids,any,,,h3_getpentagoncellids(resolution) - Returns the pentagonal H3 cells (as an array of BIGINTs) at the specified resolution. +scalar,h3_h3tostring,any,,,h3_h3tostring(h3cellid) - Converts an H3 cell ID to a string. +scalar,h3_hexring,any,,,"h3_hexring(h3cell, k) - Returns an array of H3 cell IDs that form a hollow hexagonal ring centered at the origin H3 cell and that are at grid distance k from the origin H3 cell." +scalar,h3_ischildof,any,,,"h3_ischildof(h3child, h3parent) - Returns true if the first H3 cell ID is a child of the second H3 cell ID." +scalar,h3_ispentagon,any,,,h3_ispentagon(h3cell) - Returns true if the input H3 cell ID represents a pentagon. +scalar,h3_isvalid,any,,,h3_isvalid(expr) - Returns true if the input represents a valid H3 cell ID. +scalar,h3_kring,any,,,"h3_kring(h3cell, k) - Returns the H3 cell IDs that are within (grid) distance k of the origin cell ID." +scalar,h3_kringdistances,any,,,"h3_kringdistances(h3cell, k) - Returns all H3 cell IDs (represented as long integers or strings) within grid distance k from * the origin H3 cell ID, along with their distance from the origin H3 cell ID." +scalar,h3_longlatash3,any,,,"h3_longlatash3(longitude, latitude, resolution) - Returns the H3 cell ID (as a BIGINT) corresponding to the provided longitude and latitude at the specified resolution." +scalar,h3_longlatash3string,any,,,"h3_longlatash3string(longitude, latitude, resolution) - Returns the H3 cell ID (as a STRING) corresponding to the provided longitude and latitude at the specified resolution." +scalar,h3_maxchild,any,,,"h3_maxchild(h3cell, resolution) - Returns the child of maximum value of the input H3 cell at the specified resolution." +scalar,h3_minchild,any,,,"h3_minchild(h3cell, resolution) - Returns the child of minimum value of the input H3 cell at the specified resolution." +scalar,h3_pointash3,any,,,"h3_pointash3(geoRep, resolution) - Returns the H3 cell ID (as a BIGINT) corresponding to the provided point at the specified resolution." +scalar,h3_pointash3string,any,,,"h3_pointash3string(geoRep, resolution) - Returns the H3 cell ID (as a STRING) corresponding to the provided point at the specified resolution." +scalar,h3_polyfillash3,any,,,"h3_polyfillash3(geoRep, resolution) - Returns an array of cell IDs represented as long integers, corresponding to hexagons or pentagons of the specified resolution that are contained by the input areal geography." +scalar,h3_polyfillash3string,any,,,"h3_polyfillash3string(geoRep, resolution) - Returns an array of cell IDs represented as strings, corresponding to hexagons or pentagons of the specified resolution that are contained by the input areal geography." +scalar,h3_resolution,any,,,h3_resolution(h3cell) - Returns the resolution of the H3 cell ID. +scalar,h3_stringtoh3,any,,,h3_stringtoh3(str) - Converts the string representation H3 cell ID to its big integer representation. +scalar,h3_tessellateaswkb,any,,,"h3_tessellateaswkb(geo, resolution) - Returns an array of structs representing the chips covering geography at the specified resolution." +scalar,h3_tochildren,any,,,"h3_tochildren(h3cell, resolution) - Returns the children H3 cell IDs of the input H3 cell ID at the specified resolution." +scalar,h3_toparent,any,,,"h3_toparent(h3cell, resolution) - Returns the parent H3 cell ID of the input H3 cell ID at the specified resolution." +scalar,h3_try_coverash3,any,,,"h3_try_coverash3(geoRep, resolution) - Returns an array of cell IDs represented as long integers, corresponding to hexagons or pentagons of the specified resolution that minimally cover the input linear or areal geography, or NULL if the geography input is invalid" +scalar,h3_try_coverash3string,any,,,"h3_try_coverash3string(geoRep, resolution) - Returns an array of cell IDs represented as strings, corresponding to hexagons or pentagons of the specified resolution that minimally cover the input linear or areal geography, or NULL if the geography input is invalid" +scalar,h3_try_distance,any,,,"h3_try_distance(h3cell1, h3cell2) - Returns the grid distance between two H3 cell IDs of the same resolution or NULL if the grid distance is not defined." +scalar,h3_try_polyfillash3,any,,,"h3_try_polyfillash3(geoRep, resolution) - Returns an array of cell IDs represented as long integers, corresponding to hexagons or pentagons of the specified resolution that are contained by the input areal geography." +scalar,h3_try_polyfillash3string,any,,,"h3_try_polyfillash3string(geoRep, resolution) - Returns an array of cell IDs represented as strings, corresponding to hexagons or pentagons of the specified resolution that are contained by the input areal geography." +scalar,h3_try_tessellateaswkb,any,,,"h3_try_tessellateaswkb(geo, resolution) - Returns an array of structs representing the chips covering geography at the specified resolution." +scalar,h3_try_validate,any,,,h3_try_validate(h3cell) - Returns the input value if it is a valid H3 cell or NULL otherwise. +scalar,h3_uncompact,any,,,"h3_uncompact(h3cells, resolution) - Uncompacts the input set of H3 cell IDs to the specified resolution." +scalar,h3_validate,any,,,h3_validate(h3cell) - Returns the input value if it is a valid H3 cell or emits an error otherwise. +scalar,hash,any,,,"hash(expr1, expr2, ...) - Returns a hash value of the arguments." +scalar,hex,varchar,,,hex(expr) - Converts `expr` to hexadecimal. +aggregate,histogram_numeric,any,,,"histogram_numeric(expr, nb) - Computes a histogram on numeric 'expr' using nb bins. The return value is an array of (x,y) pairs representing the centers of the histogram's bins. As the value of 'nb' is increased, the histogram approximation gets finer-grained, but may yield artifacts around outliers. In practice, 20-40 histogram bins appear to work well, with more bins being required for skewed or smaller datasets. Note that this function creates a histogram with non-uniform bin widths. It offers no guarantees in terms of the mean-squared-error of the histogram, but in practice is comparable to the histograms produced by the R/S-Plus statistical computing packages. Note: the output type of the 'x' field in the return value is propagated from the input value consumed in the aggregate function." +scalar,hll_cardinality_internal,any,,,hll_cardinality_internal(expr) - Returns the estimated number of unique values. +aggregate,hll_collect_internal,any,,,"hll_collect_internal(expr, lgK, useHashAgg) - Returns a binary representation of an HLL sketch. `lgK` defines the size of the sketch. The default value is 9. `useHashAgg` dictates whether we use a hash-based or a sort based=aggregation, refer to the `hasFixedSizeBuffer` definition below for more info." +aggregate,hll_sketch_agg,any,,,"hll_sketch_agg(expr, lgConfigK) - Returns the HllSketch's updatable binary representation. `lgConfigK` (optional) the log-base-2 of K, with K is the number of buckets or slots for the HllSketch." +scalar,hll_sketch_estimate,any,,,hll_sketch_estimate(expr) - Returns the estimated number of unique values given the binary representation of a Datasketches HllSketch. +scalar,hll_union,any,,,"hll_union(first, second, allowDifferentLgConfigK) - Merges two binary representations of Datasketches HllSketch objects, using a Datasketches Union object. Set allowDifferentLgConfigK to true to allow unions of sketches with different lgConfigK values (defaults to false)." +aggregate,hll_union_agg,any,,,"hll_union_agg(expr, allowDifferentLgConfigK) - Returns the estimated number of unique values. `allowDifferentLgConfigK` (optional) Allow sketches with different lgConfigK values to be unioned (defaults to false)." +scalar,hour,bigint,,,hour(expr) - Returns the hour component of the given expression. +scalar,http_request,any,,,"http_request(conn, method, path, json, headers, params) - Uses the conn (Connection) object to make an http request" +scalar,hypot,double,,,"hypot(expr1, expr2) - Returns sqrt(`expr1`² + `expr2`²)." +scalar,if,same_as_input,,,"if(expr1, expr2, expr3) - If `expr1` evaluates to true, then returns `expr2`; otherwise returns `expr3`." +scalar,iff,same_as_input,,,"iff(expr1, expr2, expr3) - If `expr1` evaluates to true, then returns `expr2`; otherwise returns `expr3`." +scalar,ilike,boolean,,,"str ilike pattern[ ESCAPE escape] - Returns true if str matches `pattern` with `escape` case-insensitively, null if any arguments are null, false otherwise." +scalar,in,boolean,,,"expr1 in(expr2, expr3, ...) - Returns true if `expr` equals to any valN." +scalar,ingestion_describe_table,any,,,"ingestion_describe_table( gatewayId: string | connectionName: string, catalogName: string, schemaName: string, tableName: string, payload: string )" +scalar,ingestion_list_catalogs,any,,,ingestion_list_catalogs(gatewayId: string | connectionName: string) +scalar,ingestion_list_schemas,any,,,"ingestion_list_schemas(gatewayId: string | connectionId: string, catalogName: string)" +scalar,ingestion_list_tables,any,,,"ingestion_list_tables( gatewayId: string | connectionId: string, catalogName: string, schemaName: string, payload: string )" +scalar,ingestion_read_data,any,,,"ingestion_read_data( gatewayId: string | connectionId: string, catalogName: string, schemaName: string, tableName: string, [optionMap] )" +scalar,inline,any,,,"inline(expr) - Explodes an array of structs into a table. Uses column names col1, col2, etc. by default unless specified otherwise." +scalar,inline_outer,any,,,"inline_outer(expr) - Explodes an array of structs into a table. Uses column names col1, col2, etc. by default unless specified otherwise." +scalar,input_file_block_length,any,,,"input_file_block_length() - Returns the length of the block being read, or -1 if not available." +scalar,input_file_block_start,any,,,"input_file_block_start() - Returns the start offset of the block being read, or -1 if not available." +scalar,input_file_name,any,,,"input_file_name() - Returns the name of the file being read, or empty string if not available." +scalar,int,any,,,int(expr) - Casts the value `expr` to the target data type `int`. +scalar,is_account_group_member,boolean,,,"is_account_group_member(group) - Given a group name, returns a boolean value indicating whether the current user is a member of that group. This function enforce that group membership ischecked at account level." +scalar,is_member,boolean,,,"is_member(group) - Given a group name, returns a boolean value indicating whether the current user is a member of that group." +scalar,is_valid_utf8,boolean,,,"is_valid_utf8(str) - Returns true if `str` is a valid UTF-8 string, otherwise returns false." +scalar,is_variant_null,boolean,,,is_variant_null(expr) - Check if a variant value is a variant null. Returns true if and only if the input is a variant null and false otherwise (including in the case of SQL NULL). +scalar,isnotnull,boolean,,,"isnotnull(expr) - Returns true if `expr` is not null, or false otherwise." +scalar,isnull,boolean,,,"isnull(expr) - Returns true if `expr` is null, or false otherwise." +scalar,java_method,any,,,"java_method(class, method[, arg1[, arg2 ..]]) - Calls a method with reflection." +scalar,json_array_length,bigint,,,json_array_length(jsonArray) - Returns the number of elements in the outermost JSON array. +scalar,json_object_keys,array,,,json_object_keys(json_object) - Returns all the keys of the outermost JSON object as an array. +scalar,json_tuple,any,,,"json_tuple(jsonStr, p1, p2, ..., pn) - Returns a tuple like the function get_json_object, but it takes multiple names. All the input parameters and output column types are string." +aggregate,kurtosis,double,,,kurtosis(expr) - Returns the kurtosis value calculated from values of a group. +aggregate,last,any,,,"last(expr[, isIgnoreNull]) - Returns the last value of `expr` for a group of rows. If `isIgnoreNull` is true, returns only non-null values" +scalar,last_day,date,,,last_day(date) - Returns the last day of the month which the date belongs to. +scalar,lcase,varchar,,,lcase(str) - Returns `str` with all characters changed to lowercase. +scalar,len,bigint,,,Length in characters +scalar,like,boolean,,,"str like pattern[ ESCAPE escape] - Returns true if str matches `pattern` with `escape`, null if any arguments are null, false otherwise." +scalar,list_secrets,any,,,"list_secrets() - Queries all secrets available to the current user using the Databricks secret service. It accepts an optional variable list of string arguments representing the scopes to look up secrets for; if no arguments are provided, all scopes are included. The function returns a relation with two string columns including the scopes and keys." +aggregate,listagg,string,,,Concatenates strings +scalar,localtimestamp,timestamp,,,localtimestamp() - Returns the current timestamp without time zone at the start of query evaluation. All calls of localtimestamp within the same query return the same value. +scalar,locate,bigint,,,"locate(substr, str[, pos]) - Returns the position of the first occurrence of `substr` in `str` after position `pos`. The given `pos` and return value are 1-based." +scalar,log1p,double,,,log1p(expr) - Returns log(1 + `expr`). +scalar,luhn_check,boolean,,,"luhn_check(str ) - Checks that a string of digits is valid according to the Luhn algorithm. This checksum function is widely applied on credit card numbers and government identification numbers to distinguish valid numbers from mistyped, incorrect numbers." +scalar,make_dt_interval,interval,,,"make_dt_interval([days[, hours[, mins[, secs]]]]) - Make DayTimeIntervalType duration from days, hours, mins and secs." +scalar,make_interval,interval,,,"make_interval([years[, months[, weeks[, days[, hours[, mins[, secs]]]]]]]) - Make interval from years, months, weeks, days, hours, mins and secs." +scalar,make_time,time,,,"make_time(hour, minute, second) - Create time from hour, minute and second fields. For invalid inputs it will throw an error." +scalar,make_timestamp,timestamp,,,"make_timestamp(year, month, day, hour, min, sec[, timezone]) - Create the current timestamp with local time zone from year, month, day, hour, min, sec and timezone fields. If the configuration `spark.sql.ansi.enabled` is false, the function returns NULL on invalid inputs. Otherwise, it will throw an error instead." +scalar,make_timestamp_ltz,timestamp,,,"make_timestamp_ltz(year, month, day, hour, min, sec[, timezone]) - Create the current timestamp with local time zone from year, month, day, hour, min, sec and timezone fields. If the configuration `spark.sql.ansi.enabled` is false, the function returns NULL on invalid inputs. Otherwise, it will throw an error instead." +scalar,make_timestamp_ntz,timestamp,,,"make_timestamp_ntz(year, month, day, hour, min, sec) - Create local date-time from year, month, day, hour, min, sec fields. If the configuration `spark.sql.ansi.enabled` is false, the function returns NULL on invalid inputs. Otherwise, it will throw an error instead." +scalar,make_valid_utf8,varchar,,,"make_valid_utf8(str) - Returns the original string if `str` is a valid UTF-8 string, otherwise returns a new string whose invalid UTF8 byte sequences are replaced using the UNICODE replacement character U+FFFD." +scalar,make_ym_interval,interval,,,"make_ym_interval([years[, months]]) - Make year-month interval from years, months." +scalar,map_concat,any,,,"map_concat(map, ...) - Returns the union of all the given maps" +scalar,map_contains_key,boolean,,,"map_contains_key(map, key) - Returns true if the map contains the key." +scalar,map_filter,any,,,"map_filter(expr, func) - Filters entries in a map using the function." +scalar,map_from_arrays,any,,,"map_from_arrays(keys, values) - Creates a map with a pair of the given key/value arrays. All elements in keys should not be null" +scalar,map_from_entries,any,,,map_from_entries(arrayOfEntries) - Returns a map created from the given array of entries. +scalar,map_zip_with,any,,,"map_zip_with(map1, map2, function) - Merges two given maps into a single map by applying function to the pair of values with the same key. For keys only presented in one map, NULL will be passed as the value for the missing key. If an input map contains duplicated keys, only the first entry of the duplicated key is passed into the lambda function." +scalar,mask,varchar,,,"mask(input[, upperChar, lowerChar, digitChar, otherChar]) - masks the given string value. The function replaces characters with 'X' or 'x', and numbers with 'n'. This can be useful for creating copies of tables with sensitive information removed." +aggregate,max_by,same_as_input,,,Value of arg with maximum by comparator +scalar,measure,any,,,measure(expr) - this function is used and can only be used to calculate a measure defined in a metric view. +scalar,metric_store,any,,,"metric_store() - A table function that returns result of a metric defined in MetricStore. It takes parameters of a metric such as metric_name, start_time, end_time, dimension_groupings, etc, and returns the result of the metric." +aggregate,min_by,same_as_input,,,Value of arg with minimum by comparator +scalar,minute,bigint,,,minute(expr) - Returns the minute component of the given expression. +scalar,mod,same_as_input,,,"expr1 % expr2, or mod(expr1, expr2) - Returns the remainder after `expr1`/`expr2`." +aggregate,mode,same_as_input,,,Most frequent value +scalar,monotonically_increasing_id,bigint,,,"monotonically_increasing_id() - Returns monotonically increasing 64-bit integers. The generated ID is guaranteed to be monotonically increasing and unique, but not consecutive. The current implementation puts the partition ID in the upper 31 bits, and the lower 33 bits represent the record number within each partition. The assumption is that the data frame has less than 1 billion partitions, and each partition has less than 8 billion records. The function is non-deterministic because its result depends on partition IDs." +scalar,month,bigint,,,month(date) - Returns the month component of the date/timestamp. +scalar,monthname,varchar,,,monthname(date) - Returns the three-letter abbreviated month name from the given date. +scalar,months_between,double,,,"months_between(timestamp1, timestamp2[, roundOff]) - If `timestamp1` is later than `timestamp2`, then the result is positive. If `timestamp1` and `timestamp2` are on the same day of month, or both are the last day of month, time of day will be ignored. Otherwise, the difference is calculated based on 31 days per month, and rounded to 8 digits unless roundOff=false." +scalar,negative,same_as_input,,,negative(expr) - Returns the negated value of `expr`. +scalar,next_day,date,,,"next_day(start_date, day_of_week) - Returns the first date which is later than `start_date` and named as indicated. The function returns NULL if at least one of the input parameters is NULL. When both of the input parameters are not NULL and day_of_week is an invalid input, the function throws SparkIllegalArgumentException if `spark.sql.ansi.enabled` is set to true, otherwise NULL." +scalar,not,boolean,,,not expr - Logical not. +scalar,nullifzero,same_as_input,,,"nullifzero(expr) - Returns null if `expr` is equal to zero, or `expr` otherwise." +scalar,or,boolean,,,expr1 or expr2 - Logical OR. +scalar,parse_json,any,,,parse_json(jsonStr) - Parse a JSON string as a Variant value. Throw an exception when the string is not valid JSON value. +scalar,parse_url,varchar,,,"parse_url(url, partToExtract[, key]) - Extracts a part from a URL." +aggregate,percentile,same_as_input,,,Exact percentile +aggregate,percentile_approx,same_as_input,,,Approximate percentile +aggregate,percentile_cont,same_as_input,,,Continuous percentile +aggregate,percentile_disc,same_as_input,,,Discrete percentile +scalar,pmod,same_as_input,,,"pmod(expr1, expr2) - Returns the positive value of `expr1` mod `expr2`." +scalar,posexplode,any,,,"posexplode(expr) - Separates the elements of array `expr` into multiple rows with positions, or the elements of map `expr` into multiple rows and columns with positions. Unless specified otherwise, uses the column name `pos` for position, `col` for elements of the array or `key` and `value` for elements of the map." +scalar,posexplode_outer,any,,,"posexplode_outer(expr) - Separates the elements of array `expr` into multiple rows with positions, or the elements of map `expr` into multiple rows and columns with positions. Unless specified otherwise, uses the column name `pos` for position, `col` for elements of the array or `key` and `value` for elements of the map." +scalar,positive,same_as_input,,,positive(expr) - Returns the value of `expr`. +scalar,printf,varchar,,,"printf(strfmt, obj, ...) - Returns a formatted string from printf-style format strings." +scalar,quarter,bigint,,,"quarter(date) - Returns the quarter of the year for date, in the range 1 to 4." +scalar,quote,varchar,,,quote(str) - Returns `str` enclosed by single quotes and each instance of single quote in it is preceded by a backslash. +scalar,raise_error,any,,,raise_error( expr ) - Throws a USER_RAISED_EXCEPTION with `expr` as message. +scalar,rand,double,,,"rand([seed]) - Returns a random value with independent and identically distributed (i.i.d.) uniformly distributed values in [0, 1)." +scalar,randn,double,,,randn([seed]) - Returns a random value with independent and identically distributed (i.i.d.) values drawn from the standard normal distribution. +scalar,randstr,varchar,,,"randstr(length[, seed]) - Returns a string of the specified length whose characters are chosen uniformly at random from the following pool of characters: 0-9, a-z, A-Z. The random seed is optional. The string length must be a constant two-byte or four-byte integer (SMALLINT or INT, respectively)." +scalar,read_files,any,,,"read_files(path: string [, optionMap])" +scalar,read_kafka,any,,,"read_kafka(bootstrapServers => String, [optionMap])" +scalar,read_kinesis,any,,,"READ_KINESIS (streamName: string [, optionMap])" +scalar,read_pubsub,any,,,"READ_PUBSUB (subscriptionId: string, projectId: string, topicId: string [, options])" +scalar,read_pulsar,any,,,"READ_PULSAR (serviceUrl: string, topic: string, [, options])" +scalar,read_state_metadata,any,,,read_state_metadata(path: string) +scalar,read_statestore,any,,,"read_statestore(path: string [, optionMap])" +scalar,reduce,any,,,"reduce(expr, start, merge, finish) - Applies a binary operator to an initial state and all elements in the array, and reduces this to a single state. The final state is converted into the final result by applying a finish function." +scalar,reflect,any,,,"reflect(class, method[, arg1[, arg2 ..]]) - Calls a method with reflection." +scalar,regexp,boolean,,,"regexp(str, regexp) - Returns true if `str` matches `regexp`, or false otherwise." +scalar,regexp_extract,varchar,,,Regex capture extraction +scalar,regexp_extract_all,array,,,"regexp_extract_all(str, regexp[, idx]) - Extract all strings in the `str` that match the `regexp` expression and corresponding to the regex group index." +scalar,regexp_substr,varchar,,,"regexp_substr(str, regexp) - Returns the substring that matches the regular expression `regexp` within the string `str`. If the regular expression is not found, the result is null." +scalar,remote_query,any,,,"remote_query(connectionName: string, [optionMap])" +scalar,rint,same_as_input,,,rint(expr) - Returns the double value that is closest in value to the argument and is equal to a mathematical integer. +scalar,rlike,boolean,,,"rlike(str, regexp) - Returns true if `str` matches `regexp`, or false otherwise." +scalar,schema_of_avro,varchar,,,"schema_of_avro(jsonFormatSchema, options) - Returns schema in the DDL format of the avro schema in JSON string format." +scalar,schema_of_csv,varchar,,,"schema_of_csv(csv[, options]) - Returns schema in the DDL format of CSV string." +scalar,schema_of_json,varchar,,,"schema_of_json(json[, options]) - Returns schema in the DDL format of JSON string." +aggregate,schema_of_json_agg,varchar,,,"schema_of_json_agg(json[, options]) - Aggregate function to return the schema of a column in the DDL format of JSON string." +scalar,schema_of_variant,varchar,,,schema_of_variant(v) - Returns schema in the SQL format of a variant. +aggregate,schema_of_variant_agg,varchar,,,schema_of_variant_agg(v) - Returns the merged schema in the SQL format of a variant column. +scalar,schema_of_xml,varchar,,,"schema_of_xml(xml[, options]) - Returns schema in the DDL format of XML string." +scalar,sec,double,,,"sec(expr) - Returns the secant of `expr`, as if computed by `1/java.lang.Math.cos`." +scalar,second,bigint,,,second(expr) - Returns the second component of the given expression. +scalar,secret,varchar,,,"secret(scope, key) - Extracts a secret value with the given scope and key. This uses the Databricks secret service. The user executing the query must have authority to access the provided scope. If this user is not authorized, the expression returns a 'not authorized' error, or if the provided scope or key are not found, the expression returns a 'not found' error. Otherwise, it looks up the corresponding value using the key and returns the result as a string." +scalar,sentences,array,,,"sentences(str[, lang[, country]]) - Splits `str` into an array of array of words." +scalar,sequence,array,,,"sequence(start, stop, step) - Generates an array of elements from start to stop (inclusive), incrementing by step. The type of the returned elements is the same as the type of argument expressions." +scalar,session_user,varchar,,,session_user() - user name of current execution context. +window,session_window,any,,,Session windowing function +scalar,sha,varchar,,,sha(expr) - Returns a sha1 hash value as a hex string of the `expr`. +scalar,sha1,varchar,,,SHA-1 hash (hex string) +scalar,sha2,varchar,,,SHA-2 hash (hex string) +scalar,shiftleft,same_as_input,,,base shiftleft exp - Bitwise left shift. +scalar,shiftright,same_as_input,,,base shiftright expr - Bitwise (signed) right shift. +scalar,shiftrightunsigned,same_as_input,,,base shiftrightunsigned expr - Bitwise unsigned right shift. +scalar,shuffle,same_as_input,,,shuffle(array) - Returns a random permutation of the given array. +scalar,sign,same_as_input,,,"sign(expr) - Returns -1.0, 0.0 or 1.0 as `expr` is negative, 0 or positive." +scalar,size,bigint,,,"size(expr) - Returns the size of an array or a map. This function returns -1 for null input only if spark.sql.ansi.enabled is false and spark.sql.legacy.sizeOfNull is true. Otherwise, it returns null for null input. With the default settings, the function returns null for null input." +aggregate,skewness,any,,,skewness(expr) - Returns the skewness value calculated from values of a group. +scalar,slice,same_as_input,,,"slice(x, start, length) - Subsets array x starting from index start (array indices start at 1, or starting from the end if start is negative) with the specified length." +scalar,smallint,smallint,,,smallint(expr) - Casts the value `expr` to the target data type `smallint`. +aggregate,some,boolean,,,some(expr) - Returns true if at least one value of `expr` is true. +scalar,sort_array,same_as_input,,,"sort_array(array[, ascendingOrder]) - Sorts the input array in ascending or descending order according to the natural ordering of the array elements. NaN is greater than any non-NaN elements for double/float type. Null elements will be placed at the beginning of the returned array in ascending order or at the end of the returned array in descending order." +scalar,soundex,varchar,,,soundex(str) - Returns Soundex code of the string. +scalar,space,varchar,,,space(n) - Returns a string consisting of `n` spaces. +scalar,spark_partition_id,bigint,,,spark_partition_id() - Returns the current partition id. +scalar,split,array,,,"split(str, regex, limit) - Splits `str` around occurrences that match `regex` and returns an array with a length of at most `limit`" +scalar,sql_keywords,any,,,sql_keywords() - Get Spark SQL keywords +scalar,st_addpoint,any,,,"st_addpoint(geo1, geo2[, index]) - Adds a new point to the n-th position in the input linestring GEOGRAPHY or GEOMETRY." +scalar,st_area,any,,,st_area(geo) - Returns the area of the input geometry or geography value. +scalar,st_asbinary,any,,,"st_asbinary(geo, endianness) - Returns the geospatial value (value of type GEOGRAPHY or GEOMETRY) in WKB format using the specified endianness ('NDR' for little-endian, 'XDR' for big-endian), if provided. Defaults to little-endian encoding." +scalar,st_asewkb,any,,,"st_asewkb(geo, endianness) - Returns the GEOMETRY value in EWKB format using the specified endianness ('NDR' for little-endian, 'XDR' for big-endian), if provided. Defaults to little-endian encoding." +scalar,st_asewkt,any,,,"st_asewkt(geo[, precision]) - Returns the geospatial value (value of type GEOGRAPHY or GEOMETRY) in EWKT format using the specified precision, if provided." +scalar,st_asgeojson,any,,,"st_asgeojson(geo[, precision]) - Returns the geospatial value (value of type GEOGRAPHY or GEOMETRY) in GeoJSON format using the specified precision, if provided." +scalar,st_astext,any,,,"st_astext(geo[, precision]) - Returns the geospatial value (value of type GEOGRAPHY or GEOMETRY) in WKT format using the specified precision, if provided." +scalar,st_aswkb,any,,,"st_aswkb(geo, endianness) - Returns the geospatial value (value of type GEOGRAPHY or GEOMETRY) in WKB format using the specified endianness ('NDR' for little-endian, 'XDR' for big-endian), if provided. Defaults to little-endian encoding." +scalar,st_aswkt,any,,,"st_aswkt(geo[, precision]) - Returns the geospatial value (value of type GEOGRAPHY or GEOMETRY) in WKT format using the specified precision, if provided." +scalar,st_buffer,any,,,"st_buffer(geo, radius) - Returns the buffer of the input geometry using the specified radius." +scalar,st_centroid,any,,,st_centroid(geo) - Returns the centroid of the input geometry as a 2D point geometry. +scalar,st_concavehull,any,,,"st_concavehull(geo, lengthRatio[, allowHoles]) - Returns the concave hull of the input geometry as a geometry using the specified length ratio." +scalar,st_contains,any,,,"st_contains(geo1, geo2) - Returns true if the first geometry contains the second geometry." +scalar,st_convexhull,any,,,st_convexhull(geo) - Returns the convex hull of the input geometry as a geometry. +scalar,st_covers,any,,,"st_covers(geo1, geo2) - Returns true if the first geometry covers the second geometry." +scalar,st_difference,any,,,"st_difference(geo1, geo2) - Returns the point-set different of the two input geometries as a 2D geometry." +scalar,st_dimension,any,,,st_dimension(geo) - Returns the topological dimension of the 2D projection of the geometry. +scalar,st_disjoint,any,,,"st_disjoint(geo1, geo2) - Returns `true` if the two geometries are disjoint." +scalar,st_distance,any,,,"st_distance(geo1, geo2) - Returns the 2D Cartesian distance between the two input geometries." +scalar,st_distancesphere,any,,,"st_distancesphere(geo1, geo2) - Returns the spherical distance (in meters) between two point geometries, measured on a sphere whose radius is the mean radius of the WGS84 ellipsoid." +scalar,st_distancespheroid,any,,,"st_distancespheroid(geo1, geo2) - Returns the geodesic distance (in meters) between two point geometries on the WGS84 ellipsoid." +scalar,st_dwithin,any,,,"st_dwithin(geo1, geo2, d) - Returns `true` iff the 2D Cartesian distance between the two input geometries is smaller than or equal to the input distance." +scalar,st_endpoint,any,,,"st_endpoint(geo) - Returns the last point of the input GEOGRAPHY or GEOMETRY value, if the input geospatial value is a non-empty linestring." +scalar,st_envelope,any,,,st_envelope(geo) - Returns a 2D Cartesian geometry representing the 2D axis-aligned minimum bounding box (envelope) of the input geometry. +aggregate,st_envelope_agg,any,,,"st_envelope_agg(geoCol) - Returns the envelope of all the geometries in the column, or NULL if the column has zero rows, or contains only NULL values." +scalar,st_equals,any,,,"st_equals(geo1, geo2) - Returns `true` if the two geometries are geometrically equal." +scalar,st_exteriorring,any,,,"st_exteriorring(geo) - Returns the exterior ring (shell), as a linestring, of the inputGEOGRAPHY or GEOMETRY value representing a polygon." +scalar,st_flipcoordinates,any,,,st_flipcoordinates(geo) - Swaps X and Y coordinates of the input geometry +scalar,st_geogfromgeojson,any,,,st_geogfromgeojson(geojson) - Parses the GeoJSON description of a geography and returns the corresponding GEOGRAPHY value. +scalar,st_geogfromtext,any,,,st_geogfromtext(wkt) - Parses the WKT description of a geography and returns the corresponding GEOGRAPHY value. +scalar,st_geogfromwkb,any,,,st_geogfromwkb(wkb) - Parses the WKB description of a geography and returns the corresponding GEOGRAPHY value. +scalar,st_geogfromwkt,any,,,st_geogfromwkt(wkt) - Parses the WKT description of a geography and returns the corresponding GEOGRAPHY value. +scalar,st_geohash,any,,,"st_geohash(geo[, precision]) - Returns the geohash of the input geometry at the given precision. If no precision is specified the expression returns the geohash value that corresponds to the geohash grid bucket of maximum precision (but not more than 12) that fully covers the bounding box of the input geometry." +scalar,st_geometryn,any,,,"st_geometryn(geo, n) - Returns the 1-based n-th element of the input geometry as a GEOMETRY value, if the input is a multipoint, a multilinestring, a multipolygon, or a geometry collection, or returns an error if the element does not exist. Returns the input as is if the input is a non-empty point, linestring, or polygon, and the value of the index isequal to 1, otherwise returns an error." +scalar,st_geometrytype,any,,,st_geometrytype(geo) - Returns the type of the input GEOGRAPHY or GEOMETRY value as a string. +scalar,st_geomfromewkb,any,,,st_geomfromewkb(ewkb) - Parses the EWKB description of a geometry and returns the corresponding GEOMETRY value. +scalar,st_geomfromgeohash,any,,,st_geomfromgeohash(geohash) - Returns the geohash grid box corresponding to the input geohash value as a 2D polygon geometry. +scalar,st_geomfromgeojson,any,,,st_geomfromgeojson(geojson) - Parses the GeoJSON description of a geometry and returns the corresponding GEOMETRY value. +scalar,st_geomfromtext,any,,,"st_geomfromtext(wkt[, srid]) - Parses the WKT description of a geometry and returns the corresponding GEOMETRY value." +scalar,st_geomfromwkb,any,,,"st_geomfromwkb(wkb[, srid]) - Parses the WKB description of a geometry and returns the corresponding GEOMETRY value." +scalar,st_geomfromwkt,any,,,"st_geomfromwkt(wkt[, srid]) - Parses the WKT description of a geometry and returns the corresponding GEOMETRY value." +scalar,st_intersection,any,,,"st_intersection(geo1, geo2) - Returns the point-set intersection of the two input geometries as a 2D geometry." +scalar,st_intersects,any,,,"st_intersects(geo1, geo2) - Returns `true` if the two geometries intersect." +scalar,st_isempty,any,,,st_isempty(geo) - Returns true if the input geography or geometry value does not contain any non-empty points. +scalar,st_isvalid,any,,,st_isvalid(geo) - Returns true if the input geometry is a valid geometry in the OGC sense. +scalar,st_length,any,,,st_length(geo) - Returns the length of the input geometry or geography value. +scalar,st_m,any,,,"st_m(geo) - Returns the M coordinate of the input point geometry, or NULL if the point is empty or does not have an M coordinate." +scalar,st_makeline,any,,,"st_makeline(geoArray) - Returns a linestring geometry whose points are the non-empty points of the geometries in the input array of geometries, which are expected to be points, linestrings, or multipoints." +scalar,st_makepolygon,any,,,"st_makepolygon(outer[, innerArray]) - Constructs a polygon from the input outer boundary and optional array of inner boundaries, represented as closed linestrings." +scalar,st_multi,any,,,st_multi(geo) - Returns the input GEOGRAPHY or GEOMETRY value as an equivalent multi geospatial value. +scalar,st_ndims,any,,,st_ndims(geo) - Returns the coordinate dimension of the input geography or geometry value. +scalar,st_npoints,any,,,st_npoints(geo) - Returns the number of non-empty points in the input geography or geometry. +scalar,st_numgeometries,any,,,st_numgeometries(geo) - Returns the number of geometries in the input geometry. +scalar,st_perimeter,any,,,st_perimeter(geo) - Returns the perimeter of the input geometry or geography value. +scalar,st_point,any,,,"st_point(x, y[, srid]) - Returns a point GEOMETRY with the given x and y coordinates and SRID value (if provided)." +scalar,st_pointfromgeohash,any,,,st_pointfromgeohash(geohash) - Returns the center of the geohash grid box corresponding to the input geohash value as a 2D point geometry. +scalar,st_pointn,any,,,"st_pointn(geo, index) - Returns the 1-based indexed n-th point of the input GEOGRAPHY or GEOMETRY value, if the index is valid and the input geospatial value is a non-empty linestring. Otherwise, returns an error. The SRID value of the output point geography or geometry is the same as that of the input value." +scalar,st_removepoint,any,,,"st_removepoint(geo, index) - Removes the n-th point from the input linestring GEOGRAPHY or GEOMETRY." +scalar,st_reverse,any,,,st_reverse(geo) - Reverses the order of vertices in the input GEOGRAPHY or GEOMETRY value. +scalar,st_rotate,any,,,"st_rotate(geo, rotationAngle) - Rotates the input geometry around the Z axis by the given rotation angle (in radians)." +scalar,st_scale,any,,,"st_scale(geo, xfactor, yfactor[, zfactor]) - Scales the input geometry in the X, Y, and, if specified, Z directions using the provided scaling factors." +scalar,st_setpoint,any,,,"st_setpoint(geo1, index, geo2) - Sets the n-th point in the input linestring GEOGRAPHY or GEOMETRY." +scalar,st_setsrid,any,,,"st_setsrid(geo, srid) - Returns a new GEOMETRY value whose SRID is the specified SRID value." +scalar,st_simplify,any,,,"st_simplify(geo, tolerance) - Simplifies the input geometry using the Douglas-Peucker algorithm." +scalar,st_srid,any,,,st_srid(geo) - Returns the SRID of the input GEOGRAPHY or GEOMETRY value. +scalar,st_startpoint,any,,,"st_startpoint(geo) - Returns the first point of the input GEOGRAPHY or GEOMETRY value, if the input geospatial value is a non-empty linestring." +scalar,st_touches,any,,,"st_touches(geo1, geo2) - Returns `true` if the two geometries touch each other." +scalar,st_transform,any,,,"st_transform(geo, srid) - Transforms the X and Y coordinates of the input geometry from the current coordinate reference system to the coordinate reference system described by provided SRID value." +scalar,st_translate,any,,,"st_translate(geo, xoffset, yoffset[, zoffset]) - Translates the input geometry in the X, Y, and, if specified, Z directions using the provided offsets." +scalar,st_union,any,,,"st_union(geo1, geo2) - Returns the point-set union of the two input geometries as a 2D geometry." +aggregate,st_union_agg,any,,,"st_union_agg(geoCol) - Returns the point-wise union of all the geometries in the column, or NULL if the column is zero rows, or contains only NULL values." +scalar,st_within,any,,,"st_within(geo1, geo2) - Returns true if the first geometry is within the second geometry." +scalar,st_x,any,,,"st_x(geo) - Returns the X coordinate of the input point geometry, or NULL if the point is empty." +scalar,st_xmax,any,,,"st_xmax(geo) - Returns the maximum X coordinate of the input geometry, or NULL if the geometry is empty." +scalar,st_xmin,any,,,"st_xmin(geo) - Returns the minimum X coordinate of the input geometry, or NULL if the geometry is empty." +scalar,st_y,any,,,"st_y(geo) - Returns the Y coordinate of the input point geometry, or NULL if the point is empty." +scalar,st_ymax,any,,,"st_ymax(geo) - Returns the maximum Y coordinate of the input geometry, or NULL if the geometry is empty." +scalar,st_ymin,any,,,"st_ymin(geo) - Returns the minimum Y coordinate of the input geometry, or NULL if the geometry is empty." +scalar,st_z,any,,,"st_z(geo) - Returns the Z coordinate of the input point geometry, or NULL if the point is empty or does not have a Z coordinate." +scalar,st_zmax,any,,,"st_zmax(geo) - Returns the maximum Z coordinate of the input geometry, or NULL if the geometry is empty or does not have Z coordinates." +scalar,st_zmin,any,,,"st_zmin(geo) - Returns the minimum Y coordinate of the input geometry, or NULL if the geometry is empty or does not have Z coordinates." +scalar,stack,any,,,"stack(n, expr1, ..., exprk) - Separates `expr1`, ..., `exprk` into `n` rows. Uses column names col0, col1, etc. by default unless specified otherwise." +scalar,startswith,any,,,"startswith(left, right) - Returns a boolean. The value is True if left starts with right. Returns NULL if either input expression is NULL. Otherwise, returns False. Both left or right must be of STRING or BINARY type." +aggregate,std,double,,,Standard deviation +scalar,str_to_map,any,,,"str_to_map(text[, pairDelim[, keyValueDelim]]) - Creates a map after splitting the text into key/value pairs using delimiters. Default delimiters are ',' for `pairDelim` and ':' for `keyValueDelim`. Both `pairDelim` and `keyValueDelim` are treated as regular expressions." +scalar,string,any,,,string(expr) - Casts the value `expr` to the target data type `string`. +scalar,stringdecode,any,,,"stringdecode(bin, charset) - Decodes the first argument using the second argument character set. If either argument is null, the result will also be null." +scalar,table_changes,any,,,N/A. +scalar,table_changes_by_path,any,,,N/A. +scalar,time,time,,,time(expr) - Casts the value `expr` to the target data type `time`. +scalar,time_diff,bigint,,,"time_diff(unit, start, end) - Gets the difference between the times in the specified units." +scalar,time_trunc,time,,,"time_trunc(unit, time) - Returns `time` truncated to the `unit`." +scalar,timestamp,timestamp,,,timestamp(expr) - Casts the value `expr` to the target data type `timestamp`. +scalar,timestamp_micros,any,,,timestamp_micros(microseconds) - Creates timestamp from the number of microseconds since UTC epoch. +scalar,timestamp_millis,any,,,timestamp_millis(milliseconds) - Creates timestamp from the number of milliseconds since UTC epoch. +scalar,timestamp_seconds,any,,,timestamp_seconds(seconds) - Creates timestamp from the number of seconds (can be fractional) since UTC epoch. +scalar,tinyint,any,,,tinyint(expr) - Casts the value `expr` to the target data type `tinyint`. +scalar,to_avro,any,,,"to_avro(child[, jsonFormatSchema]) - Converts a Catalyst binary input value into its corresponding Avro format result." +scalar,to_binary,any,,,"to_binary(str[, fmt]) - Converts the input `str` to a binary value based on the supplied `fmt`. `fmt` can be a case-insensitive string literal of ""hex"", ""utf-8"", ""utf8"", or ""base64"". By default, the binary format for conversion is ""hex"" if `fmt` is omitted. The function returns NULL if at least one of the input parameters is NULL." +scalar,to_csv,any,,,"to_csv(expr[, options]) - Returns a CSV string with a given struct value" +scalar,to_geography,any,,,to_geography(geoFormat) - Parses the input BINARY or STRING value and returns the corresponding GEOGRAPHY value. +scalar,to_geometry,any,,,to_geometry(geoFormat) - Parses the input BINARY or STRING value and returns the corresponding GEOMETRY value. +scalar,to_json,any,,,"to_json(expr[, options]) - Returns a JSON string with a given struct value" +scalar,to_number,any,,,"to_number(expr, fmt) - Convert string 'expr' to a number based on the string format 'fmt'. Throws an exception if the conversion fails. The format can consist of the following characters, case insensitive: '0' or '9': Specifies an expected digit between 0 and 9. A sequence of 0 or 9 in the format string matches a sequence of digits in the input string. If the 0/9 sequence starts with 0 and is before the decimal point, it can only match a digit sequence of the same size. Otherwise, if the sequence starts with 9 or is after the decimal point, it can match a digit sequence that has the same or smaller size. '.' or 'D': Specifies the position of the decimal point (optional, only allowed once). ',' or 'G': Specifies the position of the grouping (thousands) separator (,). There must be a 0 or 9 to the left and right of each grouping separator. 'expr' must match the grouping separator relevant for the size of the number. '$': Specifies the location of the $ currency sign. This character may only be specified once. 'S' or 'MI': Specifies the position of a '-' or '+' sign (optional, only allowed once at the beginning or end of the format string). Note that 'S' allows '-' but 'MI' does not. 'PR': Only allowed at the end of the format string; specifies that 'expr' indicates a negative number with wrapping angled brackets. ('<1>')." +scalar,to_protobuf,any,,,"to_protobuf(child, messageName, descFilePath, options) - Converts a Catalyst binary input value into its corresponding Protobuf format result." +scalar,to_time,any,,,"to_time(str[, format]) - Parses the `str` expression with the `format` expression to a time. If `format` is malformed or its application does not result in a well formed time, the function raises an error. By default, it follows casting rules to a time if the `format` is omitted." +scalar,to_timestamp_ltz,any,,,"to_timestamp_ltz(timestamp_str[, fmt]) - Parses the `timestamp_str` expression with the `fmt` expression to a timestamp with local time zone. Returns null with invalid input. By default, it follows casting rules to a timestamp if the `fmt` is omitted." +scalar,to_timestamp_ntz,any,,,"to_timestamp_ntz(timestamp_str[, fmt]) - Parses the `timestamp_str` expression with the `fmt` expression to a timestamp without time zone. Returns null with invalid input. By default, it follows casting rules to a timestamp if the `fmt` is omitted." +scalar,to_unix_timestamp,any,,,"to_unix_timestamp(timeExp[, fmt]) - Returns the UNIX timestamp of the given time." +scalar,to_utc_timestamp,any,,,"to_utc_timestamp(timestamp, timezone) - Given a timestamp like '2017-07-14 02:40:00.0', interprets it as a time in the given time zone, and renders that time as a timestamp in UTC. For example, 'GMT+1' would yield '2017-07-14 01:40:00.0'." +scalar,to_varchar,any,,,"to_varchar(expr, format) - Convert `expr` to a string based on the `format`. Throws an exception if the conversion fails. The format can consist of the following characters, case insensitive: '0' or '9': Specifies an expected digit between 0 and 9. A sequence of 0 or 9 in the format string matches a sequence of digits in the input value, generating a result string of the same length as the corresponding sequence in the format string. The result string is left-padded with zeros if the 0/9 sequence comprises more digits than the matching part of the decimal value, starts with 0, and is before the decimal point. Otherwise, it is padded with spaces. '.' or 'D': Specifies the position of the decimal point (optional, only allowed once). ',' or 'G': Specifies the position of the grouping (thousands) separator (,). There must be a 0 or 9 to the left and right of each grouping separator. '$': Specifies the location of the $ currency sign. This character may only be specified once. 'S' or 'MI': Specifies the position of a '-' or '+' sign (optional, only allowed once at the beginning or end of the format string). Note that 'S' prints '+' for positive values but 'MI' prints a space. 'PR': Only allowed at the end of the format string; specifies that the result string will be wrapped by angle brackets if the input value is negative. ('<1>'). If `expr` is a datetime, `format` shall be a valid datetime pattern, see Datetime Patterns. If `expr` is a binary, it is converted to a string in one of the formats: 'base64': a base 64 string. 'hex': a string in the hexadecimal format. 'utf-8': the input binary is decoded to UTF-8 string." +scalar,to_variant_object,any,,,to_variant_object(expr) - Convert a nested input (array/map/struct) into a variant where maps and structs are converted to variant objects which are unordered unlike SQL structs. Input maps can only have string keys. +scalar,to_xml,any,,,"to_xml(expr[, options]) - Returns a XML string with a given struct value" +scalar,transform,any,,,"transform(expr, func) - Transforms elements in an array using the function." +scalar,transform_keys,any,,,"transform_keys(expr, func) - Transforms elements in a map using the function." +scalar,transform_values,any,,,"transform_values(expr, func) - Transforms values in the map using the function." +scalar,try_add,any,,,"try_add(expr1, expr2) - Returns the sum of `expr1`and `expr2` and the result is null on overflow. The acceptable input types are the same with the `+` operator." +scalar,try_aes_decrypt,any,,,"try_aes_decrypt(expr, key[, mode[, padding[, aad]]]) - This is a special version of `aes_decrypt` that performs the same operation, but returns a NULL value instead of raising an error if the decryption cannot be performed." +aggregate,try_avg,double,,,Average of values (NULL on failure) +scalar,try_divide,any,,,"try_divide(dividend, divisor) - Returns `dividend`/`divisor`. It always performs floating point division. Its result is always null if `expr2` is 0. `dividend` must be a numeric or an interval. `divisor` must be a numeric." +scalar,try_element_at,any,,,"try_element_at(array, index) - Returns element of array at given (1-based) index. If Index is 0, Spark will throw an error. If index < 0, accesses elements from the last to the first. The function always returns NULL if the index exceeds the length of the array." +scalar,try_make_interval,any,,,"try_make_interval([years[, months[, weeks[, days[, hours[, mins[, secs]]]]]]]) - This is a special version of `make_interval` that performs the same operation, but returns NULL when an overflow occurs." +scalar,try_make_timestamp,any,,,"try_make_timestamp(year, month, day, hour, min, sec[, timezone]) - Try to create a timestamp from year, month, day, hour, min, sec and timezone fields. The result data type is consistent with the value of configuration `spark.sql.timestampType`. The function returns NULL on invalid inputs." +scalar,try_make_timestamp_ltz,any,,,"try_make_timestamp_ltz(year, month, day, hour, min, sec[, timezone]) - Try to create the current timestamp with local time zone from year, month, day, hour, min, sec and timezone fields. The function returns NULL on invalid inputs." +scalar,try_make_timestamp_ntz,any,,,"try_make_timestamp_ntz(year, month, day, hour, min, sec) - Try to create local date-time from year, month, day, hour, min, sec fields. The function returns NULL on invalid inputs." +scalar,try_mod,any,,,"try_mod(dividend, divisor) - Returns the remainder after `expr1`/`expr2`. `dividend` must be a numeric. `divisor` must be a numeric." +scalar,try_multiply,any,,,"try_multiply(expr1, expr2) - Returns `expr1`*`expr2` and the result is null on overflow. The acceptable input types are the same with the `*` operator." +scalar,try_parse_json,any,,,try_parse_json(jsonStr) - Parse a JSON string as a Variant value. Return NULL when the string is not valid JSON value. +scalar,try_parse_url,any,,,"try_parse_url(url, partToExtract[, key]) - This is a special version of `parse_url` that performs the same operation, but returns a NULL value instead of raising an error if the parsing cannot be performed." +scalar,try_reflect,any,,,"try_reflect(class, method[, arg1[, arg2 ..]]) - This is a special version of `reflect` that performs the same operation, but returns a NULL value instead of raising an error if the invoke method thrown exception." +scalar,try_remainder,any,,,"try_remainder(dividend, divisor) - Returns the remainder after `expr1`/`expr2`. `dividend` must be a numeric. `divisor` must be a numeric." +scalar,try_secret,any,,,"try_secret(scope, key) - Extracts a secret value with the given scope and key. This uses the Databricks secret service. The user executing the query must have authority to access the provided scope. If this user is not authorized, or if the provided scope or key are not found, the expression returns NULL. Otherwise, looks up the corresponding value using the key and returns the result as a string." +scalar,try_subtract,any,,,"try_subtract(expr1, expr2) - Returns `expr1`-`expr2` and the result is null on overflow. The acceptable input types are the same with the `-` operator." +aggregate,try_sum,same_as_input,,,Sum of values (NULL on failure) +scalar,try_to_binary,any,,,"try_to_binary(str[, fmt]) - This is a special version of `to_binary` that performs the same operation, but returns a NULL value instead of raising an error if the conversion cannot be performed." +scalar,try_to_date,any,,,"try_to_date(date_str[, fmt]) - Parses the `date_str` expression with the `fmt` expression to a date. The function always returns null on an invalid input with/without ANSI SQL mode enabled. By default, it follows casting rules to a date if the `fmt` is omitted." +scalar,try_to_geography,any,,,try_to_geography(geoFormat) - Parses the input BINARY or STRING value and returns the corresponding GEOGRAPHY value. +scalar,try_to_geometry,any,,,try_to_geometry(geoFormat) - Parses the input BINARY or STRING value and returns the corresponding GEOMETRY value. +scalar,try_to_number,any,,,"try_to_number(expr, fmt) - Convert string 'expr' to a number based on the string format `fmt`. Returns NULL if the string 'expr' does not match the expected format. The format follows the same semantics as the to_number function." +scalar,try_to_time,any,,,"try_to_time(str[, format]) - Parses the `str` expression with the `format` expression to a time. If `format` is malformed or its application does not result in a well formed time, the function returns NULL. By default, it follows casting rules to a time if the `format` is omitted." +scalar,try_to_timestamp,any,,,"try_to_timestamp(timestamp_str[, fmt]) - Parses the `timestamp_str` expression with the `fmt` expression to a timestamp. The function always returns null on an invalid input with/without ANSI SQL mode enabled. By default, it follows casting rules to a timestamp if the `fmt` is omitted. The result data type is consistent with the value of configuration `spark.sql.timestampType`." +scalar,try_url_decode,any,,,"try_url_decode(str) - This is a special version of `url_decode` that performs the same operation, but returns a NULL value instead of raising an error if the decoding cannot be performed." +scalar,try_validate_utf8,any,,,"try_validate_utf8(str) - Returns the original string if `str` is a valid UTF-8 string, otherwise returns NULL." +scalar,try_variant_get,any,,,"try_variant_get(v, path[, type]) - Extracts a sub-variant from `v` according to `path`, and then cast the sub-variant to `type`. When `type` is omitted, it is default to `variant`. Returns null if the path does not exist or the cast fails." +scalar,try_zstd_decompress,any,,,"try_zstd_decompress(expr) - Returns the decompressed value of `expr` using Zstandard. Supports data compressed in both single-pass mode and streaming mode. On decompression failure, it returns NULL." +scalar,typeof,any,,,typeof(expr) - Return DDL-formatted type string for the data type of the input. +scalar,ucase,varchar,,,ucase(str) - Returns `str` with all characters changed to uppercase. +scalar,unbase64,any,,,unbase64(str) - Converts the argument from a base 64 string `str` to a binary. +scalar,unhex,any,,,unhex(expr) - Converts hexadecimal `expr` to binary. +scalar,uniform,any,,,"uniform(min, max[, seed]) - Returns a random value with independent and identically distributed (i.i.d.) values with the specified range of numbers. The random seed is optional. The provided numbers specifying the minimum and maximum values of the range must be constant. If both of these numbers are integers, then the result will also be an integer. Otherwise if one or both of these are floating-point numbers, then the result will also be a floating-point number." +scalar,unix_date,any,,,unix_date(date) - Returns the number of days since 1970-01-01. +scalar,unix_micros,any,,,unix_micros(timestamp) - Returns the number of microseconds since 1970-01-01 00:00:00 UTC. +scalar,unix_millis,any,,,unix_millis(timestamp) - Returns the number of milliseconds since 1970-01-01 00:00:00 UTC. Truncates higher levels of precision. +scalar,unix_seconds,any,,,unix_seconds(timestamp) - Returns the number of seconds since 1970-01-01 00:00:00 UTC. Truncates higher levels of precision. +scalar,unix_timestamp,any,,,"unix_timestamp([timeExp[, fmt]]) - Returns the UNIX timestamp of current or specified time." +scalar,url_decode,any,,,url_decode(str) - Decodes a `str` in 'application/x-www-form-urlencoded' format using a specific encoding scheme. +scalar,url_encode,any,,,url_encode(str) - Translates a string into 'application/x-www-form-urlencoded' format using a specific encoding scheme. +scalar,user,any,,,user() - user name of current execution context. +scalar,validate_utf8,any,,,"validate_utf8(str) - Returns the original string if `str` is a valid UTF-8 string, otherwise throws an exception." +aggregate,variance,double,,,variance(expr) - Returns the sample variance calculated from values of a group. +scalar,variant_explode,any,,,"variant_explode(expr) - It separates a variant object/array into multiple rows containing its fields/elements. Its result schema is `struct`. `pos` is the position of the field/element in its parent object/array, and `value` is the field/element value. `key` is the field name when exploding a variant object, or is NULL when exploding a variant array. It ignores any input that is not a variant array/object, including SQL NULL, variant null, and any other variant values." +scalar,variant_explode_outer,any,,,"variant_explode_outer(expr) - It separates a variant object/array into multiple rows containing its fields/elements. Its result schema is `struct`. `pos` is the position of the field/element in its parent object/array, and `value` is the field/element value. `key` is the field name when exploding a variant object, or is NULL when exploding a variant array. It ignores any input that is not a variant array/object, including SQL NULL, variant null, and any other variant values." +scalar,variant_get,any,,,"variant_get(v, path[, type]) - Extracts a sub-variant from `v` according to `path`, and then cast the sub-variant to `type`. When `type` is omitted, it is default to `variant`. Returns null if the path does not exist. Throws an exception if the cast fails." +scalar,vector_search,any,,,"vector_search() - Performs a KNN vector search against the specified vector index. Required parameters are `index`, which is the fully-qualified vector index name, and either `query_text` for the query text or `query_vector` for the query embedding. Optionally, `num_results` specifies the maximum number of output rows, which defaults to 10. Currently, only the DELTA_SYNC index type is supported. Also optionally, `query_type` specifies the type of query to perform, It supports two types of queries: `ANN` and `HYBRID`. It defaults to `ANN`. As a legacy, the `query` parameter name (equivalent to `query_text`) remains supported for backward compatibility." +scalar,weekday,any,,,"weekday(date) - Returns the day of the week for date/timestamp (0 = Monday, 1 = Tuesday, ..., 6 = Sunday)." +scalar,weekofyear,any,,,weekofyear(date) - Returns the week of the year of the given date. A week is considered to start on a Monday and week 1 is the first week with >3 days. +scalar,when,any,,,"CASE WHEN expr1 THEN expr2 [WHEN expr3 THEN expr4]* [ELSE expr5] END - When `expr1` = true, returns `expr2`; else when `expr3` = true, returns `expr4`; else returns `expr5`." +scalar,width_bucket,any,,,"width_bucket(value, min_value, max_value, num_bucket) - Returns the bucket number to which `value` would be assigned in an equiwidth histogram with `num_bucket` buckets, in the range `min_value` to `max_value`.""" +window,window,any,,,"window(time_column, window_duration[, slide_duration[, start_time]]) - Bucketize rows into one or more time windows given a timestamp specifying column. Window starts are inclusive but the window ends are exclusive, e.g. 12:05 will be in the window [12:05,12:10) but not in [12:00,12:05). Windows can support microsecond precision. Windows in the order of months are not supported. See 'Window Operations on Event Time' in Structured Streaming guide doc for detailed explanation and examples." +window,window_time,timestamp,,,Current window time +scalar,xpath,any,,,"xpath(xml, xpath) - Returns a string array of values within the nodes of xml that match the XPath expression." +scalar,xpath_boolean,any,,,"xpath_boolean(xml, xpath) - Returns true if the XPath expression evaluates to true, or if a matching node is found." +scalar,xpath_double,any,,,"xpath_double(xml, xpath) - Returns a double value, the value zero if no match is found, or NaN if a match is found but the value is non-numeric." +scalar,xpath_float,any,,,"xpath_float(xml, xpath) - Returns a float value, the value zero if no match is found, or NaN if a match is found but the value is non-numeric." +scalar,xpath_int,any,,,"xpath_int(xml, xpath) - Returns an integer value, or the value zero if no match is found, or a match is found but the value is non-numeric." +scalar,xpath_long,any,,,"xpath_long(xml, xpath) - Returns a long integer value, or the value zero if no match is found, or a match is found but the value is non-numeric." +scalar,xpath_number,any,,,"xpath_number(xml, xpath) - Returns a double value, the value zero if no match is found, or NaN if a match is found but the value is non-numeric." +scalar,xpath_short,any,,,"xpath_short(xml, xpath) - Returns a short integer value, or the value zero if no match is found, or a match is found but the value is non-numeric." +scalar,xpath_string,any,,,"xpath_string(xml, xpath) - Returns the text contents of the first xml node that matches the XPath expression." +scalar,xxhash64,bigint,,,"xxhash64(expr1, expr2, ...) - Returns a 64-bit hash value of the arguments. Hash seed is 42." +scalar,year,bigint,,,year(date) - Returns the year component of the date/timestamp. +scalar,zeroifnull,any,,,"zeroifnull(expr) - Returns zero if `expr` is equal to null, or `expr` otherwise." +scalar,zip_with,any,,,"zip_with(left, right, func) - Merges the two given arrays, element-wise, into a single array using function. If one array is shorter, nulls are appended at the end to match the length of the longer array, before applying function." +scalar,zstd_compress,any,,,"zstd_compress(expr[, level[, streaming_mode]]) - Returns a compressed value of `expr` using Zstandard with the specified compression `level`. The default level is 3. Uses single-pass mode by default." +scalar,zstd_decompress,any,,,"zstd_decompress(expr) - Returns the decompressed value of `expr` using Zstandard. Supports data compressed in both single-pass mode and streaming mode. On decompression failure, it throws an exception." diff --git a/ibis-server/tests/routers/v3/connector/databricks/__init__.py b/ibis-server/tests/routers/v3/connector/databricks/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/ibis-server/tests/routers/v3/connector/databricks/conftest.py b/ibis-server/tests/routers/v3/connector/databricks/conftest.py new file mode 100644 index 000000000..8b659e96a --- /dev/null +++ b/ibis-server/tests/routers/v3/connector/databricks/conftest.py @@ -0,0 +1,60 @@ +import os +import pathlib + +import pytest +from databricks import sql + +pytestmark = pytest.mark.databricks + +base_url = "/v3/connector/databricks" + + +def pytest_collection_modifyitems(items): + current_file_dir = pathlib.Path(__file__).resolve().parent + for item in items: + if pathlib.Path(item.fspath).is_relative_to(current_file_dir): + item.add_marker(pytestmark) + + +@pytest.fixture(scope="module", autouse=True) +def init_databricks(connection_info): + # create schema `wren` and some tables for testing + try: + connection = sql.connect( + server_hostname=connection_info["serverHostname"], + http_path=connection_info["httpPath"], + access_token=connection_info["accessToken"], + ) + with connection.cursor() as cursor: + cursor.execute("CREATE SCHEMA IF NOT EXISTS wren;") + cursor.execute( + """ + CREATE OR REPLACE TABLE wren.t1 ( + id INT PRIMARY KEY COMMENT 'This is a primary key', + value STRING + ) + COMMENT 'This is a table comment' + ; + """ + ) + cursor.execute( + """ + CREATE OR REPLACE TABLE wren.t2 ( + id INT, + t1_id INT, + value STRING, + CONSTRAINT fk_t1 FOREIGN KEY (t1_id) REFERENCES wren.t1(id) + ); + """ + ) + finally: + connection.close() + + +@pytest.fixture(scope="module") +def connection_info() -> dict[str, str]: + return { + "serverHostname": os.getenv("DATABRICKS_SERVER_HOSTNAME"), + "httpPath": os.getenv("DATABRICKS_HTTP_PATH"), + "accessToken": os.getenv("DATABRICKS_TOKEN"), + } diff --git a/ibis-server/tests/routers/v3/connector/databricks/test_function.py b/ibis-server/tests/routers/v3/connector/databricks/test_function.py new file mode 100644 index 000000000..cac8bea3d --- /dev/null +++ b/ibis-server/tests/routers/v3/connector/databricks/test_function.py @@ -0,0 +1,108 @@ +import base64 + +import orjson +import pytest + +from app.config import get_config +from tests.conftest import DATAFUSION_FUNCTION_COUNT, file_path +from tests.routers.v3.connector.databricks.conftest import base_url + +manifest = { + "catalog": "my_catalog", + "schema": "my_schema", + "models": [ + { + "name": "orders", + "tableReference": { + "catalog": "samples", + "schema": "tpch", + "table": "orders", + }, + "columns": [ + {"name": "o_orderkey", "type": "integer"}, + ], + }, + ], +} + +function_list_path = file_path("../resources/function_list") + + +@pytest.fixture(scope="module") +def manifest_str(): + return base64.b64encode(orjson.dumps(manifest)).decode("utf-8") + + +@pytest.fixture(autouse=True) +def set_remote_function_list_path(): + config = get_config() + config.set_remote_function_list_path(function_list_path) + yield + config.set_remote_function_list_path(None) + + +async def test_function_list(client): + config = get_config() + + config.set_remote_function_list_path(None) + response = await client.get(url=f"{base_url}/functions") + assert response.status_code == 200 + result = response.json() + assert len(result) == DATAFUSION_FUNCTION_COUNT + + config.set_remote_function_list_path(function_list_path) + response = await client.get(url=f"{base_url}/functions") + assert response.status_code == 200 + result = response.json() + assert len(result) == DATAFUSION_FUNCTION_COUNT + 491 + the_func = next(filter(lambda x: x["name"] == "bit_reverse", result)) + assert the_func == { + "name": "bit_reverse", + "description": "bit_reverse(expr) - Returns the value obtained by reversing the order of the bits in the two's complement binary representation of the specified integral value.", + "function_type": "scalar", + "param_names": None, + "param_types": None, + "return_type": None, + } + + config.set_remote_function_list_path(None) + response = await client.get(url=f"{base_url}/functions") + assert response.status_code == 200 + result = response.json() + assert len(result) == DATAFUSION_FUNCTION_COUNT + + +async def test_scalar_function(client, manifest_str: str, connection_info): + response = await client.post( + url=f"{base_url}/query", + json={ + "connectionInfo": connection_info, + "manifestStr": manifest_str, + "sql": "SELECT ABS(-1) AS col", + }, + ) + assert response.status_code == 200 + result = response.json() + assert result == { + "columns": ["col"], + "data": [[1]], + "dtypes": {"col": "int32"}, + } + + +async def test_aggregate_function(client, manifest_str: str, connection_info): + response = await client.post( + url=f"{base_url}/query", + json={ + "connectionInfo": connection_info, + "manifestStr": manifest_str, + "sql": "SELECT COUNT(*) AS col FROM (SELECT 1) AS temp_table", + }, + ) + assert response.status_code == 200 + result = response.json() + assert result == { + "columns": ["col"], + "data": [[1]], + "dtypes": {"col": "int64"}, + } diff --git a/ibis-server/tests/routers/v3/connector/databricks/test_metadata.py b/ibis-server/tests/routers/v3/connector/databricks/test_metadata.py new file mode 100644 index 000000000..ca2ef27d7 --- /dev/null +++ b/ibis-server/tests/routers/v3/connector/databricks/test_metadata.py @@ -0,0 +1,57 @@ +v2_base_url = "/v2/connector/databricks" + + +async def test_metadata_list_tables(client, connection_info): + response = await client.post( + url=f"{v2_base_url}/metadata/tables", + json={"connectionInfo": connection_info}, + ) + assert response.status_code == 200 + + result = next(filter(lambda x: x["name"] == "workspace.wren.t1", response.json())) + assert result["name"] == "workspace.wren.t1" + assert result["primaryKey"] is not None + assert result["description"] == "This is a table comment" + assert result["properties"] == { + "catalog": "workspace", + "schema": "wren", + "table": "t1", + "path": None, + } + assert len(result["columns"]) == 2 + assert result["columns"][0] == { + "name": "id", + "nestedColumns": None, + "type": "INTEGER", + "notNull": True, + "description": "This is a primary key", + "properties": None, + } + + +async def test_metadata_list_constraints(client, connection_info): + response = await client.post( + url=f"{v2_base_url}/metadata/constraints", + json={"connectionInfo": connection_info}, + ) + + result = next( + filter(lambda x: x["constraintName"] == "t2_t1_id_t1_id", response.json()) + ) + assert result["constraintName"] == "t2_t1_id_t1_id" + assert result["constraintType"] == "FOREIGN KEY" + assert result["constraintTable"] == "workspace.wren.t2" + assert result["constraintColumn"] == "t1_id" + assert result["constraintedTable"] == "workspace.wren.t1" + assert result["constraintedColumn"] == "id" + + assert response.status_code == 200 + + +async def test_metadata_db_version(client, connection_info): + response = await client.post( + url=f"{v2_base_url}/metadata/version", + json={"connectionInfo": connection_info}, + ) + assert response.status_code == 200 + assert response.text is not None diff --git a/ibis-server/tests/routers/v3/connector/databricks/test_query.py b/ibis-server/tests/routers/v3/connector/databricks/test_query.py new file mode 100644 index 000000000..b3f3dd3c1 --- /dev/null +++ b/ibis-server/tests/routers/v3/connector/databricks/test_query.py @@ -0,0 +1,206 @@ +import base64 + +import orjson +import pytest + +from tests.routers.v3.connector.databricks.conftest import base_url + +manifest = { + "catalog": "wren", + "schema": "public", + "models": [ + { + "name": "orders", + "tableReference": { + "catalog": "samples", + "schema": "tpch", + "table": "orders", + }, + "columns": [ + {"name": "orderkey", "expression": "o_orderkey", "type": "integer"}, + {"name": "custkey", "expression": "o_custkey", "type": "integer"}, + { + "name": "orderstatus", + "expression": "o_orderstatus", + "type": "varchar", + }, + {"name": "totalprice", "expression": "o_totalprice", "type": "float"}, + {"name": "orderdate", "expression": "o_orderdate", "type": "date"}, + { + "name": "order_cust_key", + "expression": "concat(o_orderkey, '_', o_custkey)", + "type": "varchar", + }, + { + "name": "timestamp", + "expression": "cast('2024-01-01T23:59:59' as timestamp)", + "type": "timestamp", + }, + { + "name": "timestamptz", + "expression": "cast('2024-01-01T23:59:59' as timestamp with time zone)", + "type": "timestamp", + }, + { + "name": "test_null_time", + "expression": "cast(NULL as timestamp)", + "type": "timestamp", + }, + ], + }, + ], +} + + +@pytest.fixture(scope="module") +def manifest_str(): + return base64.b64encode(orjson.dumps(manifest)).decode("utf-8") + + +async def test_query(client, manifest_str, connection_info): + response = await client.post( + url=f"{base_url}/query", + json={ + "connectionInfo": connection_info, + "manifestStr": manifest_str, + "sql": "SELECT * FROM wren.public.orders ORDER BY orderkey LIMIT 1", + }, + ) + assert response.status_code == 200 + result = response.json() + assert len(result["columns"]) == len(manifest["models"][0]["columns"]) + assert len(result["data"]) == 1 + + assert result["data"][0] == [ + 1, + 184501, + "O", + "203010.51", + "1996-01-02", + "1_184501", + "2024-01-01 23:59:59.000000 +00:00", + "2024-01-01 23:59:59.000000 +00:00", + None, + ] + assert result["dtypes"] == { + "orderkey": "int64", + "custkey": "int64", + "orderstatus": "string", + "totalprice": "decimal128(38, 9)", + "orderdate": "date32[day]", + "order_cust_key": "string", + "timestamp": "timestamp[us, tz=UTC]", + "timestamptz": "timestamp[us, tz=UTC]", + "test_null_time": "timestamp[us, tz=UTC]", + } + + +async def test_query_with_limit(client, manifest_str, connection_info): + response = await client.post( + url=f"{base_url}/query", + params={"limit": 1}, + json={ + "connectionInfo": connection_info, + "manifestStr": manifest_str, + "sql": "SELECT * FROM wren.public.orders", + }, + ) + assert response.status_code == 200 + result = response.json() + assert len(result["data"]) == 1 + + response = await client.post( + url=f"{base_url}/query", + params={"limit": 1}, + json={ + "connectionInfo": connection_info, + "manifestStr": manifest_str, + "sql": "SELECT * FROM wren.public.orders LIMIT 10", + }, + ) + assert response.status_code == 200 + result = response.json() + assert len(result["data"]) == 1 + + +async def test_query_with_invalid_manifest_str(client, connection_info): + response = await client.post( + url=f"{base_url}/query", + json={ + "connectionInfo": connection_info, + "manifestStr": "xxx", + "sql": "SELECT * FROM wren.public.orders LIMIT 1", + }, + ) + assert response.status_code == 422 + + +async def test_query_without_manifest(client, connection_info): + response = await client.post( + url=f"{base_url}/query", + json={ + "connectionInfo": connection_info, + "sql": "SELECT * FROM wren.public.orders LIMIT 1", + }, + ) + assert response.status_code == 422 + result = response.json() + assert result["detail"][0]["type"] == "missing" + assert result["detail"][0]["loc"] == ["body", "manifestStr"] + assert result["detail"][0]["msg"] == "Field required" + + +async def test_query_without_sql(client, manifest_str, connection_info): + response = await client.post( + url=f"{base_url}/query", + json={"connectionInfo": connection_info, "manifestStr": manifest_str}, + ) + assert response.status_code == 422 + result = response.json() + assert result["detail"][0]["type"] == "missing" + assert result["detail"][0]["loc"] == ["body", "sql"] + assert result["detail"][0]["msg"] == "Field required" + + +async def test_query_without_connection_info(client, manifest_str): + response = await client.post( + url=f"{base_url}/query", + json={ + "manifestStr": manifest_str, + "sql": "SELECT * FROM wren.public.orders LIMIT 1", + }, + ) + assert response.status_code == 422 + result = response.json() + assert result["detail"][0]["type"] == "missing" + assert result["detail"][0]["loc"] == ["body", "connectionInfo"] + assert result["detail"][0]["msg"] == "Field required" + + +async def test_query_with_dry_run(client, manifest_str, connection_info): + response = await client.post( + url=f"{base_url}/query", + params={"dryRun": True}, + json={ + "connectionInfo": connection_info, + "manifestStr": manifest_str, + "sql": "SELECT * FROM wren.public.orders LIMIT 1", + }, + ) + assert response.status_code == 204 + + +async def test_query_with_dry_run_and_invalid_sql( + client, manifest_str, connection_info +): + response = await client.post( + url=f"{base_url}/query", + params={"dryRun": True}, + json={ + "connectionInfo": connection_info, + "manifestStr": manifest_str, + "sql": "SELECT * FROM X", + }, + ) + assert response.status_code == 422 + assert response.text is not None diff --git a/ibis-server/tools/update_databricks_functions.py b/ibis-server/tools/update_databricks_functions.py new file mode 100644 index 000000000..51dc8686b --- /dev/null +++ b/ibis-server/tools/update_databricks_functions.py @@ -0,0 +1,380 @@ +#!/usr/bin/env python3 +""" +Update Databricks function descriptions in CSV by querying DESCRIBE FUNCTION EXTENDED. + +- Input CSV format: function_type,name,return_type,param_names,param_types,description +- Connection via Databricks SQL Warehouse using environment variables: + DATABRICKS_SERVER_HOSTNAME, DATABRICKS_HTTP_PATH, DATABRICKS_ACCESS_TOKEN (or DATABRICKS_TOKEN) + +Example: + export DATABRICKS_SERVER_HOSTNAME="adb-xxxxxxxx.xx.azuredatabricks.net" + export DATABRICKS_HTTP_PATH="/sql/1.0/warehouses/xxxxxxxxxxxxxx" + export DATABRICKS_ACCESS_TOKEN="dapixxx" # or use Databricks personal access token + poetry run python ibis-server/tools/update_databricks_functions.py \ + --csv ibis-server/resources/function_list/databricks.csv --limit 50 + +Notes: +- System functions generally work with DESCRIBE FUNCTION EXTENDED +- Some names may require backticks; script retries with backticks if needed +- If description cannot be parsed, leaves original unchanged unless --overwrite-empty +- Description extraction prefers Usage section; falls back to Function section if needed +""" +from __future__ import annotations + +import argparse +import csv +import os +import sys +import time +from typing import Iterable, List, Optional, Tuple, Any +from dotenv import load_dotenv + +load_dotenv(override=True) + +try: + # databricks-sql-connector + from databricks import sql as dbsql +except Exception as e: # pragma: no cover - import error surfaced in runtime + dbsql = None # type: ignore + + + + +def connect() -> Any: + if dbsql is None: + raise RuntimeError( + "databricks-sql-connector is not installed. Install it or ensure the ibis databricks extra is present." + ) + host = os.environ.get("DATABRICKS_SERVER_HOSTNAME") + http_path = os.environ.get("DATABRICKS_HTTP_PATH") + token = os.environ.get("DATABRICKS_ACCESS_TOKEN") or os.environ.get( + "DATABRICKS_TOKEN" + ) + if not (host and http_path and token): + missing = [ + name + for name, val in [ + ("DATABRICKS_SERVER_HOSTNAME", host), + ("DATABRICKS_HTTP_PATH", http_path), + ("DATABRICKS_ACCESS_TOKEN", token), + ] + if not val + ] + raise RuntimeError( + f"Missing required environment variables: {', '.join(missing)}" + ) + return dbsql.connect(server_hostname=host, http_path=http_path, access_token=token) + + +def describe_function(cur: Any, func_name: str) -> Optional[List[str]]: + """Return DESCRIBE FUNCTION EXTENDED lines (single text column) or None on failure.""" + queries = [ + f"DESCRIBE FUNCTION EXTENDED {func_name}", + f"DESCRIBE FUNCTION EXTENDED `{func_name}`", + ] + for q in queries: + try: + cur.execute(q) + rows = cur.fetchall() + if not rows: + continue + # databricks usually returns one text column; be resilient + lines: List[str] = [] + for row in rows: + if isinstance(row, (tuple, list)) and len(row) >= 1: + lines.append(str(row[0])) + else: + lines.append(str(row)) + return lines + except Exception: + # try next form + continue + return None + + +# Known section labels (lowercase, without the trailing colon) +KNOWN_SECTION_NAMES = ( + "function", + "class", + "usage", + "extended usage", + "arguments", + "examples", + "created by", + "since", +) + + +def _normalize_lines(block_lines: List[str]) -> List[str]: + """Split any multi-line string entries into per-line strings.""" + out: List[str] = [] + for raw in block_lines: + # Ensure str and split on newline to handle multi-line cells + s = str(raw).replace("\r\n", "\n").replace("\r", "\n") + out.extend(s.split("\n")) + return out + + +def _collect_after_header(lines: List[str], header: str) -> List[str]: + """Collect content lines after a section header (case-insensitive) until next section or blank divider. + + - Header matching is based on the label before the first colon, e.g. "Usage" or "Function" (colon optional in input). + - Inline content on the same line after the colon is captured as the first element. + - Collection stops when another known section label is encountered or a blank line is hit (after capturing at least one line). + """ + out: List[str] = [] + started = False + header_key = header.rstrip(":").strip().lower() + for raw in lines: + line = raw.rstrip() + striped = line.strip() + + # Helper to detect if a line starts a section header and optionally return its label and remainder + def parse_header(s: str) -> Tuple[Optional[str], Optional[str]]: + if not s: + return None, None + idx = s.find(":") + if idx == -1: + return None, None + label = s[: idx].strip().lower() + remainder = s[idx + 1 :].strip() + return label, remainder + + if not started: + lbl, remainder = parse_header(striped) + if lbl == header_key: + started = True + if remainder: + out.append(remainder) + continue + else: + # If we reach another header, stop + lbl, _ = parse_header(striped) + if lbl and lbl in KNOWN_SECTION_NAMES: + break + + # Skip obvious separators + if not striped: + if out: + break + else: + continue + + out.append(striped) + return out + + +def parse_description(lines: List[str]) -> Optional[str]: + """Extract only the Usage section (single-line summary). + + Preference: + - Usage: <...> -> return only the Usage content, ignoring Extended Usage/Examples/Since + If Usage spans multiple lines immediately after the header, capture until blank or next header. + - If there is no Usage:, fall back to content after Function:. + """ + if not lines: + return None + # Normalize into per-line strings so headers can be detected + norm = _normalize_lines(lines) + # Collect only the Usage section (inline part plus immediate continuation lines) + usage_lines = _collect_after_header(norm, "Usage") + if usage_lines: + # Flatten and normalize whitespace into a single line + text = " ".join(usage_lines) + text = " ".join(text.split()) + return text + # Fallback: try take a succinct line after Function: + fn = _collect_after_header(norm, "Function") + if fn: + text = " ".join(fn) + text = " ".join(text.split()) + return text + return None + + +def read_csv(path: str) -> Tuple[List[str], List[List[str]]]: + with open(path, newline="", encoding="utf-8") as f: + reader = csv.reader(f) + rows = list(reader) + if not rows: + raise ValueError("CSV is empty") + header = rows[0] + body = rows[1:] + return header, body + + +def write_csv(path: str, header: List[str], rows: List[List[str]]) -> None: + with open(path, "w", newline="", encoding="utf-8") as f: + writer = csv.writer(f) + writer.writerow(header) + writer.writerows(rows) + + +DEFAULT_PLACEHOLDERS = { + "Scalar function", + "Aggregate function", + "Windowing helper", +} + +def _normalize_token(s: str) -> str: + # Lowercase, collapse spaces, strip quotes/backticks and trailing periods + t = (s or "").strip().strip("`\"' ") + t = " ".join(t.split()) + return t.lower().rstrip(".") + + +def is_placeholder_desc(desc: str, func_name: str) -> bool: + """Return True if description looks like a placeholder. + + Placeholder conditions: + - empty string + - known generic placeholders (Scalar function, Aggregate function, Windowing helper) + - description equals the function name (case-insensitive), optionally with empty parentheses (e.g., name or name()) + """ + if not desc: + return True + d = desc.strip() + if d in DEFAULT_PLACEHOLDERS: + return True + dn = _normalize_token(d) + fn = _normalize_token(func_name) + if dn == fn or dn == f"{fn}()": + return True + return False + + +def placeholder_reason(desc: str, func_name: str) -> str: + if not desc: + return "empty description" + d = desc.strip() + if d in DEFAULT_PLACEHOLDERS: + return f"generic placeholder: {d}" + dn = _normalize_token(d) + fn = _normalize_token(func_name) + if dn == fn: + return "equals function name" + if dn == f"{fn}()": + return "equals function name with empty parentheses" + return "non-placeholder" + + +def main() -> int: + ap = argparse.ArgumentParser( + description="Update Databricks function descriptions in CSV" + ) + ap.add_argument( + "--csv", + default="resources/function_list/databricks.csv", + help="Path to databricks.csv", + ) + ap.add_argument( + "--limit", + type=int, + default=None, + help="Max number of functions to update (for dry-runs)", + ) + ap.add_argument( + "--overwrite", + action="store_true", + help="Overwrite existing non-empty descriptions", + ) + ap.add_argument( + "--overwrite-empty", + action="store_true", + help="Overwrite only if description is empty or placeholder", + ) + ap.add_argument( + "--sleep", + type=float, + default=0.0, + help="Sleep seconds between DESCRIBE calls (rate limiting)", + ) + ap.add_argument( + "--verbose", + action="store_true", + help="Print debug information for each function", + ) + args = ap.parse_args() + + csv_path = args.csv + if args.verbose: + print(f"Using CSV: {csv_path}") + header, rows = read_csv(csv_path) + + # Locate column indices + try: + name_idx = header.index("name") + desc_idx = header.index("description") + except ValueError as e: + raise SystemExit(f"CSV missing required columns: {e}") + + to_process = rows + if args.limit is not None: + to_process = rows[: args.limit] + + updated = 0 + skipped = 0 + + with connect() as conn: + with conn.cursor() as cur: + for i, row in enumerate(to_process): + func = row[name_idx].strip() + current_desc = row[desc_idx].strip() if len(row) > desc_idx else "" + + should_update = False + if args.overwrite: + should_update = True + elif args.overwrite_empty: + should_update = is_placeholder_desc(current_desc, func) + else: + # default behavior: fill only placeholders/empty + should_update = is_placeholder_desc(current_desc, func) + + if not should_update: + if args.verbose: + print( + f"Skip {func}: existing description kept (use --overwrite or --overwrite-empty). " + f"Current desc=\"{current_desc}\"; reason={placeholder_reason(current_desc, func)}" + ) + skipped += 1 + continue + + lines = describe_function(cur, func) + if args.verbose: + print(f"DESCRIBE {func}: {lines}") + if not lines: + if args.verbose: + print(f"Skip {func}: DESCRIBE returned no rows or failed in both forms") + skipped += 1 + continue + desc = parse_description(lines) + if desc: + row[desc_idx] = desc + if args.verbose: + print(f"Updated {func}: {desc}") + updated += 1 + else: + if args.verbose: + print(f"Skip {func}: failed to parse description from DESCRIBE output") + skipped += 1 + + if args.sleep: + time.sleep(args.sleep) + + # Merge back updated rows (if limit was set) + if args.limit is not None: + rows[: args.limit] = to_process + else: + rows = to_process + + write_csv(csv_path, header, rows) + + print( + f"Done. Updated {updated} descriptions, skipped {skipped}. CSV saved to {csv_path}", + file=sys.stderr, + ) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/wren-core-base/manifest-macro/src/lib.rs b/wren-core-base/manifest-macro/src/lib.rs index cf0700e5c..5de9f0176 100644 --- a/wren-core-base/manifest-macro/src/lib.rs +++ b/wren-core-base/manifest-macro/src/lib.rs @@ -108,6 +108,8 @@ pub fn data_source(python_binding: proc_macro::TokenStream) -> proc_macro::Token Athena, #[serde(alias = "redshift")] Redshift, + #[serde(alias = "databricks")] + Databricks, } }; proc_macro::TokenStream::from(expanded) diff --git a/wren-core-base/src/mdl/manifest.rs b/wren-core-base/src/mdl/manifest.rs index 9e377f218..36f22a6af 100644 --- a/wren-core-base/src/mdl/manifest.rs +++ b/wren-core-base/src/mdl/manifest.rs @@ -119,6 +119,7 @@ impl Display for DataSource { DataSource::Oracle => write!(f, "ORACLE"), DataSource::Athena => write!(f, "ATHENA"), DataSource::Redshift => write!(f, "REDSHIFT"), + DataSource::Databricks => write!(f, "DATABRICKS"), } } } diff --git a/wren-core/core/src/logical_plan/utils.rs b/wren-core/core/src/logical_plan/utils.rs index 9a9b1403d..a33152485 100644 --- a/wren-core/core/src/logical_plan/utils.rs +++ b/wren-core/core/src/logical_plan/utils.rs @@ -182,6 +182,7 @@ pub fn map_data_type(data_type: &str) -> Result { "uhugeint" => DataType::UInt64, // we don't have a UHUINT type, so we map it to UInt64 "bit" => DataType::Boolean, // we don't have a BIT type, so we map it to Boolean "timestamp_ns" => DataType::Timestamp(TimeUnit::Nanosecond, None), + "any" => DataType::Utf8, // we don't have an ANY type, so we map it to Utf8 _ => { debug!("try parse by arrow {lower_data_type}"); // the from_str is case sensitive, so we need to use the original string