diff --git a/.github/ISSUE_TEMPLATE/airflow_providers_bug_report.yml b/.github/ISSUE_TEMPLATE/airflow_providers_bug_report.yml index 70f1a27e7b108..67be91e8cb8b2 100644 --- a/.github/ISSUE_TEMPLATE/airflow_providers_bug_report.yml +++ b/.github/ISSUE_TEMPLATE/airflow_providers_bug_report.yml @@ -43,6 +43,7 @@ body: - celery - cloudant - cncf-kubernetes + - core-sql - databricks - datadog - dbt-cloud diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst index a906c0eec2269..4ec19f86ac4c0 100644 --- a/CONTRIBUTING.rst +++ b/CONTRIBUTING.rst @@ -647,7 +647,7 @@ This is the full list of those extras: airbyte, alibaba, all, all_dbs, amazon, apache.atlas, apache.beam, apache.cassandra, apache.drill, apache.druid, apache.hdfs, apache.hive, apache.kylin, apache.livy, apache.pig, apache.pinot, apache.spark, apache.sqoop, apache.webhdfs, arangodb, asana, async, atlas, aws, azure, cassandra, -celery, cgroups, cloudant, cncf.kubernetes, crypto, dask, databricks, datadog, dbt.cloud, +celery, cgroups, cloudant, cncf.kubernetes, core.sql, crypto, dask, databricks, datadog, dbt.cloud, deprecated_api, devel, devel_all, devel_ci, devel_hadoop, dingding, discord, doc, docker, druid, elasticsearch, exasol, facebook, ftp, gcp, gcp_api, github, github_enterprise, google, google_auth, grpc, hashicorp, hdfs, hive, http, imap, influxdb, jdbc, jenkins, jira, kerberos, kubernetes, ldap, diff --git a/INSTALL b/INSTALL index ba2f5500475ee..9694ba1f5bdba 100644 --- a/INSTALL +++ b/INSTALL @@ -97,7 +97,7 @@ The list of available extras: airbyte, alibaba, all, all_dbs, amazon, apache.atlas, apache.beam, apache.cassandra, apache.drill, apache.druid, apache.hdfs, apache.hive, apache.kylin, apache.livy, apache.pig, apache.pinot, apache.spark, apache.sqoop, apache.webhdfs, arangodb, asana, async, atlas, aws, azure, cassandra, -celery, cgroups, cloudant, cncf.kubernetes, crypto, dask, databricks, datadog, dbt.cloud, +celery, cgroups, cloudant, cncf.kubernetes, core.sql, crypto, dask, databricks, datadog, dbt.cloud, deprecated_api, devel, devel_all, devel_ci, devel_hadoop, dingding, discord, doc, docker, druid, elasticsearch, exasol, facebook, ftp, gcp, gcp_api, github, github_enterprise, google, google_auth, grpc, hashicorp, hdfs, hive, http, imap, influxdb, jdbc, jenkins, jira, kerberos, kubernetes, ldap, diff --git a/airflow/providers/core/__init__.py b/airflow/providers/core/__init__.py new file mode 100644 index 0000000000000..13a83393a9124 --- /dev/null +++ b/airflow/providers/core/__init__.py @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. diff --git a/airflow/providers/core/sql/CHANGELOG.rst b/airflow/providers/core/sql/CHANGELOG.rst new file mode 100644 index 0000000000000..a5b4657ed3277 --- /dev/null +++ b/airflow/providers/core/sql/CHANGELOG.rst @@ -0,0 +1,26 @@ + .. Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + .. http://www.apache.org/licenses/LICENSE-2.0 + + .. Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. + + +Changelog +--------- + +1.0.0 +..... + +Initial version of the provider. +Adds SQLColumnCheckOperator and SQLTableCheckOperator. diff --git a/airflow/providers/core/sql/__init__.py b/airflow/providers/core/sql/__init__.py new file mode 100644 index 0000000000000..13a83393a9124 --- /dev/null +++ b/airflow/providers/core/sql/__init__.py @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. diff --git a/airflow/providers/core/sql/example_dags/__init__.py b/airflow/providers/core/sql/example_dags/__init__.py new file mode 100644 index 0000000000000..13a83393a9124 --- /dev/null +++ b/airflow/providers/core/sql/example_dags/__init__.py @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. diff --git a/airflow/providers/core/sql/example_dags/example_sql_column_table_check.py b/airflow/providers/core/sql/example_dags/example_sql_column_table_check.py new file mode 100644 index 0000000000000..e83f0217655a3 --- /dev/null +++ b/airflow/providers/core/sql/example_dags/example_sql_column_table_check.py @@ -0,0 +1,77 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +from airflow import DAG +from airflow.providers.core.sql.operators.sql import SQLColumnCheckOperator, SQLTableCheckOperator +from airflow.utils.dates import datetime + +AIRFLOW_DB_METADATA_TABLE = "ab_role" +connection_args = { + "conn_id": "airflow_db", + "conn_type": "Postgres", + "host": "postgres", + "schema": "postgres", + "login": "postgres", + "password": "postgres", + "port": 5432, +} + +with DAG( + "example_sql_column_table_check", + description="Example DAG for SQLColumnCheckOperator and SQLTableCheckOperator.", + default_args=connection_args, + start_date=datetime(2021, 1, 1), + schedule_interval=None, + catchup=False, +) as dag: + """ + ### Example SQL Column and Table Check DAG + + Runs the SQLColumnCheckOperator and SQLTableCheckOperator against the Airflow metadata DB. + """ + + # [START howto_operator_sql_column_check] + column_check = SQLColumnCheckOperator( + task_id="column_check", + table=AIRFLOW_DB_METADATA_TABLE, + column_mapping={ + "id": { + "null_check": { + "equal_to": 0, + "tolerance": 0, + }, + "distinct_check": { + "equal_to": 1, + }, + } + }, + ) + # [END howto_operator_sql_column_check] + + # [START howto_operator_sql_table_check] + row_count_check = SQLTableCheckOperator( + task_id="row_count_check", + table=AIRFLOW_DB_METADATA_TABLE, + checks={ + "row_count_check": { + "check_statement": "COUNT(*) = 1", + } + }, + ) + # [END howto_operator_sql_table_check] + + column_check >> row_count_check diff --git a/airflow/providers/core/sql/operators/__init__.py b/airflow/providers/core/sql/operators/__init__.py new file mode 100644 index 0000000000000..13a83393a9124 --- /dev/null +++ b/airflow/providers/core/sql/operators/__init__.py @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. diff --git a/airflow/providers/core/sql/operators/sql.py b/airflow/providers/core/sql/operators/sql.py new file mode 100644 index 0000000000000..1d9945f403ec3 --- /dev/null +++ b/airflow/providers/core/sql/operators/sql.py @@ -0,0 +1,315 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +from typing import Any, Dict, Optional, Union + +from airflow.exceptions import AirflowException +from airflow.operators.sql import BaseSQLOperator + + +def parse_boolean(val: str) -> Union[str, bool]: + """Try to parse a string into boolean. + + Raises ValueError if the input is not a valid true- or false-like string value. + """ + val = val.lower() + if val in ('y', 'yes', 't', 'true', 'on', '1'): + return True + if val in ('n', 'no', 'f', 'false', 'off', '0'): + return False + raise ValueError(f"{val!r} is not a boolean-like string value") + + +def _get_failed_tests(checks): + return [ + f"\tCheck: {check},\n\tCheck Values: {check_values}\n" + for check, check_values in checks.items() + if not check_values["success"] + ] + + +class SQLColumnCheckOperator(BaseSQLOperator): + """ + Performs one or more of the templated checks in the column_checks dictionary. + Checks are performed on a per-column basis specified by the column_mapping. + Each check can take one or more of the following options: + - equal_to: an exact value to equal, cannot be used with other comparison options + - greater_than: value that result should be strictly greater than + - less_than: value that results should be strictly less than + - geq_to: value that results should be greater than or equal to + - leq_to: value that results should be less than or equal to + - tolerance: the percentage that the result may be off from the expected value + + :param table: the table to run checks on + :param column_mapping: the dictionary of columns and their associated checks, e.g. + + .. code-block:: python + + { + "col_name": { + "null_check": { + "equal_to": 0, + }, + "min": { + "greater_than": 5, + "leq_to": 10, + "tolerance": 0.2, + }, + "max": {"less_than": 1000, "geq_to": 10, "tolerance": 0.01}, + } + } + + :param conn_id: the connection ID used to connect to the database + :param database: name of database which overwrite the defined one in connection + + .. seealso:: + For more information on how to use this operator, take a look at the guide: + :ref:`howto/operator:SQLColumnCheckOperator` + """ + + column_checks = { + "null_check": "SUM(CASE WHEN column IS NULL THEN 1 ELSE 0 END) AS column_null_check", + "distinct_check": "COUNT(DISTINCT(column)) AS column_distinct_check", + "unique_check": "COUNT(column) - COUNT(DISTINCT(column)) AS column_unique_check", + "min": "MIN(column) AS column_min", + "max": "MAX(column) AS column_max", + } + + def __init__( + self, + *, + table: str, + column_mapping: Dict[str, Dict[str, Any]], + conn_id: Optional[str] = None, + database: Optional[str] = None, + **kwargs, + ): + super().__init__(conn_id=conn_id, database=database, **kwargs) + for checks in column_mapping.values(): + for check, check_values in checks.items(): + self._column_mapping_validation(check, check_values) + + self.table = table + self.column_mapping = column_mapping + # OpenLineage needs a valid SQL query with the input/output table(s) to parse + self.sql = f"SELECT * FROM {self.table};" + + def execute(self, context=None): + hook = self.get_db_hook() + failed_tests = [] + for column in self.column_mapping: + checks = [*self.column_mapping[column]] + checks_sql = ",".join([self.column_checks[check].replace("column", column) for check in checks]) + + self.sql = f"SELECT {checks_sql} FROM {self.table};" + records = hook.get_first(self.sql) + + if not records: + raise AirflowException(f"The following query returned zero rows: {self.sql}") + + self.log.info(f"Record: {records}") + + for idx, result in enumerate(records): + tolerance = self.column_mapping[column][checks[idx]].get("tolerance") + + self.column_mapping[column][checks[idx]]["result"] = result + self.column_mapping[column][checks[idx]]["success"] = self._get_match( + self.column_mapping[column][checks[idx]], result, tolerance + ) + + failed_tests.extend(_get_failed_tests(self.column_mapping[column])) + if failed_tests: + raise AirflowException( + f"Test failed.\nQuery:\n{self.sql}\nResults:\n{records!s}\n" + "The following tests have failed:" + f"\n{''.join(failed_tests)}" + ) + + self.log.info("All tests have passed") + + def _get_match(self, check_values, record, tolerance=None) -> bool: + match_boolean = True + if "geq_to" in check_values: + if tolerance is not None: + match_boolean = record >= check_values["geq_to"] * (1 - tolerance) + else: + match_boolean = record >= check_values["geq_to"] + elif "greater_than" in check_values: + if tolerance is not None: + match_boolean = record > check_values["greater_than"] * (1 - tolerance) + else: + match_boolean = record > check_values["greater_than"] + if "leq_to" in check_values: + if tolerance is not None: + match_boolean = record <= check_values["leq_to"] * (1 + tolerance) and match_boolean + else: + match_boolean = record <= check_values["leq_to"] and match_boolean + elif "less_than" in check_values: + if tolerance is not None: + match_boolean = record < check_values["less_than"] * (1 + tolerance) and match_boolean + else: + match_boolean = record < check_values["less_than"] and match_boolean + if "equal_to" in check_values: + if tolerance is not None: + match_boolean = ( + check_values["equal_to"] * (1 - tolerance) + <= record + <= check_values["equal_to"] * (1 + tolerance) + ) and match_boolean + else: + match_boolean = record == check_values["equal_to"] and match_boolean + return match_boolean + + def _column_mapping_validation(self, check, check_values): + if check not in self.column_checks: + raise AirflowException(f"Invalid column check: {check}.") + if ( + "greater_than" not in check_values + and "geq_to" not in check_values + and "less_than" not in check_values + and "leq_to" not in check_values + and "equal_to" not in check_values + ): + raise ValueError( + "Please provide one or more of: less_than, leq_to, " + "greater_than, geq_to, or equal_to in the check's dict." + ) + + if "greater_than" in check_values and "less_than" in check_values: + if check_values["greater_than"] >= check_values["less_than"]: + raise ValueError( + "greater_than should be strictly less than " + "less_than. Use geq_to or leq_to for " + "overlapping equality." + ) + + if "greater_than" in check_values and "leq_to" in check_values: + if check_values["greater_than"] >= check_values["leq_to"]: + raise ValueError( + "greater_than must be strictly less than leq_to. " + "Use geq_to with leq_to for overlapping equality." + ) + + if "geq_to" in check_values and "less_than" in check_values: + if check_values["geq_to"] >= check_values["less_than"]: + raise ValueError( + "geq_to should be strictly less than less_than. " + "Use leq_to with geq_to for overlapping equality." + ) + + if "geq_to" in check_values and "leq_to" in check_values: + if check_values["geq_to"] > check_values["leq_to"]: + raise ValueError("geq_to should be less than or equal to leq_to.") + + if "greater_than" in check_values and "geq_to" in check_values: + raise ValueError("Only supply one of greater_than or geq_to.") + + if "less_than" in check_values and "leq_to" in check_values: + raise ValueError("Only supply one of less_than or leq_to.") + + if ( + "greater_than" in check_values + or "geq_to" in check_values + or "less_than" in check_values + or "leq_to" in check_values + ) and "equal_to" in check_values: + raise ValueError( + "equal_to cannot be passed with a greater or less than " + "function. To specify 'greater than or equal to' or " + "'less than or equal to', use geq_to or leq_to." + ) + + +class SQLTableCheckOperator(BaseSQLOperator): + """ + Performs one or more of the checks provided in the checks dictionary. + Checks should be written to return a boolean result. + + :param table: the table to run checks on + :param checks: the dictionary of checks, e.g.: + + .. code-block:: python + + { + "row_count_check": {"check_statement": "COUNT(*) = 1000"}, + "column_sum_check": {"check_statement": "col_a + col_b < col_c"}, + } + + :param conn_id: the connection ID used to connect to the database + :param database: name of database which overwrite the defined one in connection + + .. seealso:: + For more information on how to use this operator, take a look at the guide: + :ref:`howto/operator:SQLTableCheckOperator` + """ + + sql_check_template = "CASE WHEN check_statement THEN 1 ELSE 0 END AS check_name" + sql_min_template = "MIN(check_name)" + + def __init__( + self, + *, + table: str, + checks: Dict[str, Dict[str, Any]], + conn_id: Optional[str] = None, + database: Optional[str] = None, + **kwargs, + ): + super().__init__(conn_id=conn_id, database=database, **kwargs) + + self.table = table + self.checks = checks + # OpenLineage needs a valid SQL query with the input/output table(s) to parse + self.sql = f"SELECT * FROM {self.table};" + + def execute(self, context=None): + hook = self.get_db_hook() + + check_names = [*self.checks] + check_mins_sql = ",".join( + self.sql_min_template.replace("check_name", check_name) for check_name in check_names + ) + checks_sql = ",".join( + [ + self.sql_check_template.replace("check_statement", value["check_statement"]).replace( + "check_name", check_name + ) + for check_name, value in self.checks.items() + ] + ) + + self.sql = f"SELECT {check_mins_sql} FROM (SELECT {checks_sql} FROM {self.table});" + records = hook.get_first(self.sql) + + if not records: + raise AirflowException(f"The following query returned zero rows: {self.sql}") + + self.log.info(f"Record: {records}") + + for check in self.checks.keys(): + for result in records: + self.checks[check]["success"] = parse_boolean(str(result)) + + failed_tests = _get_failed_tests(self.checks) + if failed_tests: + raise AirflowException( + f"Test failed.\nQuery:\n{self.sql}\nResults:\n{records!s}\n" + "The following tests have failed:" + f"\n{', '.join(failed_tests)}" + ) + + self.log.info("All tests have passed") diff --git a/airflow/providers/core/sql/provider.yaml b/airflow/providers/core/sql/provider.yaml new file mode 100644 index 0000000000000..64f6d9746ce5a --- /dev/null +++ b/airflow/providers/core/sql/provider.yaml @@ -0,0 +1,41 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +--- +package-name: apache-airflow-providers-core-sql +name: Core SQL +description: | + `Core SQL Provider `__ + +versions: + - 1.0.0 + +additional-dependencies: + - apache-airflow>=2.2.0 + +integrations: + - integration-name: Core SQL + external-doc-url: https://en.wikipedia.org/wiki/SQL + how-to-guide: + - /docs/apache-airflow-providers-core-sql/operators.rst + logo: /integration-logos/core/sql/sql.png + tags: [software] + +operators: + - integration-name: Core SQL + python-modules: + - airflow.providers.core.sql.operators.sql diff --git a/docs/apache-airflow-providers-core-sql/commits.rst b/docs/apache-airflow-providers-core-sql/commits.rst new file mode 100644 index 0000000000000..8292a11950bf7 --- /dev/null +++ b/docs/apache-airflow-providers-core-sql/commits.rst @@ -0,0 +1,25 @@ + .. Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + .. http://www.apache.org/licenses/LICENSE-2.0 + + .. Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. + +Package apache-airflow-providers-core-sql +------------------------------------------ + +`Core SQL Provider `__ + + +This is detailed commit list of changes for versions provider package: ``core.sql``. +For high-level changelog, see :doc:`package information including changelog `. diff --git a/docs/apache-airflow-providers-core-sql/connections.rst b/docs/apache-airflow-providers-core-sql/connections.rst new file mode 100644 index 0000000000000..ad06ed4b8ee98 --- /dev/null +++ b/docs/apache-airflow-providers-core-sql/connections.rst @@ -0,0 +1,32 @@ + .. Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + .. http://www.apache.org/licenses/LICENSE-2.0 + + .. Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. + +.. _howto/connection:sql: + +Connecting to a SQL DB +======================= + +The SQL Provider package operators allow access to various SQL-like databases. For +databases that can be connected to with a DBApi Hook directly, simply passing the +connection ID with these operators is sufficient. For other connections with more +complicated hooks, the additional parameters can be passed as key word args to the +operators. + +Default Connection ID +~~~~~~~~~~~~~~~~~~~~~ + +SQL Operators under this provider do not default to any connection ID. diff --git a/docs/apache-airflow-providers-core-sql/index.rst b/docs/apache-airflow-providers-core-sql/index.rst new file mode 100644 index 0000000000000..abc6186b68557 --- /dev/null +++ b/docs/apache-airflow-providers-core-sql/index.rst @@ -0,0 +1,54 @@ + .. Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + .. http://www.apache.org/licenses/LICENSE-2.0 + + .. Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. + +``apache-airflow-providers-core-sql`` +============================================ + +Content +------- + +.. toctree:: + :maxdepth: 1 + :caption: Guides + + Connecting to SQL Databases + Operators + +.. toctree:: + :maxdepth: 1 + :caption: References + + Python API <_api/airflow/providers/core/sql/index> + +.. toctree:: + :maxdepth: 1 + :caption: Resources + + Example DAGs + PyPI Repository + Installing from sources + +.. THE REMAINDER OF THE FILE IS AUTOMATICALLY GENERATED. IT WILL BE OVERWRITTEN AT RELEASE TIME! +.. toctree:: + :maxdepth: 1 + :caption: Commits + + Detailed list of commits + + +Package apache-airflow-providers-core-sql +------------------------------------------------------ diff --git a/docs/apache-airflow-providers-core-sql/installing-providers-from-sources.rst b/docs/apache-airflow-providers-core-sql/installing-providers-from-sources.rst new file mode 100644 index 0000000000000..1c90205d15b3a --- /dev/null +++ b/docs/apache-airflow-providers-core-sql/installing-providers-from-sources.rst @@ -0,0 +1,18 @@ + .. Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + .. http://www.apache.org/licenses/LICENSE-2.0 + + .. Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. + +.. include:: ../installing-providers-from-sources.rst diff --git a/docs/apache-airflow-providers-core-sql/operators.rst b/docs/apache-airflow-providers-core-sql/operators.rst new file mode 100644 index 0000000000000..c2b04767d4857 --- /dev/null +++ b/docs/apache-airflow-providers-core-sql/operators.rst @@ -0,0 +1,112 @@ + .. Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + .. http://www.apache.org/licenses/LICENSE-2.0 + + .. Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. + +SQL Operators +=================== + +These operators perform various queries against a SQL database, including +column- and table-level data quality checks. + +.. _howto/operator:SQLColumnCheckOperator: + +Check SQL Table Columns +~~~~~~~~~~~~~~~~~~~~~~~ + +Use the :class:`~airflow.providers.core.sql.operators.sql.SQLColumnCheckOperator` to run data quality +checks against columns of a given table. As well as a connection ID and table, a column_mapping +describing the relationship between columns and tests to run must be supplied. An example column +mapping is a set of three nested dictionaries and looks like: + +.. code-block:: python + + column_mapping = { + "col_name": { + "null_check": { + "equal_to": 0, + }, + "min": { + "greater_than": 5, + "leq_to": 10, + "tolerance": 0.2, + }, + "max": {"less_than": 1000, "geq_to": 10, "tolerance": 0.01}, + } + } + +Where col_name is the name of the column to run checks on, and each entry in its dictionary is a check. +The valid checks are: +- null_check: checks the number of NULL values in the column +- distinct_check: checks the COUNT of values in the column that are distinct +- unique_check: checks the number of distinct values in a column against the number of rows +- min: checks the minimum value in the column +- max: checks the maximum value in the column + +Each entry in the check's dictionary is either a condition for success of the check or the tolerance. The +conditions for success are: +- greater_than +- geq_to +- less_than +- leq_to +- equal_to + +When specifying conditions, equal_to is not compatible with other conditions. Both a lower- and an upper- +bound condition may be specified in the same check. The tolerance is a percentage that the result may +be out of bounds but still considered successful. + + + +The below example demonstrates how to instantiate the SQLColumnCheckOperator task. + +.. exampleinclude:: /../../airflow/providers/core/sql/example_dags/example_sql_column_table_check.py + :language: python + :dedent: 4 + :start-after: [START howto_operator_sql_column_check] + :end-before: [END howto_operator_sql_column_check] + +.. _howto/operator:SQLTableCheckOperator: + +Check SQL Table Values +~~~~~~~~~~~~~~~~~~~~~~~ + +Use the :class:`~airflow.providers.core.sql.operators.sql.SQLTableCheckOperator` to run data quality +checks against a given table. As well as a connection ID and table, a checks dictionary +describing the relationship between the table and tests to run must be supplied. An example +checks argument is a set of two nested dictionaries and looks like: + +.. code-block:: python + + checks = ( + { + "row_count_check": { + "check_statement": "COUNT(*) = 1000", + }, + "column_sum_check": {"check_statement": "col_a + col_b < col_c"}, + }, + ) + +The first set of keys are the check names, which are referenced in the templated query the operator builds. +The dictionary key under the check name must be check_statement, with the value a SQL statement that +resolves to a boolean (this can be any string or int that resolves to a boolean in +airflow.operators.sql.parse_boolean). + +The below example demonstrates how to instantiate the SQLTableCheckOperator task. + +.. exampleinclude:: /../../airflow/providers/core/sql/example_dags/example_sql_column_table_check.py + :language: python + :dedent: 4 + :start-after: [START howto_operator_sql_table_check] + :end-before: [END howto_operator_sql_table_check] diff --git a/docs/apache-airflow/extra-packages-ref.rst b/docs/apache-airflow/extra-packages-ref.rst index cefa4a67bfddf..188b9a02f9944 100644 --- a/docs/apache-airflow/extra-packages-ref.rst +++ b/docs/apache-airflow/extra-packages-ref.rst @@ -33,7 +33,7 @@ pre-installed when Airflow is installed. Core Airflow extras ------------------- -Those are core airflow extras that extend capabilities of core Airflow. They usually do not install provider +These are core airflow extras that extend capabilities of core Airflow. They usually do not install provider packages (with the exception of ``celery`` and ``cncf.kubernetes`` extras), they just install necessary python dependencies for the provided package. @@ -79,7 +79,7 @@ python dependencies for the provided package. Providers extras ---------------- -Those providers extras are simply convenience extras to install provider packages so that you can install the providers with simple command - including +These providers extras are simply convenience extras to install provider packages so that you can install the providers with simple command - including provider package and necessary dependencies in single command, which allows PIP to resolve any conflicting dependencies. This is extremely useful for first time installation where you want to repeatably install version of dependencies which are 'valid' for both airflow and providers installed. @@ -105,7 +105,7 @@ upgrade those providers manually if you want to use latest versions of the provi Apache Software extras ====================== -Those are extras that add dependencies needed for integration with other Apache projects (note that ``apache.atlas`` and +These are extras that add dependencies needed for integration with other Apache projects (note that ``apache.atlas`` and ``apache.webhdfs`` do not have their own providers - they only install additional libraries that can be used in custom bash/python providers). @@ -145,7 +145,7 @@ custom bash/python providers). External Services extras ======================== -Those are extras that add dependencies needed for integration with external services - either cloud based or on-premises. +These are extras that add dependencies needed for integration with external services - either cloud based or on-premises. +---------------------+-----------------------------------------------------+-----------------------------------------------------+ | extra | install command | enables | @@ -213,7 +213,7 @@ Those are extras that add dependencies needed for integration with external serv Locally installed software extras ================================= -Those are extras that add dependencies needed for integration with other software packages installed usually as part of the deployment of Airflow. +These are extras that add dependencies needed for integration with other software packages installed usually as part of the deployment of Airflow. +---------------------+-----------------------------------------------------+-------------------------------------------+ | extra | install command | enables | @@ -263,11 +263,13 @@ Those are extras that add dependencies needed for integration with other softwar Other extras ============ -Those are extras that provide support for integration with external systems via some - usually - standard protocols. +These are extras that provide support for integration with external systems via some - usually - standard protocols. +---------------------+-----------------------------------------------------+--------------------------------------+--------------+ | extra | install command | enables | Preinstalled | +=====================+=====================================================+======================================+==============+ +| core.sql | ``pip install 'apache-airflow[core.sql]'`` | Core SQL Operators | | ++---------------------+-----------------------------------------------------+--------------------------------------+--------------+ | ftp | ``pip install 'apache-airflow[ftp]'`` | FTP hooks and operators | * | +---------------------+-----------------------------------------------------+--------------------------------------+--------------+ | grpc | ``pip install 'apache-airflow[grpc]'`` | Grpc hooks and operators | | @@ -294,7 +296,7 @@ Those are extras that provide support for integration with external systems via Bundle extras ------------- -Those are extras that install one ore more extras as a bundle. Note that those extras should only be used for "development" version +These are extras that install one ore more extras as a bundle. Note that these extras should only be used for "development" version of Airflow - i.e. when Airflow is installed from sources. Because of the way how bundle extras are constructed they might not work when airflow is installed from 'PyPI`. @@ -332,7 +334,7 @@ This is the extra that is needed to generated documentation for Airflow. This is Deprecated 1.10 extras ---------------------- -Those are the extras that have been deprecated in 2.0 and will be removed in Airflow 3.0.0. They were +These are the extras that have been deprecated in 2.0 and will be removed in Airflow 3.0.0. They were all replaced by new extras, which have naming consistent with the names of provider packages. The ``crypto`` extra is not needed any more, because all crypto dependencies are part of airflow package, diff --git a/docs/integration-logos/core/sql/sql.png b/docs/integration-logos/core/sql/sql.png new file mode 100644 index 0000000000000..29a2685d8bd1b Binary files /dev/null and b/docs/integration-logos/core/sql/sql.png differ diff --git a/docs/spelling_wordlist.txt b/docs/spelling_wordlist.txt index d91c742ab189d..44362b9b933d9 100644 --- a/docs/spelling_wordlist.txt +++ b/docs/spelling_wordlist.txt @@ -859,6 +859,7 @@ gcpcloudsql gcs gdbm generateUploadUrl +geq getattr getfqdn getframe @@ -1016,6 +1017,7 @@ latencies latin ldap ldaps +leq leveldb libs libz diff --git a/images/breeze/output-build-docs.svg b/images/breeze/output-build-docs.svg index 928f8f898ba5d..069d64d07e921 100644 --- a/images/breeze/output-build-docs.svg +++ b/images/breeze/output-build-docs.svg @@ -1,4 +1,4 @@ - + - - + + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + + + + - Command: build-docs + Command: build-docs - + - - -Usage: breeze build-docs [OPTIONS] - -Build documentation in the container. - -╭─ Doc flags ──────────────────────────────────────────────────────────────────────────────────────────────────────────╮ ---docs-only-dOnly build documentation. ---spellcheck-only-sOnly run spell checking. ---for-production-pBuilds documentation for official release i.e. all links point to stable version. ---package-filter-pList of packages to consider.                                                                 -(apache-airflow | apache-airflow-providers | apache-airflow-providers-airbyte |               -apache-airflow-providers-alibaba | apache-airflow-providers-amazon |                          -apache-airflow-providers-apache-beam | apache-airflow-providers-apache-cassandra |            -apache-airflow-providers-apache-drill | apache-airflow-providers-apache-druid |               -apache-airflow-providers-apache-hdfs | apache-airflow-providers-apache-hive |                 -apache-airflow-providers-apache-kylin | apache-airflow-providers-apache-livy |                -apache-airflow-providers-apache-pig | apache-airflow-providers-apache-pinot |                 -apache-airflow-providers-apache-spark | apache-airflow-providers-apache-sqoop |               -apache-airflow-providers-arangodb | apache-airflow-providers-asana |                          -apache-airflow-providers-celery | apache-airflow-providers-cloudant |                         -apache-airflow-providers-cncf-kubernetes | apache-airflow-providers-databricks |              -apache-airflow-providers-datadog | apache-airflow-providers-dbt-cloud |                       -apache-airflow-providers-dingding | apache-airflow-providers-discord |                        -apache-airflow-providers-docker | apache-airflow-providers-elasticsearch |                    -apache-airflow-providers-exasol | apache-airflow-providers-facebook |                         -apache-airflow-providers-ftp | apache-airflow-providers-github |                              -apache-airflow-providers-google | apache-airflow-providers-grpc |                             -apache-airflow-providers-hashicorp | apache-airflow-providers-http |                          -apache-airflow-providers-imap | apache-airflow-providers-influxdb |                           -apache-airflow-providers-jdbc | apache-airflow-providers-jenkins |                            -apache-airflow-providers-jira | apache-airflow-providers-microsoft-azure |                    -apache-airflow-providers-microsoft-mssql | apache-airflow-providers-microsoft-psrp |          -apache-airflow-providers-microsoft-winrm | apache-airflow-providers-mongo |                   -apache-airflow-providers-mysql | apache-airflow-providers-neo4j |                             -apache-airflow-providers-odbc | apache-airflow-providers-openfaas |                           -apache-airflow-providers-opsgenie | apache-airflow-providers-oracle |                         -apache-airflow-providers-pagerduty | apache-airflow-providers-papermill |                     -apache-airflow-providers-plexus | apache-airflow-providers-postgres |                         -apache-airflow-providers-presto | apache-airflow-providers-qubole |                           -apache-airflow-providers-redis | apache-airflow-providers-salesforce |                        -apache-airflow-providers-samba | apache-airflow-providers-segment |                           -apache-airflow-providers-sendgrid | apache-airflow-providers-sftp |                           -apache-airflow-providers-singularity | apache-airflow-providers-slack |                       -apache-airflow-providers-snowflake | apache-airflow-providers-sqlite |                        -apache-airflow-providers-ssh | apache-airflow-providers-tableau |                             -apache-airflow-providers-telegram | apache-airflow-providers-trino |                          -apache-airflow-providers-vertica | apache-airflow-providers-yandex |                          -apache-airflow-providers-zendesk | docker-stack | helm-chart)                                 -╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ -╭─ Options ────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ ---verbose-vPrint verbose information about performed steps. ---dry-run-DIf dry-run is set, commands are only printed, not executed. ---github-repository-gGitHub repository used to pull, push run images.(TEXT)[default: apache/airflow] ---help-hShow this message and exit. -╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ + + +Usage: breeze build-docs [OPTIONS] + +Build documentation in the container. + +╭─ Doc flags ──────────────────────────────────────────────────────────────────────────────────────────────────────────╮ +--docs-only-dOnly build documentation. +--spellcheck-only-sOnly run spell checking. +--for-production-pBuilds documentation for official release i.e. all links point to stable version. +--package-filter-pList of packages to consider.                                                                 +(apache-airflow | apache-airflow-providers | apache-airflow-providers-airbyte |               +apache-airflow-providers-alibaba | apache-airflow-providers-amazon |                          +apache-airflow-providers-apache-beam | apache-airflow-providers-apache-cassandra |            +apache-airflow-providers-apache-drill | apache-airflow-providers-apache-druid |               +apache-airflow-providers-apache-hdfs | apache-airflow-providers-apache-hive |                 +apache-airflow-providers-apache-kylin | apache-airflow-providers-apache-livy |                +apache-airflow-providers-apache-pig | apache-airflow-providers-apache-pinot |                 +apache-airflow-providers-apache-spark | apache-airflow-providers-apache-sqoop |               +apache-airflow-providers-arangodb | apache-airflow-providers-asana |                          +apache-airflow-providers-celery | apache-airflow-providers-cloudant |                         +apache-airflow-providers-cncf-kubernetes | apache-airflow-providers-core-sql |                +apache-airflow-providers-databricks | apache-airflow-providers-datadog |                      +apache-airflow-providers-dbt-cloud | apache-airflow-providers-dingding |                      +apache-airflow-providers-discord | apache-airflow-providers-docker |                          +apache-airflow-providers-elasticsearch | apache-airflow-providers-exasol |                    +apache-airflow-providers-facebook | apache-airflow-providers-ftp |                            +apache-airflow-providers-github | apache-airflow-providers-google |                           +apache-airflow-providers-grpc | apache-airflow-providers-hashicorp |                          +apache-airflow-providers-http | apache-airflow-providers-imap |                               +apache-airflow-providers-influxdb | apache-airflow-providers-jdbc |                           +apache-airflow-providers-jenkins | apache-airflow-providers-jira |                            +apache-airflow-providers-microsoft-azure | apache-airflow-providers-microsoft-mssql |         +apache-airflow-providers-microsoft-psrp | apache-airflow-providers-microsoft-winrm |          +apache-airflow-providers-mongo | apache-airflow-providers-mysql |                             +apache-airflow-providers-neo4j | apache-airflow-providers-odbc |                              +apache-airflow-providers-openfaas | apache-airflow-providers-opsgenie |                       +apache-airflow-providers-oracle | apache-airflow-providers-pagerduty |                        +apache-airflow-providers-papermill | apache-airflow-providers-plexus |                        +apache-airflow-providers-postgres | apache-airflow-providers-presto |                         +apache-airflow-providers-qubole | apache-airflow-providers-redis |                            +apache-airflow-providers-salesforce | apache-airflow-providers-samba |                        +apache-airflow-providers-segment | apache-airflow-providers-sendgrid |                        +apache-airflow-providers-sftp | apache-airflow-providers-singularity |                        +apache-airflow-providers-slack | apache-airflow-providers-snowflake |                         +apache-airflow-providers-sqlite | apache-airflow-providers-ssh |                              +apache-airflow-providers-tableau | apache-airflow-providers-telegram |                        +apache-airflow-providers-trino | apache-airflow-providers-vertica |                           +apache-airflow-providers-yandex | apache-airflow-providers-zendesk | docker-stack |           +helm-chart)                                                                                   +╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ Options ────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ +--verbose-vPrint verbose information about performed steps. +--dry-run-DIf dry-run is set, commands are only printed, not executed. +--github-repository-gGitHub repository used to pull, push run images.(TEXT)[default: apache/airflow] +--help-hShow this message and exit. +╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ diff --git a/images/breeze/output-commands-hash.txt b/images/breeze/output-commands-hash.txt index e79557f189468..43538474e18ec 100644 --- a/images/breeze/output-commands-hash.txt +++ b/images/breeze/output-commands-hash.txt @@ -2,4 +2,4 @@ # This file is automatically generated by pre-commit. If you have a conflict with this file # Please do not solve it but run `breeze regenerate-command-images`. # This command should fix the conflict and regenerate help images that you have conflict with. -eb0a4aef0e360063ff1bef04107752c7 +906b523aaaaed54525b239a97f3303c1 diff --git a/images/breeze/output-prepare-provider-documentation.svg b/images/breeze/output-prepare-provider-documentation.svg index 49e1dd54b6781..de6e1e492523f 100644 --- a/images/breeze/output-prepare-provider-documentation.svg +++ b/images/breeze/output-prepare-provider-documentation.svg @@ -1,4 +1,4 @@ - + - - + + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + + + + - Command: prepare-provider-documentation + Command: prepare-provider-documentation - + - - -Usage: breeze prepare-provider-documentation [OPTIONS] [airbyte | alibaba | amazon | apache.beam | apache.cassandra | -                                             apache.drill | apache.druid | apache.hdfs | apache.hive | apache.kylin | -                                             apache.livy | apache.pig | apache.pinot | apache.spark | apache.sqoop | -                                             arangodb | asana | celery | cloudant | cncf.kubernetes | databricks | -                                             datadog | dbt.cloud | dingding | discord | docker | elasticsearch | -                                             exasol | facebook | ftp | github | google | grpc | hashicorp | http | -                                             imap | influxdb | jdbc | jenkins | jira | microsoft.azure | -                                             microsoft.mssql | microsoft.psrp | microsoft.winrm | mongo | mysql | -                                             neo4j | odbc | openfaas | opsgenie | oracle | pagerduty | papermill | -                                             plexus | postgres | presto | qubole | redis | salesforce | samba | -                                             segment | sendgrid | sftp | singularity | slack | snowflake | sqlite | -                                             ssh | tableau | telegram | trino | vertica | yandex | zendesk]... - -Prepare CHANGELOG, README and COMMITS information for providers. - -╭─ Provider documentation preparation flags ───────────────────────────────────────────────────────────────────────────╮ ---debugDrop user in shell instead of running the command. Useful for debugging. -╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ -╭─ Options ────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ ---verbose-vPrint verbose information about performed steps. ---dry-run-DIf dry-run is set, commands are only printed, not executed. ---github-repository-gGitHub repository used to pull, push run images.(TEXT)[default: apache/airflow] ---answer-aForce answer to questions.(y | n | q | yes | no | quit) ---help-hShow this message and exit. -╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ + + +Usage: breeze prepare-provider-documentation [OPTIONS] [airbyte | alibaba | amazon | apache.beam | apache.cassandra | +                                             apache.drill | apache.druid | apache.hdfs | apache.hive | apache.kylin | +                                             apache.livy | apache.pig | apache.pinot | apache.spark | apache.sqoop | +                                             arangodb | asana | celery | cloudant | cncf.kubernetes | core.sql | +                                             databricks | datadog | dbt.cloud | dingding | discord | docker | +                                             elasticsearch | exasol | facebook | ftp | github | google | grpc | +                                             hashicorp | http | imap | influxdb | jdbc | jenkins | jira | +                                             microsoft.azure | microsoft.mssql | microsoft.psrp | microsoft.winrm | +                                             mongo | mysql | neo4j | odbc | openfaas | opsgenie | oracle | pagerduty | +                                             papermill | plexus | postgres | presto | qubole | redis | salesforce | +                                             samba | segment | sendgrid | sftp | singularity | slack | snowflake | +                                             sqlite | ssh | tableau | telegram | trino | vertica | yandex | +                                             zendesk]... + +Prepare CHANGELOG, README and COMMITS information for providers. + +╭─ Provider documentation preparation flags ───────────────────────────────────────────────────────────────────────────╮ +--debugDrop user in shell instead of running the command. Useful for debugging. +╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ Options ────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ +--verbose-vPrint verbose information about performed steps. +--dry-run-DIf dry-run is set, commands are only printed, not executed. +--github-repository-gGitHub repository used to pull, push run images.(TEXT)[default: apache/airflow] +--answer-aForce answer to questions.(y | n | q | yes | no | quit) +--help-hShow this message and exit. +╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ diff --git a/images/breeze/output-prepare-provider-packages.svg b/images/breeze/output-prepare-provider-packages.svg index 1da53c7d566be..18a9bda713077 100644 --- a/images/breeze/output-prepare-provider-packages.svg +++ b/images/breeze/output-prepare-provider-packages.svg @@ -19,153 +19,153 @@ font-weight: 700; } - .terminal-3636495061-matrix { + .terminal-3886318936-matrix { font-family: Fira Code, monospace; font-size: 20px; line-height: 24.4px; font-variant-east-asian: full-width; } - .terminal-3636495061-title { + .terminal-3886318936-title { font-size: 18px; font-weight: bold; font-family: arial; } - .terminal-3636495061-r1 { fill: #c5c8c6;font-weight: bold } -.terminal-3636495061-r2 { fill: #c5c8c6 } -.terminal-3636495061-r3 { fill: #d0b344;font-weight: bold } -.terminal-3636495061-r4 { fill: #868887 } -.terminal-3636495061-r5 { fill: #68a0b3;font-weight: bold } -.terminal-3636495061-r6 { fill: #8d7b39 } -.terminal-3636495061-r7 { fill: #98a84b;font-weight: bold } + .terminal-3886318936-r1 { fill: #c5c8c6;font-weight: bold } +.terminal-3886318936-r2 { fill: #c5c8c6 } +.terminal-3886318936-r3 { fill: #d0b344;font-weight: bold } +.terminal-3886318936-r4 { fill: #868887 } +.terminal-3886318936-r5 { fill: #68a0b3;font-weight: bold } +.terminal-3886318936-r6 { fill: #8d7b39 } +.terminal-3886318936-r7 { fill: #98a84b;font-weight: bold } - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - Command: prepare-provider-packages + Command: prepare-provider-packages - + - - -Usage: breeze prepare-provider-packages [OPTIONS] [airbyte | alibaba | amazon | apache.beam | apache.cassandra | -                                        apache.drill | apache.druid | apache.hdfs | apache.hive | apache.kylin | -                                        apache.livy | apache.pig | apache.pinot | apache.spark | apache.sqoop | -                                        arangodb | asana | celery | cloudant | cncf.kubernetes | databricks | datadog -                                        | dbt.cloud | dingding | discord | docker | elasticsearch | exasol | facebook -                                        | ftp | github | google | grpc | hashicorp | http | imap | influxdb | jdbc | -                                        jenkins | jira | microsoft.azure | microsoft.mssql | microsoft.psrp | -                                        microsoft.winrm | mongo | mysql | neo4j | odbc | openfaas | opsgenie | oracle -                                        | pagerduty | papermill | plexus | postgres | presto | qubole | redis | -                                        salesforce | samba | segment | sendgrid | sftp | singularity | slack | -                                        snowflake | sqlite | ssh | tableau | telegram | trino | vertica | yandex | -                                        zendesk]... - -Prepare sdist/whl packages of Airflow Providers. - -╭─ Package flags ──────────────────────────────────────────────────────────────────────────────────────────────────────╮ ---package-formatFormat of packages.(wheel | sdist | both)[default: wheel] ---version-suffix-for-pypiVersion suffix used for PyPI packages (alpha, beta, rc1, etc.).(TEXT) ---package-list-fileRead list of packages from text file (one package per line)(FILENAME) ---debugDrop user in shell instead of running the command. Useful for debugging. -╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ -╭─ Options ────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ ---verbose-vPrint verbose information about performed steps. ---dry-run-DIf dry-run is set, commands are only printed, not executed. ---github-repository-gGitHub repository used to pull, push run images.(TEXT)[default: apache/airflow] ---help-hShow this message and exit. -╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ + + +Usage: breeze prepare-provider-packages [OPTIONS] [airbyte | alibaba | amazon | apache.beam | apache.cassandra | +                                        apache.drill | apache.druid | apache.hdfs | apache.hive | apache.kylin | +                                        apache.livy | apache.pig | apache.pinot | apache.spark | apache.sqoop | +                                        arangodb | asana | celery | cloudant | cncf.kubernetes | core.sql | databricks +                                        | datadog | dbt.cloud | dingding | discord | docker | elasticsearch | exasol | +                                        facebook | ftp | github | google | grpc | hashicorp | http | imap | influxdb | +                                        jdbc | jenkins | jira | microsoft.azure | microsoft.mssql | microsoft.psrp | +                                        microsoft.winrm | mongo | mysql | neo4j | odbc | openfaas | opsgenie | oracle +                                        | pagerduty | papermill | plexus | postgres | presto | qubole | redis | +                                        salesforce | samba | segment | sendgrid | sftp | singularity | slack | +                                        snowflake | sqlite | ssh | tableau | telegram | trino | vertica | yandex | +                                        zendesk]... + +Prepare sdist/whl packages of Airflow Providers. + +╭─ Package flags ──────────────────────────────────────────────────────────────────────────────────────────────────────╮ +--package-formatFormat of packages.(wheel | sdist | both)[default: wheel] +--version-suffix-for-pypiVersion suffix used for PyPI packages (alpha, beta, rc1, etc.).(TEXT) +--package-list-fileRead list of packages from text file (one package per line)(FILENAME) +--debugDrop user in shell instead of running the command. Useful for debugging. +╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ Options ────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ +--verbose-vPrint verbose information about performed steps. +--dry-run-DIf dry-run is set, commands are only printed, not executed. +--github-repository-gGitHub repository used to pull, push run images.(TEXT)[default: apache/airflow] +--help-hShow this message and exit. +╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ diff --git a/setup.py b/setup.py index b4baa640b4aee..996c66c736e7e 100644 --- a/setup.py +++ b/setup.py @@ -683,6 +683,7 @@ def write_version(filename: str = os.path.join(*[my_dir, "airflow", "git_version 'celery': celery, 'cloudant': cloudant, 'cncf.kubernetes': kubernetes, + 'core.sql': [], 'databricks': databricks, 'datadog': datadog, 'dbt.cloud': http_provider, diff --git a/tests/providers/core/__init__.py b/tests/providers/core/__init__.py new file mode 100644 index 0000000000000..13a83393a9124 --- /dev/null +++ b/tests/providers/core/__init__.py @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. diff --git a/tests/providers/core/sql/__init__.py b/tests/providers/core/sql/__init__.py new file mode 100644 index 0000000000000..13a83393a9124 --- /dev/null +++ b/tests/providers/core/sql/__init__.py @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. diff --git a/tests/providers/core/sql/operators/__init__.py b/tests/providers/core/sql/operators/__init__.py new file mode 100644 index 0000000000000..13a83393a9124 --- /dev/null +++ b/tests/providers/core/sql/operators/__init__.py @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. diff --git a/tests/providers/core/sql/operators/test_sql.py b/tests/providers/core/sql/operators/test_sql.py new file mode 100644 index 0000000000000..d39e2d9faeccf --- /dev/null +++ b/tests/providers/core/sql/operators/test_sql.py @@ -0,0 +1,114 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import pytest + +from airflow.exceptions import AirflowException +from airflow.providers.core.sql.operators.sql import SQLColumnCheckOperator, SQLTableCheckOperator + + +class MockHook: + def get_first(self): + return + + +def _get_mock_db_hook(): + return MockHook() + + +class TestColumnCheckOperator: + + valid_column_mapping = { + "X": { + "null_check": {"equal_to": 0}, + "distinct_check": {"equal_to": 10, "tolerance": 0.1}, + "unique_check": {"geq_to": 10}, + "min": {"leq_to": 1}, + "max": {"less_than": 20, "greater_than": 10}, + } + } + + invalid_column_mapping = {"Y": {"invalid_check_name": {"expectation": 5}}} + + def _construct_operator(self, monkeypatch, column_mapping, return_vals): + def get_first_return(*arg): + return return_vals + + operator = SQLColumnCheckOperator( + task_id="test_task", table="test_table", column_mapping=column_mapping + ) + monkeypatch.setattr(operator, "get_db_hook", _get_mock_db_hook) + monkeypatch.setattr(MockHook, "get_first", get_first_return) + return operator + + def test_check_not_in_column_checks(self, monkeypatch): + with pytest.raises(AirflowException, match="Invalid column check: invalid_check_name."): + self._construct_operator(monkeypatch, self.invalid_column_mapping, ()) + + def test_pass_all_checks_exact_check(self, monkeypatch): + operator = self._construct_operator(monkeypatch, self.valid_column_mapping, (0, 10, 10, 1, 19)) + operator.execute() + + def test_max_less_than_fails_check(self, monkeypatch): + with pytest.raises(AirflowException): + operator = self._construct_operator(monkeypatch, self.valid_column_mapping, (0, 10, 10, 1, 21)) + operator.execute() + assert operator.column_mapping["X"]["max"]["success"] is False + + def test_max_greater_than_fails_check(self, monkeypatch): + with pytest.raises(AirflowException): + operator = self._construct_operator(monkeypatch, self.valid_column_mapping, (0, 10, 10, 1, 9)) + operator.execute() + assert operator.column_mapping["X"]["max"]["success"] is False + + def test_pass_all_checks_inexact_check(self, monkeypatch): + operator = self._construct_operator(monkeypatch, self.valid_column_mapping, (0, 9, 12, 0, 15)) + operator.execute() + + def test_fail_all_checks_check(self, monkeypatch): + operator = operator = self._construct_operator( + monkeypatch, self.valid_column_mapping, (1, 12, 11, -1, 20) + ) + with pytest.raises(AirflowException): + operator.execute() + + +class TestTableCheckOperator: + + checks = { + "row_count_check": {"check_statement": "COUNT(*) == 1000"}, + "column_sum_check": {"check_statement": "col_a + col_b < col_c"}, + } + + def _construct_operator(self, monkeypatch, checks, return_vals): + def get_first_return(*arg): + return return_vals + + operator = SQLTableCheckOperator(task_id="test_task", table="test_table", checks=checks) + monkeypatch.setattr(operator, "get_db_hook", _get_mock_db_hook) + monkeypatch.setattr(MockHook, "get_first", get_first_return) + return operator + + def test_pass_all_checks_check(self, monkeypatch): + operator = self._construct_operator(monkeypatch, self.checks, ('1', 'y', 'true')) + operator.execute() + + def test_fail_all_checks_check(self, monkeypatch): + operator = self._construct_operator(monkeypatch, self.checks, ('0', 'n', 'false')) + with pytest.raises(AirflowException): + operator.execute()