diff --git a/docs/conf.py b/docs/conf.py index 1f20af6190..943b16f86e 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -53,3 +53,56 @@ } generate_mapping_docs() + +# -- Begin docs redirect section +# -- To test redirects in a local build, paste the redirect source, and append .html to the end. +# -- For example, "airflow3_compatibility/index" redirect must be tested using "airflow3_compatibility/index.html" +# -- https://documatt.com/sphinx-reredirects/usage/ +redirects = { + "configuration/caching": "../optimize_performance/caching.html", + "configuration/cosmos-conf": "../reference/configs/cosmos-conf.html", + "configuration/execution-config": "../reference/configs/execution-config.html", + "configuration/memory_optimization": "../optimize_performance/memory_optimization.html", + "configuration/partial-parsing": "../optimize_performance/partial-parsing.html", + "configuration/profile-config": "../reference/configs/profile-config.html", + "configuration/project-config": "../reference/configs/project-config.html", + "configuration/selecting-excluding": "../optimize_performance/selecting-excluding.html", + "getting_started/async-execution-mode": "../guides/run_dbt/airflow-worker/async-execution-mode.html", + "getting_started/aws-container-run-job": "../guides/run_dbt/airflow-worker/async-execution-mode.html", + "getting_started/azure-container-instance": "../guides/run_dbt/container/azure-container-instance.html", + "getting_started/custom-airflow-properties": "../run_dbt/airflow-worker/custom-airflow-properties.html", + "getting_started/docker": "../guides/run_dbt/container/docker.html", + "getting_started/execution-modes-local-conflicts": "../guides/run_dbt/airflow-worker/execution-modes-local-conflicts.html", + "getting_started/execution-modes": "../guides/run_dbt/execution-modes.html", + "getting_started/gcp-cloud-run-job": "../guides/run_dbt/container/gcp-cloud-run-job.html", + "getting_started/kubernetes": "../guides/run_dbt/container/kubernetes.html", + "getting_started/operators": "../guides/run_dbt/operators/operators.html", + "getting_started/watcher-execution-mode": "../guides/run_dbt/airflow-worker/watcher-execution-mode.html", + "getting_started/watcher-kubernetes-execution-mode": "../guides/run_dbt/container/watcher-kubernetes-execution-mode.html", + "profiles/AthenaAccessKey": "../reference/profiles/AthenaAccessKey.html", + "profiles/ClickhouseUserPassword": "../reference/profiles/ClickhouseUserPassword.html", + "profiles/DatabricksOauth": "../reference/profiles/DatabricksOauth.html", + "profiles/DatabricksToken": "../reference/profiles/DatabricksToken.html", + "profiles/DuckDBUserPassword": "../reference/profiles/DuckDBUserPassword.html", + "profiles/ExasolUserPassword": "../reference/profiles/ExasolUserPassword.html", + "profiles/GoogleCloudOauth": "../reference/profiles/GoogleCloudOauth.html", + "profiles/GoogleCloudServiceAccountDict": "../reference/profiles/GoogleCloudServiceAccountDict.html", + "profiles/GoogleCloudServiceAccountFile": "../reference/profiles/GoogleCloudServiceAccountFile.html", + "profiles/index": "../reference/profiles/index.html", + "profiles/MysqlUserPassword": "../reference/profiles/MysqlUserPassword.html", + "profiles/OracleUserPassword": "../reference/profiles/OracleUserPassword.html", + "profiles/PostgresUserPassword": "../reference/profiles/PostgresUserPassword.html", + "profiles/RedshiftUserPassword": "../reference/profiles/RedshiftUserPassword.html", + "profiles/SnowflakeEncryptedPrivateKeyFilePem": "../reference/profiles/SnowflakeEncryptedPrivateKeyFilePem.html", + "profiles/SnowflakeEncryptedPrivateKeyPem": "../reference/profiles/SnowflakeEncryptedPrivateKeyPem.html", + "profiles/SnowflakePrivateKeyPem": "../reference/profiles/SnowflakePrivateKeyPem.html", + "profiles/SnowflakeUserPassword": "../reference/profiles/SnowflakeUserPassword.html", + "profiles/SparkThrift": "../reference/profiles/SparkThrift.html", + "profiles/StandardSQLServerAuth": "../reference/profiles/StandardSQLServerAuth.html", + "profiles/StarrocksUserPassword": "../reference/profiles/StarrocksUserPassword.html", + "profiles/TeradataUserPassword": "../reference/profiles/TeradataUserPassword.html", + "profiles/TrinoCertificate": "../reference/profiles/TrinoCertificate.html", + "profiles/TrinoJWT": "../reference/profiles/TrinoJWT.html", + "profiles/TrinoLDAP": "../reference/profiles/TrinoLDAP.html", + "profiles/VerticaUserPassword": "../reference/profiles/VerticaUserPassword.html", +} diff --git a/docs/configuration/index.rst b/docs/configuration/index.rst deleted file mode 100644 index a6042327b0..0000000000 --- a/docs/configuration/index.rst +++ /dev/null @@ -1,36 +0,0 @@ -.. _configuration: - -Configuration -============= - -Cosmos offers a number of configuration options to customize its behavior. For more info, check out the links on the left or the table of contents below. - -.. toctree:: - :caption: Contents: - - dbt Fusion - Multi-Project Setups - - Project Config - Profile Config - Execution Config - Render Config - - Parsing Methods - Configuring in Airflow - Configuring Lineage - Generating Docs - Hosting Docs - Scheduling - Testing Behavior - Selecting & Excluding - Partial Parsing - Source Nodes Rendering - Post-rendering DAG customization - Operator Args - Compiled SQL - Logging - Caching - Task display name - Callbacks - Memory Optimization diff --git a/docs/contributing.rst b/docs/contributing.rst index 006149faac..d50c120398 100644 --- a/docs/contributing.rst +++ b/docs/contributing.rst @@ -155,17 +155,19 @@ To run the checks manually, run: Writing Docs ____________ -`Hatch `_ is a unified command-line tool for managing dependencies and environment isolation for Python developers. In Cosmos, we use a Hatchto declare the dependencies required for the project itself, as well as for tests and documentation builds. +`Hatch `_ is a unified command-line tool for managing dependencies and environment isolation for Python developers. In Cosmos, we use a Hatch to declare the dependencies required for the project itself, as well as for tests and documentation builds. If you don’t already have Hatch installed, please `install it `_ before proceeding. As an example, on macOS, you can do so with: .. code-block:: bash + brew install hatch You can run the docs locally by running the following: .. code-block:: bash + hatch run docs:serve diff --git a/docs/generate_mappings.py b/docs/generate_mappings.py index 52a7b1a787..bc825a896e 100644 --- a/docs/generate_mappings.py +++ b/docs/generate_mappings.py @@ -42,8 +42,8 @@ def get_fields_from_mapping(mapping: type[BaseProfileMapping]) -> list[Field]: def generate_mapping_docs( - templates_dir: str = "./templates", - output_dir: str = "./profiles", + templates_dir: str = "./reference/templates", + output_dir: str = "./reference/profiles", ) -> None: """ Generate a dedicated docs page per profile mapping. diff --git a/docs/getting_started/astro.rst b/docs/getting_started/astro.rst index b590575f2e..56e9fa0d53 100644 --- a/docs/getting_started/astro.rst +++ b/docs/getting_started/astro.rst @@ -1,7 +1,7 @@ .. _astro: -Getting Started on Astro -======================== +Getting Started with Cosmos on Astro +==================================== While it is possible to use Cosmos on Astro with all :ref:`Execution Modes `, we recommend using the ``local`` execution mode. It's the simplest to set up and use. diff --git a/docs/getting_started/dbt-airflow-concepts.rst b/docs/getting_started/dbt-airflow-concepts.rst index 70c4feae8d..ee55abe694 100644 --- a/docs/getting_started/dbt-airflow-concepts.rst +++ b/docs/getting_started/dbt-airflow-concepts.rst @@ -1,7 +1,7 @@ .. _dbt-airflow-concepts: -Similar dbt & Airflow concepts -============================== +Similar dbt and Airflow concepts +================================ While dbt is an open source tool for data transformations and analysis, using SQL, Airflow focuses on being a platform for the development, scheduling and monitoring of batch-oriented workflows, using Python. Although both tools have many diff --git a/docs/getting_started/index.rst b/docs/getting_started/index.rst index eb71d10221..59e07698a5 100644 --- a/docs/getting_started/index.rst +++ b/docs/getting_started/index.rst @@ -1,27 +1,28 @@ .. _getting-started: .. toctree:: + :maxdepth: 1 :hidden: - :caption: Contents: + :caption: Cosmos Fundamentals + + Similar dbt and Airflow concepts + +.. toctree:: + :maxdepth: 1 + :hidden: + :caption: Quickstart Astro CLI quickstart - Astro - MWAA - GCC - Open-Source - Execution Modes - Docker Execution Mode - Kubernetes Execution Mode - Azure Container Instance Execution Mode - AWS Container Run Job Execution Mode - GCP Cloud Run Job Execution Mode - Airflow Async Execution Mode - Watcher Execution Mode - Watcher Kubernetes Execution Mode - dbt and Airflow Similar Concepts - Operators - Custom Airflow Properties +.. toctree:: + :maxdepth: 1 + :hidden: + :caption: Get started with Cosmos + + Open-source Airflow + Astro + Google Cloud Composer (GCC) + Amazon Managed Workflows for Apache Airflow (MWAA) Getting Started =============== @@ -46,11 +47,11 @@ For more customization, check out the different execution modes that Cosmos supp For specific guides, see the following: -- `Executing dbt DAGs with Docker Operators `__ -- `Executing dbt DAGs with KubernetesPodOperators `__ -- `Executing dbt DAGs with Watcher Kubernetes Mode `__ -- `Executing dbt DAGs with AzureContainerInstancesOperators `__ -- `Executing dbt DAGs with GcpCloudRunExecuteJobOperators `__ +- `Executing dbt DAGs with DockerOperators <../../guides/run_dbt/container/docker.html>`__ +- `Executing dbt DAGs with KubernetesPodOperators <../../guides/run_dbt/container/kubernetes.html>`__ +- `Executing dbt DAGs with Watcher Kubernetes Mode <../../guides/run_dbt/container/watcher-kubernetes-execution-mode.html>`__ +- `Executing dbt DAGs with AzureContainerInstancesOperators <../../guides/run_dbt/container/azure-container-instance.html>`__ +- `Executing dbt DAGs with GcpCloudRunExecuteJobOperators <../../guides/run_dbt/container/gcp-cloud-run-job.html>`__ Concepts Overview diff --git a/docs/getting_started/mwaa.rst b/docs/getting_started/mwaa.rst index 5b7c41bde5..5b1da23439 100644 --- a/docs/getting_started/mwaa.rst +++ b/docs/getting_started/mwaa.rst @@ -1,7 +1,7 @@ .. _mwaa: -Getting Started on MWAA -======================= +Getting Started with Cosmos on Amazon Managed Workflows +======================================================= Users can face Python dependency issues when trying to use the Cosmos `Local Execution Mode `_ in Amazon Managed Workflows for `Apache Airflow® `_ (MWAA). diff --git a/docs/getting_started/open-source.rst b/docs/getting_started/open-source.rst index ba9bbdb15c..f5d1db832b 100644 --- a/docs/getting_started/open-source.rst +++ b/docs/getting_started/open-source.rst @@ -1,7 +1,7 @@ .. _open-source: -Getting Started on Open Source Airflow -====================================== +Getting Started with Cosmos on Open-source Airflow +================================================== When running open-source Airflow, your setup may vary. This guide assumes you have access to edit the underlying image. diff --git a/docs/configuration/compiled-sql.rst b/docs/guides/cosmos_devex/compiled-sql.rst similarity index 100% rename from docs/configuration/compiled-sql.rst rename to docs/guides/cosmos_devex/compiled-sql.rst diff --git a/docs/guides/cosmos_devex/index.rst b/docs/guides/cosmos_devex/index.rst new file mode 100644 index 0000000000..2ad3dff71b --- /dev/null +++ b/docs/guides/cosmos_devex/index.rst @@ -0,0 +1,14 @@ +.. _cosmos_devex: + + +Cosmos DevEx +============ + +.. toctree:: + :maxdepth: 1 + :caption: Cosmos DevEx + + lineage + compiled-sql + logging + task-display-name diff --git a/docs/configuration/lineage.rst b/docs/guides/cosmos_devex/lineage.rst similarity index 100% rename from docs/configuration/lineage.rst rename to docs/guides/cosmos_devex/lineage.rst diff --git a/docs/configuration/logging.rst b/docs/guides/cosmos_devex/logging.rst similarity index 100% rename from docs/configuration/logging.rst rename to docs/guides/cosmos_devex/logging.rst diff --git a/docs/configuration/task-display-name.rst b/docs/guides/cosmos_devex/task-display-name.rst similarity index 100% rename from docs/configuration/task-display-name.rst rename to docs/guides/cosmos_devex/task-display-name.rst diff --git a/docs/configuration/generating-docs.rst b/docs/guides/dbt_docs/generating-docs.rst similarity index 100% rename from docs/configuration/generating-docs.rst rename to docs/guides/dbt_docs/generating-docs.rst diff --git a/docs/configuration/hosting-docs.rst b/docs/guides/dbt_docs/hosting-docs.rst similarity index 100% rename from docs/configuration/hosting-docs.rst rename to docs/guides/dbt_docs/hosting-docs.rst diff --git a/docs/configuration/dbt-fusion.rst b/docs/guides/dbt_setup/dbt-fusion.rst similarity index 100% rename from docs/configuration/dbt-fusion.rst rename to docs/guides/dbt_setup/dbt-fusion.rst diff --git a/docs/guides/index.rst b/docs/guides/index.rst new file mode 100644 index 0000000000..f3e22486ed --- /dev/null +++ b/docs/guides/index.rst @@ -0,0 +1,61 @@ +.. _guides: + +Guides +====== + +Cosmos offers a number of configuration options to customize its behavior. For more info, check out the links on the left or the table of contents below. + +.. toctree:: + :maxdepth: 1 + :hidden: + :caption: Set up dbt with Airflow + + dbt_setup/dbt-fusion + +.. toctree:: + :maxdepth: 1 + :hidden: + :caption: Translating dbt into Airflow + + translate_dbt_to_airflow/index + +.. toctree:: + :maxdepth: 3 + :hidden: + :caption: How Cosmos runs dbt + + run_dbt/execution-modes + run_dbt/airflow-worker/index + run_dbt/container/index + run_dbt/callbacks/callbacks + run_dbt/operators/operators + run_dbt/customization/index + +.. toctree:: + :maxdepth: 1 + :hidden: + :caption: Multi-project Setups + + multi_project/multi-project + +.. toctree:: + :maxdepth: 1 + :hidden: + :caption: Documentation + + dbt_docs/generating-docs + dbt_docs/hosting-docs + +.. toctree:: + :maxdepth: 1 + :hidden: + :caption: Cosmos DevEx + + cosmos_devex/index + +.. toctree:: + :maxdepth: 1 + :hidden: + :caption: Optimizing Performance + + optimize_performance/index diff --git a/docs/configuration/multi-project.rst b/docs/guides/multi_project/multi-project.rst similarity index 99% rename from docs/configuration/multi-project.rst rename to docs/guides/multi_project/multi-project.rst index 70283bc410..5dfd79eea0 100644 --- a/docs/configuration/multi-project.rst +++ b/docs/guides/multi_project/multi-project.rst @@ -169,7 +169,7 @@ You can use either separate DAGs or a combined DAG with task groups. **Option 1: Combined DAG with Task Groups using dbt ls Load Mode (Recommended)** -.. literalinclude:: ../../dev/dags/cross_project_dbt_ls_dag.py +.. literalinclude:: ../../../dev/dags/cross_project_dbt_ls_dag.py :language: python :start-after: [START cross_project_dbt_ls_dag] :end-before: [END cross_project_dbt_ls_dag] @@ -178,7 +178,7 @@ You can use either separate DAGs or a combined DAG with task groups. This option uses pre-generated ``manifest.json`` files for faster DAG parsing (no ``dbt ls`` execution required). -.. literalinclude:: ../../dev/dags/cross_project_manifest_dag.py +.. literalinclude:: ../../../dev/dags/cross_project_manifest_dag.py :language: python :start-after: [START cross_project_manifest_dag] :end-before: [END cross_project_manifest_dag] diff --git a/docs/guides/optimize_performance/caching.rst b/docs/guides/optimize_performance/caching.rst new file mode 100644 index 0000000000..5bf8a6406c --- /dev/null +++ b/docs/guides/optimize_performance/caching.rst @@ -0,0 +1,230 @@ +.. _caching: + +Caching +======= + +This page explains the caching strategies in ``astronomer-cosmos`` Astronomer Cosmos behavior. + +All Cosmos caching mechanisms can be enabled or turned off in the ``airflow.cfg`` file or using environment variables. + +.. note:: + For more information, see `configuring a Cosmos project <./project-config.html>`_. + +Depending on the Cosmos version, it creates a cache for three types of data: + +- The ``dbt ls`` output +- The dbt ``partial_parse.msgpack`` file +- The parsed manifest selectors + +It is possible to turn off any cache in Cosmos by exporting the environment variable ``AIRFLOW__COSMOS__ENABLE_CACHE=0``. +Disabling individual types of cache in Cosmos is also possible, as explained below. + +Caching the dbt ls output +~~~~~~~~~~~~~~~~~~~~~~~~~ + +(Introduced in Cosmos 1.5) + +While parsing a dbt project using `LoadMode.DBT_LS <./parsing-methods.html#dbt-ls>`_, Cosmos uses subprocess to run ``dbt ls``. +This operation can be very costly; it can increase the DAG parsing times and affect not only the scheduler DAG processing but +also the tasks queueing time. + +Cosmos 1.5 introduced a feature to mitigate the performance issue associated with ``LoadMode.DBT_LS`` by caching the output +of this command as an `Airflow Variable `_. +Based on an initial `analysis `_, enabling this setting reduced some DAGs task queueing from 30s to 0s. Additionally, some users `reported improvements of 84% `_ in the DAG run time. + +This feature is on by default. To turn it off, export the following environment variable: ``AIRFLOW__COSMOS__ENABLE_CACHE_DBT_LS=0``. + +(Introduced in Cosmos 1.6 - Experimental feature) + +Starting with Cosmos 1.6.0, users can also set a remote directory path to store this cache instead of using Airflow +Variables. To do so, you need to configure a remote cache directory. See :ref:`remote_cache_dir` and +:ref:`remote_cache_dir_conn_id` for more information. This is an experimental feature introduced in 1.6.0 to gather +user feedback. The ``remote_cache_dir`` will eventually be merged into the :ref:`cache_dir` setting in upcoming +releases. + +**How the cache is refreshed** + +If using the default Variables cache approach, users can purge or delete the cache via Airflow UI by identifying and +deleting the cache key. In case you're using the alternative approach by setting the ``remote_cache_dir`` introduced +in Cosmos 1.6.0, you can delete the cache by removing the specific files by identifying them using your configured path +in the remote store. + +Cosmos will refresh the cache in a few circumstances: + +* if any files of the dbt project change +* if one of the arguments that affect the dbt ls command execution changes + +To evaluate if the dbt project changed, it calculates the changes using a few of the MD5 of all the files in the directory. + +Additionally, if any of the following DAG configurations are changed, we'll automatically purge the cache of the DAGs that use that specific configuration: + +* ``ProjectConfig.dbt_vars`` +* ``ProjectConfig.env_vars`` +* ``ProjectConfig.partial_parse`` +* ``RenderConfig.env_vars`` +* ``RenderConfig.exclude`` +* ``RenderConfig.select`` +* ``RenderConfig.selector`` + +Finally, if users would like to define specific Airflow variables that, if changed, will cause the recreation of the cache, they can specify those by using: + +* ``RenderConfig.airflow_vars_to_purge_dbt_ls_cache`` + +Example: + +.. code-block:: python + + RenderConfig(airflow_vars_to_purge_dbt_ls_cache=["refresh_cache"]) + +**Cleaning up stale cache** + +Not rarely, Cosmos DbtDags and DbtTaskGroups may be renamed or deleted. In those cases, to clean up the Airflow metadata database, it is possible to use the method ``delete_unused_dbt_ls_cache``. + +The method deletes the Cosmos cache stored in Airflow Variables based on the last execution of their associated DAGs. + +As an example, the following clean-up DAG will delete any cache associated with Cosmos that has not been used for the last five days: + +.. literalinclude:: ../../../dev/dags/example_cosmos_cleanup_dag.py + :language: python + :start-after: [START cache_example] + :end-before: [END cache_example] + +**Cache key** + +The Airflow variables that represent the dbt ls cache are prefixed by ``cosmos_cache``. +When using ``DbtDag``, the keys use the DAG name. When using ``DbtTaskGroup``, they contain the ``TaskGroup`` and parent task groups and DAG. + +Examples: + +* The ``DbtDag`` "cosmos_dag" will have the cache represented by "cosmos_cache__basic_cosmos_dag". +* The ``DbtTaskGroup`` "customers" declared inside the DAG "basic_cosmos_task_group" will have the cache key "cosmos_cache__basic_cosmos_task_group__customers". + +**Cache value** + +The cache values contain a few properties: + +* ``last_modified`` timestamp, represented using the ISO 8601 format. +* ``version`` is a hash that represents the version of the dbt project and arguments used to run dbt ls by the time Cosmos created the cache +* ``dbt_ls_compressed`` represents the dbt ls output compressed using zlib and encoded to base64 so Cosmos can record the value as a compressed string in the Airflow metadata database. +* ``dag_id`` is the DAG associated to this cache +* ``task_group_id`` is the TaskGroup associated to this cache +* ``cosmos_type`` is either ``DbtDag`` or ``DbtTaskGroup`` + +Caching the YAML selectors +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +(Introduced in Cosmos 1.13) + +While parsing a dbt project using `LoadMode.DBT_MANIFEST <./parsing-methods.html#dbt-manifest>`_, if a ``selector`` argument is provided to the `RenderConfig <./render-config.html>`_ instance passed to the ``DbtDag`` or ``DbtTaskGroup``, +Cosmos will parse the preprocessed YAML selectors found in the manifest. The YAML selectors will be parsed into selection criteria that Cosmos will use to filter the dbt nodes to include in the Airflow DAG. The parsed selectors will be cached to improve performance during DAG parsing. + +This feature is on by default. To turn it off, export the following environment variable: ``AIRFLOW__COSMOS__ENABLE_CACHE_DBT_YAML_SELECTORS=0``. + +Similar to the caching of ``dbt ls`` output, users can also set a remote directory path to store this cache instead of using Airflow +Variables. To do so, you need to configure a remote cache directory. See :ref:`remote_cache_dir` and +:ref:`remote_cache_dir_conn_id` for more information. This is an experimental feature introduced in 1.6.0 to gather +user feedback. The ``remote_cache_dir`` will eventually be merged into the :ref:`cache_dir` setting in upcoming +releases. + +**How the cache is refreshed** + +If using the default Variables cache approach, users can purge or delete the cache via Airflow UI by identifying and +deleting the cache key. In case you're using the alternative approach by setting the ``remote_cache_dir`` introduced +in Cosmos 1.6.0, you can delete the cache by removing the specific files by identifying them using your configured path +in the remote store. + +Cosmos will refresh the cache in a few circumstances: + +* if any files of the dbt project change +* if the YAML selectors in the manifest file change +* if the implementation of the YAML selector parsing logic changes + + * For new definitions of the dbt YAML selector specification. + +To evaluate if the dbt project changed, it calculates the changes using a few of the MD5 of all the files in the directory. + +Finally, if users would like to define specific Airflow variables that, if changed, will cause the recreation of the cache, they can specify those by using: + +* ``RenderConfig.airflow_vars_to_purge_dbt_yaml_selectors_cache`` + +Example: + +.. code-block:: python + + RenderConfig(airflow_vars_to_purge_dbt_yaml_selectors_cache=["refresh_cache"]) + +**Cleaning up stale cache** + +Not rarely, Cosmos DbtDags and DbtTaskGroups may be renamed or deleted. In those cases, to clean up the Airflow metadata database, it is possible to use the method ``delete_unused_dbt_yaml_selectors_cache``. + +The method deletes the Cosmos cache stored in Airflow Variables based on the last execution of their associated DAGs. + +As an example, the following clean-up DAG will delete any cache associated with Cosmos that has not been used for the last five days: + +.. literalinclude:: ../../../dev/dags/example_cosmos_cleanup_dag.py + :language: python + :start-after: [START cache_example] + :end-before: [END cache_example] + +.. warning:: + Because the backing Airflow Variable is shared between the dbt ls cache and the YAML selectors cache, delete methods for the non-remote cache delete the same Airflow Variable. + In other words, if you call ``delete_unused_dbt_ls_cache``, it will also delete the YAML selectors cache for the same DAG or TaskGroup, and vice versa, and calling ``delete_unused_dbt_yaml_selectors_cache`` will delete the corresponding dbt ls cache. + +**Cache key** + +The Airflow variables that represent the yaml selectors cache are prefixed by ``cosmos_cache``. +When using ``DbtDag``, the keys use the DAG name. When using ``DbtTaskGroup``, they contain the ``TaskGroup`` and parent task groups and DAG. + +Examples: + +* The ``DbtDag`` "cosmos_dag" will have the cache represented by "cosmos_cache__basic_cosmos_dag". +* The ``DbtTaskGroup`` "customers" declared inside the DAG "basic_cosmos_task_group" will have the cache key "cosmos_cache__basic_cosmos_task_group__customers". + +**Cache value** + +The cache values contain a few properties: + +* ``last_modified`` timestamp, represented using the ISO 8601 format. +* ``version`` is a hash that represents the version of the dbt project, the raw YAML selectors, and a hash of the YAML selector parser implementation version combined with the keys specified by ``airflow_vars_to_purge_dbt_yaml_selectors_cache`` +* ``raw_selectors_compressed`` represents the raw YAML selector definitions compressed using zlib and encoded to base64 +* ``parsed_selectors_compressed`` represents the parsed YAML selector definitions compressed using zlib and encoded to base64 +* ``dag_id`` is the DAG associated to this cache +* ``task_group_id`` is the TaskGroup associated to this cache +* ``cosmos_type`` is either ``DbtDag`` or ``DbtTaskGroup`` + +**Shared Cache Behavior** + +When using Airflow variables as the backend to store cached cosmos artifacts, both the dbt ls output and the YAML selectors cache will use the same variable. It should not be possible +to have both artifacts occupy the cache at the same time due to their distinct `RenderConfig.load_mode <./render-config.html>`_ and switching from using one cache to the other will invalidate the cache on the next version check. + +Caching the partial parse file +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +(Introduced in Cosmos 1.4) + +After parsing the dbt project, dbt stores an internal project manifest in a file called ``partial_parse.msgpack`` (`official docs `_). +This file contributes significantly to the performance of running dbt commands when the dbt project did not change. + +Cosmos 1.4 introduced `support to partial parse files `_ both +provided by the user, and also by storing in the disk temporary folder in the Airflow scheduler and worker node the file +generated after running dbt commands. + +Users can customize where to store the cache using the setting ``AIRFLOW__COSMOS__CACHE_DIR``. + +It is possible to switch off this feature by exporting the environment variable ``AIRFLOW__COSMOS__ENABLE_CACHE_PARTIAL_PARSE=0``. + +For more information, read the `Cosmos partial parsing documentation <./partial-parsing.html>`_ + + +Caching the profiles +~~~~~~~~~~~~~~~~~~~~~~~~ + +(Introduced in Cosmos 1.5) + +Cosmos 1.5 introduced `support to profile caching `_, +enabling caching for the profile mapping in the path specified by env ``AIRFLOW__COSMOS__CACHE_DIR`` and ``AIRFLOW__COSMOS__PROFILE_CACHE_DIR_NAME``. +This feature facilitates the reuse of Airflow connections and ``profiles.yml``. + +Users have the flexibility to customize the cache storage location using the settings ``AIRFLOW__COSMOS__CACHE_DIR`` and ``AIRFLOW__COSMOS__PROFILE_CACHE_DIR_NAME``. + +To disable this feature, users can set the environment variable ``AIRFLOW__COSMOS__ENABLE_CACHE_PROFILE=False`` diff --git a/docs/guides/optimize_performance/index.rst b/docs/guides/optimize_performance/index.rst new file mode 100644 index 0000000000..0ed84470d0 --- /dev/null +++ b/docs/guides/optimize_performance/index.rst @@ -0,0 +1,13 @@ +.. _optimize-performance: + +Optimize your Cosmos Performance +================================ + +.. toctree:: + :maxdepth: 1 + :caption: Optimize Performance + + partial-parsing + memory_optimization + selecting-excluding + caching diff --git a/docs/configuration/memory_optimization.rst b/docs/guides/optimize_performance/memory_optimization.rst similarity index 100% rename from docs/configuration/memory_optimization.rst rename to docs/guides/optimize_performance/memory_optimization.rst diff --git a/docs/configuration/partial-parsing.rst b/docs/guides/optimize_performance/partial-parsing.rst similarity index 100% rename from docs/configuration/partial-parsing.rst rename to docs/guides/optimize_performance/partial-parsing.rst diff --git a/docs/configuration/selecting-excluding.rst b/docs/guides/optimize_performance/selecting-excluding.rst similarity index 100% rename from docs/configuration/selecting-excluding.rst rename to docs/guides/optimize_performance/selecting-excluding.rst diff --git a/docs/getting_started/async-execution-mode.rst b/docs/guides/run_dbt/airflow-worker/async-execution-mode.rst similarity index 99% rename from docs/getting_started/async-execution-mode.rst rename to docs/guides/run_dbt/airflow-worker/async-execution-mode.rst index 6d61bcf22b..55d6778abc 100644 --- a/docs/getting_started/async-execution-mode.rst +++ b/docs/guides/run_dbt/airflow-worker/async-execution-mode.rst @@ -1,7 +1,5 @@ .. _async-execution-mode: -.. title:: Getting Started with Deferrable Operator - Airflow Async Execution Mode ============================ diff --git a/docs/getting_started/execution-modes-local-conflicts.rst b/docs/guides/run_dbt/airflow-worker/execution-modes-local-conflicts.rst similarity index 97% rename from docs/getting_started/execution-modes-local-conflicts.rst rename to docs/guides/run_dbt/airflow-worker/execution-modes-local-conflicts.rst index 9fec173751..0f9120127c 100644 --- a/docs/getting_started/execution-modes-local-conflicts.rst +++ b/docs/guides/run_dbt/airflow-worker/execution-modes-local-conflicts.rst @@ -10,8 +10,8 @@ When using the `Local Execution Mode `__, users may If you find errors, we recommend users isolating the installation of dbt from the Airflow installation. With the `Local Execution Mode `__, this can be accomplished by installing dbt in a separate -Python virtualenv and setting the `ExecutionConfig.dbt_executable_path <../configuration/execution-config.html>`_ and -`RenderConfig.dbt_executable_path <../configuration/render-config.html>`_ parameters. +Python virtualenv and setting the `ExecutionConfig.dbt_executable_path <../guides/execution-config.html>`_ and +`RenderConfig.dbt_executable_path <../guides/render-config.html>`_ parameters. The page `execution modes `__ describes many other methods that support isolating dbt from Airflow. diff --git a/docs/guides/run_dbt/airflow-worker/index.rst b/docs/guides/run_dbt/airflow-worker/index.rst new file mode 100644 index 0000000000..eaa89c2d9f --- /dev/null +++ b/docs/guides/run_dbt/airflow-worker/index.rst @@ -0,0 +1,9 @@ +Run dbt in an Airflow worker +============================ + +.. toctree:: + :maxdepth: 1 + :caption: Run dbt in an Airflow worker + + async-execution-mode + watcher-execution-mode diff --git a/docs/getting_started/watcher-execution-mode.rst b/docs/guides/run_dbt/airflow-worker/watcher-execution-mode.rst similarity index 98% rename from docs/getting_started/watcher-execution-mode.rst rename to docs/guides/run_dbt/airflow-worker/watcher-execution-mode.rst index af7589650c..05bb21c7f7 100644 --- a/docs/getting_started/watcher-execution-mode.rst +++ b/docs/guides/run_dbt/airflow-worker/watcher-execution-mode.rst @@ -144,7 +144,7 @@ Example 1 — Using ``DbtDag`` with ``ExecutionMode.WATCHER`` You can enable WATCHER mode directly in your ``DbtDag`` configuration. This approach is best when your Airflow DAG is fully dedicated to a dbt project. -.. literalinclude:: ../../dev/dags/example_watcher.py +.. literalinclude:: ../../../../dev/dags/example_watcher.py :language: python :start-after: [START example_watcher] :end-before: [END example_watcher] @@ -370,7 +370,7 @@ Source freshness nodes Since Cosmos 1.6, it `supports the rendering of source nodes `_. -We noticed some Cosmos users use this feature alongside `overriding Cosmos source nodes `_ as sensors or another operator that allows them to skip the following branch of the DAG if the source is not fresh. +We noticed some Cosmos users use this feature alongside `overriding Cosmos source nodes `_ as sensors or another operator that allows them to skip the following branch of the DAG if the source is not fresh. This use case is not currently supported by the ``ExecutionMode.WATCHER``, since the ``dbt build`` command does not run `source freshness checks `_. @@ -451,7 +451,7 @@ Asynchronous sensor execution To disable asynchronous execution, set the ``deferrable`` flag to ``False`` in the ``operator_args``. -.. literalinclude:: ../../dev/dags/example_watcher.py +.. literalinclude:: ../../../../dev/dags/example_watcher.py :language: python :start-after: [START example_watcher_synchronous] :end-before: [END example_watcher_synchronous] diff --git a/docs/configuration/callbacks.rst b/docs/guides/run_dbt/callbacks/callbacks.rst similarity index 98% rename from docs/configuration/callbacks.rst rename to docs/guides/run_dbt/callbacks/callbacks.rst index c754245525..4b602ece3f 100644 --- a/docs/configuration/callbacks.rst +++ b/docs/guides/run_dbt/callbacks/callbacks.rst @@ -34,7 +34,7 @@ Example: Using Callbacks with a Single Operator To demonstrate how to specify a callback function for uploading files from the target directory, here’s an example using a single operator in an Airflow DAG: -.. literalinclude:: ../../dev/dags/example_operators.py +.. literalinclude:: ../../../../dev/dags/example_operators.py :language: python :start-after: [START single_operator_callback] :end-before: [END single_operator_callback] @@ -46,7 +46,7 @@ You can leverage the :ref:`remote_target_path` configuration to upload files from the target directory to a remote storage. Below is an example of how to define a callback helper function in your ``DbtDag`` that utilizes this configuration: -.. literalinclude:: ../../dev/dags/cosmos_callback_dag.py +.. literalinclude:: ../../../../dev/dags/cosmos_callback_dag.py :language: python :start-after: [START cosmos_callback_example] :end-before: [END cosmos_callback_example] diff --git a/docs/getting_started/aws-container-run-job.rst b/docs/guides/run_dbt/container/aws-container-run-job.rst similarity index 99% rename from docs/getting_started/aws-container-run-job.rst rename to docs/guides/run_dbt/container/aws-container-run-job.rst index db00fc8c3c..4321c8f346 100644 --- a/docs/getting_started/aws-container-run-job.rst +++ b/docs/guides/run_dbt/container/aws-container-run-job.rst @@ -1,7 +1,5 @@ .. _aws-container-run-job: -.. title:: Getting Started with Astronomer Cosmos on AWS ECS - Getting Started with Astronomer Cosmos on AWS ECS ================================================== diff --git a/docs/getting_started/azure-container-instance.rst b/docs/guides/run_dbt/container/azure-container-instance.rst similarity index 100% rename from docs/getting_started/azure-container-instance.rst rename to docs/guides/run_dbt/container/azure-container-instance.rst diff --git a/docs/getting_started/docker.rst b/docs/guides/run_dbt/container/docker.rst similarity index 100% rename from docs/getting_started/docker.rst rename to docs/guides/run_dbt/container/docker.rst diff --git a/docs/getting_started/gcp-cloud-run-job.rst b/docs/guides/run_dbt/container/gcp-cloud-run-job.rst similarity index 100% rename from docs/getting_started/gcp-cloud-run-job.rst rename to docs/guides/run_dbt/container/gcp-cloud-run-job.rst diff --git a/docs/guides/run_dbt/container/index.rst b/docs/guides/run_dbt/container/index.rst new file mode 100644 index 0000000000..9cccdbb29a --- /dev/null +++ b/docs/guides/run_dbt/container/index.rst @@ -0,0 +1,13 @@ +Run dbt in a container +====================== + +.. toctree:: + :maxdepth: 1 + :caption: Run dbt in a container + + aws-container-run-job + azure-container-instance + docker + gcp-cloud-run-job + kubernetes + watcher-kubernetes-execution-mode diff --git a/docs/getting_started/kubernetes.rst b/docs/guides/run_dbt/container/kubernetes.rst similarity index 92% rename from docs/getting_started/kubernetes.rst rename to docs/guides/run_dbt/container/kubernetes.rst index 607ba07bd7..d200589429 100644 --- a/docs/getting_started/kubernetes.rst +++ b/docs/guides/run_dbt/container/kubernetes.rst @@ -28,7 +28,7 @@ Additional KubernetesPodOperator parameters can be added to the ``operator_args` For instance, -.. literalinclude:: ../../dev/dags/jaffle_shop_kubernetes.py +.. literalinclude:: ../../../../dev/dags/jaffle_shop_kubernetes.py :language: python :start-after: [START kubernetes_tg_example] :end-before: [END kubernetes_tg_example] @@ -161,7 +161,7 @@ The Kubernetes execution mode has the following limitations: - Does not emit Airflow datasets, assets, and dataset aliases (there is an `open ticket #2329 `__ to address this) - Does not handle installing dbt deps for users (there is an `open ticket #679 `__ to address this) - Does not support `ProfileMapping `_ (there is an `open ticket #749 `__ to address this) -- Does not support `Callbacks `_ (there is an `open ticket #1575 `__ to address this) -- Does not expose Compiled SQL as a `templated field `_ -- Does not benefit from `Cosmos caching mechanisms `_ -- Does not support `generating dbt docs & uploading to an object store `_ (there is a `PR `_ to solve this for S3) +- Does not support `Callbacks `_ (there is an `open ticket #1575 `__ to address this) +- Does not expose Compiled SQL as a `templated field `_ +- Does not benefit from `Cosmos caching mechanisms `_ +- Does not support `generating dbt docs & uploading to an object store `_ (there is a `PR `_ to solve this for S3) diff --git a/docs/getting_started/watcher-kubernetes-execution-mode.rst b/docs/guides/run_dbt/container/watcher-kubernetes-execution-mode.rst similarity index 99% rename from docs/getting_started/watcher-kubernetes-execution-mode.rst rename to docs/guides/run_dbt/container/watcher-kubernetes-execution-mode.rst index 16dbbffd0a..d3f8a80a49 100644 --- a/docs/getting_started/watcher-kubernetes-execution-mode.rst +++ b/docs/guides/run_dbt/container/watcher-kubernetes-execution-mode.rst @@ -183,7 +183,7 @@ Example DAG Below is a complete example of a DAG using ``ExecutionMode.WATCHER_KUBERNETES``: -.. literalinclude:: ../../dev/dags/jaffle_shop_watcher_kubernetes.py +.. literalinclude:: ../../../../dev/dags/jaffle_shop_watcher_kubernetes.py :language: python ------------------------------------------------------------------------------- diff --git a/docs/guides/run_dbt/customization/index.rst b/docs/guides/run_dbt/customization/index.rst new file mode 100644 index 0000000000..44021154dc --- /dev/null +++ b/docs/guides/run_dbt/customization/index.rst @@ -0,0 +1,9 @@ +Additional Customization +======================== + +.. toctree:: + :maxdepth: 1 + :caption: Additional Customization + + operator-args + scheduling diff --git a/docs/configuration/operator-args.rst b/docs/guides/run_dbt/customization/operator-args.rst similarity index 100% rename from docs/configuration/operator-args.rst rename to docs/guides/run_dbt/customization/operator-args.rst diff --git a/docs/configuration/scheduling.rst b/docs/guides/run_dbt/customization/scheduling.rst similarity index 99% rename from docs/configuration/scheduling.rst rename to docs/guides/run_dbt/customization/scheduling.rst index 2d4e729c5b..0040135d37 100644 --- a/docs/configuration/scheduling.rst +++ b/docs/guides/run_dbt/customization/scheduling.rst @@ -77,7 +77,7 @@ This example DAG: .. The following renders in Sphinx but not Github: -.. literalinclude:: ../../dev/dags/basic_cosmos_dag.py +.. literalinclude:: ../../../../dev/dags/basic_cosmos_dag.py :language: python :start-after: [START local_example] :end-before: [END local_example] diff --git a/docs/getting_started/execution-modes.rst b/docs/guides/run_dbt/execution-modes.rst similarity index 98% rename from docs/getting_started/execution-modes.rst rename to docs/guides/run_dbt/execution-modes.rst index ea6a03f283..71d581c25b 100644 --- a/docs/getting_started/execution-modes.rst +++ b/docs/guides/run_dbt/execution-modes.rst @@ -1,7 +1,7 @@ .. _execution-modes: Execution Modes -=============== +=================== Cosmos can run ``dbt`` commands using several different approaches, called ``execution modes``: @@ -96,7 +96,7 @@ When using the ``local`` execution mode, Cosmos converts Airflow Connections int Example of how to use, for instance, when ``dbt`` was installed together with Cosmos: -.. literalinclude:: ../../dev/dags/basic_cosmos_dag.py +.. literalinclude:: ../../../dev/dags/basic_cosmos_dag.py :language: python :start-after: [START local_example] :end-before: [END local_example] @@ -122,7 +122,7 @@ Some drawbacks of this approach: Example of how to use: -.. literalinclude:: ../../dev/dags/example_virtualenv.py +.. literalinclude:: ../../../dev/dags/example_virtualenv.py :language: python :start-after: [START virtualenv_example] :end-before: [END virtualenv_example] @@ -170,7 +170,7 @@ Check the step-by-step guide on using the ``kubernetes`` execution mode at :ref: Example DAG: -.. literalinclude:: ../../dev/dags/jaffle_shop_kubernetes.py +.. literalinclude:: ../../../dev/dags/jaffle_shop_kubernetes.py :language: python :start-after: [START kubernetes_seed_example] :end-before: [END kubernetes_seed_example] @@ -314,7 +314,7 @@ as more dbt nodes will be run in parallel since they won't be blocking Airflow's Example DAG: -.. literalinclude:: ../../dev/dags/simple_dag_async.py +.. literalinclude:: ../../../dev/dags/simple_dag_async.py :language: python :start-after: [START airflow_async_execution_mode_example] :end-before: [END airflow_async_execution_mode_example] diff --git a/docs/guides/run_dbt/index.rst b/docs/guides/run_dbt/index.rst new file mode 100644 index 0000000000..a8c96d1b93 --- /dev/null +++ b/docs/guides/run_dbt/index.rst @@ -0,0 +1,405 @@ +.. _execution-modes: + +How Cosmos runs dbt +=================== + +.. toctree:: + :maxdepth: 3 + :caption: Run dbt in the Airflow worker + + airflow-worker/index + +.. toctree:: + :maxdepth: 3 + :caption: Run dbt in a container + + container/index + +.. toctree:: + :maxdepth: 3 + :caption: Callbacks + + callbacks/callbacks + +.. toctree:: + :maxdepth: 3 + :caption: Operators + + operators/operators + +.. toctree:: + :maxdepth: 3 + :caption: Customize Airflow + + customization/index + + +Cosmos can run ``dbt`` commands using several different approaches, called ``execution modes``: + +1. **local**: Run ``dbt`` commands using a local ``dbt`` installation (default) +2. **virtualenv**: Run ``dbt`` commands from Python virtual environments managed by Cosmos +3. **docker**: Run ``dbt`` commands from Docker containers managed by Cosmos (requires a pre-existing Docker image) +4. **kubernetes**: Run ``dbt`` commands from Kubernetes Pods managed by Cosmos (requires a pre-existing Docker image) +5. **aws_eks**: Run ``dbt`` commands from AWS EKS Pods managed by Cosmos (requires a pre-existing Docker image) +6. **azure_container_instance**: Run ``dbt`` commands from Azure Container Instances managed by Cosmos (requires a pre-existing Docker image) +7. **gcp_cloud_run_job**: Run ``dbt`` commands from GCP Cloud Run Job instances managed by Cosmos (requires a pre-existing Docker image) +8. **aws_ecs**: Run ``dbt`` commands from AWS ECS instances managed by Cosmos (requires a pre-existing Docker image) +9. **airflow_async**: (stable since Cosmos 1.9.0) Run the dbt resources from your dbt project asynchronously, by submitting the corresponding compiled SQLs to Apache Airflow's `Deferrable operators `__ +10. **watcher**: (experimental since Cosmos 1.11.0) Run a single ``dbt build`` command from a producer task and have sensor tasks to watch the progress of the producer, with improved DAG run time while maintaining the tasks lineage in the Airflow UI, and ability to retry failed tasks. Check the :ref:`watcher-execution-mode` for more details. +11. **watcher_kubernetes**: (experimental since Cosmos 1.13.0) Combines the speed of the watcher execution mode with the isolation of Kubernetes. Check the :ref:`watcher-kubernetes-execution-mode` for more details. + +The choice of the ``execution mode`` can vary based on each user's needs and concerns. For more details, check each execution mode described below. + +.. _execution-modes-comparison: + +.. list-table:: Execution Modes Comparison + :widths: 25 25 25 25 + :header-rows: 1 + + * - Execution Mode + - Task Duration + - Environment Isolation + - Cosmos Profile Management + * - Local + - Fast + - None + - Yes + * - Virtualenv + - Medium + - Lightweight + - Yes + * - Docker + - Slow + - Medium + - No + * - Kubernetes + - Slow + - High + - No + * - AWS_EKS + - Slow + - High + - No + * - Azure Container Instance + - Slow + - High + - No + * - GCP Cloud Run Job Instance + - Slow + - High + - No + * - AWS ECS + - Slow + - High + - No + * - Airflow Async + - Very Fast + - Medium + - Yes + * - Watcher + - Very Fast + - None + - Yes + * - Watcher Kubernetes + - Fast + - High + - No + +Local +----- + +By default, Cosmos uses the ``local`` execution mode. + +The ``local`` execution mode is the fastest way to run Cosmos operators since they don't install ``dbt`` nor build docker containers. However, it may not be an option for users using managed Airflow services such as +Google Cloud Composer, since Airflow and ``dbt`` dependencies can conflict (:ref:`execution-modes-local-conflicts`), the user may not be able to install ``dbt`` in a custom path. + +The ``local`` execution mode assumes a ``dbt`` binary is reachable within the Airflow worker node. + +If ``dbt`` was not installed as part of the Cosmos packages, +users can define a custom path to ``dbt`` by declaring the argument ``dbt_executable_path``. + +.. note:: + Starting in the 1.4 version, Cosmos tries to leverage the dbt partial parsing (``partial_parse.msgpack``) to speed up task execution. + This feature is bound to `dbt partial parsing limitations `_. + Learn more: :ref:`partial-parsing`. + +When using the ``local`` execution mode, Cosmos converts Airflow Connections into a native ``dbt`` profiles file (``profiles.yml``). + +Example of how to use, for instance, when ``dbt`` was installed together with Cosmos: + +.. literalinclude:: ../../../dev/dags/basic_cosmos_dag.py + :language: python + :start-after: [START local_example] + :end-before: [END local_example] + + +Virtualenv +---------- + +If you're using managed Airflow on GCP (Cloud Composer), for instance, we recommend you use the ``virtualenv`` execution mode. + +The ``virtualenv`` mode isolates the Airflow worker dependencies from ``dbt`` by managing a Python virtual environment created during task execution and deleted afterwards. + +In this case, users are responsible for declaring which version of ``dbt`` they want to use by giving the argument ``py_requirements``. This argument can be set directly in operator instances or when instantiating ``DbtDag`` and ``DbtTaskGroup`` as part of ``operator_args``. + +Similar to the ``local`` execution mode, Cosmos converts Airflow Connections into a way ``dbt`` understands them by creating a ``dbt`` profile file (``profiles.yml``). +Also similar to the ``local`` execution mode, Cosmos will by default attempt to use a ``partial_parse.msgpack`` if one exists to speed up parsing. + +Some drawbacks of this approach: + +- It is slower than ``local`` because it creates a new Python virtual environment for each Cosmos dbt task run. +- If dbt is unavailable in the Airflow scheduler, the default ``LoadMode.DBT_LS`` will not work. In this scenario, users must use a :ref:`parsing-methods` that does not rely on dbt, such as ``LoadMode.MANIFEST``. +- Only ``InvocationMode.SUBPROCESS`` is supported currently, attempt to use ``InvocationMode.DBT_RUNNER`` will raise error. + +Example of how to use: + +.. literalinclude:: ../../../dev/dags/example_virtualenv.py + :language: python + :start-after: [START virtualenv_example] + :end-before: [END virtualenv_example] + +Docker +------ + +The ``docker`` approach assumes users have a previously created Docker image, which should contain all the ``dbt`` pipelines and a ``profiles.yml``, managed by the user. + +The user has better environment isolation than when using ``local`` or ``virtualenv`` modes, but also more responsibility (ensuring the Docker container used has up-to-date files and managing secrets potentially in multiple places). + +The other challenge with the ``docker`` approach is if the Airflow worker is already running in Docker, which sometimes can lead to challenges running `Docker in Docker `__. + +This approach can be significantly slower than ``virtualenv`` since it may have to build the ``Docker`` container, which is slower than creating a Virtualenv with ``dbt-core``. +If dbt is unavailable in the Airflow scheduler, the default ``LoadMode.DBT_LS`` will not work. In this scenario, users must use a :ref:`parsing-methods` that does not rely on dbt, such as ``LoadMode.MANIFEST``. + +Check the step-by-step guide on using the ``docker`` execution mode at :ref:`docker`. + +Example DAG: + +.. code-block:: python + + docker_cosmos_dag = DbtDag( + # ... + execution_config=ExecutionConfig( + execution_mode=ExecutionMode.DOCKER, + ), + operator_args={ + "image": "dbt-jaffle-shop:1.0.0", + "network_mode": "bridge", + }, + ) + + +Kubernetes +---------- + +The ``kubernetes`` approach is a very isolated way of running ``dbt`` since the ``dbt`` run commands from within a Kubernetes Pod, usually in a separate host. + +It assumes the user has a Kubernetes cluster. It also expects the user to ensure the Docker container has up-to-date ``dbt`` pipelines and profiles, potentially leading the user to declare secrets in two places (Airflow and Docker container). + +The ``Kubernetes`` deployment may be slower than ``Docker`` and ``Virtualenv`` assuming that the container image is built (which is slower than creating a Python ``virtualenv`` and installing ``dbt-core``) and the Airflow task needs to spin up a new ``Pod`` in Kubernetes. + +Check the step-by-step guide on using the ``kubernetes`` execution mode at :ref:`kubernetes`. + +Example DAG: + +.. literalinclude:: ../../../dev/dags/jaffle_shop_kubernetes.py + :language: python + :start-after: [START kubernetes_seed_example] + :end-before: [END kubernetes_seed_example] + +AWS_EKS +---------- + +The ``aws_eks`` approach is very similar to the ``kubernetes`` approach, but it is specifically designed to run on AWS EKS clusters. +It uses the `EKSPodOperator `_ +to run the dbt commands. You need to provide the ``cluster_name`` in your operator_args to connect to the AWS EKS cluster. + + +Example DAG: + +.. code-block:: python + + postgres_password_secret = Secret( + deploy_type="env", + deploy_target="POSTGRES_PASSWORD", + secret="postgres-secrets", + key="password", + ) + + docker_cosmos_dag = DbtDag( + # ... + execution_config=ExecutionConfig( + execution_mode=ExecutionMode.AWS_EKS, + ), + operator_args={ + "image": "dbt-jaffle-shop:1.0.0", + "cluster_name": CLUSTER_NAME, + "get_logs": True, + "is_delete_operator_pod": False, + "secrets": [postgres_password_secret], + }, + ) + +Azure Container Instance +------------------------ +.. versionadded:: 1.4 + +Similar to the ``kubernetes`` approach, using ``Azure Container Instances`` as the execution mode gives a very isolated way of running ``dbt``, since the ``dbt`` run itself is run within a container running in an Azure Container Instance. + +This execution mode requires the user has an Azure environment that can be used to run Azure Container Groups in (see :ref:`azure-container-instance` for more details on the exact requirements). Similarly to the ``Docker`` and ``Kubernetes`` execution modes, a Docker container should be available, containing the up-to-date ``dbt`` pipelines and profiles. + +Each task will create a new container on Azure, giving full isolation. This, however, comes at the cost of speed, as this separation of tasks introduces some overhead. Please checkout the step-by-step guide for using Azure Container Instance as the execution mode + + +.. code-block:: python + + docker_cosmos_dag = DbtDag( + # ... + execution_config=ExecutionConfig( + execution_mode=ExecutionMode.AZURE_CONTAINER_INSTANCE + ), + operator_args={ + "ci_conn_id": "aci", + "registry_conn_id": "acr", + "resource_group": "my-rg", + "name": "my-aci-{{ ti.task_id.replace('.','-').replace('_','-') }}", + "region": "West Europe", + "image": "dbt-jaffle-shop:1.0.0", + }, + ) + +GCP Cloud Run Job +------------------------ +.. versionadded:: 1.7 + +The ``gcp_cloud_run_job`` execution mode is particularly useful for users who prefer to run their ``dbt`` commands on Google Cloud infrastructure, taking advantage of Cloud Run's scalability, isolation, and managed service capabilities. + +For the ``gcp_cloud_run_job`` execution mode to work, a Cloud Run Job instance must first be created using a previously built Docker container. This container should include the latest ``dbt`` pipelines and profiles. You can find more details in the `Cloud Run Job creation guide `__ . + +This execution mode allows users to run ``dbt`` core CLI commands in a Google Cloud Run Job instance. This mode leverages the ``CloudRunExecuteJobOperator`` from the Google Cloud Airflow provider to execute commands within a Cloud Run Job instance, where ``dbt`` is already installed. Similarly to the ``Docker`` and ``Kubernetes`` execution modes, a Docker container should be available, containing the up-to-date ``dbt`` pipelines and profiles. + +Each task will create a new Cloud Run Job execution, giving full isolation. The separation of tasks adds extra overhead; however, that can be mitigated by using the ``concurrency`` parameter in ``DbtDag``, which will result in parallelized execution of ``dbt`` models. + + +.. code-block:: python + + gcp_cloud_run_job_cosmos_dag = DbtDag( + # ... + execution_config=ExecutionConfig(execution_mode=ExecutionMode.GCP_CLOUD_RUN_JOB), + operator_args={ + "project_id": "my-gcp-project-id", + "region": "europe-west1", + "job_name": "my-crj-{{ ti.task_id.replace('.','-').replace('_','-') }}", + }, + ) + + +AWS ECS +--------- +.. versionadded:: 1.9.0 + +Using ``AWS Elastic Container Service (ECS)`` as the execution mode provides an isolated and scalable way to run ``dbt`` tasks within an AWS ECS service. This execution mode ensures that each ``dbt`` run is performed inside a dedicated container running in an ECS task. + +This execution mode requires the user to have an AWS environment configured to run ECS tasks (see :ref:``aws-ecs`` for more details on the exact requirements). Similar to the ``Docker`` and ``Kubernetes`` execution modes, a Docker container should be available, containing the up-to-date ``dbt`` pipelines and profiles. + +Each task will create a new ECS task execution, providing full isolation. However, this separation introduces some overhead in execution time due to container startup and provisioning. For users who require faster execution times, configuring appropriate ECS task definitions and cluster optimizations can help mitigate these delays. + +Please refer to the step-by-step guide for using AWS ECS as the execution mode. + +.. code-block:: python + + aws_ecs_cosmos_dag = DbtDag( + # ... + execution_config=ExecutionConfig(execution_mode=ExecutionMode.AWS_ECS), + operator_args={ + "aws_conn_id": "aws_default", + "cluster": "my-ecs-cluster", + "task_definition": "my-dbt-task", + "container_name": "dbt-container", + "launch_type": "FARGATE", + "deferrable": True, + "network_configuration": { + "awsvpcConfiguration": { + "subnets": ["<<>>"], + "assignPublicIp": "ENABLED", + }, + }, + "environment_variables": {"DBT_PROFILE_NAME": "default"}, + }, + ) + +.. _airflow-async-execution-mode: + +Airflow Async +------------- + +.. versionadded:: 1.9.0 + +Although this execution mode was introduced in Cosmos 1.9, we strongly encourage users to use Cosmos 1.11, which has significant performance improvements. +In comparison to the ``local``, the ``airflow_async`` execution mode can reduce the execution time of a dbt project by up to 36%. + +The ``airflow_async`` execution mode is a way to run the dbt resources from your dbt project using Apache Airflow's +`Deferrable operators `__. +This execution mode could be preferred when you've long running resources and you want to run them asynchronously by +leveraging Airflow's deferrable operators. With that, you would be able to potentially observe higher throughput of tasks +as more dbt nodes will be run in parallel since they won't be blocking Airflow's worker slots. + +Example DAG: + +.. literalinclude:: ../../../dev/dags/simple_dag_async.py + :language: python + :start-after: [START airflow_async_execution_mode_example] + :end-before: [END airflow_async_execution_mode_example] + +For a full step-by-step guide and limitations, check the :ref:`async-execution-mode` page. + + +Watcher Execution Mode (Experimental) +------------------------------------- + +.. versionadded:: 1.11.0 + +The ``watcher`` execution mode is an experimental execution mode that runs a single ``dbt build`` command from a producer task and has sensor tasks to watch the progress of the producer. +It is designed to improve DAG run time while maintaining the tasks lineage in the Airflow UI, and ability to retry failed tasks. + +Check the :ref:`watcher-execution-mode` for more details. + + +Watcher Kubernetes Execution Mode (Experimental) +------------------------------------------------ + +.. versionadded:: 1.13.0 + +The ``watcher_kubernetes`` execution mode combines the speed of the ``watcher`` execution mode with the isolation of the ``kubernetes`` execution mode. It runs a single ``dbt build`` command from a producer task inside a Kubernetes pod and has sensor tasks to watch the progress of the producer. + +Check the :ref:`watcher-kubernetes-execution-mode` for more details. + + +.. _invocation_modes: + +Invocation Modes +================ +.. versionadded:: 1.4 + +For ``ExecutionMode.LOCAL`` execution mode, Cosmos supports two invocation modes for running dbt: + +1. ``InvocationMode.SUBPROCESS``: In this mode, Cosmos runs dbt cli commands using the Python ``subprocess`` module and parses the output to capture logs and to raise exceptions. + +2. ``InvocationMode.DBT_RUNNER``: In this mode, Cosmos uses the ``dbtRunner`` available for `dbt programmatic invocations `__ to run dbt commands. \ + In order to use this mode, dbt must be installed in the same local environment. This mode does not have the overhead of spawning new subprocesses or parsing the output of dbt commands and is faster than ``InvocationMode.SUBPROCESS``. \ + This mode requires dbt version 1.5.0 or higher. It is up to the user to resolve :ref:`execution-modes-local-conflicts` when using this mode. + +The invocation mode can be set in the ``ExecutionConfig`` as shown below: + +.. code-block:: python + + from cosmos.constants import InvocationMode + + dag = DbtDag( + # ... + execution_config=ExecutionConfig( + execution_mode=ExecutionMode.LOCAL, + invocation_mode=InvocationMode.DBT_RUNNER, + ), + ) + +If the invocation mode is not set, Cosmos will attempt to use ``InvocationMode.DBT_RUNNER`` if dbt is installed in the same environment as the worker, otherwise it will fall back to ``InvocationMode.SUBPROCESS``. diff --git a/docs/getting_started/operators.rst b/docs/guides/run_dbt/operators/operators.rst similarity index 88% rename from docs/getting_started/operators.rst rename to docs/guides/run_dbt/operators/operators.rst index 9f6658b6b1..448e037e77 100644 --- a/docs/getting_started/operators.rst +++ b/docs/guides/run_dbt/operators/operators.rst @@ -18,7 +18,7 @@ The ``DbtCloneLocalOperator`` implement `dbt clone = 1.5 and cosmos >= 1.6.0. @@ -70,7 +70,7 @@ The ``on_warning_callback`` is a callback parameter available on the ``DbtSource Example: -.. literalinclude:: ../../dev/dags/example_source_rendering.py/ +.. literalinclude:: ../../../dev/dags/example_source_rendering.py/ :language: python :start-after: [START cosmos_source_node_example] :end-before: [END cosmos_source_node_example] diff --git a/docs/configuration/parsing-methods.rst b/docs/guides/translate_dbt_to_airflow/parsing-methods.rst similarity index 96% rename from docs/configuration/parsing-methods.rst rename to docs/guides/translate_dbt_to_airflow/parsing-methods.rst index 9eb654d04f..567fc4c137 100644 --- a/docs/configuration/parsing-methods.rst +++ b/docs/guides/translate_dbt_to_airflow/parsing-methods.rst @@ -56,7 +56,7 @@ Examples of how to supply ``manifest.json`` using ``manifest_path`` argument: - Local path: -.. literalinclude:: ../../dev/dags/cosmos_manifest_example.py +.. literalinclude:: ../../../dev/dags/cosmos_manifest_example.py :language: python :start-after: [START local_example] :end-before: [END local_example] @@ -66,7 +66,7 @@ Examples of how to supply ``manifest.json`` using ``manifest_path`` argument: Ensure that you have the required dependencies installed to use the S3 URL. You can install the required dependencies using the following command: ``pip install "astronomer-cosmos[amazon]"`` -.. literalinclude:: ../../dev/dags/cosmos_manifest_example.py +.. literalinclude:: ../../../dev/dags/cosmos_manifest_example.py :language: python :start-after: [START aws_s3_example] :end-before: [END aws_s3_example] @@ -76,7 +76,7 @@ using the following command: ``pip install "astronomer-cosmos[amazon]"`` Ensure that you have the required dependencies installed to use the GCS URL. You can install the required dependencies using the following command: ``pip install "astronomer-cosmos[google]"`` -.. literalinclude:: ../../dev/dags/cosmos_manifest_example.py +.. literalinclude:: ../../../dev/dags/cosmos_manifest_example.py :language: python :start-after: [START gcp_gs_example] :end-before: [END gcp_gs_example] @@ -86,7 +86,7 @@ using the following command: ``pip install "astronomer-cosmos[google]"`` Ensure that you have the required dependencies installed to use the Azure blob URL. You can install the required dependencies using the following command: ``pip install "astronomer-cosmos[microsoft]"`` -.. literalinclude:: ../../dev/dags/cosmos_manifest_example.py +.. literalinclude:: ../../../dev/dags/cosmos_manifest_example.py :language: python :start-after: [START azure_abfs_example] :end-before: [END azure_abfs_example] diff --git a/docs/configuration/render-config.rst b/docs/guides/translate_dbt_to_airflow/render-config.rst similarity index 99% rename from docs/configuration/render-config.rst rename to docs/guides/translate_dbt_to_airflow/render-config.rst index f153d3c3d1..425e106124 100644 --- a/docs/configuration/render-config.rst +++ b/docs/guides/translate_dbt_to_airflow/render-config.rst @@ -63,7 +63,7 @@ Your pipeline may even have specific node types not part of the standard dbt def The following example illustrates how it is possible to tell Cosmos how to convert two different types of nodes (``source`` and ``exposure``) into Airflow: -.. literalinclude:: ../../dev/dags/example_cosmos_sources.py +.. literalinclude:: ../../../dev/dags/example_cosmos_sources.py :language: python :start-after: [START custom_dbt_nodes] :end-before: [END custom_dbt_nodes] diff --git a/docs/index.rst b/docs/index.rst index beee4f40bb..e27883979e 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -2,13 +2,14 @@ .. toctree:: :hidden: - :maxdepth: 2 + :maxdepth: 0 :caption: Contents: Home Getting Started - Configuration - Profiles + Guides + Optimize Performance + Reference Contributing Airflow 3 compatibility Compatibility Policy @@ -110,8 +111,7 @@ for managing and scaling your data workflows. Getting Started with Airflow Async Execution Mode ------------------------------------------------- -See our :doc:`Getting Started with Airflow Async Execution Mode ` for details. - +See our :doc:`Getting Started with Airflow Async Execution Mode ` for details. Airflow 3 compatibility ----------------------- diff --git a/docs/configuration/caching.rst b/docs/optimize_performance/caching.rst similarity index 100% rename from docs/configuration/caching.rst rename to docs/optimize_performance/caching.rst diff --git a/docs/optimize_performance/index.rst b/docs/optimize_performance/index.rst new file mode 100644 index 0000000000..0ed84470d0 --- /dev/null +++ b/docs/optimize_performance/index.rst @@ -0,0 +1,13 @@ +.. _optimize-performance: + +Optimize your Cosmos Performance +================================ + +.. toctree:: + :maxdepth: 1 + :caption: Optimize Performance + + partial-parsing + memory_optimization + selecting-excluding + caching diff --git a/docs/optimize_performance/memory_optimization.rst b/docs/optimize_performance/memory_optimization.rst new file mode 100644 index 0000000000..c5ad81ae04 --- /dev/null +++ b/docs/optimize_performance/memory_optimization.rst @@ -0,0 +1,259 @@ +.. _memory-optimization: + +Memory Optimization Options for Astronomer Cosmos +================================================== + +When running dbt pipelines with Astronomer Cosmos, the framework executes dbt commands that can consume significant memory resources. In high-memory scenarios, tasks may reach a zombie state or workers may be killed due to Out of Memory (OOM) errors, leading to pipeline failures and reduced reliability. + +Cosmos provides various configuration options and execution modes to optimize memory usage, reduce worker resource consumption, and prevent OOM issues. This document outlines these memory optimization strategies, from simple configuration changes to advanced execution modes that can dramatically reduce memory footprint while maintaining or improving pipeline performance. + +1. Enable Memory-Optimized Imports +------------------------------------- + +**Impact**: High - Reduces memory footprint both at the DAG Processor and at Worker nodes. + +**Configuration**: + +.. code-block:: cfg + + # In airflow.cfg + [cosmos] + enable_memory_optimised_imports = True + +.. code-block:: bash + + # Or via environment variable + export AIRFLOW__COSMOS__ENABLE_MEMORY_OPTIMISED_IMPORTS=True + +**What it does**: Disables eager imports in ``cosmos/__init__.py``, preventing unused modules and classes from being loaded into memory. + +**Note**: When enabled, you must use full module paths for importing classes, functions and objects from Cosmos: + +.. code-block:: python + + # Instead of: + from cosmos import DbtDag, ProjectConfig, RenderConfig + + # Use: + from cosmos.airflow.dag import DbtDag + from cosmos.config import ProjectConfig, RenderConfig + +**Default**: ``False`` (will become default in Cosmos 2.0.0) + +----------------------------------------------------------------- + +2. Use DBT_MANIFEST Load Mode +------------------------------ + +**Impact**: High - Avoids running ``dbt ls`` subprocess which can consume significant CPU and memory. This reduces memory consumption when a cache miss occurs in the DBT LS method. It may not significantly reduce the memory footprint if there is a cache hit. + +**Configuration**: + +.. code-block:: python + + from cosmos.airflow.dag import DbtDag + from cosmos.config import ProjectConfig, RenderConfig + from cosmos.constants import LoadMode + + DbtDag( + project_config=ProjectConfig(dbt_project_path="/path/to/dbt/project"), + render_config=RenderConfig( + load_method=LoadMode.DBT_MANIFEST, # Use manifest instead of DBT_LS + ), + # ... + ) + +**What it does**: Uses pre-compiled ``manifest.json`` file instead of running ``dbt ls`` command, avoiding subprocess overhead and memory usage. + +**Requirements**: You need a ``manifest.json`` file (can be generated with ``dbt compile`` or ``dbt run``). + +--------------------------------- + +3. Use DBT_RUNNER Invocation Mode +----------------------------------- + +* (default for ``ExecutionMode.LOCAL`` since 1.4.0, default for ``RenderConfig.DBT_LS`` since Cosmos 1.9.0) + +**Impact**: Medium-High. Depends on the execution and load modes used. Can reduce subprocess overhead and memory usage compared to subprocess mode. + +**Configuration**: + +.. code-block:: python + + from cosmos.airflow.dag import DbtDag + from cosmos.config import ProjectConfig, RenderConfig + from cosmos.constants import LoadMode, InvocationMode + + DbtDag( + project_config=ProjectConfig(dbt_project_path="/path/to/dbt/project"), + render_config=RenderConfig( + load_method=LoadMode.DBT_LS, + invocation_mode=InvocationMode.DBT_RUNNER, # Default since Cosmos 1.9 + ), + # ... + ) + +**What it does**: Uses ``dbtRunner`` (dbt programmatic API) instead of Python subprocess, reducing memory and CPU overhead. + +**Requirements**: dbt version 1.5.0+ and dbt installed in the same Python environment as Airflow. + +**Default**: default behaviour for ``ExecutionMode.LOCAL`` since 1.4.0, default behaviour for ``RenderConfig.DBT_LS`` since Cosmos 1.9.0 + +------------------------------------------------------------------------------- + +4. Use Partial Parse (Keep Enabled) +------------------------------------ + +**Impact**: Low - Actually reduces memory by avoiding full project parsing. + +**Configuration**: + +.. code-block:: cfg + + # In airflow.cfg (should be enabled, not disabled) + [cosmos] + enable_cache_partial_parse = True + +.. code-block:: python + + # Also ensure mock profiles are disabled for partial parse to work + # In your DbtDag: + render_config = RenderConfig( + enable_mock_profile=False, # Required for partial parse + ) + +**What it does**: Uses dbt's ``partial_parse.msgpack`` to avoid re-parsing unchanged parts of the project, reducing memory and CPU usage. + +**Default**: ``True`` since Cosmos 1.4.0 + +------------------------------------------------------------------------------- + +5. Use ExecutionMode.WATCHER +----------------------------- + +**Impact**: Very High - Dramatically reduces Airflow worker slot usage and memory consumption. + +**Configuration** + +- `Getting Started with ExecutionMode.WATCHER `_ +- `Configure a Custom Queue for Producer and Watcher Tasks in ExecutionMode.WATCHER `_ + +------------------------------------------------------------------------------- + +6. Control DAG-Level Concurrency with ``concurrency`` Parameter +---------------------------------------------------------------- + +**Impact**: High - Limits concurrent task execution per DAG based on available resources. + +**Configuration**: + +.. code-block:: python + + from cosmos.airflow.dag import DbtDag + from cosmos.config import ProjectConfig, RenderConfig, ExecutionConfig + from cosmos.constants import ExecutionMode + + DbtDag( + project_config=ProjectConfig(dbt_project_path="/path/to/dbt/project"), + execution_config=ExecutionConfig( + execution_mode=ExecutionMode.LOCAL, # Or WATCHER + ), + # DAG-level concurrency control + concurrency=10, # Maximum concurrent tasks across all active DAG runs + max_active_runs=3, # Maximum concurrent DAG runs (optional) + # ... + ) + +**What it does**: + +- **``concurrency``**: The maximum number of task instances allowed to run concurrently across all active DAG runs for a given DAG +- Allows different DAGs to have different concurrency limits (e.g., one DAG runs 32 tasks at once, another runs 16) +- If not defined, defaults to the environment-level setting ``max_active_tasks_per_dag`` (default: 16) +- Works in combination with ``max_active_runs`` to control both task and DAG run concurrency + +**Example: Different Concurrency for Different DAGs**: + +.. code-block:: python + + # High-resource DAG - allow more concurrent tasks + high_resource_dag = DbtDag( + dag_id="high_resource_dbt_dag", + concurrency=32, # Allow 32 concurrent tasks + max_active_runs=2, + # ... + ) + + # Low-resource DAG - limit concurrent tasks + low_resource_dag = DbtDag( + dag_id="low_resource_dbt_dag", + concurrency=8, # Only 8 concurrent tasks + max_active_runs=1, + # ... + ) + +**Benefits**: + +- **Per-DAG Control**: Set different concurrency limits for different DAGs based on their resource needs +- **Resource Protection**: Prevent resource-intensive DAGs from overwhelming workers +- **Flexible Configuration**: Adjust concurrency without changing environment-level settings +- **Works with Pools**: Can be combined with task pools for even more granular control + +**Best Practices**: + +1. Set ``concurrency`` lower than your total worker capacity to leave room for other DAGs +2. Use lower ``concurrency`` values for resource-intensive DAGs (e.g., large dbt models) +3. Combine with ``max_active_runs`` to control both task and DAG run parallelism +4. Monitor task queuing - if tasks are queued for long periods, consider increasing ``concurrency`` + +**Reference**: `Airflow Scaling Workers Documentation `_ + +------------------------------------------------------------------------------- + +7. Enable Task Profiling with Debug Mode +----------------------------------------- + +**Impact**: Low - Provides visibility into memory usage patterns to help identify optimization opportunities and prevent OOM issues. + +**Configuration**: + +.. code-block:: bash + + # In airflow.cfg + [cosmos] + enable_debug_mode = True + + # Or via environment variable + export AIRFLOW__COSMOS__ENABLE_DEBUG_MODE=True + +**What it does**: When enabled, Cosmos tracks memory utilization for its tasks during execution and pushes the peak memory usage (in MB) to XCom under the key ``cosmos_debug_max_memory_mb``. This enables you to: + +- **Profile Memory Usage**: Identify which tasks consume the most memory +- **Optimize Resource Allocation**: Set appropriate memory limits and worker queue assignments based on actual usage +- **Track Memory Trends**: Monitor memory usage over time to detect regressions or improvements + +**How to Access Memory Data**: + +The peak memory usage is stored in XCom and can be accessed via the Airflow UI + +**Requirements**: + +- ``psutil`` package must be installed in your Airflow environment +- Debug mode adds minimal overhead (memory polling occurs at configurable intervals) + +**Configuration for Poll Interval**: + +You can adjust the memory polling frequency to balance accuracy and overhead: + +.. code-block:: bash + + # In airflow.cfg + [cosmos] + enable_debug_mode = True + debug_memory_poll_interval_seconds = 0.5 # Default: 0.5 seconds + + # Or via environment variable + export AIRFLOW__COSMOS__DEBUG_MEMORY_POLL_INTERVAL_SECONDS=0.5 + +Lower values provide more accurate peak memory measurements but may add slight overhead. Higher values reduce overhead but may miss short memory spikes. + +**Default**: ``False`` diff --git a/docs/optimize_performance/partial-parsing.rst b/docs/optimize_performance/partial-parsing.rst new file mode 100644 index 0000000000..b0e85fe088 --- /dev/null +++ b/docs/optimize_performance/partial-parsing.rst @@ -0,0 +1,71 @@ +.. _partial-parsing: + +Partial parsing +=============== + +Starting in the 1.4 version, Cosmos tries to leverage dbt's partial parsing (``partial_parse.msgpack``) to speed up both the task execution and the DAG parsing (if using ``LoadMode.DBT_LS``). + +This feature is bound to `dbt partial parsing limitations `_. +As an example, ``dbt`` requires the same ``--vars``, ``--target``, ``--profile``, and ``profile.yml`` environment variables (as called by the ``env_var()`` macro) while running dbt commands, otherwise it will reparse the project from scratch. + +Profile configuration +--------------------- + +To respect the dbt requirement of having the same profile to benefit from partial parsing, Cosmos users should either: + +* If using Cosmos profile mapping (``ProfileConfig(profile_mapping=...``), disable using mocked profile mappings by setting ``render_config=RenderConfig(enable_mock_profile=False)`` +* Declare their own ``profiles.yml`` file, via ``ProfileConfig(profiles_yml_filepath=...)`` + +If users don't follow these guidelines, Cosmos will use different profiles to parse the dbt project and to run tasks, and the user won't leverage dbt partial parsing. +Their logs will contain multiple ``INFO`` messages similar to the following, meaning that Cosmos is not using partial parsing: + +.. code-block:: + + 13:33:16 Unable to do partial parsing because profile has changed + 13:33:16 Unable to do partial parsing because env vars used in profiles.yml have changed + +dbt vars +-------- + +If the Airflow scheduler and worker processes run in the same node, users must ensure the dbt ``--vars`` flag is the same in the ``RenderConfig`` and ``ExecutionConfig``. + +Otherwise, users may see messages similar to the following in their logs: + +.. code-block:: + + [2024-03-14, 17:04:57 GMT] {{subprocess.py:94}} INFO - Unable to do partial parsing because config vars, config profile, or config target have changed + + +Caching +------- + +If the dbt project ``target`` directory has a ``partial_parse.msgpack``, Cosmos will attempt to use it. + +There is a chance, however, that the file is stale or was generated in a way that is different to how Cosmos runs the dbt commands. + +Therefore, Cosmos also caches the most up-to-date ``partial_parse.msgpack`` file after running a dbt command in the `system temporary directory `_. +With this, unless there are code changes, each Airflow node should only run the dbt command with a full dbt project parse once, and benefit from partial parsing from then onwards. + + +Caching is enabled by default. +It is possible to disable caching or override the directory that Cosmos uses caching with the Airflow configuration: + +.. code-block:: cfg + + [cosmos] + cache_dir = path/to/docs/here # to override default caching directory (by default, uses the system temporary directory) + enable_cache_partial_parse = False # to disable caching (enabled by default) + +Or environment variable: + +.. code-block:: cfg + + AIRFLOW__COSMOS__CACHE_DIR="path/to/docs/here" # to override default caching directory (by default, uses the system temporary directory) + AIRFLOW__COSMOS__ENABLE_CACHE_PARTIAL_PARSE="False" # to disable caching (enabled by default) + +Learn more about `caching <./caching.html>`_ and `Cosmos Airflow configurations <./cosmos-conf.html>`_. + +Disabling +--------- + +To switch off partial parsing in Cosmos, use the argument ``partial_parse=False`` in the ``ProjectConfig``. diff --git a/docs/optimize_performance/selecting-excluding.rst b/docs/optimize_performance/selecting-excluding.rst new file mode 100644 index 0000000000..e587895ac8 --- /dev/null +++ b/docs/optimize_performance/selecting-excluding.rst @@ -0,0 +1,234 @@ +.. _selecting-excluding: + +Selecting & Excluding +======================= + +Cosmos allows you to filter to a subset of your dbt project in each ``DbtDag`` / ``DbtTaskGroup`` using the ``select`` and ``exclude`` parameters in the ``RenderConfig`` class. + + Since Cosmos 1.3, the ``selector`` parameter is available in ``RenderConfig`` when using the ``LoadMode.DBT_LS`` to parse the dbt project into Airflow. + + Since Cosmos 1.13, the ``selector`` parameter is available in ``RenderConfig`` when using the ``LoadMode.DBT_MANIFEST`` to parse the dbt project into Airflow. + + +Using ``select`` and ``exclude`` +-------------------------------- + +The ``select`` and ``exclude`` parameters are lists, with values like the following: + +- ``tag:my_tag``: include/exclude models with the tag ``my_tag`` +- ``config.meta.some_key:some_value``: include/exclude models with ``config.meta_some_key: some_value`` +- ``config.materialized:table``: include/exclude models with the config ``materialized: table`` +- ``path:analytics/tables``: include/exclude models in the ``analytics/tables`` directory. In this example, ``analytics/table`` is a relative path, but absolute paths are also supported. +- ``+node_name+1`` (graph operators): include/exclude the node with name ``node_name``, all its parents, and its first generation of children (`dbt graph selector docs `_) +- ``+/path/to/model_g+`` (graph operators): include/exclude all the nodes in the absolute path ``/path/to/model_g``, their parents and children. Relative paths are also supported. +- ``+tag:nightly`` (graph operators): include/exclude all nodes that have tag ``nightly`` and their parents. +- ``+config.materialized:view`` (graph operators): include/exclude all the nodes that have the materialization ``view`` and their parents +- ``@node_name`` (@ operator): include/exclude the node with name ``node_name``, all its descendants, and all ancestors of those descendants. This is useful in CI environments where you want to build a model and all its descendants, but you need the ancestors of those descendants to exist first. +- ``tag:my_tag,+node_name`` (intersection): include/exclude ``node_name`` and its parents if they have the tag ``my_tag`` (`dbt set operator docs `_) +- ``['tag:first_tag', 'tag:second_tag']`` (union): include/exclude nodes that have either ``tag:first_tag`` or ``tag:second_tag`` +- ``resource_type:``: include nodes with the resource type ``seed, snapshot, model, test, source``. For example, ``resource_type:source`` returns only nodes where resource_type == SOURCE +- ``exclude_resource_type:``: exclude nodes with the resource type ``analysis, exposure, metric, model, saved_query, seed, semantic_model, snapshot, source, test, unit_test``. For example, ``exclude_resource_type:source`` returns only nodes where resource_type != SOURCE +- ``source:my_source``: include/exclude nodes that have the source ``my_source`` and are of resource_type ``source`` +- ``source:my_source+``: include/exclude nodes that have the source ``my_source`` and their children +- ``source:my_source.my_table``: include/exclude nodes that have the source ``my_source`` and the table ``my_table`` +- ``exposure:my_exposure``: include/exclude nodes that have the exposure ``my_exposure`` and are of resource_type ``exposure`` +- ``exposure:+my_exposure``: include/exclude nodes that have the exposure ``my_exposure`` and their parents +- ``fqn:some_model``: include/exclude nodes based on their fully qualified names (FQN), which consist of the project name, folder path, and model name. For example, ``fqn:my_dbt_project.analytics.tables.my_model`` selects the model ``my_model`` in the ``analytics/tables`` folder of the ``my_dbt_project`` project. +- ``package:package_name``: include/exclude all nodes that belong to the given package (e.g. ``package:dbt_artifacts``). The package name must be non-empty (use ``package:dbt_artifacts``, not ``package:``). +- ``package:package_name+``: include/exclude all nodes in the package and their descendants (children). +- ``+package:package_name``: include/exclude all nodes in the package and their ancestors (parents). +- A bare name without a method prefix (e.g. ``dbt_artifacts`` or ``child``) is resolved like dbt: it matches nodes by package name, node name, or path segment (folder name). So ``select=['folder_a']`` or ``exclude=['folder_a']`` includes or excludes all models under a folder named ``folder_a``, including when using ``LoadMode.DBT_MANIFEST``. + +.. note:: + + If you're using the ``dbt_ls`` parsing method, these arguments are passed directly to the dbt CLI command. + + If you're using the ``dbt_manifest`` parsing method, Cosmos will filter the models in the manifest before creating the DAG. This does not directly use dbt's CLI command, but should include all metadata that dbt would include. + + If you're using the ``custom`` parsing method, Cosmos does not currently read the ``dbt_project.yml`` file. You can still select/exclude models if you're selecting on metadata defined in the model code or ``.yml`` files in the models directory. + +Examples: + +.. code-block:: python + + from cosmos import DbtDag, RenderConfig + + jaffle_shop = DbtDag( + render_config=RenderConfig( + select=["tag:my_tag"], + ) + ) + +.. code-block:: python + + from cosmos import DbtDag + + jaffle_shop = DbtDag( + render_config=RenderConfig( + select=["config.schema:prod"], + ) + ) + +.. code-block:: python + + from cosmos import DbtDag + + jaffle_shop = DbtDag( + render_config=RenderConfig( + select=["path:analytics/tables"], + ) + ) + +.. code-block:: python + + from cosmos import DbtDag, RenderConfig + + jaffle_shop = DbtDag( + render_config=RenderConfig( + select=["tag:include_tag1", "tag:include_tag2"], # union + ) + ) + +.. code-block:: python + + from cosmos import DbtDag, RenderConfig + + jaffle_shop = DbtDag( + render_config=RenderConfig( + select=["tag:include_tag1,tag:include_tag2"], # intersection + ) + ) + +.. code-block:: python + + from cosmos import DbtDag, RenderConfig + + jaffle_shop = DbtDag( + render_config=RenderConfig( + exclude=["node_name+"], # node_name and its children + ) + ) + +.. code-block:: python + + from cosmos import DbtDag, RenderConfig + + jaffle_shop = DbtDag( + render_config=RenderConfig( + exclude=[ + "package:dbt_artifacts" + ], # exclude all nodes from dbt_artifacts package (e.g. when using manifest load mode) + ) + ) + +.. code-block:: python + + from cosmos import DbtDag, RenderConfig + + jaffle_shop = DbtDag( + render_config=RenderConfig( + select=["@my_model"], # selects my_model, all its descendants, + # and all ancestors needed to build those descendants + ) + ) + +.. code-block:: python + + from cosmos.airflow.dag import DbtDag + from cosmos.config import RenderConfig + + jaffle_shop = DbtDag( + render_config=RenderConfig( + select=[ + "fqn:jaffle_shop.analytics.tables.my_model" + ], # selects models by fully qualified name + ) + ) + +Using ``selector`` +-------------------------------- +.. note:: + Only currently supported using the ``LoadMode.DBT_LS`` (since Cosmos 1.3) or ``LoadMode.DBT_MANIFEST`` (since Cosmos 1.13). + If ``select`` and/or ``exclude`` are used with ``selector``, dbt will ignore the ``select`` and ``exclude`` parameters. + +The ``selector`` parameter is a string that references a `dbt YAML selector `_ already defined in a dbt project. + +Examples: + +.. code-block:: python + + from cosmos import DbtDag, RenderConfig, LoadMode + + jaffle_shop = DbtDag( + render_config=RenderConfig( + selector="my_selector", # this selector must be defined in your dbt project + load_method=LoadMode.DBT_LS, + ) + ) + +.. code-block:: python + + from cosmos import DbtDag, RenderConfig, LoadMode + + jaffle_shop = DbtDag( + project_config=ProjectConfig( + manifest_path=DBT_ROOT_PATH / "jaffle_shop" / "target" / "manifest.json", + project_name="jaffle_shop", + ), + render_config=RenderConfig( + selector="nightly_models", # this selector must be defined in your dbt project + load_method=LoadMode.DBT_MANIFEST, + ), + ) + jaffle_shop_remote = DbtDag( + project_config=ProjectConfig( + manifest_path="s3://cosmos-manifest-test/manifest.json", + manifest_conn_id="aws_s3_conn", + project_name="jaffle_shop", + ), + render_config=RenderConfig( + selector="nightly_models", # this selector must be defined in your dbt project + load_method=LoadMode.DBT_MANIFEST, + ), + ) + +Using ``selector`` with ``LoadMode.DBT_MANIFEST`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Since Cosmos 1.13, the ``selector`` parameter is also supported when using the ``LoadMode.DBT_MANIFEST`` parsing method. + +When using this combination, Cosmos will read the preprocessed YAML selectors from the manifest file and use them to filter the dbt nodes to include in the Airflow DAG or Task Group. + +The YAML selection parser expects the selectors to be defined in the dbt project and will parse the preprocessed ``selectors`` found in the manifest file. Modifying the selector definitions in the manifest file in any way may lead to undefined behavior. +The parser may or may not catch invalid selector definitions if the selectors in the manifest are altered. + +The YAML selection parsing logic is based off the spec defined in the `dbt documentation `_. +All `graph operators `_ and `set operators `_ are supported. +Parsing of the ``default`` and ``indirect_selection`` keywords is not currently supported. + +In the event the dbt YAML selector specification changes, Cosmos will attempt to keep up to date with the changes, but there may be a lag between dbt releases and Cosmos releases. +Once a new Cosmos version is released with the updated selector parsing logic, users should update their Cosmos version to ensure compatibility with the latest dbt selector specification. +For subsequent updates to the YAML selector parser, existing YAML selector caches will be invalidated the next time the DAG is parsed. + +**Error Handling** + +Cosmos distinguishes between two types of errors when parsing YAML selectors: + +- **Structural YAML Errors** - These cause immediate failure during manifest parsing: + + - Selector definition is not a dictionary + - Missing required ``name`` key + - Missing required ``definition`` key + + These errors indicate malformed YAML structure and will raise a ``CosmosValueError`` immediately when calling ``YamlSelectors.parse()``. + +- **Selector Definition Errors** - These are isolated and surfaced when accessing the selector: + + - Unsupported selector methods (e.g., ``method: "state"``, ``method: "package"``) + - Invalid graph operator configurations (e.g., non-integer depth values) + - Invalid selector logic (e.g., multiple root keys in a definition) + + These errors are collected during parsing but only raised when you attempt to retrieve the selector using ``get_parsed(selector_name)``. + This allows the manifest to be loaded successfully even if some selectors have definition errors, enabling you to work with valid selectors while debugging invalid ones. + +If a selector has multiple definition errors, they will all be reported together in a formatted error message when accessing the selector. diff --git a/docs/configuration/cosmos-conf.rst b/docs/reference/configs/cosmos-conf.rst similarity index 98% rename from docs/configuration/cosmos-conf.rst rename to docs/reference/configs/cosmos-conf.rst index cc68c3b71f..a8928c3840 100644 --- a/docs/configuration/cosmos-conf.rst +++ b/docs/reference/configs/cosmos-conf.rst @@ -253,14 +253,14 @@ This page lists all available Airflow configurations that affect ``astronomer-co As an example, when this option is enabled, the following is an example of specifying the imports with full module paths: - .. literalinclude:: ../../dev/dags/basic_cosmos_dag_full_module_path_imports.py + .. literalinclude:: ../../../dev/dags/basic_cosmos_dag_full_module_path_imports.py :language: python :start-after: [START cosmos_explicit_imports] :end-before: [END cosmos_explicit_imports] as opposed to the following approach you might have when this option is disabled (default): - .. literalinclude:: ../../dev/dags/basic_cosmos_dag.py + .. literalinclude:: ../../../dev/dags/basic_cosmos_dag.py :language: python :start-after: [START cosmos_init_imports] :end-before: [END cosmos_init_imports] diff --git a/docs/configuration/execution-config.rst b/docs/reference/configs/execution-config.rst similarity index 100% rename from docs/configuration/execution-config.rst rename to docs/reference/configs/execution-config.rst diff --git a/docs/reference/configs/index.rst b/docs/reference/configs/index.rst new file mode 100644 index 0000000000..8d01434380 --- /dev/null +++ b/docs/reference/configs/index.rst @@ -0,0 +1,14 @@ + +Configuration References +======================== + + +.. toctree:: + :maxdepth: 1 + :hidden: + :caption: Configurations + + Project Config + Profile Config + Execution Config + Cosmos Config diff --git a/docs/configuration/profile-config.rst b/docs/reference/configs/profile-config.rst similarity index 100% rename from docs/configuration/profile-config.rst rename to docs/reference/configs/profile-config.rst diff --git a/docs/configuration/project-config.rst b/docs/reference/configs/project-config.rst similarity index 100% rename from docs/configuration/project-config.rst rename to docs/reference/configs/project-config.rst diff --git a/docs/reference/index.rst b/docs/reference/index.rst new file mode 100644 index 0000000000..5d4b755faa --- /dev/null +++ b/docs/reference/index.rst @@ -0,0 +1,19 @@ + +Reference +========= + + +.. toctree:: + :maxdepth: 1 + :hidden: + :caption: Configurations + + configs/index + + +.. toctree:: + :maxdepth: 1 + :hidden: + :caption: Profiles + + profiles/index diff --git a/docs/reference/profiles/AthenaAccessKey.rst b/docs/reference/profiles/AthenaAccessKey.rst new file mode 100644 index 0000000000..9c1262444c --- /dev/null +++ b/docs/reference/profiles/AthenaAccessKey.rst @@ -0,0 +1,168 @@ +.. + This file is autogenerated by ``docs/scripts/generate_mappings.py``. Do not edit by hand. + + +AthenaAccessKey +=============== + + + +Uses the Airflow AWS Connection provided to get_credentials() to generate the profile for dbt. + + + +https://airflow.apache.org/docs/apache-airflow-providers-amazon/stable/connections/aws.html + + + + + +This behaves similarly to other provider operators such as the AWS Athena Operator. + +Where you pass the aws_conn_id and the operator will generate the credentials for you. + + + +https://registry.astronomer.io/providers/amazon/versions/latest/modules/athenaoperator + + + +Information about the dbt Athena profile that is generated can be found here: + + + +https://github.com/dbt-athena/dbt-athena?tab=readme-ov-file#configuring-your-profile + +https://docs.getdbt.com/docs/core/connect-data-platform/athena-setup + + + +This profile mapping translates Airflow connections with the type ``aws`` +into dbt profiles. To use this profile, import it from ``cosmos.profiles``: + +.. code-block:: python + + from cosmos.profiles import AthenaAccessKeyProfileMapping + + profile = AthenaAccessKeyProfileMapping( + conn_id="my_aws_connection", + profile_args={...}, + ) + +While the profile mapping pulls fields from Airflow connections, you may need to supplement it +with additional ``profile_args``. The below table shows which fields are required, along with those +not required but pulled from the Airflow connection if present. You can also add additional fields +to the ``profile_args`` dict. + +.. list-table:: + :header-rows: 1 + + * - dbt Field Name + - Required + - Airflow Field Name + + + * - ``aws_profile_name`` + - False + + - ``extra.aws_profile_name`` + + + * - ``database`` + - True + + - ``extra.database`` + + + * - ``debug_query_state`` + - False + + - ``extra.debug_query_state`` + + + * - ``lf_tags_database`` + - False + + - ``extra.lf_tags_database`` + + + * - ``num_retries`` + - False + + - ``extra.num_retries`` + + + * - ``poll_interval`` + - False + + - ``extra.poll_interval`` + + + * - ``region_name`` + - True + + - ``extra.region_name`` + + + * - ``s3_data_dir`` + - False + + - ``extra.s3_data_dir`` + + + * - ``s3_data_naming`` + - False + + - ``extra.s3_data_naming`` + + + * - ``s3_staging_dir`` + - True + + - ``extra.s3_staging_dir`` + + + * - ``schema`` + - True + + - ``extra.schema`` + + + * - ``seed_s3_upload_args`` + - False + + - ``extra.seed_s3_upload_args`` + + + * - ``work_group`` + - False + + - ``extra.work_group`` + + + * - ``aws_access_key_id`` + - True + + - + + + * - ``aws_secret_access_key`` + - True + + - + + + + +Some notes about the table above: + +- This table doesn't necessarily show the full list of fields you *can* pass to the dbt profile. To + see the full list of fields, see the link to the dbt docs at the top of this page. +- If the Airflow field name starts with an ``extra.``, this means that the field is nested under + the ``extra`` field in the Airflow connection. For example, if the Airflow field name is + ``extra.token``, this means that the field is nested under ``extra`` in the Airflow connection, + and the field name is ``token``. +- If there are multiple Airflow field names, the profile mapping looks at those fields in order. + For example, if the Airflow field name is ``['password', 'extra.token']``, the profile mapping + will first look for a field named ``password``. If that field is not present, it will look for + ``extra.token``. diff --git a/docs/reference/profiles/ClickhouseUserPassword.rst b/docs/reference/profiles/ClickhouseUserPassword.rst new file mode 100644 index 0000000000..5c1a10fded --- /dev/null +++ b/docs/reference/profiles/ClickhouseUserPassword.rst @@ -0,0 +1,90 @@ +.. + This file is autogenerated by ``docs/scripts/generate_mappings.py``. Do not edit by hand. + + +ClickhouseUserPassword +====================== + + + +Maps Airflow generic connections using user + password authentication to dbt Clickhouse profiles. + +https://docs.getdbt.com/docs/core/connect-data-platform/clickhouse-setup + + + +This profile mapping translates Airflow connections with the type ``generic`` +into dbt profiles. To use this profile, import it from ``cosmos.profiles``: + +.. code-block:: python + + from cosmos.profiles import ClickhouseUserPasswordProfileMapping + + profile = ClickhouseUserPasswordProfileMapping( + conn_id="my_generic_connection", + profile_args={...}, + ) + +While the profile mapping pulls fields from Airflow connections, you may need to supplement it +with additional ``profile_args``. The below table shows which fields are required, along with those +not required but pulled from the Airflow connection if present. You can also add additional fields +to the ``profile_args`` dict. + +.. list-table:: + :header-rows: 1 + + * - dbt Field Name + - Required + - Airflow Field Name + + + * - ``host`` + - True + + - ``host`` + + + * - ``user`` + - True + + - ``login`` + + + * - ``password`` + - False + + - ``password`` + + + * - ``port`` + - False + + - ``port`` + + + * - ``schema`` + - True + + - ``schema`` + + + * - ``clickhouse`` + - True + + - ``extra.clickhouse`` + + + + +Some notes about the table above: + +- This table doesn't necessarily show the full list of fields you *can* pass to the dbt profile. To + see the full list of fields, see the link to the dbt docs at the top of this page. +- If the Airflow field name starts with an ``extra.``, this means that the field is nested under + the ``extra`` field in the Airflow connection. For example, if the Airflow field name is + ``extra.token``, this means that the field is nested under ``extra`` in the Airflow connection, + and the field name is ``token``. +- If there are multiple Airflow field names, the profile mapping looks at those fields in order. + For example, if the Airflow field name is ``['password', 'extra.token']``, the profile mapping + will first look for a field named ``password``. If that field is not present, it will look for + ``extra.token``. diff --git a/docs/reference/profiles/DatabricksOauth.rst b/docs/reference/profiles/DatabricksOauth.rst new file mode 100644 index 0000000000..cef7eeca86 --- /dev/null +++ b/docs/reference/profiles/DatabricksOauth.rst @@ -0,0 +1,88 @@ +.. + This file is autogenerated by ``docs/scripts/generate_mappings.py``. Do not edit by hand. + + +DatabricksOauth +=============== + + + +Maps Airflow Databricks connections with the client auth to dbt profiles. + + + +https://docs.getdbt.com/reference/warehouse-setups/databricks-setup + +https://airflow.apache.org/docs/apache-airflow-providers-databricks/stable/connections/databricks.html + + + +This profile mapping translates Airflow connections with the type ``databricks`` +into dbt profiles. To use this profile, import it from ``cosmos.profiles``: + +.. code-block:: python + + from cosmos.profiles import DatabricksOauthProfileMapping + + profile = DatabricksOauthProfileMapping( + conn_id="my_databricks_connection", + profile_args={...}, + ) + +While the profile mapping pulls fields from Airflow connections, you may need to supplement it +with additional ``profile_args``. The below table shows which fields are required, along with those +not required but pulled from the Airflow connection if present. You can also add additional fields +to the ``profile_args`` dict. + +.. list-table:: + :header-rows: 1 + + * - dbt Field Name + - Required + - Airflow Field Name + + + * - ``host`` + - True + + - ``host`` + + + * - ``schema`` + - True + + - ``schema`` + + + * - ``client_id`` + - True + + - ``['login', 'extra.client_id']`` + + + * - ``client_secret`` + - True + + - ``['password', 'extra.client_secret']`` + + + * - ``http_path`` + - True + + - ``extra.http_path`` + + + + +Some notes about the table above: + +- This table doesn't necessarily show the full list of fields you *can* pass to the dbt profile. To + see the full list of fields, see the link to the dbt docs at the top of this page. +- If the Airflow field name starts with an ``extra.``, this means that the field is nested under + the ``extra`` field in the Airflow connection. For example, if the Airflow field name is + ``extra.token``, this means that the field is nested under ``extra`` in the Airflow connection, + and the field name is ``token``. +- If there are multiple Airflow field names, the profile mapping looks at those fields in order. + For example, if the Airflow field name is ``['password', 'extra.token']``, the profile mapping + will first look for a field named ``password``. If that field is not present, it will look for + ``extra.token``. diff --git a/docs/reference/profiles/DatabricksToken.rst b/docs/reference/profiles/DatabricksToken.rst new file mode 100644 index 0000000000..48cce17cec --- /dev/null +++ b/docs/reference/profiles/DatabricksToken.rst @@ -0,0 +1,82 @@ +.. + This file is autogenerated by ``docs/scripts/generate_mappings.py``. Do not edit by hand. + + +DatabricksToken +=============== + + + +Maps Airflow Databricks connections with a token to dbt profiles. + + + +https://docs.getdbt.com/reference/warehouse-setups/databricks-setup + +https://airflow.apache.org/docs/apache-airflow-providers-databricks/stable/connections/databricks.html + + + +This profile mapping translates Airflow connections with the type ``databricks`` +into dbt profiles. To use this profile, import it from ``cosmos.profiles``: + +.. code-block:: python + + from cosmos.profiles import DatabricksTokenProfileMapping + + profile = DatabricksTokenProfileMapping( + conn_id="my_databricks_connection", + profile_args={...}, + ) + +While the profile mapping pulls fields from Airflow connections, you may need to supplement it +with additional ``profile_args``. The below table shows which fields are required, along with those +not required but pulled from the Airflow connection if present. You can also add additional fields +to the ``profile_args`` dict. + +.. list-table:: + :header-rows: 1 + + * - dbt Field Name + - Required + - Airflow Field Name + + + * - ``host`` + - True + + - ``host`` + + + * - ``schema`` + - True + + - ``schema`` + + + * - ``token`` + - True + + - ``['password', 'extra.token']`` + + + * - ``http_path`` + - True + + - ``extra.http_path`` + + + + +Some notes about the table above: + +- This table doesn't necessarily show the full list of fields you *can* pass to the dbt profile. To + see the full list of fields, see the link to the dbt docs at the top of this page. +- If the Airflow field name starts with an ``extra.``, this means that the field is nested under + the ``extra`` field in the Airflow connection. For example, if the Airflow field name is + ``extra.token``, this means that the field is nested under ``extra`` in the Airflow connection, + and the field name is ``token``. +- If there are multiple Airflow field names, the profile mapping looks at those fields in order. + For example, if the Airflow field name is ``['password', 'extra.token']``, the profile mapping + will first look for a field named ``password``. If that field is not present, it will look for + ``extra.token``. diff --git a/docs/reference/profiles/DuckDBUserPassword.rst b/docs/reference/profiles/DuckDBUserPassword.rst new file mode 100644 index 0000000000..d95aa8011a --- /dev/null +++ b/docs/reference/profiles/DuckDBUserPassword.rst @@ -0,0 +1,62 @@ +.. + This file is autogenerated by ``docs/scripts/generate_mappings.py``. Do not edit by hand. + + +DuckDBUserPassword +================== + + + +Maps Airflow DuckDB connections using local path mapping to dbt profiles. + +https://docs.getdbt.com/docs/core/connect-data-platform/duckdb-setup + +https://github.com/astronomer/airflow-provider-duckdb + + + +This profile mapping translates Airflow connections with the type ``duckdb`` +into dbt profiles. To use this profile, import it from ``cosmos.profiles``: + +.. code-block:: python + + from cosmos.profiles import DuckDBUserPasswordProfileMapping + + profile = DuckDBUserPasswordProfileMapping( + conn_id="my_duckdb_connection", + profile_args={...}, + ) + +While the profile mapping pulls fields from Airflow connections, you may need to supplement it +with additional ``profile_args``. The below table shows which fields are required, along with those +not required but pulled from the Airflow connection if present. You can also add additional fields +to the ``profile_args`` dict. + +.. list-table:: + :header-rows: 1 + + * - dbt Field Name + - Required + - Airflow Field Name + + + * - ``path`` + - True + + - ``host`` + + + + +Some notes about the table above: + +- This table doesn't necessarily show the full list of fields you *can* pass to the dbt profile. To + see the full list of fields, see the link to the dbt docs at the top of this page. +- If the Airflow field name starts with an ``extra.``, this means that the field is nested under + the ``extra`` field in the Airflow connection. For example, if the Airflow field name is + ``extra.token``, this means that the field is nested under ``extra`` in the Airflow connection, + and the field name is ``token``. +- If there are multiple Airflow field names, the profile mapping looks at those fields in order. + For example, if the Airflow field name is ``['password', 'extra.token']``, the profile mapping + will first look for a field named ``password``. If that field is not present, it will look for + ``extra.token``. diff --git a/docs/reference/profiles/ExasolUserPassword.rst b/docs/reference/profiles/ExasolUserPassword.rst new file mode 100644 index 0000000000..3a8961a151 --- /dev/null +++ b/docs/reference/profiles/ExasolUserPassword.rst @@ -0,0 +1,120 @@ +.. + This file is autogenerated by ``docs/scripts/generate_mappings.py``. Do not edit by hand. + + +ExasolUserPassword +================== + + + +Maps Airflow Exasol connections with a username and password to dbt profiles. + +https://docs.getdbt.com/reference/warehouse-setups/exasol-setup + + + +This profile mapping translates Airflow connections with the type ``exasol`` +into dbt profiles. To use this profile, import it from ``cosmos.profiles``: + +.. code-block:: python + + from cosmos.profiles import ExasolUserPasswordProfileMapping + + profile = ExasolUserPasswordProfileMapping( + conn_id="my_exasol_connection", + profile_args={...}, + ) + +While the profile mapping pulls fields from Airflow connections, you may need to supplement it +with additional ``profile_args``. The below table shows which fields are required, along with those +not required but pulled from the Airflow connection if present. You can also add additional fields +to the ``profile_args`` dict. + +.. list-table:: + :header-rows: 1 + + * - dbt Field Name + - Required + - Airflow Field Name + + + * - ``dsn`` + - True + + - ``host`` + + + * - ``user`` + - True + + - ``login`` + + + * - ``password`` + - True + + - ``password`` + + + * - ``dbname`` + - True + + - ``schema`` + + + * - ``encryption`` + - False + + - ``extra.encryption`` + + + * - ``compression`` + - False + + - ``extra.compression`` + + + * - ``connection_timeout`` + - False + + - ``extra.connection_timeout`` + + + * - ``socket_timeout`` + - False + + - ``extra.socket_timeout`` + + + * - ``protocol_version`` + - False + + - ``extra.protocol_version`` + + + * - ``threads`` + - True + + - + + + * - ``schema`` + - True + + - + + + + +Some notes about the table above: + +- This table doesn't necessarily show the full list of fields you *can* pass to the dbt profile. To + see the full list of fields, see the link to the dbt docs at the top of this page. +- If the Airflow field name starts with an ``extra.``, this means that the field is nested under + the ``extra`` field in the Airflow connection. For example, if the Airflow field name is + ``extra.token``, this means that the field is nested under ``extra`` in the Airflow connection, + and the field name is ``token``. +- If there are multiple Airflow field names, the profile mapping looks at those fields in order. + For example, if the Airflow field name is ``['password', 'extra.token']``, the profile mapping + will first look for a field named ``password``. If that field is not present, it will look for + ``extra.token``. diff --git a/docs/reference/profiles/GoogleCloudOauth.rst b/docs/reference/profiles/GoogleCloudOauth.rst new file mode 100644 index 0000000000..abc921ca45 --- /dev/null +++ b/docs/reference/profiles/GoogleCloudOauth.rst @@ -0,0 +1,72 @@ +.. + This file is autogenerated by ``docs/scripts/generate_mappings.py``. Do not edit by hand. + + +GoogleCloudOauth +================ + + + +Maps Airflow GCP connections to dbt BigQuery profiles that uses oauth via gcloud, + +if they don't use key file or JSON. + + + +https://docs.getdbt.com/docs/core/connect-data-platform/bigquery-setup#oauth-via-gcloud + +https://airflow.apache.org/docs/apache-airflow-providers-google/stable/connections/gcp.html + + + +This profile mapping translates Airflow connections with the type ``google_cloud_platform`` +into dbt profiles. To use this profile, import it from ``cosmos.profiles``: + +.. code-block:: python + + from cosmos.profiles import GoogleCloudOauthProfileMapping + + profile = GoogleCloudOauthProfileMapping( + conn_id="my_google_cloud_platform_connection", + profile_args={...}, + ) + +While the profile mapping pulls fields from Airflow connections, you may need to supplement it +with additional ``profile_args``. The below table shows which fields are required, along with those +not required but pulled from the Airflow connection if present. You can also add additional fields +to the ``profile_args`` dict. + +.. list-table:: + :header-rows: 1 + + * - dbt Field Name + - Required + - Airflow Field Name + + + * - ``project`` + - True + + - ``extra.project`` + + + * - ``dataset`` + - True + + - ``extra.dataset`` + + + + +Some notes about the table above: + +- This table doesn't necessarily show the full list of fields you *can* pass to the dbt profile. To + see the full list of fields, see the link to the dbt docs at the top of this page. +- If the Airflow field name starts with an ``extra.``, this means that the field is nested under + the ``extra`` field in the Airflow connection. For example, if the Airflow field name is + ``extra.token``, this means that the field is nested under ``extra`` in the Airflow connection, + and the field name is ``token``. +- If there are multiple Airflow field names, the profile mapping looks at those fields in order. + For example, if the Airflow field name is ``['password', 'extra.token']``, the profile mapping + will first look for a field named ``password``. If that field is not present, it will look for + ``extra.token``. diff --git a/docs/reference/profiles/GoogleCloudServiceAccountDict.rst b/docs/reference/profiles/GoogleCloudServiceAccountDict.rst new file mode 100644 index 0000000000..ee1485e4a4 --- /dev/null +++ b/docs/reference/profiles/GoogleCloudServiceAccountDict.rst @@ -0,0 +1,76 @@ +.. + This file is autogenerated by ``docs/scripts/generate_mappings.py``. Do not edit by hand. + + +GoogleCloudServiceAccountDict +============================= + + + +Maps Airflow GCP connections to dbt BigQuery profiles if they use a service account keyfile dict/json. + + + +https://docs.getdbt.com/reference/warehouse-setups/bigquery-setup#service-account-file + +https://airflow.apache.org/docs/apache-airflow-providers-google/stable/connections/gcp.html + + + +This profile mapping translates Airflow connections with the type ``google_cloud_platform`` +into dbt profiles. To use this profile, import it from ``cosmos.profiles``: + +.. code-block:: python + + from cosmos.profiles import GoogleCloudServiceAccountDictProfileMapping + + profile = GoogleCloudServiceAccountDictProfileMapping( + conn_id="my_google_cloud_platform_connection", + profile_args={...}, + ) + +While the profile mapping pulls fields from Airflow connections, you may need to supplement it +with additional ``profile_args``. The below table shows which fields are required, along with those +not required but pulled from the Airflow connection if present. You can also add additional fields +to the ``profile_args`` dict. + +.. list-table:: + :header-rows: 1 + + * - dbt Field Name + - Required + - Airflow Field Name + + + * - ``project`` + - True + + - ``extra.project`` + + + * - ``dataset`` + - True + + - ``extra.dataset`` + + + * - ``keyfile_json`` + - True + + - ``['extra.keyfile_dict', 'keyfile_dict', 'extra__google_cloud_platform__keyfile_dict']`` + + + + +Some notes about the table above: + +- This table doesn't necessarily show the full list of fields you *can* pass to the dbt profile. To + see the full list of fields, see the link to the dbt docs at the top of this page. +- If the Airflow field name starts with an ``extra.``, this means that the field is nested under + the ``extra`` field in the Airflow connection. For example, if the Airflow field name is + ``extra.token``, this means that the field is nested under ``extra`` in the Airflow connection, + and the field name is ``token``. +- If there are multiple Airflow field names, the profile mapping looks at those fields in order. + For example, if the Airflow field name is ``['password', 'extra.token']``, the profile mapping + will first look for a field named ``password``. If that field is not present, it will look for + ``extra.token``. diff --git a/docs/reference/profiles/GoogleCloudServiceAccountFile.rst b/docs/reference/profiles/GoogleCloudServiceAccountFile.rst new file mode 100644 index 0000000000..e95c0d7801 --- /dev/null +++ b/docs/reference/profiles/GoogleCloudServiceAccountFile.rst @@ -0,0 +1,76 @@ +.. + This file is autogenerated by ``docs/scripts/generate_mappings.py``. Do not edit by hand. + + +GoogleCloudServiceAccountFile +============================= + + + +Maps Airflow GCP connections to dbt BigQuery profiles if they use a service account file. + + + +https://docs.getdbt.com/reference/warehouse-setups/bigquery-setup#service-account-file + +https://airflow.apache.org/docs/apache-airflow-providers-google/stable/connections/gcp.html + + + +This profile mapping translates Airflow connections with the type ``google_cloud_platform`` +into dbt profiles. To use this profile, import it from ``cosmos.profiles``: + +.. code-block:: python + + from cosmos.profiles import GoogleCloudServiceAccountFileProfileMapping + + profile = GoogleCloudServiceAccountFileProfileMapping( + conn_id="my_google_cloud_platform_connection", + profile_args={...}, + ) + +While the profile mapping pulls fields from Airflow connections, you may need to supplement it +with additional ``profile_args``. The below table shows which fields are required, along with those +not required but pulled from the Airflow connection if present. You can also add additional fields +to the ``profile_args`` dict. + +.. list-table:: + :header-rows: 1 + + * - dbt Field Name + - Required + - Airflow Field Name + + + * - ``project`` + - True + + - ``extra.project`` + + + * - ``dataset`` + - True + + - ``extra.dataset`` + + + * - ``keyfile`` + - True + + - ``extra.key_path`` + + + + +Some notes about the table above: + +- This table doesn't necessarily show the full list of fields you *can* pass to the dbt profile. To + see the full list of fields, see the link to the dbt docs at the top of this page. +- If the Airflow field name starts with an ``extra.``, this means that the field is nested under + the ``extra`` field in the Airflow connection. For example, if the Airflow field name is + ``extra.token``, this means that the field is nested under ``extra`` in the Airflow connection, + and the field name is ``token``. +- If there are multiple Airflow field names, the profile mapping looks at those fields in order. + For example, if the Airflow field name is ``['password', 'extra.token']``, the profile mapping + will first look for a field named ``password``. If that field is not present, it will look for + ``extra.token``. diff --git a/docs/reference/profiles/MysqlUserPassword.rst b/docs/reference/profiles/MysqlUserPassword.rst new file mode 100644 index 0000000000..07eebc0597 --- /dev/null +++ b/docs/reference/profiles/MysqlUserPassword.rst @@ -0,0 +1,86 @@ +.. + This file is autogenerated by ``docs/scripts/generate_mappings.py``. Do not edit by hand. + + +MysqlUserPassword +================= + + + +Maps Airflow MySQL connections using user + password authentication to dbt profiles. + +https://docs.getdbt.com/reference/warehouse-setups/mysql-setup + +https://airflow.apache.org/docs/apache-airflow-providers-mysql/stable/connections/mysql.html + + + +This profile mapping translates Airflow connections with the type ``mysql`` +into dbt profiles. To use this profile, import it from ``cosmos.profiles``: + +.. code-block:: python + + from cosmos.profiles import MysqlUserPasswordProfileMapping + + profile = MysqlUserPasswordProfileMapping( + conn_id="my_mysql_connection", + profile_args={...}, + ) + +While the profile mapping pulls fields from Airflow connections, you may need to supplement it +with additional ``profile_args``. The below table shows which fields are required, along with those +not required but pulled from the Airflow connection if present. You can also add additional fields +to the ``profile_args`` dict. + +.. list-table:: + :header-rows: 1 + + * - dbt Field Name + - Required + - Airflow Field Name + + + * - ``server`` + - True + + - ``host`` + + + * - ``username`` + - True + + - ``login`` + + + * - ``password`` + - True + + - ``password`` + + + * - ``port`` + - False + + - ``port`` + + + * - ``schema`` + - True + + - ``schema`` + + + + +Some notes about the table above: + +- This table doesn't necessarily show the full list of fields you *can* pass to the dbt profile. To + see the full list of fields, see the link to the dbt docs at the top of this page. +- If the Airflow field name starts with an ``extra.``, this means that the field is nested under + the ``extra`` field in the Airflow connection. For example, if the Airflow field name is + ``extra.token``, this means that the field is nested under ``extra`` in the Airflow connection, + and the field name is ``token``. +- If there are multiple Airflow field names, the profile mapping looks at those fields in order. + For example, if the Airflow field name is ``['password', 'extra.token']``, the profile mapping + will first look for a field named ``password``. If that field is not present, it will look for + ``extra.token``. diff --git a/docs/reference/profiles/OracleUserPassword.rst b/docs/reference/profiles/OracleUserPassword.rst new file mode 100644 index 0000000000..948050820a --- /dev/null +++ b/docs/reference/profiles/OracleUserPassword.rst @@ -0,0 +1,98 @@ +.. + This file is autogenerated by ``docs/scripts/generate_mappings.py``. Do not edit by hand. + + +OracleUserPassword +================== + + + +Maps Airflow Oracle connections using user + password authentication to dbt profiles. + +https://docs.getdbt.com/reference/warehouse-setups/oracle-setup + +https://airflow.apache.org/docs/apache-airflow-providers-oracle/stable/connections/oracle.html + + + +This profile mapping translates Airflow connections with the type ``oracle`` +into dbt profiles. To use this profile, import it from ``cosmos.profiles``: + +.. code-block:: python + + from cosmos.profiles import OracleUserPasswordProfileMapping + + profile = OracleUserPasswordProfileMapping( + conn_id="my_oracle_connection", + profile_args={...}, + ) + +While the profile mapping pulls fields from Airflow connections, you may need to supplement it +with additional ``profile_args``. The below table shows which fields are required, along with those +not required but pulled from the Airflow connection if present. You can also add additional fields +to the ``profile_args`` dict. + +.. list-table:: + :header-rows: 1 + + * - dbt Field Name + - Required + - Airflow Field Name + + + * - ``host`` + - False + + - ``host`` + + + * - ``port`` + - False + + - ``port`` + + + * - ``service`` + - False + + - ``extra.service_name`` + + + * - ``user`` + - True + + - ``login`` + + + * - ``password`` + - True + + - ``password`` + + + * - ``database`` + - False + + - ``extra.service_name`` + + + * - ``connection_string`` + - False + + - ``extra.dsn`` + + + + +Some notes about the table above: + +- This table doesn't necessarily show the full list of fields you *can* pass to the dbt profile. To + see the full list of fields, see the link to the dbt docs at the top of this page. +- If the Airflow field name starts with an ``extra.``, this means that the field is nested under + the ``extra`` field in the Airflow connection. For example, if the Airflow field name is + ``extra.token``, this means that the field is nested under ``extra`` in the Airflow connection, + and the field name is ``token``. +- If there are multiple Airflow field names, the profile mapping looks at those fields in order. + For example, if the Airflow field name is ``['password', 'extra.token']``, the profile mapping + will first look for a field named ``password``. If that field is not present, it will look for + ``extra.token``. diff --git a/docs/reference/profiles/PostgresUserPassword.rst b/docs/reference/profiles/PostgresUserPassword.rst new file mode 100644 index 0000000000..6883507a2e --- /dev/null +++ b/docs/reference/profiles/PostgresUserPassword.rst @@ -0,0 +1,98 @@ +.. + This file is autogenerated by ``docs/scripts/generate_mappings.py``. Do not edit by hand. + + +PostgresUserPassword +==================== + + + +Maps Airflow Postgres connections using user + password authentication to dbt profiles. + +https://docs.getdbt.com/reference/warehouse-setups/postgres-setup + +https://airflow.apache.org/docs/apache-airflow-providers-postgres/stable/connections/postgres.html + + + +This profile mapping translates Airflow connections with the type ``postgres`` +into dbt profiles. To use this profile, import it from ``cosmos.profiles``: + +.. code-block:: python + + from cosmos.profiles import PostgresUserPasswordProfileMapping + + profile = PostgresUserPasswordProfileMapping( + conn_id="my_postgres_connection", + profile_args={...}, + ) + +While the profile mapping pulls fields from Airflow connections, you may need to supplement it +with additional ``profile_args``. The below table shows which fields are required, along with those +not required but pulled from the Airflow connection if present. You can also add additional fields +to the ``profile_args`` dict. + +.. list-table:: + :header-rows: 1 + + * - dbt Field Name + - Required + - Airflow Field Name + + + * - ``host`` + - True + + - ``host`` + + + * - ``user`` + - True + + - ``login`` + + + * - ``password`` + - True + + - ``password`` + + + * - ``port`` + - False + + - ``port`` + + + * - ``dbname`` + - True + + - ``schema`` + + + * - ``keepalives_idle`` + - False + + - ``extra.keepalives_idle`` + + + * - ``sslmode`` + - False + + - ``extra.sslmode`` + + + + +Some notes about the table above: + +- This table doesn't necessarily show the full list of fields you *can* pass to the dbt profile. To + see the full list of fields, see the link to the dbt docs at the top of this page. +- If the Airflow field name starts with an ``extra.``, this means that the field is nested under + the ``extra`` field in the Airflow connection. For example, if the Airflow field name is + ``extra.token``, this means that the field is nested under ``extra`` in the Airflow connection, + and the field name is ``token``. +- If there are multiple Airflow field names, the profile mapping looks at those fields in order. + For example, if the Airflow field name is ``['password', 'extra.token']``, the profile mapping + will first look for a field named ``password``. If that field is not present, it will look for + ``extra.token``. diff --git a/docs/reference/profiles/RedshiftUserPassword.rst b/docs/reference/profiles/RedshiftUserPassword.rst new file mode 100644 index 0000000000..924a14e8cc --- /dev/null +++ b/docs/reference/profiles/RedshiftUserPassword.rst @@ -0,0 +1,110 @@ +.. + This file is autogenerated by ``docs/scripts/generate_mappings.py``. Do not edit by hand. + + +RedshiftUserPassword +==================== + + + +Maps Airflow Redshift connections to dbt Redshift profiles if they use a username and password. + +https://docs.getdbt.com/reference/warehouse-setups/redshift-setup + +https://airflow.apache.org/docs/apache-airflow-providers-amazon/stable/connections/redshift.html + + + +This profile mapping translates Airflow connections with the type ``redshift`` +into dbt profiles. To use this profile, import it from ``cosmos.profiles``: + +.. code-block:: python + + from cosmos.profiles import RedshiftUserPasswordProfileMapping + + profile = RedshiftUserPasswordProfileMapping( + conn_id="my_redshift_connection", + profile_args={...}, + ) + +While the profile mapping pulls fields from Airflow connections, you may need to supplement it +with additional ``profile_args``. The below table shows which fields are required, along with those +not required but pulled from the Airflow connection if present. You can also add additional fields +to the ``profile_args`` dict. + +.. list-table:: + :header-rows: 1 + + * - dbt Field Name + - Required + - Airflow Field Name + + + * - ``host`` + - True + + - ``host`` + + + * - ``user`` + - True + + - ``login`` + + + * - ``password`` + - True + + - ``password`` + + + * - ``port`` + - False + + - ``port`` + + + * - ``dbname`` + - True + + - ``schema`` + + + * - ``timeout`` + - False + + - ``extra.timeout`` + + + * - ``sslmode`` + - False + + - ``extra.sslmode`` + + + * - ``region`` + - False + + - ``extra.region`` + + + * - ``schema`` + - True + + - + + + + +Some notes about the table above: + +- This table doesn't necessarily show the full list of fields you *can* pass to the dbt profile. To + see the full list of fields, see the link to the dbt docs at the top of this page. +- If the Airflow field name starts with an ``extra.``, this means that the field is nested under + the ``extra`` field in the Airflow connection. For example, if the Airflow field name is + ``extra.token``, this means that the field is nested under ``extra`` in the Airflow connection, + and the field name is ``token``. +- If there are multiple Airflow field names, the profile mapping looks at those fields in order. + For example, if the Airflow field name is ``['password', 'extra.token']``, the profile mapping + will first look for a field named ``password``. If that field is not present, it will look for + ``extra.token``. diff --git a/docs/reference/profiles/SnowflakeEncryptedPrivateKeyFilePem.rst b/docs/reference/profiles/SnowflakeEncryptedPrivateKeyFilePem.rst new file mode 100644 index 0000000000..203d45ca59 --- /dev/null +++ b/docs/reference/profiles/SnowflakeEncryptedPrivateKeyFilePem.rst @@ -0,0 +1,122 @@ +.. + This file is autogenerated by ``docs/scripts/generate_mappings.py``. Do not edit by hand. + + +SnowflakeEncryptedPrivateKeyFilePem +=================================== + + + +Maps Airflow Snowflake connections to dbt profiles if they use a user/private key path. + +https://docs.getdbt.com/docs/core/connect-data-platform/snowflake-setup#key-pair-authentication + +https://airflow.apache.org/docs/apache-airflow-providers-snowflake/stable/connections/snowflake.html + + + +This profile mapping translates Airflow connections with the type ``snowflake`` +into dbt profiles. To use this profile, import it from ``cosmos.profiles``: + +.. code-block:: python + + from cosmos.profiles import SnowflakeEncryptedPrivateKeyFilePemProfileMapping + + profile = SnowflakeEncryptedPrivateKeyFilePemProfileMapping( + conn_id="my_snowflake_connection", + profile_args={...}, + ) + +While the profile mapping pulls fields from Airflow connections, you may need to supplement it +with additional ``profile_args``. The below table shows which fields are required, along with those +not required but pulled from the Airflow connection if present. You can also add additional fields +to the ``profile_args`` dict. + +.. list-table:: + :header-rows: 1 + + * - dbt Field Name + - Required + - Airflow Field Name + + + * - ``account`` + - True + + - ``extra.account`` + + + * - ``user`` + - True + + - ``login`` + + + * - ``database`` + - True + + - ``extra.database`` + + + * - ``warehouse`` + - True + + - ``extra.warehouse`` + + + * - ``schema`` + - True + + - ``schema`` + + + * - ``role`` + - False + + - ``extra.role`` + + + * - ``private_key_passphrase`` + - True + + - ``password`` + + + * - ``private_key_path`` + - True + + - ``extra.private_key_file`` + + + + +Some notes about the table above: + +- This table doesn't necessarily show the full list of fields you *can* pass to the dbt profile. To + see the full list of fields, see the link to the dbt docs at the top of this page. +- If the Airflow field name starts with an ``extra.``, this means that the field is nested under + the ``extra`` field in the Airflow connection. For example, if the Airflow field name is + ``extra.token``, this means that the field is nested under ``extra`` in the Airflow connection, + and the field name is ``token``. +- If there are multiple Airflow field names, the profile mapping looks at those fields in order. + For example, if the Airflow field name is ``['password', 'extra.token']``, the profile mapping + will first look for a field named ``password``. If that field is not present, it will look for + ``extra.token``. + + +Default Values +-------------- + +This profile mapping sets the following default values. These can be overridden by passing +them in ``profile_args``. + +.. list-table:: + :header-rows: 1 + + * - Field Name + - Default Value + + + * - ``threads`` + - ``4`` + diff --git a/docs/reference/profiles/SnowflakeEncryptedPrivateKeyPem.rst b/docs/reference/profiles/SnowflakeEncryptedPrivateKeyPem.rst new file mode 100644 index 0000000000..65fc2d9356 --- /dev/null +++ b/docs/reference/profiles/SnowflakeEncryptedPrivateKeyPem.rst @@ -0,0 +1,122 @@ +.. + This file is autogenerated by ``docs/scripts/generate_mappings.py``. Do not edit by hand. + + +SnowflakeEncryptedPrivateKeyPem +=============================== + + + +Maps Airflow Snowflake connections to dbt profiles if they use a user/private key. + +https://docs.getdbt.com/docs/core/connect-data-platform/snowflake-setup#key-pair-authentication + +https://airflow.apache.org/docs/apache-airflow-providers-snowflake/stable/connections/snowflake.html + + + +This profile mapping translates Airflow connections with the type ``snowflake`` +into dbt profiles. To use this profile, import it from ``cosmos.profiles``: + +.. code-block:: python + + from cosmos.profiles import SnowflakeEncryptedPrivateKeyPemProfileMapping + + profile = SnowflakeEncryptedPrivateKeyPemProfileMapping( + conn_id="my_snowflake_connection", + profile_args={...}, + ) + +While the profile mapping pulls fields from Airflow connections, you may need to supplement it +with additional ``profile_args``. The below table shows which fields are required, along with those +not required but pulled from the Airflow connection if present. You can also add additional fields +to the ``profile_args`` dict. + +.. list-table:: + :header-rows: 1 + + * - dbt Field Name + - Required + - Airflow Field Name + + + * - ``account`` + - True + + - ``extra.account`` + + + * - ``user`` + - True + + - ``login`` + + + * - ``database`` + - True + + - ``extra.database`` + + + * - ``warehouse`` + - True + + - ``extra.warehouse`` + + + * - ``schema`` + - True + + - ``schema`` + + + * - ``role`` + - False + + - ``extra.role`` + + + * - ``private_key`` + - True + + - ``extra.private_key_content`` + + + * - ``private_key_passphrase`` + - True + + - ``password`` + + + + +Some notes about the table above: + +- This table doesn't necessarily show the full list of fields you *can* pass to the dbt profile. To + see the full list of fields, see the link to the dbt docs at the top of this page. +- If the Airflow field name starts with an ``extra.``, this means that the field is nested under + the ``extra`` field in the Airflow connection. For example, if the Airflow field name is + ``extra.token``, this means that the field is nested under ``extra`` in the Airflow connection, + and the field name is ``token``. +- If there are multiple Airflow field names, the profile mapping looks at those fields in order. + For example, if the Airflow field name is ``['password', 'extra.token']``, the profile mapping + will first look for a field named ``password``. If that field is not present, it will look for + ``extra.token``. + + +Default Values +-------------- + +This profile mapping sets the following default values. These can be overridden by passing +them in ``profile_args``. + +.. list-table:: + :header-rows: 1 + + * - Field Name + - Default Value + + + * - ``threads`` + - ``4`` + diff --git a/docs/reference/profiles/SnowflakePrivateKeyPem.rst b/docs/reference/profiles/SnowflakePrivateKeyPem.rst new file mode 100644 index 0000000000..2a0576afb2 --- /dev/null +++ b/docs/reference/profiles/SnowflakePrivateKeyPem.rst @@ -0,0 +1,116 @@ +.. + This file is autogenerated by ``docs/scripts/generate_mappings.py``. Do not edit by hand. + + +SnowflakePrivateKeyPem +====================== + + + +Maps Airflow Snowflake connections to dbt profiles if they use a user/private key. + +https://docs.getdbt.com/docs/core/connect-data-platform/snowflake-setup#key-pair-authentication + +https://airflow.apache.org/docs/apache-airflow-providers-snowflake/stable/connections/snowflake.html + + + +This profile mapping translates Airflow connections with the type ``snowflake`` +into dbt profiles. To use this profile, import it from ``cosmos.profiles``: + +.. code-block:: python + + from cosmos.profiles import SnowflakePrivateKeyPemProfileMapping + + profile = SnowflakePrivateKeyPemProfileMapping( + conn_id="my_snowflake_connection", + profile_args={...}, + ) + +While the profile mapping pulls fields from Airflow connections, you may need to supplement it +with additional ``profile_args``. The below table shows which fields are required, along with those +not required but pulled from the Airflow connection if present. You can also add additional fields +to the ``profile_args`` dict. + +.. list-table:: + :header-rows: 1 + + * - dbt Field Name + - Required + - Airflow Field Name + + + * - ``account`` + - True + + - ``extra.account`` + + + * - ``user`` + - True + + - ``login`` + + + * - ``database`` + - True + + - ``extra.database`` + + + * - ``warehouse`` + - True + + - ``extra.warehouse`` + + + * - ``schema`` + - True + + - ``schema`` + + + * - ``role`` + - False + + - ``extra.role`` + + + * - ``private_key`` + - True + + - ``extra.private_key_content`` + + + + +Some notes about the table above: + +- This table doesn't necessarily show the full list of fields you *can* pass to the dbt profile. To + see the full list of fields, see the link to the dbt docs at the top of this page. +- If the Airflow field name starts with an ``extra.``, this means that the field is nested under + the ``extra`` field in the Airflow connection. For example, if the Airflow field name is + ``extra.token``, this means that the field is nested under ``extra`` in the Airflow connection, + and the field name is ``token``. +- If there are multiple Airflow field names, the profile mapping looks at those fields in order. + For example, if the Airflow field name is ``['password', 'extra.token']``, the profile mapping + will first look for a field named ``password``. If that field is not present, it will look for + ``extra.token``. + + +Default Values +-------------- + +This profile mapping sets the following default values. These can be overridden by passing +them in ``profile_args``. + +.. list-table:: + :header-rows: 1 + + * - Field Name + - Default Value + + + * - ``threads`` + - ``4`` + diff --git a/docs/reference/profiles/SnowflakeUserPassword.rst b/docs/reference/profiles/SnowflakeUserPassword.rst new file mode 100644 index 0000000000..86817a3360 --- /dev/null +++ b/docs/reference/profiles/SnowflakeUserPassword.rst @@ -0,0 +1,128 @@ +.. + This file is autogenerated by ``docs/scripts/generate_mappings.py``. Do not edit by hand. + + +SnowflakeUserPassword +===================== + + + +Maps Airflow Snowflake connections to dbt profiles if they use a user/password. + +https://docs.getdbt.com/reference/warehouse-setups/snowflake-setup + +https://airflow.apache.org/docs/apache-airflow-providers-snowflake/stable/connections/snowflake.html + + + +This profile mapping translates Airflow connections with the type ``snowflake`` +into dbt profiles. To use this profile, import it from ``cosmos.profiles``: + +.. code-block:: python + + from cosmos.profiles import SnowflakeUserPasswordProfileMapping + + profile = SnowflakeUserPasswordProfileMapping( + conn_id="my_snowflake_connection", + profile_args={...}, + ) + +While the profile mapping pulls fields from Airflow connections, you may need to supplement it +with additional ``profile_args``. The below table shows which fields are required, along with those +not required but pulled from the Airflow connection if present. You can also add additional fields +to the ``profile_args`` dict. + +.. list-table:: + :header-rows: 1 + + * - dbt Field Name + - Required + - Airflow Field Name + + + * - ``account`` + - True + + - ``extra.account`` + + + * - ``user`` + - True + + - ``login`` + + + * - ``password`` + - True + + - ``password`` + + + * - ``database`` + - True + + - ``extra.database`` + + + * - ``warehouse`` + - True + + - ``extra.warehouse`` + + + * - ``schema`` + - True + + - ``schema`` + + + * - ``role`` + - False + + - ``extra.role`` + + + * - ``host`` + - False + + - ``extra.host`` + + + * - ``port`` + - False + + - ``extra.port`` + + + + +Some notes about the table above: + +- This table doesn't necessarily show the full list of fields you *can* pass to the dbt profile. To + see the full list of fields, see the link to the dbt docs at the top of this page. +- If the Airflow field name starts with an ``extra.``, this means that the field is nested under + the ``extra`` field in the Airflow connection. For example, if the Airflow field name is + ``extra.token``, this means that the field is nested under ``extra`` in the Airflow connection, + and the field name is ``token``. +- If there are multiple Airflow field names, the profile mapping looks at those fields in order. + For example, if the Airflow field name is ``['password', 'extra.token']``, the profile mapping + will first look for a field named ``password``. If that field is not present, it will look for + ``extra.token``. + + +Default Values +-------------- + +This profile mapping sets the following default values. These can be overridden by passing +them in ``profile_args``. + +.. list-table:: + :header-rows: 1 + + * - Field Name + - Default Value + + + * - ``threads`` + - ``4`` + diff --git a/docs/reference/profiles/SparkThrift.rst b/docs/reference/profiles/SparkThrift.rst new file mode 100644 index 0000000000..75661317e8 --- /dev/null +++ b/docs/reference/profiles/SparkThrift.rst @@ -0,0 +1,74 @@ +.. + This file is autogenerated by ``docs/scripts/generate_mappings.py``. Do not edit by hand. + + +SparkThrift +=========== + + + +Maps Airflow Spark connections to dbt profiles if they use a thrift connection. + +https://docs.getdbt.com/reference/warehouse-setups/spark-setup#thrift + +https://airflow.apache.org/docs/apache-airflow-providers-apache-spark/stable/connections/spark.html + + + +This profile mapping translates Airflow connections with the type ``spark`` +into dbt profiles. To use this profile, import it from ``cosmos.profiles``: + +.. code-block:: python + + from cosmos.profiles import SparkThriftProfileMapping + + profile = SparkThriftProfileMapping( + conn_id="my_spark_connection", + profile_args={...}, + ) + +While the profile mapping pulls fields from Airflow connections, you may need to supplement it +with additional ``profile_args``. The below table shows which fields are required, along with those +not required but pulled from the Airflow connection if present. You can also add additional fields +to the ``profile_args`` dict. + +.. list-table:: + :header-rows: 1 + + * - dbt Field Name + - Required + - Airflow Field Name + + + * - ``host`` + - True + + - ``host`` + + + * - ``port`` + - False + + - ``port`` + + + * - ``schema`` + - True + + - + + + + +Some notes about the table above: + +- This table doesn't necessarily show the full list of fields you *can* pass to the dbt profile. To + see the full list of fields, see the link to the dbt docs at the top of this page. +- If the Airflow field name starts with an ``extra.``, this means that the field is nested under + the ``extra`` field in the Airflow connection. For example, if the Airflow field name is + ``extra.token``, this means that the field is nested under ``extra`` in the Airflow connection, + and the field name is ``token``. +- If there are multiple Airflow field names, the profile mapping looks at those fields in order. + For example, if the Airflow field name is ``['password', 'extra.token']``, the profile mapping + will first look for a field named ``password``. If that field is not present, it will look for + ``extra.token``. diff --git a/docs/reference/profiles/StandardSQLServerAuth.rst b/docs/reference/profiles/StandardSQLServerAuth.rst new file mode 100644 index 0000000000..3b9e5da661 --- /dev/null +++ b/docs/reference/profiles/StandardSQLServerAuth.rst @@ -0,0 +1,90 @@ +.. + This file is autogenerated by ``docs/scripts/generate_mappings.py``. Do not edit by hand. + + +StandardSQLServerAuth +===================== + + + +This profile mapping translates Airflow connections with the type ``generic`` +into dbt profiles. To use this profile, import it from ``cosmos.profiles``: + +.. code-block:: python + + from cosmos.profiles import StandardSQLServerAuthProfileMapping + + profile = StandardSQLServerAuthProfileMapping( + conn_id="my_generic_connection", + profile_args={...}, + ) + +While the profile mapping pulls fields from Airflow connections, you may need to supplement it +with additional ``profile_args``. The below table shows which fields are required, along with those +not required but pulled from the Airflow connection if present. You can also add additional fields +to the ``profile_args`` dict. + +.. list-table:: + :header-rows: 1 + + * - dbt Field Name + - Required + - Airflow Field Name + + + * - ``server`` + - True + + - ``host`` + + + * - ``user`` + - True + + - ``login`` + + + * - ``password`` + - True + + - ``password`` + + + * - ``port`` + - False + + - ``port`` + + + * - ``schema`` + - True + + - ``schema`` + + + * - ``database`` + - True + + - ``extra.database`` + + + * - ``driver`` + - True + + - ``extra.driver`` + + + + +Some notes about the table above: + +- This table doesn't necessarily show the full list of fields you *can* pass to the dbt profile. To + see the full list of fields, see the link to the dbt docs at the top of this page. +- If the Airflow field name starts with an ``extra.``, this means that the field is nested under + the ``extra`` field in the Airflow connection. For example, if the Airflow field name is + ``extra.token``, this means that the field is nested under ``extra`` in the Airflow connection, + and the field name is ``token``. +- If there are multiple Airflow field names, the profile mapping looks at those fields in order. + For example, if the Airflow field name is ``['password', 'extra.token']``, the profile mapping + will first look for a field named ``password``. If that field is not present, it will look for + ``extra.token``. diff --git a/docs/reference/profiles/StarrocksUserPassword.rst b/docs/reference/profiles/StarrocksUserPassword.rst new file mode 100644 index 0000000000..7452563e4a --- /dev/null +++ b/docs/reference/profiles/StarrocksUserPassword.rst @@ -0,0 +1,84 @@ +.. + This file is autogenerated by ``docs/scripts/generate_mappings.py``. Do not edit by hand. + + +StarrocksUserPassword +===================== + + + +Maps Airflow MySQL connections using user + password authentication to dbt profiles. + +https://docs.getdbt.com/docs/core/connect-data-platform/starrocks-setup + + + +This profile mapping translates Airflow connections with the type ``mysql`` +into dbt profiles. To use this profile, import it from ``cosmos.profiles``: + +.. code-block:: python + + from cosmos.profiles import StarrocksUserPasswordProfileMapping + + profile = StarrocksUserPasswordProfileMapping( + conn_id="my_mysql_connection", + profile_args={...}, + ) + +While the profile mapping pulls fields from Airflow connections, you may need to supplement it +with additional ``profile_args``. The below table shows which fields are required, along with those +not required but pulled from the Airflow connection if present. You can also add additional fields +to the ``profile_args`` dict. + +.. list-table:: + :header-rows: 1 + + * - dbt Field Name + - Required + - Airflow Field Name + + + * - ``host`` + - True + + - ``host`` + + + * - ``username`` + - True + + - ``login`` + + + * - ``password`` + - True + + - ``password`` + + + * - ``port`` + - True + + - ``port`` + + + * - ``schema`` + - True + + - ``schema`` + + + + +Some notes about the table above: + +- This table doesn't necessarily show the full list of fields you *can* pass to the dbt profile. To + see the full list of fields, see the link to the dbt docs at the top of this page. +- If the Airflow field name starts with an ``extra.``, this means that the field is nested under + the ``extra`` field in the Airflow connection. For example, if the Airflow field name is + ``extra.token``, this means that the field is nested under ``extra`` in the Airflow connection, + and the field name is ``token``. +- If there are multiple Airflow field names, the profile mapping looks at those fields in order. + For example, if the Airflow field name is ``['password', 'extra.token']``, the profile mapping + will first look for a field named ``password``. If that field is not present, it will look for + ``extra.token``. diff --git a/docs/reference/profiles/TeradataUserPassword.rst b/docs/reference/profiles/TeradataUserPassword.rst new file mode 100644 index 0000000000..f550df8b07 --- /dev/null +++ b/docs/reference/profiles/TeradataUserPassword.rst @@ -0,0 +1,86 @@ +.. + This file is autogenerated by ``docs/scripts/generate_mappings.py``. Do not edit by hand. + + +TeradataUserPassword +==================== + + + +Maps Airflow Teradata connections using user + password authentication to dbt profiles. + +https://docs.getdbt.com/docs/core/connect-data-platform/teradata-setup + +https://airflow.apache.org/docs/apache-airflow-providers-teradata/stable/connections/teradata.html + + + +This profile mapping translates Airflow connections with the type ``teradata`` +into dbt profiles. To use this profile, import it from ``cosmos.profiles``: + +.. code-block:: python + + from cosmos.profiles import TeradataUserPasswordProfileMapping + + profile = TeradataUserPasswordProfileMapping( + conn_id="my_teradata_connection", + profile_args={...}, + ) + +While the profile mapping pulls fields from Airflow connections, you may need to supplement it +with additional ``profile_args``. The below table shows which fields are required, along with those +not required but pulled from the Airflow connection if present. You can also add additional fields +to the ``profile_args`` dict. + +.. list-table:: + :header-rows: 1 + + * - dbt Field Name + - Required + - Airflow Field Name + + + * - ``host`` + - True + + - ``host`` + + + * - ``user`` + - True + + - ``login`` + + + * - ``password`` + - True + + - ``password`` + + + * - ``schema`` + - False + + - ``schema`` + + + * - ``tmode`` + - False + + - ``extra.tmode`` + + + + +Some notes about the table above: + +- This table doesn't necessarily show the full list of fields you *can* pass to the dbt profile. To + see the full list of fields, see the link to the dbt docs at the top of this page. +- If the Airflow field name starts with an ``extra.``, this means that the field is nested under + the ``extra`` field in the Airflow connection. For example, if the Airflow field name is + ``extra.token``, this means that the field is nested under ``extra`` in the Airflow connection, + and the field name is ``token``. +- If there are multiple Airflow field names, the profile mapping looks at those fields in order. + For example, if the Airflow field name is ``['password', 'extra.token']``, the profile mapping + will first look for a field named ``password``. If that field is not present, it will look for + ``extra.token``. diff --git a/docs/reference/profiles/TrinoCertificate.rst b/docs/reference/profiles/TrinoCertificate.rst new file mode 100644 index 0000000000..b3c66828bb --- /dev/null +++ b/docs/reference/profiles/TrinoCertificate.rst @@ -0,0 +1,104 @@ +.. + This file is autogenerated by ``docs/scripts/generate_mappings.py``. Do not edit by hand. + + +TrinoCertificate +================ + + + +Maps Airflow Trino connections to Certificate Trino dbt profiles. + +https://docs.getdbt.com/reference/warehouse-setups/trino-setup#certificate + +https://airflow.apache.org/docs/apache-airflow-providers-trino/stable/connections.html + + + +This profile mapping translates Airflow connections with the type ``trino`` +into dbt profiles. To use this profile, import it from ``cosmos.profiles``: + +.. code-block:: python + + from cosmos.profiles import TrinoCertificateProfileMapping + + profile = TrinoCertificateProfileMapping( + conn_id="my_trino_connection", + profile_args={...}, + ) + +While the profile mapping pulls fields from Airflow connections, you may need to supplement it +with additional ``profile_args``. The below table shows which fields are required, along with those +not required but pulled from the Airflow connection if present. You can also add additional fields +to the ``profile_args`` dict. + +.. list-table:: + :header-rows: 1 + + * - dbt Field Name + - Required + - Airflow Field Name + + + * - ``client_certificate`` + - True + + - ``extra.certs__client_cert_path`` + + + * - ``client_private_key`` + - True + + - ``extra.certs__client_key_path`` + + + * - ``host`` + - True + + - ``host`` + + + * - ``port`` + - True + + - ``port`` + + + * - ``user`` + - False + + - ``login`` + + + * - ``session_properties`` + - False + + - ``extra.session_properties`` + + + * - ``database`` + - True + + - + + + * - ``schema`` + - True + + - + + + + +Some notes about the table above: + +- This table doesn't necessarily show the full list of fields you *can* pass to the dbt profile. To + see the full list of fields, see the link to the dbt docs at the top of this page. +- If the Airflow field name starts with an ``extra.``, this means that the field is nested under + the ``extra`` field in the Airflow connection. For example, if the Airflow field name is + ``extra.token``, this means that the field is nested under ``extra`` in the Airflow connection, + and the field name is ``token``. +- If there are multiple Airflow field names, the profile mapping looks at those fields in order. + For example, if the Airflow field name is ``['password', 'extra.token']``, the profile mapping + will first look for a field named ``password``. If that field is not present, it will look for + ``extra.token``. diff --git a/docs/reference/profiles/TrinoJWT.rst b/docs/reference/profiles/TrinoJWT.rst new file mode 100644 index 0000000000..3fe718ad7d --- /dev/null +++ b/docs/reference/profiles/TrinoJWT.rst @@ -0,0 +1,100 @@ +.. + This file is autogenerated by ``docs/scripts/generate_mappings.py``. Do not edit by hand. + + +TrinoJWT +======== + + + +Maps Airflow Trino connections to JWT Trino dbt profiles. + + + +https://docs.getdbt.com/reference/warehouse-setups/trino-setup#jwt + +https://airflow.apache.org/docs/apache-airflow-providers-trino/stable/connections.html + + + +This profile mapping translates Airflow connections with the type ``trino`` +into dbt profiles. To use this profile, import it from ``cosmos.profiles``: + +.. code-block:: python + + from cosmos.profiles import TrinoJWTProfileMapping + + profile = TrinoJWTProfileMapping( + conn_id="my_trino_connection", + profile_args={...}, + ) + +While the profile mapping pulls fields from Airflow connections, you may need to supplement it +with additional ``profile_args``. The below table shows which fields are required, along with those +not required but pulled from the Airflow connection if present. You can also add additional fields +to the ``profile_args`` dict. + +.. list-table:: + :header-rows: 1 + + * - dbt Field Name + - Required + - Airflow Field Name + + + * - ``jwt_token`` + - True + + - ``extra.jwt__token`` + + + * - ``host`` + - True + + - ``host`` + + + * - ``port`` + - True + + - ``port`` + + + * - ``user`` + - False + + - ``login`` + + + * - ``session_properties`` + - False + + - ``extra.session_properties`` + + + * - ``database`` + - True + + - + + + * - ``schema`` + - True + + - + + + + +Some notes about the table above: + +- This table doesn't necessarily show the full list of fields you *can* pass to the dbt profile. To + see the full list of fields, see the link to the dbt docs at the top of this page. +- If the Airflow field name starts with an ``extra.``, this means that the field is nested under + the ``extra`` field in the Airflow connection. For example, if the Airflow field name is + ``extra.token``, this means that the field is nested under ``extra`` in the Airflow connection, + and the field name is ``token``. +- If there are multiple Airflow field names, the profile mapping looks at those fields in order. + For example, if the Airflow field name is ``['password', 'extra.token']``, the profile mapping + will first look for a field named ``password``. If that field is not present, it will look for + ``extra.token``. diff --git a/docs/reference/profiles/TrinoLDAP.rst b/docs/reference/profiles/TrinoLDAP.rst new file mode 100644 index 0000000000..f527558d7c --- /dev/null +++ b/docs/reference/profiles/TrinoLDAP.rst @@ -0,0 +1,100 @@ +.. + This file is autogenerated by ``docs/scripts/generate_mappings.py``. Do not edit by hand. + + +TrinoLDAP +========= + + + +Maps Airflow Trino connections to LDAP Trino dbt profiles. + + + +https://docs.getdbt.com/reference/warehouse-setups/trino-setup#ldap + +https://airflow.apache.org/docs/apache-airflow-providers-trino/stable/connections.html + + + +This profile mapping translates Airflow connections with the type ``trino`` +into dbt profiles. To use this profile, import it from ``cosmos.profiles``: + +.. code-block:: python + + from cosmos.profiles import TrinoLDAPProfileMapping + + profile = TrinoLDAPProfileMapping( + conn_id="my_trino_connection", + profile_args={...}, + ) + +While the profile mapping pulls fields from Airflow connections, you may need to supplement it +with additional ``profile_args``. The below table shows which fields are required, along with those +not required but pulled from the Airflow connection if present. You can also add additional fields +to the ``profile_args`` dict. + +.. list-table:: + :header-rows: 1 + + * - dbt Field Name + - Required + - Airflow Field Name + + + * - ``user`` + - True + + - ``login`` + + + * - ``password`` + - True + + - ``password`` + + + * - ``host`` + - True + + - ``host`` + + + * - ``port`` + - True + + - ``port`` + + + * - ``session_properties`` + - False + + - ``extra.session_properties`` + + + * - ``database`` + - True + + - + + + * - ``schema`` + - True + + - + + + + +Some notes about the table above: + +- This table doesn't necessarily show the full list of fields you *can* pass to the dbt profile. To + see the full list of fields, see the link to the dbt docs at the top of this page. +- If the Airflow field name starts with an ``extra.``, this means that the field is nested under + the ``extra`` field in the Airflow connection. For example, if the Airflow field name is + ``extra.token``, this means that the field is nested under ``extra`` in the Airflow connection, + and the field name is ``token``. +- If there are multiple Airflow field names, the profile mapping looks at those fields in order. + For example, if the Airflow field name is ``['password', 'extra.token']``, the profile mapping + will first look for a field named ``password``. If that field is not present, it will look for + ``extra.token``. diff --git a/docs/reference/profiles/VerticaUserPassword.rst b/docs/reference/profiles/VerticaUserPassword.rst new file mode 100644 index 0000000000..52141602cc --- /dev/null +++ b/docs/reference/profiles/VerticaUserPassword.rst @@ -0,0 +1,202 @@ +.. + This file is autogenerated by ``docs/scripts/generate_mappings.py``. Do not edit by hand. + + +VerticaUserPassword +=================== + + + +Maps Airflow Vertica connections using username + password authentication to dbt profiles. + +.. note:: + + Use Airflow connection ``schema`` for vertica ``database`` to keep it consistent with other connection types and profiles. The Vertica Airflow provider hook `assumes this `_. + + This seems to be a common approach also for `Postgres `_, Redshift and Exasol since there is no ``database`` field in Airflow connection and ``schema`` is not required for the database connection. + +.. seealso:: + + https://docs.getdbt.com/reference/warehouse-setups/vertica-setup + + https://airflow.apache.org/docs/apache-airflow-providers-vertica/stable/connections/vertica.html + + + +This profile mapping translates Airflow connections with the type ``vertica`` +into dbt profiles. To use this profile, import it from ``cosmos.profiles``: + +.. code-block:: python + + from cosmos.profiles import VerticaUserPasswordProfileMapping + + profile = VerticaUserPasswordProfileMapping( + conn_id="my_vertica_connection", + profile_args={...}, + ) + +While the profile mapping pulls fields from Airflow connections, you may need to supplement it +with additional ``profile_args``. The below table shows which fields are required, along with those +not required but pulled from the Airflow connection if present. You can also add additional fields +to the ``profile_args`` dict. + +.. list-table:: + :header-rows: 1 + + * - dbt Field Name + - Required + - Airflow Field Name + + + * - ``host`` + - True + + - ``host`` + + + * - ``username`` + - True + + - ``login`` + + + * - ``password`` + - True + + - ``password`` + + + * - ``port`` + - False + + - ``port`` + + + * - ``database`` + - True + + - ``schema`` + + + * - ``autocommit`` + - False + + - ``extra.autocommit`` + + + * - ``backup_server_node`` + - False + + - ``extra.backup_server_node`` + + + * - ``binary_transfer`` + - False + + - ``extra.binary_transfer`` + + + * - ``connection_load_balance`` + - False + + - ``extra.connection_load_balance`` + + + * - ``connection_timeout`` + - False + + - ``extra.connection_timeout`` + + + * - ``disable_copy_local`` + - False + + - ``extra.disable_copy_local`` + + + * - ``kerberos_host_name`` + - False + + - ``extra.kerberos_host_name`` + + + * - ``kerberos_service_name`` + - False + + - ``extra.kerberos_service_name`` + + + * - ``log_level`` + - False + + - ``extra.log_level`` + + + * - ``log_path`` + - False + + - ``extra.log_path`` + + + * - ``oauth_access_token`` + - False + + - ``extra.oauth_access_token`` + + + * - ``request_complex_types`` + - False + + - ``extra.request_complex_types`` + + + * - ``session_label`` + - False + + - ``extra.session_label`` + + + * - ``ssl`` + - False + + - ``extra.ssl`` + + + * - ``unicode_error`` + - False + + - ``extra.unicode_error`` + + + * - ``use_prepared_statements`` + - False + + - ``extra.use_prepared_statements`` + + + * - ``workload`` + - False + + - ``extra.workload`` + + + * - ``schema`` + - True + + - + + + + +Some notes about the table above: + +- This table doesn't necessarily show the full list of fields you *can* pass to the dbt profile. To + see the full list of fields, see the link to the dbt docs at the top of this page. +- If the Airflow field name starts with an ``extra.``, this means that the field is nested under + the ``extra`` field in the Airflow connection. For example, if the Airflow field name is + ``extra.token``, this means that the field is nested under ``extra`` in the Airflow connection, + and the field name is ``token``. +- If there are multiple Airflow field names, the profile mapping looks at those fields in order. + For example, if the Airflow field name is ``['password', 'extra.token']``, the profile mapping + will first look for a field named ``password``. If that field is not present, it will look for + ``extra.token``. diff --git a/docs/reference/profiles/index.rst b/docs/reference/profiles/index.rst new file mode 100644 index 0000000000..1921444bd0 --- /dev/null +++ b/docs/reference/profiles/index.rst @@ -0,0 +1,250 @@ +.. + This file is autogenerated by ``docs/scripts/generate_mappings.py``. Do not edit by hand. + +.. toctree:: + :caption: Profiles + + self + AthenaAccessKey + ClickhouseUserPassword + GoogleCloudServiceAccountFile + GoogleCloudServiceAccountDict + GoogleCloudOauth + DatabricksToken + DatabricksOauth + DuckDBUserPassword + MysqlUserPassword + OracleUserPassword + PostgresUserPassword + RedshiftUserPassword + SnowflakeUserPassword + SnowflakeEncryptedPrivateKeyFilePem + SnowflakeEncryptedPrivateKeyPem + SnowflakePrivateKeyPem + StarrocksUserPassword + SparkThrift + ExasolUserPassword + TeradataUserPassword + TrinoLDAP + TrinoCertificate + TrinoJWT + VerticaUserPassword + StandardSQLServerAuth + + +Profiles Overview +================== + +Cosmos supports two methods of authenticating with your database: + +- using your own dbt profiles.yml file +- using Airflow connections via Cosmos' profile mappings + +If you're already interacting with your database from Airflow and have a connection set up, it's recommended +to use a profile mapping to translate that Airflow connection to a dbt profile. This is because it's easier to +maintain a single connection object in Airflow than it is to maintain a connection object in Airflow and a dbt profile +in your dbt project. + +If you don't already have an Airflow connection, or if there's no readily-available profile mapping for your database, +you can use your own dbt profiles.yml file. + +Regardless of which method you use, you'll need to tell Cosmos which profile and target name it should use. Profile config +is set in the ``cosmos.config.ProfileConfig`` object, like so: + +.. code-block:: python + + from cosmos.config import ProfileConfig + + profile_config = ProfileConfig( + profile_name="my_profile_name", + target_name="my_target_name", + # choose one of the following + profile_mapping=..., + profiles_yml_filepath=..., + ) + + dag = DbtDag(profile_config=profile_config, ...) + + +Using your own profiles.yml file +++++++++++++++++++++++++++++++++ + +If you don't want to use Airflow connections, or if there's no readily-available profile mapping for your database, +you can use your own dbt profiles.yml file. To do so, you'll need to pass the path to your profiles.yml file to the +``profiles_yml_filepath`` argument in ``ProfileConfig``. + +For example, the code snippet below points Cosmos at a ``profiles.yml`` file and instructs Cosmos to use the +``my_snowflake_profile`` profile and ``dev`` target: + +.. code-block:: python + + from cosmos.config import ProfileConfig + + profile_config = ProfileConfig( + profile_name="my_snowflake_profile", + target_name="dev", + profiles_yml_filepath="/path/to/profiles.yml", + ) + + dag = DbtDag(profile_config=profile_config, ...) + +Using a profile mapping ++++++++++++++++++++++++ + +Profile mappings are utilities provided by Cosmos that translate Airflow connections to dbt profiles. This means that +you can use the same connection objects you use in Airflow to authenticate with your database in dbt. To do so, there's +a class in Cosmos for each Airflow connection to dbt profile mapping. + +You can find the available profile mappings on the left-hand side of this page. Each profile mapping is imported from +``cosmos.profiles`` and takes two arguments: + +* ``conn_id``: the Airflow connection ID to use. +* ``profile_args``: a dictionary of additional arguments to pass to the dbt profile. This is useful for specifying + values that are not in the Airflow connection. This also acts as an override for any values that are in the Airflow + connection but should be overridden. + +Below is an example of using the Snowflake profile mapping, where we take most arguments from the Airflow connection +but override the ``database`` and ``schema`` values: + +.. code-block:: python + + from cosmos.profiles import SnowflakeUserPasswordProfileMapping + + profile_config = ProfileConfig( + profile_name="my_profile_name", + target_name="my_target_name", + profile_mapping=SnowflakeUserPasswordProfileMapping( + conn_id="my_snowflake_conn_id", + profile_args={ + "database": "my_snowflake_database", + "schema": "my_snowflake_schema", + }, + ), + ) + + dag = DbtDag(profile_config=profile_config, ...) + +Note that when using a profile mapping, the profiles.yml file gets generated with the profile name and target name +you specify in ``ProfileConfig``. + +.. _profile-customise-per-node: + +Customising the profile config per dbt node ++++++++++++++++++++++++++++++++++++++++++++ +.. versionadded:: 1.9.0 + + +Since Cosmos 1.9.0, it is possible to customise which profile is used per dbt node. This works both when using a +``profile_mapping`` class or when using ``profiles_yml_filepath``. + +Let's say the user configures the profile at a ``DbtDag`` or ``DbtTaskGroup`` level as: + +.. code-block:: python + + from cosmos.profiles import PostgresUserPasswordProfileMapping + + profile_config = ProfileConfig( + profile_name="default_profile", + target_name="default_target", + profile_mapping=PostgresUserPasswordProfileMapping( + conn_id="default_conn", + profile_args={"schema": "default_schema"}, + ), + ) + +But that for a specific node or group of nodes, the user would like to replace: + +* ``profile_name`` to be "non_default_profile" as opposed to "default_profile" +* ``target_name`` to be "stage" as opposed to "default_target" +* ``conn_id`` to be "non_default_connection" as opposed to "default_conn" +* ``schema`` to be "non_default_schema" as opposed to "default_schema" + +They could apply this different configuration to all the project seeds by doing: + +.. code-block:: + + seeds: + my_dbt_project: + +meta: + cosmos: + profile_config: + profile_name: non_default_profile + target_name: stage + profile_mapping: + conn_id: non_default_connection + profile_args: + schema: non_default_schema + +This same mechanism works per individual dbt nodes, as discussed in :ref:`operator-args-per-node`, +to subsets of nodes selected based on path or other criteria that dbt supports. + + +Dbt profile config variables +---------------------------- +.. versionadded:: 1.4.0 + +The parts of ``profiles.yml``, which aren't specific to a particular data platform `dbt docs `_ + +.. code-block:: python + + from cosmos.profiles import SnowflakeUserPasswordProfileMapping, DbtProfileConfigVars + + profile_config = ProfileConfig( + profile_name="my_profile_name", + target_name="my_target_name", + profile_mapping=SnowflakeUserPasswordProfileMapping( + conn_id="my_snowflake_conn_id", + profile_args={ + "database": "my_snowflake_database", + "schema": "my_snowflake_schema", + }, + dbt_config_vars=DbtProfileConfigVars( + send_anonymous_usage_stats=False, + partial_parse=True, + use_experimental_parse=True, + static_parser=True, + printer_width=120, + write_json=True, + warn_error=True, + warn_error_options={"include": "all"}, + log_format="text", + debug=True, + version_check=True, + ), + ), + ) + + dag = DbtDag(profile_config=profile_config, ...) + + +Disabling dbt event tracking +++++++++++++++++++++++++++++ + +.. note: + Deprecated in v.1.4 and will be removed in v2.0.0. Use dbt_config_vars=DbtProfileConfigVars(send_anonymous_usage_stats=False) instead. +.. versionadded:: 1.3 + +By default `dbt will track events `_ by sending anonymous usage data +when dbt commands are invoked. Users have an option to opt out of event tracking by updating their ``profiles.yml`` file. + +If you'd like to disable this behavior in the Cosmos generated profile, you can pass ``disable_event_tracking=True`` to the profile mapping like in +the example below: + +.. code-block:: python + + from cosmos.profiles import SnowflakeUserPasswordProfileMapping + + profile_config = ProfileConfig( + profile_name="my_profile_name", + target_name="my_target_name", + profile_mapping=SnowflakeUserPasswordProfileMapping( + conn_id="my_snowflake_conn_id", + profile_args={ + "database": "my_snowflake_database", + "schema": "my_snowflake_schema", + }, + disable_event_tracking=True, + ), + ) + + dag = DbtDag(profile_config=profile_config, ...) diff --git a/docs/templates/index.rst.jinja2 b/docs/reference/templates/index.rst.jinja2 similarity index 98% rename from docs/templates/index.rst.jinja2 rename to docs/reference/templates/index.rst.jinja2 index 87285565c3..65bc676c4d 100644 --- a/docs/templates/index.rst.jinja2 +++ b/docs/reference/templates/index.rst.jinja2 @@ -1,5 +1,5 @@ .. - This file is autogenerated by `docs/scripts/generate_mappings.py`. Do not edit by hand. + This file is autogenerated by ``docs/scripts/generate_mappings.py``. Do not edit by hand. .. toctree:: :caption: Profiles diff --git a/docs/templates/profile_mapping.rst.jinja2 b/docs/reference/templates/profile_mapping.rst.jinja2 similarity index 96% rename from docs/templates/profile_mapping.rst.jinja2 rename to docs/reference/templates/profile_mapping.rst.jinja2 index c5b25b48b1..2154f778ed 100644 --- a/docs/templates/profile_mapping.rst.jinja2 +++ b/docs/reference/templates/profile_mapping.rst.jinja2 @@ -1,5 +1,5 @@ .. - This file is autogenerated by `docs/scripts/generate_mappings.py`. Do not edit by hand. + This file is autogenerated by ``docs/scripts/generate_mappings.py``. Do not edit by hand. {{ mapping_name }}