diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index baf2cd57c5..3585f4e33c 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -54,13 +54,13 @@ repos: - --py37-plus - --keep-runtime-typing - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.1.7 + rev: v0.1.9 hooks: - id: ruff args: - --fix - repo: https://github.com/psf/black - rev: 23.11.0 + rev: 23.12.1 hooks: - id: black args: ["--config", "./pyproject.toml"] @@ -71,7 +71,7 @@ repos: alias: black additional_dependencies: [black>=22.10.0] - repo: https://github.com/pre-commit/mirrors-mypy - rev: "v1.7.1" + rev: "v1.8.0" hooks: - id: mypy diff --git a/cosmos/profiles/__init__.py b/cosmos/profiles/__init__.py index 1f39a91a0f..e7eae57a1c 100644 --- a/cosmos/profiles/__init__.py +++ b/cosmos/profiles/__init__.py @@ -6,7 +6,7 @@ from .athena import AthenaAccessKeyProfileMapping -from .base import BaseProfileMapping +from .base import BaseProfileMapping, DbtConfigVars from .bigquery.service_account_file import GoogleCloudServiceAccountFileProfileMapping from .bigquery.service_account_keyfile_dict import GoogleCloudServiceAccountDictProfileMapping from .bigquery.oauth import GoogleCloudOauthProfileMapping @@ -81,4 +81,5 @@ def get_automatic_profile_mapping( "TrinoCertificateProfileMapping", "TrinoJWTProfileMapping", "VerticaUserPasswordProfileMapping", + "DbtConfigVars", ] diff --git a/cosmos/profiles/base.py b/cosmos/profiles/base.py index 171eac2d9c..4f357d0702 100644 --- a/cosmos/profiles/base.py +++ b/cosmos/profiles/base.py @@ -5,7 +5,8 @@ from __future__ import annotations from abc import ABC, abstractmethod -from typing import Any +from typing import Any, Optional +import dataclasses from typing import TYPE_CHECKING import yaml @@ -24,6 +25,66 @@ logger = get_logger(__name__) +@dataclasses.dataclass +class DbtConfigVars: + send_anonymous_usage_stats: Optional[bool] = False + partial_parse: Optional[bool] = None + use_experimental_parser: Optional[bool] = None + static_parser: Optional[bool] = None + printer_width: Optional[int] = None + write_json: Optional[bool] = None + warn_error: Optional[str] = None + warn_error_options: Optional[dict[str, Any]] = None + log_format: Optional[str] = None + debug: Optional[bool] = None + version_check: Optional[bool] = None + + def _validate_data(self) -> None: + checks: dict[str, dict[str, Any]] = { + "send_anonymous_usage_stats": {"var_type": bool}, + "partial_parse": {"var_type": bool}, + "use_experimental_parser": {"var_type": bool}, + "static_parser": {"var_type": bool}, + "printer_width": {"var_type": int}, + "write_json": {"var_type": bool}, + "warn_error": {"var_type": str}, + "warn_error_options": {"var_type": dict, "accepted_values": {"include", "exclude"}}, + "log_format": {"var_type": str, "accepted_values": {"text", "json", "default"}}, + "debug": {"var_type": bool}, + "version_check": {"var_type": bool}, + } + + for field_name, field_def in self.__dataclass_fields__.items(): + field_value = getattr(self, field_name) + + if not field_value is None: + vars_check = checks.get(field_name, {}) + accepted_values = vars_check.get("accepted_values") + var_type = vars_check.get("var_type", Any) + + if not isinstance(field_value, var_type): + raise CosmosValueError(f"dbt config var {field_name}: {field_value} must be a {var_type}") + + if accepted_values: + if field_value not in accepted_values: + raise CosmosValueError( + f"dbt config var {field_name}: {field_value} must be one of {accepted_values}" + ) + + def __post_init__(self) -> None: + self._validate_data() + + def as_dict(self) -> Optional[dict[str, Any]]: + result = { + field.name: getattr(self, field.name) + for field in dataclasses.fields(self) + if getattr(self, field.name) is not None + } + if result != {}: + return result + return None + + class BaseProfileMapping(ABC): """ A base class that other profile mappings should inherit from to ensure consistency. @@ -41,11 +102,13 @@ class BaseProfileMapping(ABC): _conn: Connection | None = None - def __init__(self, conn_id: str, profile_args: dict[str, Any] | None = None, disable_event_tracking: bool = False): + def __init__( + self, conn_id: str, profile_args: dict[str, Any] | None = None, dbt_config_vars: DbtConfigVars | None = None + ): self.conn_id = conn_id self.profile_args = profile_args or {} self._validate_profile_args() - self.disable_event_tracking = disable_event_tracking + self.dbt_config_vars = dbt_config_vars or DbtConfigVars() def _validate_profile_args(self) -> None: """ @@ -180,8 +243,9 @@ def get_profile_file_contents( } } - if self.disable_event_tracking: - profile_contents["config"] = {"send_anonymous_usage_stats": "False"} + config_vars = self.dbt_config_vars.as_dict() + if config_vars: + profile_contents["config"] = config_vars return str(yaml.dump(profile_contents, indent=4)) diff --git a/dev/Dockerfile b/dev/Dockerfile index 90c49ed6ca..b929be8b1c 100644 --- a/dev/Dockerfile +++ b/dev/Dockerfile @@ -1,4 +1,4 @@ -FROM quay.io/astronomer/astro-runtime:7.3.0-base +FROM quay.io/astronomer/astro-runtime:10.0.0-base USER root diff --git a/dev/dags/basic_cosmos_dag.py b/dev/dags/basic_cosmos_dag.py old mode 100644 new mode 100755 index 8bd49b0b39..485d767a2c --- a/dev/dags/basic_cosmos_dag.py +++ b/dev/dags/basic_cosmos_dag.py @@ -7,7 +7,7 @@ from pathlib import Path from cosmos import DbtDag, ProjectConfig, ProfileConfig -from cosmos.profiles import PostgresUserPasswordProfileMapping +from cosmos.profiles import PostgresUserPasswordProfileMapping, DbtConfigVars DEFAULT_DBT_ROOT_PATH = Path(__file__).parent / "dbt" DBT_ROOT_PATH = Path(os.getenv("DBT_ROOT_PATH", DEFAULT_DBT_ROOT_PATH)) @@ -18,6 +18,7 @@ profile_mapping=PostgresUserPasswordProfileMapping( conn_id="airflow_db", profile_args={"schema": "public"}, + dbt_config_vars=DbtConfigVars(send_anonymous_usage_stats=True), ), ) diff --git a/docs/configuration/parsing-methods.rst b/docs/configuration/parsing-methods.rst index ef50bdb4e6..ab31c00d4f 100644 --- a/docs/configuration/parsing-methods.rst +++ b/docs/configuration/parsing-methods.rst @@ -16,7 +16,7 @@ There are benefits and drawbacks to each method: - ``dbt_manifest``: You have to generate the manifest file on your own. When using the manifest, Cosmos gets a complete set of metadata about your models. However, Cosmos uses its own selecting & excluding logic to determine which models to run, which may not be as robust as dbt's. - ``dbt_ls``: Cosmos will generate the manifest file for you. This method uses dbt's metadata AND dbt's selecting/excluding logic. This is the most robust method. However, this requires the dbt executable to be installed on your machine (either on the host directly or in a virtual environment). - ``dbt_ls_file`` (new in 1.3): Path to a file containing the ``dbt ls`` output. To use this method, run ``dbt ls`` using ``--output json`` and store the output in a file. ``RenderConfig.select`` and ``RenderConfig.exclude`` will not work using this method. -- ``custom``: Cosmos will parse your project and model files for you. This means that Cosmos will not have access to dbt's metadata. However, this method does not require the dbt executable to be installed on your machine. +- ``custom``: Cosmos will parse your project and model files. This means that Cosmos will not have access to dbt's metadata. However, this method does not require the dbt executable to be installed on your machine, and does not require the user to provide any dbt artifacts. If you're using the ``local`` mode, you should use the ``dbt_ls`` method. @@ -60,7 +60,7 @@ To use this: .. note:: - This only works for the ``local`` execution mode. + This only works if a dbt command / executable is available to the scheduler. If you don't have a ``manifest.json`` file, Cosmos will attempt to generate one from your dbt project. It does this by running ``dbt ls`` and parsing the output. diff --git a/docs/getting_started/astro.rst b/docs/getting_started/astro.rst index c0bedc7e64..8aaa194e5f 100644 --- a/docs/getting_started/astro.rst +++ b/docs/getting_started/astro.rst @@ -20,7 +20,7 @@ Create a virtual environment in your ``Dockerfile`` using the sample below. Be s .. code-block:: docker - FROM quay.io/astronomer/astro-runtime:8.8.0 + FROM quay.io/astronomer/astro-runtime:10.0.0 # install dbt into a virtual environment RUN python -m venv dbt_venv && source dbt_venv/bin/activate && \ diff --git a/docs/getting_started/execution-modes-local-conflicts.rst b/docs/getting_started/execution-modes-local-conflicts.rst index 3e201bef8f..96921b6f7f 100644 --- a/docs/getting_started/execution-modes-local-conflicts.rst +++ b/docs/getting_started/execution-modes-local-conflicts.rst @@ -1,10 +1,10 @@ .. _execution-modes-local-conflicts: -Airflow and DBT dependencies conflicts +Airflow and dbt dependencies conflicts ====================================== When using the `Local Execution Mode `__, users may face dependency conflicts between -Apache Airflow and DBT. The conflicts may increase depending on the Airflow providers and DBT plugins being used. +Apache Airflow and dbt. The conflicts may increase depending on the Airflow providers and dbt adapters being used. If you find errors, we recommend users look into using `alternative execution modes `__. @@ -25,10 +25,26 @@ In the following table, ``x`` represents combinations that lead to conflicts (va +---------------+-----+-----+-----+-----+-----+-----+-----+-----+ | 2.7 | x | x | x | x | x | | | | +---------------+-----+-----+-----+-----+-----+-----+-----+-----+ +| 2.8 | x | x | x | x | x | | x | x | ++---------------+-----+-----+-----+-----+-----+-----+-----+-----+ + Examples of errors ----------------------------------- +.. code-block:: bash + + The conflict is caused by: + apache-airflow 2.8.0 depends on pydantic>=2.3.0 + dbt-semantic-interfaces 0.4.2 depends on pydantic~=1.10 + apache-airflow 2.8.0 depends on pydantic>=2.3.0 + dbt-semantic-interfaces 0.4.2.dev0 depends on pydantic~=1.10 + apache-airflow 2.8.0 depends on pydantic>=2.3.0 + dbt-semantic-interfaces 0.4.1 depends on pydantic~=1.10 + apache-airflow 2.8.0 depends on pydantic>=2.3.0 + dbt-semantic-interfaces 0.4.0 depends on pydantic~=1.10 + + .. code-block:: bash ERROR: Cannot install apache-airflow==2.2.4 and dbt-core==1.5.0 because these package versions have conflicting dependencies. @@ -78,7 +94,7 @@ The table was created by running `nox `__ wi @nox.parametrize( "dbt_version", ["1.0", "1.1", "1.2", "1.3", "1.4", "1.5", "1.6", "1.7"] ) - @nox.parametrize("airflow_version", ["2.2.4", "2.3", "2.4", "2.5", "2.6", "2.7"]) + @nox.parametrize("airflow_version", ["2.2.4", "2.3", "2.4", "2.5", "2.6", "2.7", "2.8"]) def compatibility(session: nox.Session, airflow_version, dbt_version) -> None: """Run both unit and integration tests.""" session.run( diff --git a/docs/getting_started/execution-modes.rst b/docs/getting_started/execution-modes.rst index 5925853fe1..7211382387 100644 --- a/docs/getting_started/execution-modes.rst +++ b/docs/getting_started/execution-modes.rst @@ -72,7 +72,10 @@ In this case, users are responsible for declaring which version of ``dbt`` they Similar to the ``local`` execution mode, Cosmos converts Airflow Connections into a way ``dbt`` understands them by creating a ``dbt`` profile file (``profiles.yml``). -A drawback with this approach is that it is slower than ``local`` because it creates a new Python virtual environment for each Cosmos dbt task run. +Some drawbacks of this approach: + +- It is slower than ``local`` because it creates a new Python virtual environment for each Cosmos dbt task run. +- If dbt is unavailable in the Airflow scheduler, the default ``LoadMode.DBT_LS`` will not work. In this scenario, users must use a `parsing method `_ that does not rely on dbt, such as ``LoadMode.MANIFEST``. Example of how to use: @@ -91,6 +94,7 @@ The user has better environment isolation than when using ``local`` or ``virtual The other challenge with the ``docker`` approach is if the Airflow worker is already running in Docker, which sometimes can lead to challenges running `Docker in Docker `__. This approach can be significantly slower than ``virtualenv`` since it may have to build the ``Docker`` container, which is slower than creating a Virtualenv with ``dbt-core``. +If dbt is unavailable in the Airflow scheduler, the default ``LoadMode.DBT_LS`` will not work. In this scenario, users must use a `parsing method `_ that does not rely on dbt, such as ``LoadMode.MANIFEST``. Check the step-by-step guide on using the ``docker`` execution mode at :ref:`docker`. diff --git a/docs/getting_started/gcc.rst b/docs/getting_started/gcc.rst index 1ec056e842..5baa9c37ed 100644 --- a/docs/getting_started/gcc.rst +++ b/docs/getting_started/gcc.rst @@ -22,6 +22,8 @@ Make a new folder, ``dbt``, inside your local ``dags`` folder. Then, copy/paste Note: your dbt projects can go anywhere that Airflow can read. By default, Cosmos looks in the ``/usr/local/airflow/dags/dbt`` directory, but you can change this by setting the ``dbt_project_dir`` argument when you create your DAG instance. +For more accurate parsing of your dbt project, you should pre-compile your dbt project's ``manifest.json`` (include ``dbt deps && dbt compile`` as part of your deployment process). + For example, if you wanted to put your dbt project in the ``/usr/local/airflow/dags/my_dbt_project`` directory, you would do: .. code-block:: python @@ -31,11 +33,15 @@ For example, if you wanted to put your dbt project in the ``/usr/local/airflow/d my_cosmos_dag = DbtDag( project_config=ProjectConfig( dbt_project_path="/usr/local/airflow/dags/my_dbt_project", + manifest_path="/usr/local/airflow/dags/my_dbt_project/target/manifest.json", ), # ..., ) +.. note:: + You can also exclude the ``manifest_path=...`` from the ``ProjectConfig``. Excluding a ``manifest_path`` file will by default use Cosmos's ``custom`` parsing method, which may be less accurate at parsing a dbt project compared to providing a ``manifest.json``. + Create your DAG --------------- diff --git a/docs/templates/index.rst.jinja2 b/docs/templates/index.rst.jinja2 index d5c3069111..a41a63c37d 100644 --- a/docs/templates/index.rst.jinja2 +++ b/docs/templates/index.rst.jinja2 @@ -38,6 +38,9 @@ is set in the ``cosmos.config.ProfileConfig`` object, like so: # choose one of the following profile_mapping=..., profiles_yml_filepath=..., + + # if profile_mapping is used, you can also pass dbt config vars + dbt_config_vars=..., ) dag = DbtDag(profile_config=profile_config, ...) @@ -83,19 +86,15 @@ but override the ``database`` and ``schema`` values: Note that when using a profile mapping, the profiles.yml file gets generated with the profile name and target name you specify in ``ProfileConfig``. -Disabling dbt event tracking +Dbt config vars -------------------------------- -.. versionadded:: 1.3 - -By default `dbt will track events `_ by sending anonymous usage data -when dbt commands are invoked. Users have an option to opt out of event tracking by updating their ``profiles.yml`` file. +.. versionadded:: 1.3.2 -If you'd like to disable this behavior in the Cosmos generated profile, you can pass ``disable_event_tracking=True`` to the profile mapping like in -the example below: +The parts of ``profiles.yml``, which aren't specific to a particular data platform `dbt docs `_ .. code-block:: python - from cosmos.profiles import SnowflakeUserPasswordProfileMapping + from cosmos.profiles import SnowflakeUserPasswordProfileMapping, DbtConfigVars profile_config = ProfileConfig( profile_name="my_profile_name", @@ -106,15 +105,13 @@ the example below: "database": "my_snowflake_database", "schema": "my_snowflake_schema", }, - disable_event_tracking=True, + dbt_config_vars=DbtConfigVars(send_anonymous_usage_stats=True), ), ) dag = DbtDag(profile_config=profile_config, ...) - - Using your own profiles.yml file ++++++++++++++++++++++++++++++++++++ diff --git a/pyproject.toml b/pyproject.toml index 9d367c075f..5d966c91b1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -118,7 +118,7 @@ kubernetes = [ "apache-airflow-providers-cncf-kubernetes>=5.1.1", ] pydantic = [ - "pydantic>=1.10.0,<2.0.0", + "pydantic>=1.10.0", ] [project.entry-points.cosmos] @@ -159,7 +159,7 @@ dependencies = [ [[tool.hatch.envs.tests.matrix]] python = ["3.8", "3.9", "3.10"] -airflow = ["2.3", "2.4", "2.5", "2.6", "2.7"] +airflow = ["2.3", "2.4", "2.5", "2.6", "2.7", "2.8"] [tool.hatch.envs.tests.overrides] matrix.airflow.dependencies = [ @@ -169,6 +169,7 @@ matrix.airflow.dependencies = [ { value = "apache-airflow==2.6", if = ["2.6"] }, { value = "pydantic>=1.10.0,<2.0.0", if = ["2.6"]}, { value = "apache-airflow==2.7", if = ["2.7"] }, + { value = "apache-airflow==2.8", if = ["2.8"] }, ] [tool.hatch.envs.tests.scripts] diff --git a/tests/profiles/test_base_profile.py b/tests/profiles/test_base_profile.py index f2603d43cd..b81ce4e7de 100644 --- a/tests/profiles/test_base_profile.py +++ b/tests/profiles/test_base_profile.py @@ -1,9 +1,11 @@ from __future__ import annotations +from typing import Any + import pytest import yaml -from cosmos.profiles.base import BaseProfileMapping +from cosmos.profiles.base import BaseProfileMapping, DbtConfigVars from cosmos.exceptions import CosmosValueError @@ -36,17 +38,58 @@ def test_validate_profile_args(profile_arg: str): ) -@pytest.mark.parametrize("disable_event_tracking", [True, False]) -def test_disable_event_tracking(disable_event_tracking: str): +@pytest.mark.parametrize("dbt_config_var,dbt_config_value", [("debug", True), ("debug", False)]) +def test_validate_dbt_config_vars(dbt_config_var: str, dbt_config_value: Any): """ - Tests the config block in the profile is set correctly if disable_event_tracking is set. + Tests the config block in the profile is set correctly. """ + dbt_config_vars = {dbt_config_var: dbt_config_value} test_profile = TestProfileMapping( conn_id="fake_conn_id", - disable_event_tracking=disable_event_tracking, + dbt_config_vars=DbtConfigVars(**dbt_config_vars), ) profile_contents = yaml.safe_load(test_profile.get_profile_file_contents(profile_name="fake-profile-name")) - assert ("config" in profile_contents) == disable_event_tracking - if disable_event_tracking: - assert profile_contents["config"]["send_anonymous_usage_stats"] == "False" + assert "config" in profile_contents + assert profile_contents["config"][dbt_config_var] == dbt_config_value + + +@pytest.mark.parametrize( + "dbt_config_var,dbt_config_value", + [("send_anonymous_usage_stats", 2), ("send_anonymous_usage_stats", "aaa")], +) +def test_profile_config_validate_dbt_config_vars_check_unexpected_types(dbt_config_var: str, dbt_config_value: Any): + dbt_config_vars = {dbt_config_var: dbt_config_value} + + with pytest.raises(CosmosValueError) as err_info: + TestProfileMapping( + conn_id="fake_conn_id", + dbt_config_vars=DbtConfigVars(**dbt_config_vars), + ) + assert err_info.value.args[0].startswith(f"dbt config var {dbt_config_var}: {dbt_config_value} must be a ") + + +@pytest.mark.parametrize("dbt_config_var,dbt_config_value", [("send_anonymous_usage_stats", True)]) +def test_profile_config_validate_dbt_config_vars_check_expected_types(dbt_config_var: str, dbt_config_value: Any): + dbt_config_vars = {dbt_config_var: dbt_config_value} + + profile_config = TestProfileMapping( + conn_id="fake_conn_id", + dbt_config_vars=DbtConfigVars(**dbt_config_vars), + ) + assert profile_config.dbt_config_vars.as_dict() == dbt_config_vars + + +@pytest.mark.parametrize( + "dbt_config_var,dbt_config_value", + [("log_format", "text2")], +) +def test_profile_config_validate_dbt_config_vars_check_values(dbt_config_var: str, dbt_config_value: Any): + dbt_config_vars = {dbt_config_var: dbt_config_value} + + with pytest.raises(CosmosValueError) as err_info: + TestProfileMapping( + conn_id="fake_conn_id", + dbt_config_vars=DbtConfigVars(**dbt_config_vars), + ) + assert err_info.value.args[0].startswith(f"dbt config var {dbt_config_var}: {dbt_config_value} must be one of ")