From 4f50051acea4a49ac8322f57de67bc3f611b5236 Mon Sep 17 00:00:00 2001 From: Kaxil Naik Date: Wed, 8 May 2024 23:28:08 +0100 Subject: [PATCH 1/7] Add Scarf based telemetry --- airflow/config_templates/config.yml | 19 +++++++++++++++++++ airflow/settings.py | 4 ++++ airflow/www/templates/airflow/dags.html | 3 +++ airflow/www/views.py | 15 +++++++++++++++ docs/apache-airflow/faq.rst | 12 ++++++++++++ .../installation/installing-from-pypi.rst | 11 +++++++++++ 6 files changed, 64 insertions(+) diff --git a/airflow/config_templates/config.yml b/airflow/config_templates/config.yml index 95d83f9d4c7f0..eeb44c7bdb840 100644 --- a/airflow/config_templates/config.yml +++ b/airflow/config_templates/config.yml @@ -2591,3 +2591,22 @@ sensors: type: float example: ~ default: "604800" +scarf_analytics: + description: | + Airflow integrates `Scarf `__ to collect basic telemetry data during operation. + This data assists Airflow maintainers in better understanding how Airflow is used. + Insights gained from this telemetry are critical for prioritizing patches, minor releases, and + security fixes. Additionally, this information supports key decisions related to the development Roadmap. + + Users can easily opt out of analytics in various ways documented + `here `__ and by setting the ``enabled`` option + to ``False``. Airflow also respects ``SCARF_ANALYTICS=false`` environment variable. + + options: + enabled: + description: | + Enable or disable telemetry data collection and sending via Scarf. + version_added: 2.10.0 + type: boolean + example: ~ + default: "True" diff --git a/airflow/settings.py b/airflow/settings.py index 7b8a222444574..383735b73ab7d 100644 --- a/airflow/settings.py +++ b/airflow/settings.py @@ -666,3 +666,7 @@ def initialize(): "y", "1", } + +IS_SCARF_ANALYTICS_ENABLED = conf.getboolean("scarf_analytics", "enabled", fallback=True) and ( + os.getenv("SCARF_ANALYTICS") != "false" +) diff --git a/airflow/www/templates/airflow/dags.html b/airflow/www/templates/airflow/dags.html index 1bb5ac25abf4d..29b97f8fd27c1 100644 --- a/airflow/www/templates/airflow/dags.html +++ b/airflow/www/templates/airflow/dags.html @@ -490,4 +490,7 @@

{{ page_title }}

return false; } + {% if scarf_url %} + + {% endif %} {% endblock %} diff --git a/airflow/www/views.py b/airflow/www/views.py index 26c4d8ff1a845..e3d6e4332ab43 100644 --- a/airflow/www/views.py +++ b/airflow/www/views.py @@ -27,6 +27,7 @@ import math import operator import os +import platform import sys import traceback import warnings @@ -1034,6 +1035,19 @@ def _iter_parsed_moved_data_table_names(): "warning", ) + scarf_url = "" + if settings.IS_SCARF_ANALYTICS_ENABLED: + scarf_url = ( + f"https://apacheairflow.gateway.scarf.sh/web?version={version}" + f"&python_version={platform.python_version()}" + f"&platform={platform.system()}" + f"&arch={platform.machine()}" + f"&database={settings.engine.dialect.name}" + f"&db_version={settings.engine.dialect.server_version_info}" + f"&executor={conf.get('core', 'EXECUTOR')}" + f"&num_dags={all_dags_count}" + ) + return self.render_template( "airflow/dags.html", dags=dags, @@ -1072,6 +1086,7 @@ def _iter_parsed_moved_data_table_names(): sorting_direction=arg_sorting_direction, auto_refresh_interval=conf.getint("webserver", "auto_refresh_interval"), dataset_triggered_next_run_info=dataset_triggered_next_run_info, + scarf_url=scarf_url, ) @expose("/datasets") diff --git a/docs/apache-airflow/faq.rst b/docs/apache-airflow/faq.rst index 9643663eb76f5..1ace1186b481a 100644 --- a/docs/apache-airflow/faq.rst +++ b/docs/apache-airflow/faq.rst @@ -522,3 +522,15 @@ This means ``explicit_defaults_for_timestamp`` is disabled in your mysql server #. Set ``explicit_defaults_for_timestamp = 1`` under the ``mysqld`` section in your ``my.cnf`` file. #. Restart the Mysql server. + +Does Airflow collect any telemetry data? +---------------------------------------- + +Airflow integrates `Scarf `__ to collect basic telemetry data during operation. +This data assists Airflow maintainers in better understanding how Airflow is used. +Insights gained from this telemetry are critical for prioritizing patches, minor releases, and +security fixes. Additionally, this information supports key decisions related to the development Roadmap. + +Users can easily opt out of analytics in various ways documented +`here `__ and by setting the :ref:`config:scarf_analytics__enabled` option +to ``False``. Airflow also respects ``SCARF_ANALYTICS=false`` environment variable. diff --git a/docs/apache-airflow/installation/installing-from-pypi.rst b/docs/apache-airflow/installation/installing-from-pypi.rst index bd4ecbcbe10e5..bd2e2b44b6b4c 100644 --- a/docs/apache-airflow/installation/installing-from-pypi.rst +++ b/docs/apache-airflow/installation/installing-from-pypi.rst @@ -331,6 +331,17 @@ dependencies compatible with just airflow core at the moment Airflow was release pip install "apache-airflow==${AIRFLOW_VERSION}" --constraint "${CONSTRAINT_URL}" +.. note:: + + Airflow uses `Scarf `__ to collect basic telemetry data during operation. + This data assists Airflow maintainers in better understanding how Airflow is used. + Insights gained from this telemetry are critical for prioritizing patches, minor releases, and + security fixes. Additionally, this information supports key decisions related to the development Roadmap. + + Users can easily opt out of analytics in various ways documented + `here `__ and by setting the :ref:`config:scarf_analytics__enabled` option + to ``False``. Airflow also respects ``SCARF_ANALYTICS=false`` environment variable. + Troubleshooting ''''''''''''''' From b0b9fba0e6a3110a396c63b67dd7d2ef31fe8610 Mon Sep 17 00:00:00 2001 From: Kaxil Naik Date: Thu, 9 May 2024 02:09:28 +0100 Subject: [PATCH 2/7] Add Scarf based telemetry on Scheduler startup Use Pixel for Webserver & Update URL for scheduler Improve docs Convert variable to function --- airflow/cli/commands/scheduler_command.py | 3 + airflow/config_templates/config.yml | 11 ++-- airflow/settings.py | 11 ++-- airflow/utils/scarf.py | 61 ++++++++++++++++++ airflow/www/views.py | 29 ++++++--- docs/apache-airflow/faq.rst | 20 ++++-- .../installation/installing-from-pypi.rst | 9 +-- tests/core/test_settings.py | 24 ++++++- tests/utils/test_scarf.py | 64 +++++++++++++++++++ tests/www/views/test_views_home.py | 14 ++++ 10 files changed, 217 insertions(+), 29 deletions(-) create mode 100644 airflow/utils/scarf.py create mode 100644 tests/utils/test_scarf.py diff --git a/airflow/cli/commands/scheduler_command.py b/airflow/cli/commands/scheduler_command.py index 2a55ca2373ed9..4f943e961b454 100644 --- a/airflow/cli/commands/scheduler_command.py +++ b/airflow/cli/commands/scheduler_command.py @@ -33,6 +33,7 @@ from airflow.utils import cli as cli_utils from airflow.utils.cli import process_subdir from airflow.utils.providers_configuration_loader import providers_configuration_loaded +from airflow.utils.scarf import scarf_analytics from airflow.utils.scheduler_health import serve_health_check log = logging.getLogger(__name__) @@ -55,6 +56,8 @@ def scheduler(args: Namespace): """Start Airflow Scheduler.""" print(settings.HEADER) + scarf_analytics() + run_command_with_daemon_option( args=args, process_name="scheduler", diff --git a/airflow/config_templates/config.yml b/airflow/config_templates/config.yml index eeb44c7bdb840..c1cd10e85068c 100644 --- a/airflow/config_templates/config.yml +++ b/airflow/config_templates/config.yml @@ -2596,11 +2596,13 @@ scarf_analytics: Airflow integrates `Scarf `__ to collect basic telemetry data during operation. This data assists Airflow maintainers in better understanding how Airflow is used. Insights gained from this telemetry are critical for prioritizing patches, minor releases, and - security fixes. Additionally, this information supports key decisions related to the development Roadmap. + security fixes. Additionally, this information supports key decisions related to the development roadmap. + Check the FAQ doc for more information on what data is collected. - Users can easily opt out of analytics in various ways documented - `here `__ and by setting the ``enabled`` option - to ``False``. Airflow also respects ``SCARF_ANALYTICS=false`` environment variable. + Deployments can opt-out of analytics by setting the ``enabled`` option + to ``False``, or the ``SCARF_ANALYTICS=false`` environment variable. + Individual users can easily opt-out of analytics in various ways documented in the + `Scarf Do Not Track docs `__. options: enabled: @@ -2610,3 +2612,4 @@ scarf_analytics: type: boolean example: ~ default: "True" + see_also: ":ref:`Airflow telemetry FAQ `" diff --git a/airflow/settings.py b/airflow/settings.py index 383735b73ab7d..868dba930b488 100644 --- a/airflow/settings.py +++ b/airflow/settings.py @@ -576,6 +576,13 @@ def initialize(): atexit.register(dispose_orm) +def is_scarf_analytics_enabled() -> bool: + """Check if scarf analytics is enabled.""" + return conf.getboolean("scarf_analytics", "enabled", fallback=True) and ( + os.getenv("SCARF_ANALYTICS") != "false" + ) + + # Const stuff KILOBYTE = 1024 @@ -666,7 +673,3 @@ def initialize(): "y", "1", } - -IS_SCARF_ANALYTICS_ENABLED = conf.getboolean("scarf_analytics", "enabled", fallback=True) and ( - os.getenv("SCARF_ANALYTICS") != "false" -) diff --git a/airflow/utils/scarf.py b/airflow/utils/scarf.py new file mode 100644 index 0000000000000..4f29be1594c72 --- /dev/null +++ b/airflow/utils/scarf.py @@ -0,0 +1,61 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +from __future__ import annotations + +import platform + +import httpx +from packaging.version import parse + +from airflow import __version__ as airflow_version, settings +from airflow.configuration import conf + + +def scarf_analytics(): + if not settings.is_scarf_analytics_enabled(): + return + + # Exclude pre-releases and dev versions + if _version_is_prerelease(airflow_version): + return + + scarf_domain = "https://apacheairflow.gateway.scarf.sh/scheduler" + + db_version = settings.engine.dialect.server_version_info + if db_version: + # Example: (1, 2, 3) -> "1.2.3" + db_version = ".".join(map(str, db_version)) + + try: + scarf_url = ( + f"{scarf_domain}?version={airflow_version}" + f"&python_version={platform.python_version()}" + f"&platform={platform.system()}" + f"&arch={platform.machine()}" + f"&database={settings.engine.dialect.name}" + f"&db_version={db_version}" + f"&executor={conf.get('core', 'EXECUTOR')}" + ) + + httpx.get(scarf_url, timeout=5.0) + except Exception: + pass + + +def _version_is_prerelease(version: str) -> bool: + return parse(version).is_prerelease diff --git a/airflow/www/views.py b/airflow/www/views.py index e3d6e4332ab43..8b6161582502d 100644 --- a/airflow/www/views.py +++ b/airflow/www/views.py @@ -1036,16 +1036,27 @@ def _iter_parsed_moved_data_table_names(): ) scarf_url = "" - if settings.IS_SCARF_ANALYTICS_ENABLED: + if settings.is_scarf_analytics_enabled(): + scarf_domain = "https://apacheairflow.gateway.scarf.sh" + + python_version = platform.python_version() + platform_sys = platform.system() + platform_arch = platform.machine() + db_name = settings.engine.dialect.name + db_version = settings.engine.dialect.server_version_info + if db_version: + # Example: (1, 2, 3) -> "1.2.3" + db_version = ".".join(map(str, db_version)) + executor = conf.get("core", "EXECUTOR") + + # Path Format: + # /{version}/{python_version}/{platform}/{arch}/{database}/{db_version}/{executor}/{num_dags} + # + # This path redirects to a Pixel tracking URL scarf_url = ( - f"https://apacheairflow.gateway.scarf.sh/web?version={version}" - f"&python_version={platform.python_version()}" - f"&platform={platform.system()}" - f"&arch={platform.machine()}" - f"&database={settings.engine.dialect.name}" - f"&db_version={settings.engine.dialect.server_version_info}" - f"&executor={conf.get('core', 'EXECUTOR')}" - f"&num_dags={all_dags_count}" + f"{scarf_domain}/webserver" + f"/{version}/{python_version}" + f"/{platform_sys}/{platform_arch}/{db_name}/{db_version}/{executor}/{all_dags_count}" ) return self.render_template( diff --git a/docs/apache-airflow/faq.rst b/docs/apache-airflow/faq.rst index 1ace1186b481a..37e48421b84b1 100644 --- a/docs/apache-airflow/faq.rst +++ b/docs/apache-airflow/faq.rst @@ -526,11 +526,23 @@ This means ``explicit_defaults_for_timestamp`` is disabled in your mysql server Does Airflow collect any telemetry data? ---------------------------------------- +.. _airflow-telemetry-faq: + Airflow integrates `Scarf `__ to collect basic telemetry data during operation. This data assists Airflow maintainers in better understanding how Airflow is used. Insights gained from this telemetry are critical for prioritizing patches, minor releases, and -security fixes. Additionally, this information supports key decisions related to the development Roadmap. +security fixes. Additionally, this information supports key decisions related to the development roadmap. + +Deployments can opt-out of analytics by setting the :ref:`[scarf_analytics] enabled ` +option to ``False``, or the ``SCARF_ANALYTICS=false`` environment variable. +Individual users can easily opt-out of analytics in various ways documented in the +`Scarf Do Not Track docs `__. + +The telemetry data collected is limited to the following: -Users can easily opt out of analytics in various ways documented -`here `__ and by setting the :ref:`config:scarf_analytics__enabled` option -to ``False``. Airflow also respects ``SCARF_ANALYTICS=false`` environment variable. +- Airflow version +- Python version +- Operating system & machine architecture +- Executor +- Metadata DB type & their version +- Number of DAGs diff --git a/docs/apache-airflow/installation/installing-from-pypi.rst b/docs/apache-airflow/installation/installing-from-pypi.rst index bd2e2b44b6b4c..4751b54112601 100644 --- a/docs/apache-airflow/installation/installing-from-pypi.rst +++ b/docs/apache-airflow/installation/installing-from-pypi.rst @@ -334,13 +334,8 @@ dependencies compatible with just airflow core at the moment Airflow was release .. note:: Airflow uses `Scarf `__ to collect basic telemetry data during operation. - This data assists Airflow maintainers in better understanding how Airflow is used. - Insights gained from this telemetry are critical for prioritizing patches, minor releases, and - security fixes. Additionally, this information supports key decisions related to the development Roadmap. - - Users can easily opt out of analytics in various ways documented - `here `__ and by setting the :ref:`config:scarf_analytics__enabled` option - to ``False``. Airflow also respects ``SCARF_ANALYTICS=false`` environment variable. + Check the :ref:`Airflow telemetry FAQ ` for more information about the data collected + and how to opt-out. Troubleshooting ''''''''''''''' diff --git a/tests/core/test_settings.py b/tests/core/test_settings.py index 5eac456103f4a..3e58fe60ed4a3 100644 --- a/tests/core/test_settings.py +++ b/tests/core/test_settings.py @@ -28,7 +28,7 @@ from airflow.api_internal.internal_api_call import InternalApiConfig from airflow.exceptions import AirflowClusterPolicyViolation, AirflowConfigException -from airflow.settings import _ENABLE_AIP_44, TracebackSession +from airflow.settings import _ENABLE_AIP_44, TracebackSession, is_scarf_analytics_enabled from airflow.utils.session import create_session from tests.test_utils.config import conf_vars @@ -324,3 +324,25 @@ def test_create_session_ctx_mgr_no_call_methods(mock_new, clear_internal_api): assert session == m method_calls = [x[0] for x in m.method_calls] assert method_calls == [] # commit and close not called when using internal API + + +@pytest.mark.parametrize( + "env_var, conf_setting, is_enabled", + [ + ("false", "True", False), # env forces disable + ("false", "False", False), # Both force disable + ("true", "True", True), # Both enable + ("true", "False", False), # Conf forces disable + (None, "True", True), # Default env, conf enables + (None, "False", False), # Default env, conf disables + ], +) +def test_scarf_analytics_disabled(env_var, conf_setting, is_enabled): + conf_patch = conf_vars({("scarf_analytics", "enabled"): conf_setting}) + + if env_var is not None: + with conf_patch, patch.dict(os.environ, {"SCARF_ANALYTICS": env_var}): + assert is_scarf_analytics_enabled() == is_enabled + else: + with conf_patch: + assert is_scarf_analytics_enabled() == is_enabled diff --git a/tests/utils/test_scarf.py b/tests/utils/test_scarf.py new file mode 100644 index 0000000000000..1a287fe0b283a --- /dev/null +++ b/tests/utils/test_scarf.py @@ -0,0 +1,64 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +from __future__ import annotations + +import platform +from unittest import mock + +import pytest + +from airflow import __version__ as airflow_version, settings +from airflow.configuration import conf +from airflow.utils.scarf import scarf_analytics + + +@pytest.mark.parametrize("is_enabled, is_prerelease", [(False, True), (True, True)]) +@mock.patch("httpx.get") +def test_scarf_analytics_disabled(mock_get, is_enabled, is_prerelease): + with mock.patch("airflow.settings.is_scarf_analytics_enabled", return_value=is_enabled), mock.patch( + "airflow.utils.scarf._version_is_prerelease", return_value=is_prerelease + ): + scarf_analytics() + mock_get.assert_not_called() + + +@mock.patch("airflow.settings.is_scarf_analytics_enabled", return_value=True) +@mock.patch("airflow.utils.scarf._version_is_prerelease", return_value=False) +@mock.patch("httpx.get") +def test_scarf_analytics(mock_get, mock_is_scarf_analytics_enabled, mock_version_is_prerelease): + platform_sys = platform.system() + platform_machine = platform.machine() + python_version = platform.python_version() + version_info = settings.engine.dialect.server_version_info + version_info = ".".join(map(str, version_info)) if version_info else "" + database = settings.engine.dialect.name + executor = conf.get("core", "EXECUTOR") + scarf_endpoint = "https://apacheairflow.gateway.scarf.sh/scheduler" + scarf_analytics() + + expected_scarf_url = ( + f"{scarf_endpoint}?version={airflow_version}" + f"&python_version={python_version}" + f"&platform={platform_sys}" + f"&arch={platform_machine}" + f"&database={database}" + f"&db_version={version_info}" + f"&executor={executor}" + ) + + mock_get.assert_called_once_with(expected_scarf_url, timeout=5.0) diff --git a/tests/www/views/test_views_home.py b/tests/www/views/test_views_home.py index 5ddcb65a871f5..d937fbe6eb2ab 100644 --- a/tests/www/views/test_views_home.py +++ b/tests/www/views/test_views_home.py @@ -451,3 +451,17 @@ def test_sorting_home_view(url, lower_key, greater_key, user_client, working_dag lower_index = resp_html.find(lower_key) greater_index = resp_html.find(greater_key) assert lower_index < greater_index + + +@pytest.mark.parametrize("is_enabled, should_have_pixel", [(False, False), (True, True)]) +def test_analytics_pixel(user_client, is_enabled, should_have_pixel): + """ + Test that the analytics pixel is not included when the feature is disabled + """ + with mock.patch("airflow.settings.is_scarf_analytics_enabled", return_value=is_enabled): + resp = user_client.get("home", follow_redirects=True) + + if should_have_pixel: + check_content_in_response("apacheairflow.gateway.scarf.sh", resp) + else: + check_content_not_in_response("apacheairflow.gateway.scarf.sh", resp) From 79db51a7600df205b591eccd1b9a5ff207db998f Mon Sep 17 00:00:00 2001 From: Kaxil Naik Date: Fri, 10 May 2024 00:14:27 +0100 Subject: [PATCH 3/7] Rename scarf_analytics to telemetry_collection --- airflow/cli/commands/scheduler_command.py | 4 ++-- airflow/config_templates/config.yml | 2 +- airflow/settings.py | 4 ++-- airflow/utils/scarf.py | 2 +- airflow/www/views.py | 2 +- docs/apache-airflow/faq.rst | 2 +- tests/core/test_settings.py | 10 +++++----- tests/utils/test_scarf.py | 6 +++--- tests/www/views/test_views_home.py | 2 +- 9 files changed, 17 insertions(+), 17 deletions(-) diff --git a/airflow/cli/commands/scheduler_command.py b/airflow/cli/commands/scheduler_command.py index 4f943e961b454..970c9dbd4ae30 100644 --- a/airflow/cli/commands/scheduler_command.py +++ b/airflow/cli/commands/scheduler_command.py @@ -33,7 +33,7 @@ from airflow.utils import cli as cli_utils from airflow.utils.cli import process_subdir from airflow.utils.providers_configuration_loader import providers_configuration_loaded -from airflow.utils.scarf import scarf_analytics +from airflow.utils.scarf import telemetry_collection from airflow.utils.scheduler_health import serve_health_check log = logging.getLogger(__name__) @@ -56,7 +56,7 @@ def scheduler(args: Namespace): """Start Airflow Scheduler.""" print(settings.HEADER) - scarf_analytics() + telemetry_collection() run_command_with_daemon_option( args=args, diff --git a/airflow/config_templates/config.yml b/airflow/config_templates/config.yml index c1cd10e85068c..49f945f4e11e7 100644 --- a/airflow/config_templates/config.yml +++ b/airflow/config_templates/config.yml @@ -2591,7 +2591,7 @@ sensors: type: float example: ~ default: "604800" -scarf_analytics: +telemetry_collection: description: | Airflow integrates `Scarf `__ to collect basic telemetry data during operation. This data assists Airflow maintainers in better understanding how Airflow is used. diff --git a/airflow/settings.py b/airflow/settings.py index 868dba930b488..288ddc70f1397 100644 --- a/airflow/settings.py +++ b/airflow/settings.py @@ -576,9 +576,9 @@ def initialize(): atexit.register(dispose_orm) -def is_scarf_analytics_enabled() -> bool: +def is_telemetry_collection_enabled() -> bool: """Check if scarf analytics is enabled.""" - return conf.getboolean("scarf_analytics", "enabled", fallback=True) and ( + return conf.getboolean("telemetry_collection", "enabled", fallback=True) and ( os.getenv("SCARF_ANALYTICS") != "false" ) diff --git a/airflow/utils/scarf.py b/airflow/utils/scarf.py index 4f29be1594c72..e15f4ce479a88 100644 --- a/airflow/utils/scarf.py +++ b/airflow/utils/scarf.py @@ -27,7 +27,7 @@ def scarf_analytics(): - if not settings.is_scarf_analytics_enabled(): + if not settings.is_telemetry_collection_enabled(): return # Exclude pre-releases and dev versions diff --git a/airflow/www/views.py b/airflow/www/views.py index 8b6161582502d..23a50bbb63598 100644 --- a/airflow/www/views.py +++ b/airflow/www/views.py @@ -1036,7 +1036,7 @@ def _iter_parsed_moved_data_table_names(): ) scarf_url = "" - if settings.is_scarf_analytics_enabled(): + if settings.is_telemetry_collection_enabled(): scarf_domain = "https://apacheairflow.gateway.scarf.sh" python_version = platform.python_version() diff --git a/docs/apache-airflow/faq.rst b/docs/apache-airflow/faq.rst index 37e48421b84b1..3c72e9c6c08ab 100644 --- a/docs/apache-airflow/faq.rst +++ b/docs/apache-airflow/faq.rst @@ -533,7 +533,7 @@ This data assists Airflow maintainers in better understanding how Airflow is use Insights gained from this telemetry are critical for prioritizing patches, minor releases, and security fixes. Additionally, this information supports key decisions related to the development roadmap. -Deployments can opt-out of analytics by setting the :ref:`[scarf_analytics] enabled ` +Deployments can opt-out of analytics by setting the :ref:`[telemetry_collection] enabled ` option to ``False``, or the ``SCARF_ANALYTICS=false`` environment variable. Individual users can easily opt-out of analytics in various ways documented in the `Scarf Do Not Track docs `__. diff --git a/tests/core/test_settings.py b/tests/core/test_settings.py index 3e58fe60ed4a3..8d0d5600f5566 100644 --- a/tests/core/test_settings.py +++ b/tests/core/test_settings.py @@ -28,7 +28,7 @@ from airflow.api_internal.internal_api_call import InternalApiConfig from airflow.exceptions import AirflowClusterPolicyViolation, AirflowConfigException -from airflow.settings import _ENABLE_AIP_44, TracebackSession, is_scarf_analytics_enabled +from airflow.settings import _ENABLE_AIP_44, TracebackSession, is_telemetry_collection_enabled from airflow.utils.session import create_session from tests.test_utils.config import conf_vars @@ -337,12 +337,12 @@ def test_create_session_ctx_mgr_no_call_methods(mock_new, clear_internal_api): (None, "False", False), # Default env, conf disables ], ) -def test_scarf_analytics_disabled(env_var, conf_setting, is_enabled): - conf_patch = conf_vars({("scarf_analytics", "enabled"): conf_setting}) +def test_telemetry_collection_disabled(env_var, conf_setting, is_enabled): + conf_patch = conf_vars({("telemetry_collection", "enabled"): conf_setting}) if env_var is not None: with conf_patch, patch.dict(os.environ, {"SCARF_ANALYTICS": env_var}): - assert is_scarf_analytics_enabled() == is_enabled + assert is_telemetry_collection_enabled() == is_enabled else: with conf_patch: - assert is_scarf_analytics_enabled() == is_enabled + assert is_telemetry_collection_enabled() == is_enabled diff --git a/tests/utils/test_scarf.py b/tests/utils/test_scarf.py index 1a287fe0b283a..4788354eecaf2 100644 --- a/tests/utils/test_scarf.py +++ b/tests/utils/test_scarf.py @@ -30,17 +30,17 @@ @pytest.mark.parametrize("is_enabled, is_prerelease", [(False, True), (True, True)]) @mock.patch("httpx.get") def test_scarf_analytics_disabled(mock_get, is_enabled, is_prerelease): - with mock.patch("airflow.settings.is_scarf_analytics_enabled", return_value=is_enabled), mock.patch( + with mock.patch("airflow.settings.is_telemetry_collection_enabled", return_value=is_enabled), mock.patch( "airflow.utils.scarf._version_is_prerelease", return_value=is_prerelease ): scarf_analytics() mock_get.assert_not_called() -@mock.patch("airflow.settings.is_scarf_analytics_enabled", return_value=True) +@mock.patch("airflow.settings.is_telemetry_collection_enabled", return_value=True) @mock.patch("airflow.utils.scarf._version_is_prerelease", return_value=False) @mock.patch("httpx.get") -def test_scarf_analytics(mock_get, mock_is_scarf_analytics_enabled, mock_version_is_prerelease): +def test_scarf_analytics(mock_get, mock_is_telemetry_collection_enabled, mock_version_is_prerelease): platform_sys = platform.system() platform_machine = platform.machine() python_version = platform.python_version() diff --git a/tests/www/views/test_views_home.py b/tests/www/views/test_views_home.py index d937fbe6eb2ab..52011c96cf11c 100644 --- a/tests/www/views/test_views_home.py +++ b/tests/www/views/test_views_home.py @@ -458,7 +458,7 @@ def test_analytics_pixel(user_client, is_enabled, should_have_pixel): """ Test that the analytics pixel is not included when the feature is disabled """ - with mock.patch("airflow.settings.is_scarf_analytics_enabled", return_value=is_enabled): + with mock.patch("airflow.settings.is_telemetry_collection_enabled", return_value=is_enabled): resp = user_client.get("home", follow_redirects=True) if should_have_pixel: From 5eb214f1bb9a5c0ef03b454fb70408ea3b900678 Mon Sep 17 00:00:00 2001 From: Kaxil Naik Date: Fri, 10 May 2024 00:53:38 +0100 Subject: [PATCH 4/7] Refactor to remove redundancy --- airflow/cli/commands/scheduler_command.py | 4 +- airflow/utils/scarf.py | 41 ++++++++++++++---- airflow/www/views.py | 53 ++++++++++++----------- tests/utils/test_scarf.py | 16 ++++++- tests/www/views/test_views.py | 30 +++++++++++++ 5 files changed, 107 insertions(+), 37 deletions(-) diff --git a/airflow/cli/commands/scheduler_command.py b/airflow/cli/commands/scheduler_command.py index 970c9dbd4ae30..4f943e961b454 100644 --- a/airflow/cli/commands/scheduler_command.py +++ b/airflow/cli/commands/scheduler_command.py @@ -33,7 +33,7 @@ from airflow.utils import cli as cli_utils from airflow.utils.cli import process_subdir from airflow.utils.providers_configuration_loader import providers_configuration_loaded -from airflow.utils.scarf import telemetry_collection +from airflow.utils.scarf import scarf_analytics from airflow.utils.scheduler_health import serve_health_check log = logging.getLogger(__name__) @@ -56,7 +56,7 @@ def scheduler(args: Namespace): """Start Airflow Scheduler.""" print(settings.HEADER) - telemetry_collection() + scarf_analytics() run_command_with_daemon_option( args=args, diff --git a/airflow/utils/scarf.py b/airflow/utils/scarf.py index e15f4ce479a88..bd9682a08114c 100644 --- a/airflow/utils/scarf.py +++ b/airflow/utils/scarf.py @@ -36,20 +36,21 @@ def scarf_analytics(): scarf_domain = "https://apacheairflow.gateway.scarf.sh/scheduler" - db_version = settings.engine.dialect.server_version_info - if db_version: - # Example: (1, 2, 3) -> "1.2.3" - db_version = ".".join(map(str, db_version)) + platform_sys, arch = get_platform_info() + db_version = get_database_version() + db_name = get_database_name() + executor = get_executor() + python_version = get_python_version() try: scarf_url = ( f"{scarf_domain}?version={airflow_version}" - f"&python_version={platform.python_version()}" - f"&platform={platform.system()}" - f"&arch={platform.machine()}" - f"&database={settings.engine.dialect.name}" + f"&python_version={python_version}" + f"&platform={platform_sys}" + f"&arch={arch}" + f"&database={db_name}" f"&db_version={db_version}" - f"&executor={conf.get('core', 'EXECUTOR')}" + f"&executor={executor}" ) httpx.get(scarf_url, timeout=5.0) @@ -59,3 +60,25 @@ def scarf_analytics(): def _version_is_prerelease(version: str) -> bool: return parse(version).is_prerelease + + +def get_platform_info() -> tuple[str, str]: + return platform.system(), platform.machine() + + +def get_database_version() -> str: + version_info = settings.engine.dialect.server_version_info + # Example: (1, 2, 3) -> "1.2.3" + return ".".join(map(str, version_info)) if version_info else "None" + + +def get_database_name() -> str: + return settings.engine.dialect.name + + +def get_executor() -> str: + return conf.get("core", "EXECUTOR") + + +def get_python_version() -> str: + return platform.python_version() diff --git a/airflow/www/views.py b/airflow/www/views.py index 23a50bbb63598..d5e2090187261 100644 --- a/airflow/www/views.py +++ b/airflow/www/views.py @@ -27,7 +27,6 @@ import math import operator import os -import platform import sys import traceback import warnings @@ -118,7 +117,7 @@ from airflow.timetables._cron import CronMixin from airflow.timetables.base import DataInterval, TimeRestriction from airflow.timetables.simple import ContinuousTimetable -from airflow.utils import json as utils_json, timezone, yaml +from airflow.utils import json as utils_json, scarf, timezone, yaml from airflow.utils.airflow_flask_app import get_airflow_app from airflow.utils.dag_edges import dag_edges from airflow.utils.db import get_query_count @@ -218,6 +217,32 @@ def get_safe_url(url): return redirect_url.geturl() +def build_scarf_url(dags_count: int) -> str: + """Build the URL for the Scarf telemetry collection.""" + if not settings.is_telemetry_collection_enabled(): + return "" + + scarf_domain = "https://apacheairflow.gateway.scarf.sh" + + platform_sys, platform_arch = scarf.get_platform_info() + db_version = scarf.get_database_version() + db_name = scarf.get_database_name() + executor = scarf.get_executor() + python_version = scarf.get_python_version() + + # Path Format: + # /{version}/{python_version}/{platform}/{arch}/{database}/{db_version}/{executor}/{num_dags} + # + # This path redirects to a Pixel tracking URL + scarf_url = ( + f"{scarf_domain}/webserver" + f"/{version}/{python_version}" + f"/{platform_sys}/{platform_arch}/{db_name}/{db_version}/{executor}/{dags_count}" + ) + + return scarf_url + + def get_date_time_num_runs_dag_runs_form_data(www_request, session, dag): """Get Execution Data, Base Date & Number of runs from a Request.""" date_time = www_request.args.get("execution_date") @@ -1035,29 +1060,7 @@ def _iter_parsed_moved_data_table_names(): "warning", ) - scarf_url = "" - if settings.is_telemetry_collection_enabled(): - scarf_domain = "https://apacheairflow.gateway.scarf.sh" - - python_version = platform.python_version() - platform_sys = platform.system() - platform_arch = platform.machine() - db_name = settings.engine.dialect.name - db_version = settings.engine.dialect.server_version_info - if db_version: - # Example: (1, 2, 3) -> "1.2.3" - db_version = ".".join(map(str, db_version)) - executor = conf.get("core", "EXECUTOR") - - # Path Format: - # /{version}/{python_version}/{platform}/{arch}/{database}/{db_version}/{executor}/{num_dags} - # - # This path redirects to a Pixel tracking URL - scarf_url = ( - f"{scarf_domain}/webserver" - f"/{version}/{python_version}" - f"/{platform_sys}/{platform_arch}/{db_name}/{db_version}/{executor}/{all_dags_count}" - ) + scarf_url = build_scarf_url(dags_count=all_dags_count) return self.render_template( "airflow/dags.html", diff --git a/tests/utils/test_scarf.py b/tests/utils/test_scarf.py index 4788354eecaf2..8bccc16d69623 100644 --- a/tests/utils/test_scarf.py +++ b/tests/utils/test_scarf.py @@ -24,7 +24,7 @@ from airflow import __version__ as airflow_version, settings from airflow.configuration import conf -from airflow.utils.scarf import scarf_analytics +from airflow.utils.scarf import get_database_version, scarf_analytics @pytest.mark.parametrize("is_enabled, is_prerelease", [(False, True), (True, True)]) @@ -62,3 +62,17 @@ def test_scarf_analytics(mock_get, mock_is_telemetry_collection_enabled, mock_ve ) mock_get.assert_called_once_with(expected_scarf_url, timeout=5.0) + + +@pytest.mark.parametrize( + "version_info, expected_version", + [ + ((1, 2, 3), "1.2.3"), # Normal version tuple + (None, "None"), # No version info available + ((1,), "1"), # Single element version tuple + ((1, 2, 3, "beta", 4), "1.2.3.beta.4"), # Complex version tuple with strings + ], +) +def test_get_database_version(version_info, expected_version): + with mock.patch("airflow.settings.engine.dialect.server_version_info", new=version_info): + assert get_database_version() == expected_version diff --git a/tests/www/views/test_views.py b/tests/www/views/test_views.py index 27f096403f05d..527e3ff5e4550 100644 --- a/tests/www/views/test_views.py +++ b/tests/www/views/test_views.py @@ -20,9 +20,11 @@ import os import re from unittest import mock +from unittest.mock import patch import pytest +from airflow import __version__ as airflow_version from airflow.configuration import ( initialize_config, write_default_airflow_configuration_if_needed, @@ -31,6 +33,7 @@ from airflow.plugins_manager import AirflowPlugin, EntryPointSource from airflow.utils.task_group import TaskGroup from airflow.www.views import ( + build_scarf_url, get_key_paths, get_safe_url, get_task_stats_from_query, @@ -525,3 +528,30 @@ def test_invalid_dates(app, admin_client, url, content): assert resp.status_code == 400 assert re.search(content, resp.get_data().decode()) + + +@pytest.mark.parametrize("enabled, dags_count", [(False, 5), (True, 5)]) +@patch("airflow.utils.scarf.get_platform_info", return_value=("Linux", "x86_64")) +@patch("airflow.utils.scarf.get_database_version", return_value="12.3") +@patch("airflow.utils.scarf.get_database_name", return_value="postgres") +@patch("airflow.utils.scarf.get_executor", return_value="SequentialExecutor") +@patch("airflow.utils.scarf.get_python_version", return_value="3.8.5") +def test_build_scarf_url( + get_platform_info, + get_database_version, + get_database_name, + get_executor, + get_python_version, + enabled, + dags_count, +): + with patch("airflow.settings.is_telemetry_collection_enabled", return_value=enabled): + result = build_scarf_url(dags_count) + expected_url = ( + "https://apacheairflow.gateway.scarf.sh/webserver/" + f"{airflow_version}/3.8.5/Linux/x86_64/postgres/12.3/SequentialExecutor/5" + ) + if enabled: + assert result == expected_url + else: + assert result == "" From 5b9072b483af9c6b169060782c13ddde893cb03f Mon Sep 17 00:00:00 2001 From: Kaxil Naik Date: Fri, 10 May 2024 13:21:57 +0100 Subject: [PATCH 5/7] Fix tests --- airflow/config_templates/config.yml | 2 +- airflow/utils/scarf.py | 17 +++++++++++------ airflow/www/views.py | 5 ++++- docs/apache-airflow/faq.rst | 2 +- tests/utils/test_scarf.py | 20 +++++++++++++------- 5 files changed, 30 insertions(+), 16 deletions(-) diff --git a/airflow/config_templates/config.yml b/airflow/config_templates/config.yml index 49f945f4e11e7..edfe56b45cee6 100644 --- a/airflow/config_templates/config.yml +++ b/airflow/config_templates/config.yml @@ -2596,7 +2596,7 @@ telemetry_collection: Airflow integrates `Scarf `__ to collect basic telemetry data during operation. This data assists Airflow maintainers in better understanding how Airflow is used. Insights gained from this telemetry are critical for prioritizing patches, minor releases, and - security fixes. Additionally, this information supports key decisions related to the development roadmap. + security fixes. Additionally, this information supports key decisions related to the development road map. Check the FAQ doc for more information on what data is collected. Deployments can opt-out of analytics by setting the ``enabled`` option diff --git a/airflow/utils/scarf.py b/airflow/utils/scarf.py index bd9682a08114c..556a9aa176a8b 100644 --- a/airflow/utils/scarf.py +++ b/airflow/utils/scarf.py @@ -36,13 +36,13 @@ def scarf_analytics(): scarf_domain = "https://apacheairflow.gateway.scarf.sh/scheduler" - platform_sys, arch = get_platform_info() - db_version = get_database_version() - db_name = get_database_name() - executor = get_executor() - python_version = get_python_version() - try: + platform_sys, arch = get_platform_info() + db_version = get_database_version() + db_name = get_database_name() + executor = get_executor() + python_version = get_python_version() + scarf_url = ( f"{scarf_domain}?version={airflow_version}" f"&python_version={python_version}" @@ -67,12 +67,17 @@ def get_platform_info() -> tuple[str, str]: def get_database_version() -> str: + if settings.engine is None: + return "None" + version_info = settings.engine.dialect.server_version_info # Example: (1, 2, 3) -> "1.2.3" return ".".join(map(str, version_info)) if version_info else "None" def get_database_name() -> str: + if settings.engine is None: + return "None" return settings.engine.dialect.name diff --git a/airflow/www/views.py b/airflow/www/views.py index d5e2090187261..606d48e99c2be 100644 --- a/airflow/www/views.py +++ b/airflow/www/views.py @@ -1060,7 +1060,10 @@ def _iter_parsed_moved_data_table_names(): "warning", ) - scarf_url = build_scarf_url(dags_count=all_dags_count) + try: + scarf_url = build_scarf_url(dags_count=all_dags_count) + except Exception: + scarf_url = "" return self.render_template( "airflow/dags.html", diff --git a/docs/apache-airflow/faq.rst b/docs/apache-airflow/faq.rst index 3c72e9c6c08ab..80c2ea3719600 100644 --- a/docs/apache-airflow/faq.rst +++ b/docs/apache-airflow/faq.rst @@ -531,7 +531,7 @@ Does Airflow collect any telemetry data? Airflow integrates `Scarf `__ to collect basic telemetry data during operation. This data assists Airflow maintainers in better understanding how Airflow is used. Insights gained from this telemetry are critical for prioritizing patches, minor releases, and -security fixes. Additionally, this information supports key decisions related to the development roadmap. +security fixes. Additionally, this information supports key decisions related to the development road map. Deployments can opt-out of analytics by setting the :ref:`[telemetry_collection] enabled ` option to ``False``, or the ``SCARF_ANALYTICS=false`` environment variable. diff --git a/tests/utils/test_scarf.py b/tests/utils/test_scarf.py index 8bccc16d69623..507ce0357b8d5 100644 --- a/tests/utils/test_scarf.py +++ b/tests/utils/test_scarf.py @@ -22,7 +22,7 @@ import pytest -from airflow import __version__ as airflow_version, settings +from airflow import __version__ as airflow_version from airflow.configuration import conf from airflow.utils.scarf import get_database_version, scarf_analytics @@ -39,14 +39,19 @@ def test_scarf_analytics_disabled(mock_get, is_enabled, is_prerelease): @mock.patch("airflow.settings.is_telemetry_collection_enabled", return_value=True) @mock.patch("airflow.utils.scarf._version_is_prerelease", return_value=False) +@mock.patch("airflow.utils.scarf.get_database_version", return_value="12.3") +@mock.patch("airflow.utils.scarf.get_database_name", return_value="postgres") @mock.patch("httpx.get") -def test_scarf_analytics(mock_get, mock_is_telemetry_collection_enabled, mock_version_is_prerelease): +def test_scarf_analytics( + mock_get, + mock_is_telemetry_collection_enabled, + mock_version_is_prerelease, + get_database_version, + get_database_name, +): platform_sys = platform.system() platform_machine = platform.machine() python_version = platform.python_version() - version_info = settings.engine.dialect.server_version_info - version_info = ".".join(map(str, version_info)) if version_info else "" - database = settings.engine.dialect.name executor = conf.get("core", "EXECUTOR") scarf_endpoint = "https://apacheairflow.gateway.scarf.sh/scheduler" scarf_analytics() @@ -56,14 +61,15 @@ def test_scarf_analytics(mock_get, mock_is_telemetry_collection_enabled, mock_ve f"&python_version={python_version}" f"&platform={platform_sys}" f"&arch={platform_machine}" - f"&database={database}" - f"&db_version={version_info}" + f"&database=postgres" + f"&db_version=12.3" f"&executor={executor}" ) mock_get.assert_called_once_with(expected_scarf_url, timeout=5.0) +@pytest.mark.db_test @pytest.mark.parametrize( "version_info, expected_version", [ From ecb3543a034e9780c90e000f23e13f6e9823aee5 Mon Sep 17 00:00:00 2001 From: Kaxil Naik Date: Thu, 16 May 2024 17:31:32 +0100 Subject: [PATCH 6/7] fixup! Fix tests --- airflow/settings.py | 2 +- docs/apache-airflow/faq.rst | 2 +- tests/core/test_settings.py | 1 + 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/airflow/settings.py b/airflow/settings.py index 288ddc70f1397..176d06270eb97 100644 --- a/airflow/settings.py +++ b/airflow/settings.py @@ -579,7 +579,7 @@ def initialize(): def is_telemetry_collection_enabled() -> bool: """Check if scarf analytics is enabled.""" return conf.getboolean("telemetry_collection", "enabled", fallback=True) and ( - os.getenv("SCARF_ANALYTICS") != "false" + os.getenv("SCARF_ANALYTICS", "").strip().lower() != "false" ) diff --git a/docs/apache-airflow/faq.rst b/docs/apache-airflow/faq.rst index 80c2ea3719600..31ec98b9ff90d 100644 --- a/docs/apache-airflow/faq.rst +++ b/docs/apache-airflow/faq.rst @@ -544,5 +544,5 @@ The telemetry data collected is limited to the following: - Python version - Operating system & machine architecture - Executor -- Metadata DB type & their version +- Metadata DB type & its version - Number of DAGs diff --git a/tests/core/test_settings.py b/tests/core/test_settings.py index 8d0d5600f5566..c2b4938421667 100644 --- a/tests/core/test_settings.py +++ b/tests/core/test_settings.py @@ -331,6 +331,7 @@ def test_create_session_ctx_mgr_no_call_methods(mock_new, clear_internal_api): [ ("false", "True", False), # env forces disable ("false", "False", False), # Both force disable + ("False ", "False", False), # Both force disable ("true", "True", True), # Both enable ("true", "False", False), # Conf forces disable (None, "True", True), # Default env, conf enables From 982827f725b980586c11facec595ee88da1a4e23 Mon Sep 17 00:00:00 2001 From: Kaxil Naik Date: Thu, 16 May 2024 18:27:15 +0100 Subject: [PATCH 7/7] fixup! fixup! Fix tests --- airflow/utils/scarf.py | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/airflow/utils/scarf.py b/airflow/utils/scarf.py index 556a9aa176a8b..ec19480ee78c7 100644 --- a/airflow/utils/scarf.py +++ b/airflow/utils/scarf.py @@ -18,6 +18,7 @@ from __future__ import annotations import platform +from urllib.parse import urlencode import httpx from packaging.version import parse @@ -38,20 +39,19 @@ def scarf_analytics(): try: platform_sys, arch = get_platform_info() - db_version = get_database_version() - db_name = get_database_name() - executor = get_executor() - python_version = get_python_version() - - scarf_url = ( - f"{scarf_domain}?version={airflow_version}" - f"&python_version={python_version}" - f"&platform={platform_sys}" - f"&arch={arch}" - f"&database={db_name}" - f"&db_version={db_version}" - f"&executor={executor}" - ) + + params = { + "version": airflow_version, + "python_version": get_python_version(), + "platform": platform_sys, + "arch": arch, + "database": get_database_name(), + "db_version": get_database_version(), + "executor": get_executor(), + } + + query_string = urlencode(params) + scarf_url = f"{scarf_domain}?{query_string}" httpx.get(scarf_url, timeout=5.0) except Exception: