apache · JiaqiWang18 · Jul 21, 2025 · Jul 21, 2025 · Jul 21, 2025 · Jul 21, 2025
diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py
@@ -1517,6 +1517,7 @@ def __hash__(self):
     source_file_regexes=["python/pyspark/pipelines"],
     python_test_goals=[
         "pyspark.pipelines.tests.test_block_connect_access",
+        "pyspark.pipelines.tests.test_block_imperative_construct",
         "pyspark.pipelines.tests.test_cli",
         "pyspark.pipelines.tests.test_decorators",
         "pyspark.pipelines.tests.test_graph_element_registry",

diff --git a/python/pyspark/errors/error-conditions.json b/python/pyspark/errors/error-conditions.json
@@ -363,6 +363,12 @@
       "Function `<func_name>` should return Column, got <return_type>."
     ]
   },
+  "IMPERATIVE_CONSTRUCT_IN_DECLARATIVE_PIPELINE": {
+    "message": [
+      "Imperative construct <method> is not allowed in declarative pipelines.",
+      "<suggestion>"
+    ]
+  },
   "INCORRECT_CONF_FOR_PROFILE": {
     "message": [
       "`spark.python.profile` or `spark.python.profile.memory` configuration",

diff --git a/python/pyspark/pipelines/block_imperative_construct.py b/python/pyspark/pipelines/block_imperative_construct.py
@@ -0,0 +1,141 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from contextlib import contextmanager
+from typing import Generator, NoReturn, List, Callable
+
+from pyspark.errors import PySparkException
+from pyspark.sql.connect.catalog import Catalog
+from pyspark.sql.connect.conf import RuntimeConf
+from pyspark.sql.connect.dataframe import DataFrame
+from pyspark.sql.connect.udf import UDFRegistration
+
+# pyspark methods that should be blocked from executing in python pipeline definition files
+BLOCKED_METHODS: List = [
+    {
+        "class": RuntimeConf,
+        "method": "set",
+        "suggestion": "Instead set configuration via the pipeline spec "
+        "or use the 'spark_conf' argument in various decorators",
+    },
+    {
+        "class": Catalog,
+        "method": "setCurrentCatalog",
+        "suggestion": "Instead set catalog via the pipeline spec "
+        "or the 'name' argument on the dataset decorators",
+    },
+    {
+        "class": Catalog,
+        "method": "setCurrentDatabase",
+        "suggestion": "Instead set database via the pipeline spec "
+        "or the 'name' argument on the dataset decorators",
+    },
+    {
+        "class": Catalog,
+        "method": "dropTempView",
+        "suggestion": "Instead remove the temporary view definition directly",
+    },
+    {
+        "class": Catalog,
+        "method": "dropGlobalTempView",
+        "suggestion": "Instead remove the temporary view definition directly",
+    },
+    {
+        "class": DataFrame,
+        "method": "createTempView",
+        "suggestion": "Instead use the @temporary_view decorator to define temporary views",
+    },
+    {
+        "class": DataFrame,
+        "method": "createOrReplaceTempView",
+        "suggestion": "Instead use the @temporary_view decorator to define temporary views",
+    },
+    {
+        "class": DataFrame,
+        "method": "createGlobalTempView",
+        "suggestion": "Instead use the @temporary_view decorator to define temporary views",
+    },
+    {
+        "class": DataFrame,
+        "method": "createOrReplaceGlobalTempView",
+        "suggestion": "Instead use the @temporary_view decorator to define temporary views",
+    },
+    {
+        "class": UDFRegistration,
+        "method": "register",
+        "suggestion": "",
+    },
+    {
+        "class": UDFRegistration,
+        "method": "registerJavaFunction",
+        "suggestion": "",
+    },
+    {
+        "class": UDFRegistration,
+        "method": "registerJavaUDAF",
+        "suggestion": "",
+    },
+]
+
+
+def _create_blocked_method(error_method_name: str, suggestion: str) -> Callable:
+    def blocked_method(*args: object, **kwargs: object) -> NoReturn:
+        raise PySparkException(
+            errorClass="IMPERATIVE_CONSTRUCT_IN_DECLARATIVE_PIPELINE",
+            messageParameters={
+                "method": error_method_name,
+                "suggestion": suggestion,
+            },
+        )
+
+    return blocked_method
+
+
+@contextmanager
+def block_imperative_construct() -> Generator[None, None, None]:
-def block_imperative_construct() -> Generator[None, None, None]:
+def block_imperative_constructs() -> Generator[None, None, None]:
-def block_imperative_construct() -> Generator[None, None, None]:
+def block_imperative_constructs() -> Generator[None, None, None]:
+    """
+    Context manager that blocks imperative constructs found in a pipeline python definition file
+    Blocks:
+        - imperative config set via: spark.conf.set("k", "v")
+        - catalog changes via: spark.catalog.setCurrentCatalog("catalog_name")
+        - database changes via: spark.catalog.setCurrentDatabase("db_name")
+        - temporary view creation/deletion via DataFrame and catalog methods
+        - user-defined functions registration
+    """
+    # Store original methods
+    original_methods = {}
+    for method_info in BLOCKED_METHODS:
+        cls = method_info["class"]
+        method_name = method_info["method"]
+        original_methods[(cls, method_name)] = getattr(cls, method_name)
+
+    try:
+        # Replace methods with blocked versions
+        for method_info in BLOCKED_METHODS:
+            cls = method_info["class"]
+            method_name = method_info["method"]
+            error_method_name = f"'{cls.__name__}.{method_name}'"
+            blocked_method = _create_blocked_method(error_method_name, method_info["suggestion"])
+            setattr(cls, method_name, blocked_method)
+
+        yield
+    finally:
+        # Restore original methods
+        for method_info in BLOCKED_METHODS:
+            cls = method_info["class"]
+            method_name = method_info["method"]
+            original_method = original_methods[(cls, method_name)]
+            setattr(cls, method_name, original_method)
diff --git a/python/pyspark/pipelines/cli.py b/python/pyspark/pipelines/cli.py
@@ -32,6 +32,7 @@
 
 from pyspark.errors import PySparkException, PySparkTypeError
 from pyspark.sql import SparkSession
+from pyspark.pipelines.block_imperative_construct import block_imperative_construct
 from pyspark.pipelines.graph_element_registry import (
     graph_element_registration_context,
     GraphElementRegistry,
@@ -192,7 +193,8 @@ def register_definitions(
                         assert (
                             module_spec.loader is not None
                         ), f"Module spec has no loader for {file}"
-                        module_spec.loader.exec_module(module)
+                        with block_imperative_construct():
+                            module_spec.loader.exec_module(module)
                     elif file.suffix == ".sql":
                         log_with_curr_timestamp(f"Registering SQL file {file}...")
                         with file.open("r") as f: