diff --git a/.github/workflows/all-checks.yml b/.github/workflows/all-checks.yml index 57e98962..16f4d446 100644 --- a/.github/workflows/all-checks.yml +++ b/.github/workflows/all-checks.yml @@ -38,16 +38,18 @@ jobs: python-version: ${{ matrix.python-version }} - name: Install test requirements run: make install-test-requirements - - uses: actions/setup-java@v1 - with: - java-version: '11' - - uses: vemonet/setup-spark@v1 - with: - spark-version: '3.4.1' - hadoop-version: '3' +# - name: Setup Java +# uses: actions/setup-java@v1 +# with: +# java-version: '11' +# - name: Setup spark +# uses: vemonet/setup-spark@v1 +# with: +# spark-version: '3.4.1' +# hadoop-version: '3' - name: Run `kedro run` end to end tests for all starters run: | - behave features/run.feature + behave features/run.feature --tags=pyspark lint: strategy: diff --git a/.github/workflows/run-test.yml b/.github/workflows/run-test.yml new file mode 100644 index 00000000..e69de29b diff --git a/features/run.feature b/features/run.feature index 3c9e66dc..83fcb266 100644 --- a/features/run.feature +++ b/features/run.feature @@ -23,10 +23,12 @@ Feature: Run all starters When I execute the CLI command to list Kedro pipelines Then I should get a successful exit code + @pyspark Scenario: Run a Kedro project created from pyspark-iris Given I have prepared a config file And I have run a non-interactive kedro new with the starter pyspark-iris And I have installed the Kedro project's dependencies + And I have setup hadoop binary When I run the Kedro pipeline Then I should get a successful exit code @@ -44,16 +46,20 @@ Feature: Run all starters When I run the Kedro pipeline Then I should get a successful exit code + @pyspark Scenario: Run a Kedro project created from spaceflights-pyspark Given I have prepared a config file And I have run a non-interactive kedro new with the starter spaceflights-pyspark And I have installed the Kedro project's dependencies + And I have setup hadoop binary When I run the Kedro pipeline Then I should get a successful exit code + @pyspark Scenario: Run a Kedro project created from spaceflights-pyspark-viz Given I have prepared a config file And I have run a non-interactive kedro new with the starter spaceflights-pyspark-viz And I have installed the Kedro project's dependencies + And I have setup hadoop binary When I run the Kedro pipeline Then I should get a successful exit code diff --git a/features/steps/run_steps.py b/features/steps/run_steps.py index bb39fa4e..90167aab 100644 --- a/features/steps/run_steps.py +++ b/features/steps/run_steps.py @@ -1,6 +1,7 @@ import subprocess import yaml +import os, requests, platform from behave import given, then, when OK_EXIT_CODE = 0 @@ -31,6 +32,7 @@ def create_configuration_file(context): @given("I have run a non-interactive kedro new with the starter {starter_name}") def create_project_from_config_file(context, starter_name): """Behave step to run Kedro new given the config I previously created.""" + print("!!!!!!!!!!!!!!", context.starters_paths) res = subprocess.run( [ context.kedro, @@ -41,6 +43,7 @@ def create_project_from_config_file(context, starter_name): context.starters_paths[starter_name], ] ) + assert res.returncode == OK_EXIT_CODE # prevent telemetry from prompting for input during e2e tests telemetry_file = context.root_project_dir / ".telemetry" @@ -55,6 +58,41 @@ def install_project_dependencies(context): ) assert res.returncode == OK_EXIT_CODE +@given("I have setup hadoop binary") +def setup_hadoop(context): + if platform.system() != 'Windows': + return + # Define the URLs of the files to download + winutils_url = "https://github.com/steveloughran/winutils/raw/master/hadoop-2.7.1/bin/winutils.exe" + hadoop_dll_url = "https://github.com/steveloughran/winutils/raw/master/hadoop-2.7.1/bin/hadoop.dll" + + # Specify the local file paths + winutils_local_path = "winutils.exe" + hadoop_dll_local_path = "hadoop.dll" + hadoop_bin_dir = "C:\\hadoop\\bin" + + # Download winutils.exe and hadoop.dll + response1 = requests.get(winutils_url) + with open(winutils_local_path, "wb") as file1: + file1.write(response1.content) + + response2 = requests.get(hadoop_dll_url) + with open(hadoop_dll_local_path, "wb") as file2: + file2.write(response2.content) + + # Move hadoop.dll to C:\Windows\System32 + os.rename(hadoop_dll_local_path, os.path.join("C:\\Windows\\System32", os.path.basename(hadoop_dll_local_path))) + + # Create C:\hadoop\bin directory + if not os.path.exists(hadoop_bin_dir): + os.makedirs(hadoop_bin_dir) + + # Move winutils.exe to C:\hadoop\bin + os.rename(winutils_local_path, os.path.join(hadoop_bin_dir, os.path.basename(winutils_local_path))) + + # Set the HADOOP_HOME environment variable + os.system(f"setx /M HADOOP_HOME {hadoop_bin_dir}") + @when("I run the Kedro pipeline") def run_kedro_pipeline(context): diff --git a/test_requirements.txt b/test_requirements.txt index 317b156c..fe0a373a 100644 --- a/test_requirements.txt +++ b/test_requirements.txt @@ -4,3 +4,6 @@ black~=22.0 PyYAML>=4.2, <7.0 ruff~=0.0.290 git+https://github.com/kedro-org/kedro.git@develop#egg=kedro +os +platform +requests \ No newline at end of file