Skip to content

Commit

Permalink
Merge branch 'develop' into mateusz/pip-in-smokes
Browse files Browse the repository at this point in the history
  • Loading branch information
mateuszsrebrny committed May 21, 2024
2 parents fd76e13 + fe2d3de commit 9b42584
Show file tree
Hide file tree
Showing 37 changed files with 1,924 additions and 1,288 deletions.
37 changes: 29 additions & 8 deletions .github/workflows/integration_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -64,36 +64,55 @@ jobs:

- name: Prepare `golem-cluster.tests.yaml`
run: |
poetry run python -m utils.apply_overrides -b ${{ matrix.base_yaml_path || './golem-cluster.yaml' }} golem-cluster.override.2-image.yaml golem-cluster.override.3-disable-stats.yaml golem-cluster.override-goth.yaml -o golem-cluster.tests.yaml
cat golem-cluster.tests.yaml
poetry run python -m utils.apply_overrides -b ${{ matrix.base_yaml_path || './golem-cluster.yaml' }} golem-cluster.override.1-source-files.yaml golem-cluster.override.2-image.yaml golem-cluster.override.3-disable-stats.yaml golem-cluster.override-goth.yaml -o golem-cluster.tests.yaml
cat golem-cluster.tests.yaml
free -h
ps -Heo size,command --sort -size
- name: Call `ray up`
env:
PYTHONUNBUFFERED: 1
RUST_LOG: "INFO,ya_erc20_driver::erc20::wallet=debug"
timeout-minutes: 5
run: poetry run ray up golem-cluster.tests.yaml -y --no-config-cache

- name: Run `examples/${{ matrix.example_name }}.py`
run: poetry run ray submit golem-cluster.tests.yaml examples/${{ matrix.example_name }}.py -- ${{ matrix.args }}

- name: Print last cluster logs
timeout-minutes: 5
run: |
poetry run ray exec golem-cluster.tests.yaml 'free -h'
poetry run ray exec golem-cluster.tests.yaml 'ps -Heo size,command --sort -size'
echo "= = = = BEFORE = = = ="
poetry run ray submit golem-cluster.tests.yaml examples/${{ matrix.example_name }}.py -- ${{ matrix.args }}
echo "= = = = AFTER = = = ="
poetry run ray exec golem-cluster.tests.yaml 'free -h'
poetry run ray exec golem-cluster.tests.yaml 'ps -Heo size,command --sort -size'
- name: Collect cluster logs
if: always()
continue-on-error: true
timeout-minutes: 1
timeout-minutes: 2
run: |
poetry run ray exec golem-cluster.tests.yaml 'ls /tmp/ray/session_latest/logs/'
poetry run ray exec golem-cluster.tests.yaml 'tail -n 100 /tmp/ray/session_latest/logs/*'
poetry run ray exec golem-cluster.tests.yaml 'free -h'
poetry run ray exec golem-cluster.tests.yaml 'ps -Heo size,command --sort -size'
poetry run ray rsync-down golem-cluster.tests.yaml /root/mem_usage.log mem_usage.log
poetry run ray exec golem-cluster.tests.yaml 'ray cluster-dump --local --debug-state --processes-verbose -o ray_cluster_dump.tar.gz'
poetry run ray rsync-down golem-cluster.tests.yaml ray_cluster_dump.tar.gz ray_cluster_dump.tar.gz
- name: Call `ray down`
run: poetry run ray down golem-cluster.tests.yaml -y

- name: Call `ray-on-golem stop`
run: poetry run ray-on-golem stop

- name: Check node creation in logs
run: poetry run python tests/webserver_logs_test.py ~/.local/share/ray_on_golem/webserver.log 2

- name: Stop Goth
if: always()
run: ./.github/workflows/stop-goth.sh

- name: Prepare artifact name
if: always()
run: |
echo ARTIFACT_NAME=logs-example-${{ inputs.BRANCH }}-${{ matrix.example_name }} | sed "s|/|_|g" >> $GITHUB_ENV
Expand All @@ -106,3 +125,5 @@ jobs:
/root/.local/share/ray_on_golem/webserver_debug.log
/root/.local/share/ray_on_golem/yagna.log
/tmp/goth-tests
ray_cluster_dump.tar.gz
mem_usage.log
7 changes: 4 additions & 3 deletions .github/workflows/smoke_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,10 @@ jobs:
run: poetry install --no-ansi

- name: Prepare `golem-cluster.tests.yaml`
run: |
poetry run python -m utils.apply_overrides -o golem-cluster.tests.yaml golem-cluster.override.1-source-files.yaml golem-cluster.override.2-image.yaml golem-cluster.override.3-disable-stats.yaml golem-cluster.override.4-subnet.yaml
cat golem-cluster.tests.yaml
run: poetry run python -m utils.apply_overrides -o golem-cluster.tests.yaml golem-cluster.override.2-image.yaml golem-cluster.override.3-disable-stats.yaml golem-cluster.override.4-subnet.yaml
- name: Call `ray up`
env:
PYTHONUNBUFFERED: 1
Expand All @@ -46,8 +48,7 @@ jobs:
run: poetry run ray-on-golem stop

- name: Check node creation in logs
continue-on-error: true
run: poetry run python tests/smoke.py ~/.local/share/ray_on_golem/webserver_debug.log
run: poetry run python tests/webserver_logs_test.py ~/.local/share/ray_on_golem/webserver.log 2

- name: Collects logs
if: always()
Expand Down
2 changes: 2 additions & 0 deletions .github/workflows/start-goth.sh
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,11 @@ echo CREATING ASSETS
python -m goth create-assets .envs/goth/assets
# disable use-proxy
sed -Ezi 's/("\n.*use\-proxy:\s)(True)/\1False/mg' .envs/goth/assets/goth-config-testing.yml
sed -Ezi 's/("mem_gib":\s+)(1.0)/\11.5/mg' .envs/goth/assets/provider/hardware.json

echo STARTING NETWORK
cat .envs/goth/assets/goth-config-testing.yml
cat .envs/goth/assets/provider/hardware.json
python -m goth start .envs/goth/assets/goth-config-testing.yml &
GOTH_PID=$!
echo "GOTH_PID=$GOTH_PID" | tee -a "$GITHUB_ENV"
Expand Down
1 change: 1 addition & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ RUN poetry install --no-interaction --no-ansi --only ray

RUN pip config set global.index-url https://pypi.dev.golem.network/simple
RUN pip install pillow
RUN python -m venv --system-site-packages /root/venv

COPY ray_on_golem /app/ray_on_golem/

Expand Down
22 changes: 14 additions & 8 deletions golem-cluster.full.yaml
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
# This is Ray on Golem Full yaml
# - the example config for testnet cluster,
# - with all the properties potentially changable by the user
# - with all the properties potentially changeable by the user

# Ray on Golem cluster name
cluster_name: golem-cluster
# This value identifies a given cluster within the ray-on-golem webserver.
# Therefore, it must be unique if you ever wish to run more than one cluster on the same ray-on-golem instance.
cluster_name: "golem-cluster"

# The maximum number of workers the cluster will have at any given time
max_workers: 10
Expand All @@ -20,6 +22,11 @@ provider:
# Port of golem webserver that has connection with golem network
webserver_port: 4578

# Port under which Ray's GCS will be exposed on the local machine
# Needs to be changed while running multiple clusters on a single machine
# Can be null to disable exposure
ray_gcs_expose_port: 6379

# Blockchain used for payments.
# `holesky` means running free nodes on testnet,
# `polygon` is for mainnet operations.
Expand Down Expand Up @@ -74,7 +81,7 @@ provider:
max_env_per_hour_price: 0.5

# If not provided payment will be sent only after cluster is stopped.
# Required for long running clusters
# Required for long-running clusters
payment_interval_hours:
minimal: 12

Expand All @@ -87,7 +94,7 @@ available_node_types:
# The maximum number of worker nodes of this type to launch
max_workers: 0

# The node type's CPU and GPU resources - leave it empty for autodetection
# The node type's CPU and GPU resources - leave it empty for auto-detection
resources: {}

# Additional parameters specific for this node type added on top of node_config from provider.parameters.node_config
Expand All @@ -103,7 +110,7 @@ available_node_types:
node_config: {}

# Specify the node type of the head node (as configured above).
head_node_type: ray.head.default
head_node_type: "ray.head.default"

# The files or directories to copy to the head and worker nodes
# Remote workdir is /root/
Expand Down Expand Up @@ -144,6 +151,5 @@ worker_start_ray_commands: [

# Satisfy checks to disable warning about legacy fields at `ray up`.
# This will be removed by ray-on-golem on the fly.
head_node: True
worker_nodes: True

head_node: true
worker_nodes: true
7 changes: 3 additions & 4 deletions golem-cluster.mini.yaml
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
# This is Ray on Golem Mini yaml - the smallest config for testnet cluster of up to 10 nodes


# Ray on Golem cluster name
cluster_name: golem-cluster
cluster_name: "golem-cluster"

# The maximum number of workers the cluster will have at any given time
max_workers: 10
Expand Down Expand Up @@ -67,6 +66,6 @@ worker_start_ray_commands: [

# Satisfy checks to disable warning about legacy fields at `ray up`.
# This will be removed by ray-on-golem on the fly.
head_node: True
worker_nodes: True
head_node: true
worker_nodes: true

6 changes: 5 additions & 1 deletion golem-cluster.override-goth.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,11 @@ provider:
parameters:
node_config:
subnet_tag: "goth"
priority_head_subnet_tag: "goth"

demand:
outbound_urls: []

available_node_types:
ray.head.default:
node_config:
priority_subnet_tag: "goth"
7 changes: 0 additions & 7 deletions golem-cluster.override.1-source-files.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,8 @@ file_mounts:
# remote_path: local_path
{
"/app/ray_on_golem": "./ray_on_golem",
"/app/golem": "../golem-core-python/golem",
}

rsync_exclude: [
"**/__pycache__",
]

# List of commands that will be run to initialize the nodes (before `setup_commands`)
initialization_commands: [
"mkdir -p $(python -c 'import site; print(site.getsitepackages()[0])')/golem",
"cp -fR /app/golem/* $(python -c 'import site; print(site.getsitepackages()[0])')/golem"
]
2 changes: 1 addition & 1 deletion golem-cluster.override.2-image.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@ provider:
parameters:
node_config:
demand:
image_tag: "blueshade/ray-on-golem:0.10.0-py3.10.13-ray2.9.3"
image_tag: "approxit/ray-on-golem:0.10.0-py3.10.13-ray2.9.3"
6 changes: 5 additions & 1 deletion golem-cluster.override.4-subnet.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,8 @@ provider:
parameters:
node_config:
subnet_tag: "public"
priority_head_subnet_tag: "sdk"

available_node_types:
ray.head.default:
node_config:
priority_subnet_tag: "sdk"
7 changes: 3 additions & 4 deletions golem-cluster.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
# - with properties allowing fine-tuning

# Ray on Golem cluster name
cluster_name: golem-cluster
cluster_name: "golem-cluster"

# The maximum number of workers the cluster will have at any given time
max_workers: 10
Expand All @@ -18,7 +18,6 @@ provider:
use_internal_ips: true
module: "ray_on_golem.provider.node_provider.GolemNodeProvider"
parameters:

# Blockchain used for payments.
# `holesky` means running free nodes on testnet,
# `polygon` is for mainnet operations.
Expand Down Expand Up @@ -136,6 +135,6 @@ worker_start_ray_commands: [

# Satisfy checks to disable warning about legacy fields at `ray up`.
# This will be removed by ray-on-golem on the fly.
head_node: True
worker_nodes: True
head_node: true
worker_nodes: true

Loading

0 comments on commit 9b42584

Please sign in to comment.