Skip to content

Commit

Permalink
volumes in progress...
Browse files Browse the repository at this point in the history
  • Loading branch information
shadeofblue committed Apr 4, 2024
1 parent 1f342a6 commit 668115f
Show file tree
Hide file tree
Showing 11 changed files with 34 additions and 18 deletions.
6 changes: 5 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@ RUN apt-get update && apt-get install -y \

RUN echo "UseDNS no" >> /etc/ssh/sshd_config && \
echo "PermitRootLogin yes" >> /etc/ssh/sshd_config && \
echo "PasswordAuthentication yes" >> /etc/ssh/sshd_config
echo "PasswordAuthentication yes" >> /etc/ssh/sshd_config && \
echo "StrictModes no" >> /etc/ssh/sshd_config

RUN pip install -U pip

Expand All @@ -35,4 +36,7 @@ RUN pip install pillow

COPY ray_on_golem /app/ray_on_golem/

RUN rm -r /root/.cache
RUN mv /root /root_copy

VOLUME /root
8 changes: 7 additions & 1 deletion examples/GPU.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@ RUN apt-get update && apt-get install -y \

RUN echo "UseDNS no" >> /etc/ssh/sshd_config && \
echo "PermitRootLogin yes" >> /etc/ssh/sshd_config && \
echo "PasswordAuthentication yes" >> /etc/ssh/sshd_config
echo "PasswordAuthentication yes" >> /etc/ssh/sshd_config && \
echo "StrictModes no" >> /etc/ssh/sshd_config

RUN wget -O miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-py310_23.11.0-2-Linux-x86_64.sh
RUN bash miniconda.sh -b -u
Expand All @@ -41,3 +42,8 @@ RUN pip install numpy numba
RUN pip config set global.index-url https://pypi.dev.golem.network/simple

COPY ray_on_golem /app/ray_on_golem/

RUN rm -r /root/.cache
RUN mv /root /root_copy

VOLUME /root
2 changes: 1 addition & 1 deletion examples/dds-with-ray.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ provider:
node_config:
# Parameters for golem demands (same for head and workers)
demand:
image_tag: "golem/ray-on-golem:0.9.0-py3.10.13-ray2.9.3"
image_tag: "blueshade/ray-on-golem:0.10.0-dev-240404-5-py3.10.13-ray2.9.3"

# List of urls which will be added to the Computation Manifest
# Requires protocol to be defined in all URLs
Expand Down
2 changes: 1 addition & 1 deletion examples/mandelbrot.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ provider:
demand:
# if not provided, image_tag will be autodetected based on currently used python and ray versions
# check available versions at https://registry.golem.network/explore/golem/ray-on-golem
image_tag: "blueshade/ray-on-golem:0.10.0-dev-240402-1-py3.10.13-ray2.9.3"
image_tag: "blueshade/ray-on-golem:0.10.0-dev-240404-5-py3.10.13-ray2.9.3"

# List of urls which will be added to the Computation Manifest
# Requires protocol to be defined in all URLs
Expand Down
2 changes: 1 addition & 1 deletion examples/outbound.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ provider:
# Parameters for golem demands (same for head and workers)
demand:
# Check available versions at https://registry.golem.network/explore/golem/ray-on-golem
image_tag: "blueshade/ray-on-golem:0.10.0-dev-240402-1-py3.10.13-ray2.9.3"
image_tag: "blueshade/ray-on-golem:0.10.0-dev-240404-5-py3.10.13-ray2.9.3"

# List of urls which will be added to the Computation Manifest
# Requires protocol to be defined in all URLs
Expand Down
2 changes: 1 addition & 1 deletion examples/outbound_tester.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ provider:
# Parameters for golem demands (same for head and workers)
demand:
# Check available versions at https://registry.golem.network/explore/golem/ray-on-golem
image_tag: "blueshade/ray-on-golem:0.10.0-dev-240402-1-py3.10.13-ray2.9.3"
image_tag: "blueshade/ray-on-golem:0.10.0-dev-240404-5-py3.10.13-ray2.9.3"

# List of urls which will be added to the Computation Manifest
# Requires protocol to be defined in all URLs
Expand Down
2 changes: 1 addition & 1 deletion golem-cluster.full.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ provider:
# Parameters for golem demands (same for head and workers)
demand:
# Check available versions at https://registry.golem.network/explore/golem/ray-on-golem
image_tag: "golem/ray-on-golem:0.8.0-py3.10.13-ray2.9.2"
image_tag: "blueshade/ray-on-golem:0.10.0-dev-240404-5-py3.10.13-ray2.9.3"

# List of urls which will be added to the Computation Manifest
# Requires protocol to be defined in all URLs
Expand Down
2 changes: 1 addition & 1 deletion golem-cluster.mini.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ provider:
node_config:
# Parameters for golem demands (same for head and workers)
demand:
image_tag: "golem/ray-on-golem:0.9.0-py3.10.13-ray2.9.3"
image_tag: "blueshade/ray-on-golem:0.10.0-dev-240404-5-py3.10.13-ray2.9.3"

# Tells the autoscaler the allowed node types and the resources they provide
available_node_types:
Expand Down
2 changes: 1 addition & 1 deletion golem-cluster.override.2-image.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@ provider:
parameters:
node_config:
demand:
image_tag: "blueshade/raytest:0.10.0-dev-py3.10.13-ray2.9.3"
image_tag: "blueshade/ray-on-golem:0.10.0-dev-240404-5-py3.10.13-ray2.9.3"
2 changes: 1 addition & 1 deletion golem-cluster.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ provider:
# Parameters for golem demands (same for head and workers)
demand:
# Check available versions at https://registry.golem.network/explore/golem/ray-on-golem
image_tag: "blueshade/raytest:0.10.0-dev-py3.10.13-ray2.9.3"
image_tag: "blueshade/ray-on-golem:0.10.0-dev-240404-5-py3.10.13-ray2.9.3"

# List of urls which will be added to the Computation Manifest
# Requires protocol to be defined in all URLs
Expand Down
22 changes: 14 additions & 8 deletions ray_on_golem/server/services/golem/golem.py
Original file line number Diff line number Diff line change
Expand Up @@ -291,16 +291,22 @@ async def _upload_node_configuration(
ip: str,
ssh_public_key_data: str,
):
async def run_command(cmd: str):
result = await context.run(cmd)
await result.wait()
logger.debug("Command executed: %s: %s",cmd, [e.to_dict() for e in result.events])

provider_desc = await self.get_provider_desc(context.activity)
logger.info(f"Running initial commands on {provider_desc}, {ip=}, {context.activity=}")
hostname = ip.replace(".", "-")
await context.run("echo 'ON_GOLEM_NETWORK=1' >> /etc/environment")
await context.run(f"echo 'NODE_IP={ip}' >> /etc/environment")
await context.run(f"hostname '{hostname}'")
await context.run(f"echo '{hostname}' > /etc/hostname")
await context.run(f"echo '{ip} {hostname}' >> /etc/hosts")
await context.run("mkdir -p /root/.ssh")
await context.run(f'echo "{ssh_public_key_data}" >> /root/.ssh/authorized_keys')
await run_command("echo 'ON_GOLEM_NETWORK=1' >> /etc/environment")
await run_command(f"echo 'NODE_IP={ip}' >> /etc/environment")
await run_command(f"hostname '{hostname}'")
await run_command(f"echo '{hostname}' > /etc/hostname")
await run_command(f"echo '{ip} {hostname}' >> /etc/hosts")
await run_command("mv /root_copy/.bashrc /root_copy/.profile /root 2> /dev/null")
await run_command("mkdir -p /root/.ssh")
await run_command(f'echo "{ssh_public_key_data}" >> /root/.ssh/authorized_keys')

async def _start_ssh_server(self, context: WorkContext, ip: str):
provider_desc = await self.get_provider_desc(context.activity)
Expand All @@ -313,7 +319,7 @@ async def _restart_ssh_server(self, context: WorkContext, ip: str):
try:
await context.run("service ssh restart", timeout=120)
except Exception:
msg = f"Failed to restart SSH server {provider_desc}, {ip=}, {context.activity=}"
msg = f"Failed to restart the SSH server {provider_desc}, {ip=}, {context.activity=}"
logger.warning(msg)
logger.debug(msg, exc_info=True)
else:
Expand Down

0 comments on commit 668115f

Please sign in to comment.