Skip to content

Commit

Permalink
small fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
approxit committed Apr 18, 2024
1 parent 3521c6f commit b1f1208
Show file tree
Hide file tree
Showing 5 changed files with 11 additions and 7 deletions.
1 change: 1 addition & 0 deletions .github/workflows/integration_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ jobs:
run: ./.github/workflows/stop-goth.sh

- name: Prepare artifact name
if: always()
run: |
echo ARTIFACT_NAME=logs-example-${{ inputs.BRANCH }}-${{ matrix.example_name }} | sed "s|/|_|g" >> $GITHUB_ENV
Expand Down
7 changes: 3 additions & 4 deletions golem-cluster.full.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# This is Ray on Golem Full yaml
# - the example config for testnet cluster,
# - with all the properties potentially changable by the user
# - with all the properties potentially changeable by the user

# Ray on Golem cluster name
cluster_name: golem-cluster
Expand Down Expand Up @@ -74,7 +74,7 @@ provider:
max_env_per_hour_price: 0.5

# If not provided payment will be sent only after cluster is stopped.
# Required for long running clusters
# Required for long-running clusters
payment_interval_hours:
minimal: 12

Expand All @@ -87,7 +87,7 @@ available_node_types:
# The maximum number of worker nodes of this type to launch
max_workers: 0

# The node type's CPU and GPU resources - leave it empty for autodetection
# The node type's CPU and GPU resources - leave it empty for auto-detection
resources: {}

# Additional parameters specific for this node type added on top of node_config from provider.parameters.node_config
Expand Down Expand Up @@ -146,4 +146,3 @@ worker_start_ray_commands: [
# This will be removed by ray-on-golem on the fly.
head_node: True
worker_nodes: True

1 change: 0 additions & 1 deletion golem-cluster.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ provider:
use_internal_ips: true
module: "ray_on_golem.provider.node_provider.GolemNodeProvider"
parameters:

# Blockchain used for payments.
# `holesky` means running free nodes on testnet,
# `polygon` is for mainnet operations.
Expand Down
5 changes: 3 additions & 2 deletions ray_on_golem/server/cluster/nodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,9 @@ class ClusterNode(WarningMessagesMixin, NodeData):
_cluster: "Cluster"
_golem_service: GolemService
_manager_stack: ManagerStack

_sidecars: Collection[ClusterNodeSidecar]
_ssh_public_key_data: str

_activity: Optional[Activity] = None
_start_task: Optional[asyncio.Task] = None
_warning_messages: List[str] = None
Expand Down Expand Up @@ -293,7 +293,8 @@ async def verify_ssh_connection(
"uptime",
]

retry = CLUSTER_MONITOR_RETRY_COUNT
num_retries = CLUSTER_MONITOR_RETRY_COUNT
retry = num_retries
while True:
try:
await run_subprocess_output(*ssh_command_parts)
Expand Down
4 changes: 4 additions & 0 deletions ray_on_golem/server/cluster/sidecars.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,8 @@ def __init__(self, node: "ClusterNode", cluster: "Cluster", check_interval: time
self._check_interval = check_interval

async def _monitor(self) -> None:
from ray_on_golem.server.cluster.nodes import HeadClusterNode

while True:
activity_state = await self._node.activity.get_state()

Expand Down Expand Up @@ -139,6 +141,8 @@ def __init__(
self._max_fail_count = max_fail_count

async def _monitor(self) -> None:
from ray_on_golem.server.cluster.nodes import HeadClusterNode

fails_count = 0
while True:
try:
Expand Down

0 comments on commit b1f1208

Please sign in to comment.