Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion runpod/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
get_gpu, get_gpus,
get_pod, get_pods, create_pod, stop_pod, resume_pod, terminate_pod,
create_template,
create_endpoint
get_endpoints, create_endpoint, update_endpoint_template
)
from .cli.groups.config.functions import set_credentials, check_credentials, get_credentials

Expand Down
31 changes: 31 additions & 0 deletions runpod/api/ctl_commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from .mutations import user as user_mutations
from .queries import gpus
from .queries import pods as pod_queries
from .queries import endpoints as endpoint_queries
from .graphql import run_graphql_query
from .mutations import pods as pod_mutations
from .mutations import endpoints as endpoint_mutations
Expand Down Expand Up @@ -228,6 +229,14 @@ def create_template(

return raw_response["data"]["saveTemplate"]

def get_endpoints() -> dict:
'''
Get all endpoints
'''
raw_return = run_graphql_query(endpoint_queries.QUERY_ENDPOINT)
cleaned_return = raw_return["data"]["myself"]["endpoints"]
return cleaned_return

def create_endpoint(
name:str, template_id:str, gpu_ids:str="AMPERE_16",
network_volume_id:str=None, locations:str=None,
Expand Down Expand Up @@ -262,3 +271,25 @@ def create_endpoint(
)

return raw_response["data"]["saveEndpoint"]


def update_endpoint_template(
endpoint_id:str, template_id:str
):
'''
Update an endpoint template

:param endpoint_id: the id of the endpoint
:param template_id: the id of the template to use for the endpoint

:example:

>>> endpoint_id = runpod.update_endpoint_template("test", "template_id")
'''
raw_response = run_graphql_query(
endpoint_mutations.update_endpoint_template_mutation(
endpoint_id, template_id
)
)

return raw_response["data"]["updateEndpointTemplate"]
22 changes: 22 additions & 0 deletions runpod/api/mutations/endpoints.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,3 +57,25 @@ def generate_endpoint_mutation(
}}
}}
"""

def update_endpoint_template_mutation(
endpoint_id: str, template_id: str
):
""" Generate a string for a GraphQL mutation to update an existing endpoint's template. """
input_fields = []

# ------------------------------ Required Fields ----------------------------- #
input_fields.append(f'templateId: "{template_id}"')
input_fields.append(f'endpointId: "{endpoint_id}"')

# Format the input fields into a string
input_fields_string = ", ".join(input_fields)
result = f"""
mutation {{
updateEndpointTemplate(input: {{{input_fields_string}}}) {{
id
templateId
}}
}}
"""
return result
34 changes: 34 additions & 0 deletions runpod/api/queries/endpoints.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
QUERY_ENDPOINT="""
query Query {
myself {
endpoints {
aiKey
gpuIds
id
idleTimeout
name
networkVolumeId
locations
scalerType
scalerValue
templateId
type
userId
version
workersMax
workersMin
workersStandby
gpuCount
env {
key
value
}
createdAt
networkVolume {
id
dataCenterId
}
}
}
}
"""
142 changes: 94 additions & 48 deletions runpod/cli/groups/project/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,10 @@
import tomlkit
from tomlkit import document, comment, table, nl

from runpod import __version__, get_pod, create_template, create_endpoint
from runpod import __version__, get_pod, create_template, create_endpoint, update_endpoint_template, get_endpoints
from runpod.cli import BASE_DOCKER_IMAGE, GPU_TYPES, ENV_VARS
from runpod.cli.utils.ssh_cmd import SSHConnection
from .helpers import get_project_pod, copy_template_files, attempt_pod_launch, load_project_config
from .helpers import get_project_pod, copy_template_files, attempt_pod_launch, load_project_config, get_project_endpoint
from ...utils.rp_sync import sync_directory

STARTER_TEMPLATES = os.path.join(os.path.dirname(__file__), 'starter_templates')
Expand Down Expand Up @@ -96,6 +96,38 @@ def create_new_project(project_name, runpod_volume_id, cuda_version, python_vers
with open(os.path.join(project_folder, "runpod.toml"), 'w', encoding="UTF-8") as config_file:
tomlkit.dump(toml_config, config_file)

def launch_pod():
config = load_project_config() # Load runpod.toml

print("Deploying development pod on RunPod...")

# Prepare the environment variables
environment_variables = {"RUNPOD_PROJECT_ID": config["project"]["uuid"]}
for variable in config['project'].get('env_vars', {}):
environment_variables[variable] = config['project']['env_vars'][variable]

# Prepare the GPU types
selected_gpu_types = config['project'].get('gpu_types', [])
if config['project'].get('gpu', None):
selected_gpu_types.append(config['project']['gpu'])

# Attempt to launch a pod with the given configuration
new_pod = attempt_pod_launch(config, environment_variables)
if new_pod is None:
print("Selected GPU types unavailable, try again later or use a different type.")
return

print("Waiting for pod to come online... ", end="")
sys.stdout.flush()

# Wait for the pod to come online
while new_pod.get('desiredStatus', None) != 'RUNNING' or new_pod.get('runtime') is None:
new_pod = get_pod(new_pod['id'])

project_pod_id = new_pod['id']

print(f"Project {config['project']['name']} pod ({project_pod_id}) created.", end="\n\n")
return project_pod_id

# ------------------------------- Start Project ------------------------------ #
def start_project(): # pylint: disable=too-many-locals, too-many-branches
Expand Down Expand Up @@ -124,63 +156,37 @@ def start_project(): # pylint: disable=too-many-locals, too-many-branches

# Check if the project pod already exists, if not create it.
if not project_pod_id:

print("Deploying development pod on RunPod...")

# Prepare the environment variables
environment_variables = {"RUNPOD_PROJECT_ID": config["project"]["uuid"]}
for variable in config['project'].get('env_vars', {}):
environment_variables[variable] = config['project']['env_vars'][variable]

# Prepare the GPU types
selected_gpu_types = config['project'].get('gpu_types', [])
if config['project'].get('gpu', None):
selected_gpu_types.append(config['project']['gpu'])

# Attempt to launch a pod with the given configuration
new_pod = attempt_pod_launch(config, environment_variables)
if new_pod is None:
print("Selected GPU types unavailable, try again later or use a different type.")
return

print("Waiting for pod to come online... ", end="")
sys.stdout.flush()

# Wait for the pod to come online
while new_pod.get('desiredStatus', None) != 'RUNNING' or new_pod.get('runtime') is None:
new_pod = get_pod(new_pod['id'])

project_pod_id = new_pod['id']

print(f"Project {config['project']['name']} pod ({project_pod_id}) created.", end="\n\n")
project_pod_id = launch_pod()

with SSHConnection(project_pod_id) as ssh_conn:

project_path_uuid = f'{config["project"]["volume_mount_path"]}/{config["project"]["uuid"]}'
remote_project_path = os.path.join(project_path_uuid, config["project"]["name"])
project_path_uuid_dev = os.path.join(project_path_uuid, 'dev')
project_path_uuid_prod = os.path.join(project_path_uuid, 'prod')
remote_project_path = os.path.join(project_path_uuid_dev, config["project"]["name"])

# Create the project folder on the pod
print(f'Checking pod project folder: {remote_project_path} on pod {project_pod_id}')
ssh_conn.run_commands([f'mkdir -p {remote_project_path}'])
ssh_conn.run_commands([f'mkdir -p {remote_project_path} {project_path_uuid_prod}'])

# Copy local files to the pod project folder
print(f'Syncing files to pod {project_pod_id}')
ssh_conn.rsync(os.getcwd(), project_path_uuid)
ssh_conn.rsync(os.getcwd(), project_path_uuid_dev)

# Create the virtual environment
venv_path = os.path.join(project_path_uuid, "venv")
venv_path = os.path.join(project_path_uuid_dev, "venv")
print(f'Activating Python virtual environment: {venv_path} on pod {project_pod_id}')
commands = [
f'python{config["runtime"]["python_version"]} -m venv {venv_path}',
f'source {venv_path}/bin/activate &&'
f'cd {remote_project_path} &&'
'python -m pip install --upgrade pip &&'
f'python -m pip install --requirement {config["runtime"]["requirements_path"]}'
f'python -m pip install -v --requirement {config["runtime"]["requirements_path"]}'
]
ssh_conn.run_commands(commands)

# Start the watcher and then start the API development server
sync_directory(ssh_conn, os.getcwd(), project_path_uuid)
sync_directory(ssh_conn, os.getcwd(), project_path_uuid_dev)

project_name = config["project"]["name"]
pip_req_path = os.path.join(remote_project_path, config['runtime']['requirements_path'])
Expand Down Expand Up @@ -217,14 +223,14 @@ def start_project(): # pylint: disable=too-many-locals, too-many-branches

trap cleanup EXIT SIGINT

if source {project_path_uuid}/venv/bin/activate; then
if source {project_path_uuid_dev}/venv/bin/activate; then
echo -e "- Activated virtual environment."
else
echo "Failed to activate virtual environment."
exit 1
fi

if cd {project_path_uuid}/{project_name}; then
if cd {project_path_uuid_dev}/{project_name}; then
echo -e "- Changed to project directory."
else
echo "Failed to change directory."
Expand Down Expand Up @@ -287,35 +293,75 @@ def start_project(): # pylint: disable=too-many-locals, too-many-branches
# ------------------------------ Deploy Project ------------------------------ #
def create_project_endpoint():
""" Create a project endpoint.
- Move code in dev to prod folder
- TODO: git commit the diff from current state to new state
- Create a serverless template for the project
- Create a new endpoint using the template
"""
config = load_project_config()

project_id = config['project']['uuid']
project_pod_id = get_project_pod(project_id)

# Check if the project pod already exists, if not create it.
if not project_pod_id:
project_pod_id = launch_pod()

with SSHConnection(project_pod_id) as ssh_conn:
project_path_uuid = f'{config["project"]["volume_mount_path"]}/{config["project"]["uuid"]}'
project_path_uuid_prod = os.path.join(project_path_uuid, 'prod')
remote_project_path = os.path.join(project_path_uuid_prod, config["project"]["name"])
# Copy local files to the pod project folder
ssh_conn.run_commands([f'mkdir -p {remote_project_path}'])
print(f'Syncing files to pod {project_pod_id} prod')
ssh_conn.rsync(os.getcwd(), project_path_uuid_prod)
# Create the virtual environment
venv_path = os.path.join(project_path_uuid_prod, 'venv')
print(f'Activating Python virtual environment: {venv_path} on pod {project_pod_id}')
commands = [
f'python{config["runtime"]["python_version"]} -m venv {venv_path}',
f'source {venv_path}/bin/activate &&'
f'cd {remote_project_path} &&'
'python -m pip install --upgrade pip &&'
f'python -m pip install -v --requirement {config["runtime"]["requirements_path"]}'
]
ssh_conn.run_commands(commands)
ssh_conn.close()

environment_variables = {}
for variable in config['project']['env_vars']:
environment_variables[variable] = config['project']['env_vars'][variable]

# Construct the docker start command
docker_start_cmd_prefix = 'bash -c "'
activate_cmd = f'. /runpod-volume/{config["project"]["uuid"]}/venv/bin/activate'
python_cmd = f'python -u /runpod-volume/{config["project"]["uuid"]}/{config["project"]["name"]}/{config["runtime"]["handler_path"]}' # pylint: disable=line-too-long
activate_cmd = f'. /runpod-volume/{config["project"]["uuid"]}/prod/venv/bin/activate'
python_cmd = f'python -u /runpod-volume/{config["project"]["uuid"]}/prod/{config["project"]["name"]}/{config["runtime"]["handler_path"]}' # pylint: disable=line-too-long
docker_start_cmd_suffix = '"'
docker_start_cmd = docker_start_cmd_prefix + activate_cmd + ' && ' + \
python_cmd + docker_start_cmd_suffix # pylint: disable=line-too-long


project_name_unique = str(uuid.uuid4())[:8] #project name unique constraint
project_endpoint_template = create_template(
name=f'{config["project"]["name"]}-endpoint | {config["project"]["uuid"]}',
name=f'{config["project"]["name"]}-endpoint | {config["project"]["uuid"]} | {project_name_unique}',
image_name=config['project']['base_image'],
container_disk_in_gb=config['project']['container_disk_size_gb'],
docker_start_cmd=docker_start_cmd,
env=environment_variables, is_serverless=True
)

deployed_endpoint = create_endpoint(
name=f'{config["project"]["name"]}-endpoint | {config["project"]["uuid"]}',
template_id=project_endpoint_template['id'],
network_volume_id=config['project']['storage_id'],
)
deployed_endpoint = get_project_endpoint(project_id)
if not deployed_endpoint:
deployed_endpoint = create_endpoint(
name=f'{config["project"]["name"]}-endpoint | {config["project"]["uuid"]}',
template_id=project_endpoint_template['id'],
network_volume_id=config['project']['storage_id'],
)
else:
deployed_endpoint = update_endpoint_template(
endpoint_id=deployed_endpoint['id'],
template_id=project_endpoint_template['id'],
)

#does user want to tear down and recreate workers immediately?

return deployed_endpoint['id']
12 changes: 11 additions & 1 deletion runpod/cli/groups/project/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import click
import tomlkit

from runpod import get_pods, create_pod
from runpod import get_pods, create_pod, get_endpoints
from runpod import error as rp_error

def validate_project_name(name):
Expand All @@ -29,6 +29,16 @@ def get_project_pod(project_id: str):

return None

def get_project_endpoint(project_id: str):
"""Check if a project endpoint exists.
Return the endpoint_id if it exists, else return None.
"""
for endpoint in get_endpoints():
if project_id in endpoint['name']:
return endpoint

return None

def copy_template_files(template_dir, destination):
"""Copy the template files to the destination directory."""
for item in os.listdir(template_dir):
Expand Down
1 change: 1 addition & 0 deletions runpod/cli/utils/ssh_cmd.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,7 @@ def rsync(self, local_path, remote_path, quiet=False):

return subprocess.run(rsync_cmd, check=True)


def close(self):
''' Close the SSH connection. '''
self.ssh.close()