Skip to content

Llama Inference Template Code 제작 #78

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
May 18, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions automation/deploy_streamlit/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ def init_streamlit(user_namespace, endpoint_uid, endpoint_url, image_name, image
annotations:
alb.ingress.kubernetes.io/scheme: internet-facing
alb.ingress.kubernetes.io/target-type: ip
alb.ingress.kubernetes.io/group.name: "streamlit-{user_namespace}"
alb.ingress.kubernetes.io/group.name: "{user_namespace}"
spec:
ingressClassName: alb
rules:
Expand Down Expand Up @@ -155,7 +155,7 @@ def handler(event, context):
# 추론 엔드포인트 주소
endpoint_url = body.get("endpoint_url")
result = apply_yaml(user_uid, endpoint_uid, endpoint_url, image_name, image_py_name)
cmd = "{} get ingress -A --kubeconfig {} | grep {}".format(kubectl, kubeconfig, endpoint_uid)
cmd = "{} get ingress -A --kubeconfig {} | grep {} | grep streamlit".format(kubectl, kubeconfig, endpoint_uid)
# streamlit endpoint 주소
streamlit_endpoint_url = subprocess.run(cmd, capture_output=True, shell=True).stdout.decode('utf-8').strip().split()[4]
print(f"streamlit_endpoint_url: {streamlit_endpoint_url}/streamlit/{endpoint_uid}")
Expand Down
2 changes: 2 additions & 0 deletions automation/kubernetes_inference_deploy/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import requests
import os
import json
import time

kubectl = '/var/task/kubectl'
kubeconfig = '/tmp/kubeconfig'
Expand Down Expand Up @@ -145,6 +146,7 @@ def handler(event, context):
result = apply_yaml(user_uid, endpoint_uid, model_s3_url, node_pool_name, ram_size)

cmd = "{} get ingress -A --kubeconfig {} | grep {}".format(kubectl, kubeconfig, endpoint_uid)
time.sleep(10)
endpoint_url = subprocess.run(cmd, capture_output=True, shell=True).stdout.decode('utf-8').strip().split()[4]
print(f"endpoint_url: {endpoint_url}")
update_data = {
Expand Down
4 changes: 4 additions & 0 deletions automation/llama_inference_deploy/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
push_aws_ecr.sh
ecr_login.sh
get_kubeconfig.sh
*test*
13 changes: 13 additions & 0 deletions automation/llama_inference_deploy/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
FROM public.ecr.aws/lambda/python:3.11

RUN pip install awscli requests --no-cache-dir

# x86_64
RUN curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl" \
&& chmod +x ./kubectl

COPY main.py ${LAMBDA_TASK_ROOT}

RUN chmod +x /var/task

CMD ["main.handler"]
1 change: 1 addition & 0 deletions automation/llama_inference_deploy/IaC/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
var.tf
35 changes: 35 additions & 0 deletions automation/llama_inference_deploy/IaC/main.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# prefix, container_repository 변경 (대채적인 이름)

module "llama_inference_deploy" {
source = "github.com/kookmin-sw/capstone-2024-12//IaC/serverless_api_template"
prefix = "llama-inference-deploy"
container_registry = "694448341573.dkr.ecr.ap-northeast-2.amazonaws.com"
container_repository = "llama-inference-deploy"
container_image_tag = "latest"
lambda_ram_size = 2048
attach_s3_policy = true
attach_ec2_policy = true
attach_eks_policy = true
attach_ssm_readonly_policy = true
region_name = var.region
eks_cluster_name = var.eks_cluster_name
db_api_url = var.db_api_url
}

output "llama_inference_deploy_function_url" {
value = module.llama_inference_deploy.function_url
}

provider "aws" {
region = var.region
profile = var.awscli_profile
}

terraform {
backend "s3" {
bucket = "sskai-terraform-state"
key = "llama_inference_deploy/tf.state"
region = "ap-northeast-2"
encrypt = true
}
}
19 changes: 19 additions & 0 deletions automation/llama_inference_deploy/IaC/var.tf.sample
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
variable "region" {
type = string
default = "ap-northeast-2"
}

variable "awscli_profile" {
type = string
default = ""
}

variable "eks_cluster_name" {
type = string
default = ""
}

variable "db_api_url" {
type = string
default = ""
}
183 changes: 183 additions & 0 deletions automation/llama_inference_deploy/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
import subprocess
import requests
import os
import json
import time

kubectl = '/var/task/kubectl'
kubeconfig = '/tmp/kubeconfig'

eks_cluster_name = os.getenv('EKS_CLUSTER_NAME')
region = os.getenv("REGION")
db_api_url = os.getenv("DB_API_URL")
ecr_uri = os.getenv("ECR_URI")

# get eks cluster kubernetes configuration by aws cli
result_get_kubeconfig = subprocess.run([
"aws", "eks", "update-kubeconfig",
"--name", eks_cluster_name,
"--region", region,
"--kubeconfig", kubeconfig
])

def generate_yaml(user_namespace, endpoint_uid, model_s3_url, node_pool_name, ram_size):
content = f"""---
apiVersion: v1
kind: Namespace
metadata:
name: {user_namespace}
---
apiVersion: apps/v1
kind: Deployment
metadata:
namespace: {user_namespace}
name: deployment-{endpoint_uid}
spec:
selector:
matchLabels:
app.kubernetes.io/name: app-{endpoint_uid}
replicas: 2
template:
metadata:
labels:
app.kubernetes.io/name: app-{endpoint_uid}
spec:
containers:
- image: {ecr_uri}/llama2-inference:latest
imagePullPolicy: Always
name: app-{endpoint_uid}
ports:
- containerPort: 8080
env:
- name: MODEL_S3_URL
value: {model_s3_url}
resources:
requests:
cpu: 2000m
memory: 2000M
nvidia.com/gpu: 1
limits:
cpu: 2000m
memory: 2000M
nvidia.com/gpu: 1

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

CPU 1700m, Memory 3800M으로 변경해도 될듯.

nodeSelector:
karpenter.sh/nodepool: {node_pool_name}

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nodepool은 nodepool-1을 사용하는 것으로 결정.

---
apiVersion: v1
kind: Service
metadata:
namespace: {user_namespace}
name: service-{endpoint_uid}
spec:
ports:
- port: 8080
targetPort: 8080
protocol: TCP
type: ClusterIP
selector:
app.kubernetes.io/name: app-{endpoint_uid}
---
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
namespace: {user_namespace}
name: ingress-{endpoint_uid}
annotations:
alb.ingress.kubernetes.io/scheme: internet-facing
alb.ingress.kubernetes.io/target-type: ip
alb.ingress.kubernetes.io/group.name: "{user_namespace}"
spec:
ingressClassName: alb
rules:
- http:
paths:
- path: /{endpoint_uid}
pathType: Prefix
backend:
service:
name: service-{endpoint_uid}
port:
number: 8080
"""

filepath = f"/tmp/{endpoint_uid}.yaml"
with open(filepath, 'w') as f:
f.write(content)

return filepath

def apply_yaml(user_namespace, endpoint_uid, model_s3_url, node_pool_name, ram_size):
filename = generate_yaml(user_namespace, endpoint_uid, model_s3_url, node_pool_name, ram_size)
result = subprocess.run([
kubectl, "apply", "-f", filename, "--kubeconfig", kubeconfig
])
if result.returncode != 0: print("create resource returncode != 0")
return result.returncode

def delete_resource(user_namespace, endpoint_uid):
deployment_name = f"deployment-{endpoint_uid}"
service_name = f"service-{endpoint_uid}"
ingress_name = f"ingress-{endpoint_uid}"
ingress_result = subprocess.run([
kubectl, "-n", user_namespace, "delete", "ingress", ingress_name, "--kubeconfig", kubeconfig
])
service_result = subprocess.run([
kubectl, "-n", user_namespace, "delete", "service", service_name, "--kubeconfig", kubeconfig
])
deployment_result = subprocess.run([
kubectl, "-n", user_namespace, "delete", "deployment", deployment_name, "--kubeconfig", kubeconfig
])
result = 0
if ingress_result.returncode != 0 or service_result.returncode != 0 or deployment_result.returncode != 0:
result = 1
print("delete resource returncode != 0")
return result

def handler(event, context):
body = json.loads(event.get("body", "{}"))
user_uid = body.get("user").lower()
endpoint_uid = body.get("uid").lower()
action = body.get("action")

if action == "create":
model_s3_url = body['model']['s3_url']
node_pool_name = body['model']['deployment_type']
ram_size = body['model']['max_used_ram']
result = apply_yaml(user_uid, endpoint_uid, model_s3_url, node_pool_name, ram_size)

cmd = "{} get ingress -A --kubeconfig {} | grep {}".format(kubectl, kubeconfig, endpoint_uid)
time.sleep(10)
endpoint_url = subprocess.run(cmd, capture_output=True, shell=True).stdout.decode('utf-8').strip().split()[4]
print(f"endpoint_url: {endpoint_url}")
update_data = {
"endpoint": f"http://{endpoint_url}/{endpoint_uid}"
}
response = requests.put(url=f"{db_api_url}/inferences/{endpoint_uid}", json=update_data)
if result == 0:
return {
'statusCode': 200,
'body': "complete create inference endpoint"
}
else:
return {
'statusCode': 500,
'body': "error with create inference endpoint"
}
elif action == "delete":
result = delete_resource(user_uid, endpoint_uid)
if result == 0:
requests.delete(url=f"{db_api_url}/inferences/{endpoint_uid}")
return {
'statusCode': 200,
'body': "complete delete inference deployment"
}
else:
return {
'statusCode': 500,
'body': "error with delete inference endpoint"
}
else:
return {
'statusCode': 500,
'body': "invalid action"
}
7 changes: 7 additions & 0 deletions automation/llama_inference_deploy/push_aws_ecr.sh.sample
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#!/bin/sh

ECR_URI=""

aws ecr get-login-password --region ap-northeast-2 | docker login --username AWS --password-stdin $ECR_URI
docker build -t $ECR_URI/llama-inference-deploy:latest .
docker push $ECR_URI/llama-inference-deploy:latest
12 changes: 12 additions & 0 deletions inference/template_code/llama/Dockerfile.kubernetes_gpu
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
FROM pytorch/pytorch:2.3.0-cuda12.1-cudnn8-runtime

WORKDIR /app

COPY requirements_kubernetes_gpu.txt /app/requirements.txt
RUN pip3 install --no-cache-dir -r requirements.txt

COPY kubernetes_app_llama2.py /app/app.py

CMD [ "python3", "/app/app.py" ]

EXPOSE 8080
Loading