forked from aliyun/ros-templates
-
Notifications
You must be signed in to change notification settings - Fork 1
/
ack-ai-fine-tuning.yml
219 lines (219 loc) · 6.2 KB
/
ack-ai-fine-tuning.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
ROSTemplateFormatVersion: '2015-09-01'
Description:
zh-cn: 创建Kubernetes集群,配置VPC、安全组、NAS,部署模型微调训练任务与GPU共享推理服务。
en: Create a Kubernetes cluster, configure Virtual Private Cloud (VPC), Security
Groups, and Network Attached Storage (NAS), and deploy model fine-tuning training
tasks along with GPU-shared inference services.
Parameters:
AckName:
Type: String
Label:
en: Cluster Name
zh-cn: 集群名称
Description:
en: The name must be 1 to 63 characters in length and can contain letters, Chinese
characters, digits, and hyphens (-).
zh-cn: 名称为1~63个字符,可包含数字、汉字、英文字符或中划线(-)。
AllowedPattern: ^[a-zA-Z0-9\u4e00-\u9fa5][-a-zA-Z0-9\u4e00-\u9fa5]{0,62}$
AssociationProperty: AutoCompleteInput
AssociationPropertyMetadata:
Length: 5
Prefix: ai-test-
CharacterClasses:
- Class: lowercase
min: 1
ZoneId:
Type: String
Label:
en: VSwitch Available Zone
zh-cn: 可用区
AssociationProperty: ALIYUN::VPC::Zone::ZoneId
Description:
en: If the available zone cannot be selected, please switch regions.
zh-cn: 如果选择不到可用区,请切换地域。
LoginPassword:
Type: String
Label:
en: Set node login password
zh-cn: 设置节点登录密码
Description:
en: |-
The password must be 8 to 32 characters in length. <br>
It must consist three of the the following character types: uppercase letters, lowercase letters, digits, and special characters. <br>
Special characters include <span style="background:#E7E9EB;"><b>()`~!@#$%^&*_-+=|{}[]:;'<>,.?/</b></span>.<br>
zh-cn: 长度为8-30位,需包含大写字母、小写字母、特殊符号和数字中的三个,允许的特殊字符包括<span style="background:#E7E9EB;"><b>()`~!@#$%^&*_-+=|{}[]:;'<>,.?/</b></span>。
AssociationProperty: ALIYUN::ECS::Instance::Password
NoEcho: true
NasZoneId:
Type: String
Label:
en: NAS Available Zone
zh-cn: NAS 可用区
AssociationProperty: ALIYUN::VPC::Zone::ZoneId
CommonName:
Type: String
Default: ack-ai
Resources:
ModuleAcsCsProvision:
Type: MODULE::ACS::CS::Provision
Version: default
Vpc:
Type: ALIYUN::ECS::VPC
Properties:
CidrBlock: 192.168.0.0/16
VpcName:
Fn::Sub: ${CommonName}-vpc
Vswitch:
Type: ALIYUN::ECS::VSwitch
Properties:
ZoneId:
Ref: ZoneId
VpcId:
Ref: Vpc
CidrBlock: 192.168.0.0/24
VSwitchName:
Fn::Sub: ${CommonName}-${ZoneId}-vsw
SecurityGroup:
Type: ALIYUN::ECS::SecurityGroup
Properties:
VpcId:
Ref: Vpc
SecurityGroupName:
Fn::Sub: ${CommonName}-sg
SecurityGroupEgress:
- PortRange: -1/-1
Priority: 1
IpProtocol: all
DestCidrIp: 0.0.0.0/0
NicType: internet
- PortRange: -1/-1
Priority: 1
IpProtocol: all
DestCidrIp: 0.0.0.0/0
NicType: intranet
Eip:
Type: ALIYUN::VPC::EIP
Properties:
Name:
Fn::Sub: ${CommonName}-eip
Bandwidth: 100
InternetChargeType: PayByTraffic
EipAssociation:
Type: ALIYUN::VPC::EIPAssociation
DependsOn: Sleep
Properties:
AllocationId:
Ref: Eip
InstanceId:
Fn::Jq:
- First
- .[0].instance_id
- Fn::GetAtt:
- Ack
- Nodes
Sleep:
Type: ALIYUN::ROS::Sleep
DependsOn: Ack
Properties:
CreateDuration: 180
Ack:
Type: ALIYUN::CS::ManagedKubernetesCluster
Properties:
VpcId:
Ref: Vpc
SecurityGroupId:
Ref: SecurityGroup
VSwitchIds:
- Ref: Vswitch
Name:
Ref: AckName
NumOfNodes: 1
ProxyMode: ipvs
ClusterSpec: ack.pro.small
ServiceCidr: 172.16.0.0/16
ContainerCidr: 10.0.0.0/8
NodeCidrMask: 26
Addons:
- Name: flannel
- Name: csi-plugin
- Name: csi-provisioner
- Name: storage-operator
Config: '{"CnfsOssEnable":"false","CnfsNasEnable":"false"}'
- Name: nginx-ingress-controller
Config: '{"IngressSlbNetworkType":"internet","IngressSlbSpec":"slb.s2.small"}'
- Name: ack-node-local-dns
- Name: arms-prometheus
EndpointPublicAccess: true
SnatEntry: true
IsEnterpriseSecurityGroup: true
WorkerInstanceTypes:
- ecs.gn6i-c24g1.12xlarge
- ecs.gn6v-c8g1.4xlarge
- ecs.gn6i-c24g1.24xlarge
WorkerSystemDiskCategory: cloud_essd
WorkerSystemDiskSize: 120
LoginPassword:
Ref: LoginPassword
Runtime:
Name: containerd
Version: 1.6.20
CloudMonitorFlags: false
ZoneIds:
- Ref: ZoneId
DependsOn:
- ModuleAcsCsProvision
Nas:
Type: ALIYUN::NAS::FileSystem
Properties:
ProtocolType: NFS
FileSystemType: standard
StorageType: Capacity
ZoneId:
Ref: NasZoneId
VpcId:
Ref: Vpc
VSwitchId:
Ref: Vswitch
Description:
Fn::Sub: ${CommonName}-nas
NasMountTarget:
Type: ALIYUN::NAS::MountTarget
Properties:
VpcId:
Ref: Vpc
VSwitchId:
Ref: Vswitch
Status: Active
FileSystemId:
Ref: Nas
NetworkType: Vpc
AccessGroupName: DEFAULT_VPC_GROUP_NAME
Outputs:
EcsLoginAddress:
Description:
zh-cn: ECS登录地址。
en: Ecs login address.
Value:
Fn::Sub:
- https://ecs-workbench.aliyun.com/?from=EcsConsole&instanceType=ecs®ionId=${Region}&instanceId=${InstanceId}
- InstanceId:
Fn::Jq:
- First
- .[0].instance_id
- Fn::GetAtt:
- Ack
- Nodes
Region:
Ref: ALIYUN::Region
Metadata:
ALIYUN::ROS::Interface:
ParameterGroups:
- Parameters:
- AckName
- ZoneId
- LoginPassword
- NasZoneId
Hidden:
- CommonName
TemplateTags:
- acs:technical-solution:ack:一键训练大模型及部署GPU共享推理服务-tech_solu_82