blob: 062ccaeccece8a1fa3136a94bc23f9db613830bc (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
|
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: rocky-dev-gpu
labels:
app: rocky-dev-gpu
spec:
serviceName: rocky-dev-gpu-svc
replicas: 2
selector:
matchLabels:
app: rocky-dev-gpu
template:
metadata:
labels:
app: rocky-dev-gpu
spec:
containers:
- name: rocky-dev-gpu
image: rocky_dev_gpu:latest
imagePullPolicy: IfNotPresent # Use local image
ports:
- containerPort: 22
name: ssh
securityContext:
privileged: true
resources:
limits:
nvidia.com/gpu: 1 # Request 1 GPU per pod
requests:
nvidia.com/gpu: 1
env:
- name: NVIDIA_VISIBLE_DEVICES
value: "all"
- name: NVIDIA_DRIVER_CAPABILITIES
value: "compute,utility"
volumeMounts:
- name: workspace
mountPath: /workspace
livenessProbe:
tcpSocket:
port: 22
initialDelaySeconds: 30
periodSeconds: 30
readinessProbe:
tcpSocket:
port: 22
initialDelaySeconds: 5
periodSeconds: 10
volumeClaimTemplates:
- metadata:
name: workspace
spec:
accessModes: [ "ReadWriteOnce" ]
resources:
requests:
storage: 10Gi
---
apiVersion: v1
kind: Service
metadata:
name: rocky-dev-gpu-svc
spec:
clusterIP: None
selector:
app: rocky-dev-gpu
ports:
- port: 22
targetPort: 22
|