# GPU tasks — Docker containers on the GPU node, launched via local Docker API
container = docker_client.containers.run( image="our-model-server:latest", detach=True, device_requests=[ -weight: 500;">docker.types.DeviceRequest(device_ids=["0"], capabilities=[["gpu"]]) ], environment={ "CUDA_VISIBLE_DEVICES": "0", "PYTORCH_CUDA_ALLOC_CONF": "max_split_size_mb:512" # hint only, not enforced }
)
# GPU tasks — Docker containers on the GPU node, launched via local Docker API
container = docker_client.containers.run( image="our-model-server:latest", detach=True, device_requests=[ -weight: 500;">docker.types.DeviceRequest(device_ids=["0"], capabilities=[["gpu"]]) ], environment={ "CUDA_VISIBLE_DEVICES": "0", "PYTORCH_CUDA_ALLOC_CONF": "max_split_size_mb:512" # hint only, not enforced }
)
# GPU tasks — Docker containers on the GPU node, launched via local Docker API
container = docker_client.containers.run( image="our-model-server:latest", detach=True, device_requests=[ -weight: 500;">docker.types.DeviceRequest(device_ids=["0"], capabilities=[["gpu"]]) ], environment={ "CUDA_VISIBLE_DEVICES": "0", "PYTORCH_CUDA_ALLOC_CONF": "max_split_size_mb:512" # hint only, not enforced }
)
# CPU tasks — normal K8s pods, no special handling needed
pod_manifest = { "apiVersion": "v1", "kind": "Pod", "spec": { "containers": [{ "resources": { "limits": {"cpu": "2", "memory": "4Gi"} } }] }
}
k8s_client.create_namespaced_pod(namespace="workloads", body=pod_manifest)
# CPU tasks — normal K8s pods, no special handling needed
pod_manifest = { "apiVersion": "v1", "kind": "Pod", "spec": { "containers": [{ "resources": { "limits": {"cpu": "2", "memory": "4Gi"} } }] }
}
k8s_client.create_namespaced_pod(namespace="workloads", body=pod_manifest)
# CPU tasks — normal K8s pods, no special handling needed
pod_manifest = { "apiVersion": "v1", "kind": "Pod", "spec": { "containers": [{ "resources": { "limits": {"cpu": "2", "memory": "4Gi"} } }] }
}
k8s_client.create_namespaced_pod(namespace="workloads", body=pod_manifest)
Physical GPU (RTX 3080 — 10GB) ↓ NVIDIA Driver ↓ libvgpu.so ←── HAMi injects this via LD_PRELOAD (intercepts cudaMalloc, enforces per-pod limits) ↓ ┌─────────────┐ ┌─────────────┐ │ Pod A │ │ Pod B │ │ 2GB limit │ │ 3GB limit │ │ 25% cores │ │ 40% cores │ └─────────────┘ └─────────────┘
Physical GPU (RTX 3080 — 10GB) ↓ NVIDIA Driver ↓ libvgpu.so ←── HAMi injects this via LD_PRELOAD (intercepts cudaMalloc, enforces per-pod limits) ↓ ┌─────────────┐ ┌─────────────┐ │ Pod A │ │ Pod B │ │ 2GB limit │ │ 3GB limit │ │ 25% cores │ │ 40% cores │ └─────────────┘ └─────────────┘
Physical GPU (RTX 3080 — 10GB) ↓ NVIDIA Driver ↓ libvgpu.so ←── HAMi injects this via LD_PRELOAD (intercepts cudaMalloc, enforces per-pod limits) ↓ ┌─────────────┐ ┌─────────────┐ │ Pod A │ │ Pod B │ │ 2GB limit │ │ 3GB limit │ │ 25% cores │ │ 40% cores │ └─────────────┘ └─────────────┘
# Enable microk8s GPU addon
microk8s -weight: 500;">enable gpu # Install cert-manager (HAMi's webhook needs it)
microk8s -weight: 500;">kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/v1.14.4/cert-manager.yaml
microk8s -weight: 500;">kubectl wait --for=condition=ready pod \ -l app.kubernetes.io/instance=cert-manager \ -n cert-manager --timeout=180s # Add HAMi helm repo
microk8s helm3 repo add hami-charts https://project-hami.github.io/HAMi/
microk8s helm3 repo -weight: 500;">update # Get K8s version (--short is deprecated in newer -weight: 500;">kubectl)
K8S_VERSION=$(microk8s -weight: 500;">kubectl version -o json | python3 -c "
import sys, json, re
v = json.load(sys.stdin)['serverVersion']['gitVersion'].lstrip('v')
print(re.split(r'[+\-]', v)[0])
") # Install HAMi
microk8s helm3 -weight: 500;">install hami hami-charts/hami \ --namespace kube-system \ --set scheduler.kubeScheduler.imageTag=v${K8S_VERSION} \ --set devicePlugin.nvidiaDriverPath=/usr/local/nvidia \ --set scheduler.defaultSchedulerPolicy.gpuMemory=true \ --set scheduler.defaultSchedulerPolicy.gpuCores=true # CRITICAL: Label the GPU node — without this, the device-plugin DaemonSet stays at DESIRED: 0
microk8s -weight: 500;">kubectl label node <your-node-name> gpu=on
# Enable microk8s GPU addon
microk8s -weight: 500;">enable gpu # Install cert-manager (HAMi's webhook needs it)
microk8s -weight: 500;">kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/v1.14.4/cert-manager.yaml
microk8s -weight: 500;">kubectl wait --for=condition=ready pod \ -l app.kubernetes.io/instance=cert-manager \ -n cert-manager --timeout=180s # Add HAMi helm repo
microk8s helm3 repo add hami-charts https://project-hami.github.io/HAMi/
microk8s helm3 repo -weight: 500;">update # Get K8s version (--short is deprecated in newer -weight: 500;">kubectl)
K8S_VERSION=$(microk8s -weight: 500;">kubectl version -o json | python3 -c "
import sys, json, re
v = json.load(sys.stdin)['serverVersion']['gitVersion'].lstrip('v')
print(re.split(r'[+\-]', v)[0])
") # Install HAMi
microk8s helm3 -weight: 500;">install hami hami-charts/hami \ --namespace kube-system \ --set scheduler.kubeScheduler.imageTag=v${K8S_VERSION} \ --set devicePlugin.nvidiaDriverPath=/usr/local/nvidia \ --set scheduler.defaultSchedulerPolicy.gpuMemory=true \ --set scheduler.defaultSchedulerPolicy.gpuCores=true # CRITICAL: Label the GPU node — without this, the device-plugin DaemonSet stays at DESIRED: 0
microk8s -weight: 500;">kubectl label node <your-node-name> gpu=on
# Enable microk8s GPU addon
microk8s -weight: 500;">enable gpu # Install cert-manager (HAMi's webhook needs it)
microk8s -weight: 500;">kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/v1.14.4/cert-manager.yaml
microk8s -weight: 500;">kubectl wait --for=condition=ready pod \ -l app.kubernetes.io/instance=cert-manager \ -n cert-manager --timeout=180s # Add HAMi helm repo
microk8s helm3 repo add hami-charts https://project-hami.github.io/HAMi/
microk8s helm3 repo -weight: 500;">update # Get K8s version (--short is deprecated in newer -weight: 500;">kubectl)
K8S_VERSION=$(microk8s -weight: 500;">kubectl version -o json | python3 -c "
import sys, json, re
v = json.load(sys.stdin)['serverVersion']['gitVersion'].lstrip('v')
print(re.split(r'[+\-]', v)[0])
") # Install HAMi
microk8s helm3 -weight: 500;">install hami hami-charts/hami \ --namespace kube-system \ --set scheduler.kubeScheduler.imageTag=v${K8S_VERSION} \ --set devicePlugin.nvidiaDriverPath=/usr/local/nvidia \ --set scheduler.defaultSchedulerPolicy.gpuMemory=true \ --set scheduler.defaultSchedulerPolicy.gpuCores=true # CRITICAL: Label the GPU node — without this, the device-plugin DaemonSet stays at DESIRED: 0
microk8s -weight: 500;">kubectl label node <your-node-name> gpu=on
apiVersion: apps/v1
kind: Deployment
metadata: name: gpu-worker-a
spec: replicas: 1 selector: matchLabels: app: gpu-worker instance: worker-a template: metadata: labels: app: gpu-worker instance: worker-a spec: schedulerName: hami-scheduler # critical — tells K8s to use HAMi's scheduler containers: - name: gpu-worker image: pytorch/pytorch:2.2.0-cuda12.1-cudnn8-runtime command: ["python3", "-u", "-c"] args: - | import torch, time, os pod = os.environ.get('POD_NAME', 'worker-a') device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print(f'[{pod}] device={device} gpu={torch.cuda.get_device_name(0)}', flush=True) # Allocate 1.5GB resident tensor (well within 2GB limit) elements = (1500 * 1024 * 1024) // 4 blob = torch.zeros(elements, dtype=torch.float32, device=device) print(f'[{pod}] VRAM allocated: {torch.cuda.memory_allocated() // 1024**2}MB', flush=True) a = torch.randn(1024, 1024, device=device, dtype=torch.float16) b = torch.randn(1024, 1024, device=device, dtype=torch.float16) i = 0 while True: c = torch.matmul(a, b) torch.cuda.synchronize() i += 1 if i % 100 == 0: print(f'[{pod}] iter={i} vram={torch.cuda.memory_allocated() // 1024**2}MB', flush=True) time.sleep(0.1) env: - name: POD_NAME valueFrom: fieldRef: fieldPath: metadata.name - name: PYTHONUNBUFFERED value: "1" resources: limits: nvidia.com/gpu: "1" # REQUIRED — HAMi trigger, without this it ignores the pod nvidia.com/gpucores: "25" # 25% SM core cap (soft throttle) nvidia.com/gpumem-percentage: "20" # 20% of VRAM = ~2048MB hard wall
apiVersion: apps/v1
kind: Deployment
metadata: name: gpu-worker-a
spec: replicas: 1 selector: matchLabels: app: gpu-worker instance: worker-a template: metadata: labels: app: gpu-worker instance: worker-a spec: schedulerName: hami-scheduler # critical — tells K8s to use HAMi's scheduler containers: - name: gpu-worker image: pytorch/pytorch:2.2.0-cuda12.1-cudnn8-runtime command: ["python3", "-u", "-c"] args: - | import torch, time, os pod = os.environ.get('POD_NAME', 'worker-a') device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print(f'[{pod}] device={device} gpu={torch.cuda.get_device_name(0)}', flush=True) # Allocate 1.5GB resident tensor (well within 2GB limit) elements = (1500 * 1024 * 1024) // 4 blob = torch.zeros(elements, dtype=torch.float32, device=device) print(f'[{pod}] VRAM allocated: {torch.cuda.memory_allocated() // 1024**2}MB', flush=True) a = torch.randn(1024, 1024, device=device, dtype=torch.float16) b = torch.randn(1024, 1024, device=device, dtype=torch.float16) i = 0 while True: c = torch.matmul(a, b) torch.cuda.synchronize() i += 1 if i % 100 == 0: print(f'[{pod}] iter={i} vram={torch.cuda.memory_allocated() // 1024**2}MB', flush=True) time.sleep(0.1) env: - name: POD_NAME valueFrom: fieldRef: fieldPath: metadata.name - name: PYTHONUNBUFFERED value: "1" resources: limits: nvidia.com/gpu: "1" # REQUIRED — HAMi trigger, without this it ignores the pod nvidia.com/gpucores: "25" # 25% SM core cap (soft throttle) nvidia.com/gpumem-percentage: "20" # 20% of VRAM = ~2048MB hard wall
apiVersion: apps/v1
kind: Deployment
metadata: name: gpu-worker-a
spec: replicas: 1 selector: matchLabels: app: gpu-worker instance: worker-a template: metadata: labels: app: gpu-worker instance: worker-a spec: schedulerName: hami-scheduler # critical — tells K8s to use HAMi's scheduler containers: - name: gpu-worker image: pytorch/pytorch:2.2.0-cuda12.1-cudnn8-runtime command: ["python3", "-u", "-c"] args: - | import torch, time, os pod = os.environ.get('POD_NAME', 'worker-a') device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print(f'[{pod}] device={device} gpu={torch.cuda.get_device_name(0)}', flush=True) # Allocate 1.5GB resident tensor (well within 2GB limit) elements = (1500 * 1024 * 1024) // 4 blob = torch.zeros(elements, dtype=torch.float32, device=device) print(f'[{pod}] VRAM allocated: {torch.cuda.memory_allocated() // 1024**2}MB', flush=True) a = torch.randn(1024, 1024, device=device, dtype=torch.float16) b = torch.randn(1024, 1024, device=device, dtype=torch.float16) i = 0 while True: c = torch.matmul(a, b) torch.cuda.synchronize() i += 1 if i % 100 == 0: print(f'[{pod}] iter={i} vram={torch.cuda.memory_allocated() // 1024**2}MB', flush=True) time.sleep(0.1) env: - name: POD_NAME valueFrom: fieldRef: fieldPath: metadata.name - name: PYTHONUNBUFFERED value: "1" resources: limits: nvidia.com/gpu: "1" # REQUIRED — HAMi trigger, without this it ignores the pod nvidia.com/gpucores: "25" # 25% SM core cap (soft throttle) nvidia.com/gpumem-percentage: "20" # 20% of VRAM = ~2048MB hard wall
apiVersion: apps/v1
kind: Deployment
metadata: name: gpu-worker-b
spec: replicas: 1 selector: matchLabels: app: gpu-worker instance: worker-b template: metadata: labels: app: gpu-worker instance: worker-b spec: schedulerName: hami-scheduler containers: - name: gpu-worker image: pytorch/pytorch:2.2.0-cuda12.1-cudnn8-runtime command: ["python3", "-u", "-c"] args: - | import torch, time, os pod = os.environ.get('POD_NAME', 'worker-b') device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print(f'[{pod}] device={device} gpu={torch.cuda.get_device_name(0)}', flush=True) elements = (2000 * 1024 * 1024) // 4 blob = torch.zeros(elements, dtype=torch.float32, device=device) print(f'[{pod}] VRAM allocated: {torch.cuda.memory_allocated() // 1024**2}MB', flush=True) a = torch.randn(2048, 2048, device=device, dtype=torch.float16) b = torch.randn(2048, 2048, device=device, dtype=torch.float16) i = 0 while True: c = torch.matmul(a, b) torch.cuda.synchronize() i += 1 if i % 100 == 0: print(f'[{pod}] iter={i} vram={torch.cuda.memory_allocated() // 1024**2}MB', flush=True) time.sleep(0.05) env: - name: POD_NAME valueFrom: fieldRef: fieldPath: metadata.name - name: PYTHONUNBUFFERED value: "1" resources: limits: nvidia.com/gpu: "1" nvidia.com/gpucores: "40" nvidia.com/gpumem-percentage: "30" # 30% = ~3072MB hard wall
apiVersion: apps/v1
kind: Deployment
metadata: name: gpu-worker-b
spec: replicas: 1 selector: matchLabels: app: gpu-worker instance: worker-b template: metadata: labels: app: gpu-worker instance: worker-b spec: schedulerName: hami-scheduler containers: - name: gpu-worker image: pytorch/pytorch:2.2.0-cuda12.1-cudnn8-runtime command: ["python3", "-u", "-c"] args: - | import torch, time, os pod = os.environ.get('POD_NAME', 'worker-b') device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print(f'[{pod}] device={device} gpu={torch.cuda.get_device_name(0)}', flush=True) elements = (2000 * 1024 * 1024) // 4 blob = torch.zeros(elements, dtype=torch.float32, device=device) print(f'[{pod}] VRAM allocated: {torch.cuda.memory_allocated() // 1024**2}MB', flush=True) a = torch.randn(2048, 2048, device=device, dtype=torch.float16) b = torch.randn(2048, 2048, device=device, dtype=torch.float16) i = 0 while True: c = torch.matmul(a, b) torch.cuda.synchronize() i += 1 if i % 100 == 0: print(f'[{pod}] iter={i} vram={torch.cuda.memory_allocated() // 1024**2}MB', flush=True) time.sleep(0.05) env: - name: POD_NAME valueFrom: fieldRef: fieldPath: metadata.name - name: PYTHONUNBUFFERED value: "1" resources: limits: nvidia.com/gpu: "1" nvidia.com/gpucores: "40" nvidia.com/gpumem-percentage: "30" # 30% = ~3072MB hard wall
apiVersion: apps/v1
kind: Deployment
metadata: name: gpu-worker-b
spec: replicas: 1 selector: matchLabels: app: gpu-worker instance: worker-b template: metadata: labels: app: gpu-worker instance: worker-b spec: schedulerName: hami-scheduler containers: - name: gpu-worker image: pytorch/pytorch:2.2.0-cuda12.1-cudnn8-runtime command: ["python3", "-u", "-c"] args: - | import torch, time, os pod = os.environ.get('POD_NAME', 'worker-b') device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print(f'[{pod}] device={device} gpu={torch.cuda.get_device_name(0)}', flush=True) elements = (2000 * 1024 * 1024) // 4 blob = torch.zeros(elements, dtype=torch.float32, device=device) print(f'[{pod}] VRAM allocated: {torch.cuda.memory_allocated() // 1024**2}MB', flush=True) a = torch.randn(2048, 2048, device=device, dtype=torch.float16) b = torch.randn(2048, 2048, device=device, dtype=torch.float16) i = 0 while True: c = torch.matmul(a, b) torch.cuda.synchronize() i += 1 if i % 100 == 0: print(f'[{pod}] iter={i} vram={torch.cuda.memory_allocated() // 1024**2}MB', flush=True) time.sleep(0.05) env: - name: POD_NAME valueFrom: fieldRef: fieldPath: metadata.name - name: PYTHONUNBUFFERED value: "1" resources: limits: nvidia.com/gpu: "1" nvidia.com/gpucores: "40" nvidia.com/gpumem-percentage: "30" # 30% = ~3072MB hard wall
microk8s -weight: 500;">kubectl apply -f gpu_worker_a.yaml
microk8s -weight: 500;">kubectl apply -f gpu_worker_b.yaml # Watch logs from both simultaneously
microk8s -weight: 500;">kubectl logs -l app=gpu-worker --prefix=true -f
microk8s -weight: 500;">kubectl apply -f gpu_worker_a.yaml
microk8s -weight: 500;">kubectl apply -f gpu_worker_b.yaml # Watch logs from both simultaneously
microk8s -weight: 500;">kubectl logs -l app=gpu-worker --prefix=true -f
microk8s -weight: 500;">kubectl apply -f gpu_worker_a.yaml
microk8s -weight: 500;">kubectl apply -f gpu_worker_b.yaml # Watch logs from both simultaneously
microk8s -weight: 500;">kubectl logs -l app=gpu-worker --prefix=true -f
[pod/gpu-worker-a-.../gpu-worker] [worker-a] device=cuda gpu=NVIDIA GeForce RTX 3080
[pod/gpu-worker-b-.../gpu-worker] [worker-b] device=cuda gpu=NVIDIA GeForce RTX 3080
[pod/gpu-worker-b-.../gpu-worker] [worker-b] VRAM allocated: 2000MB
[pod/gpu-worker-a-.../gpu-worker] [worker-a] VRAM allocated: 1500MB
[pod/gpu-worker-b-.../gpu-worker] [worker-b] iter=100 vram=2032MB
[pod/gpu-worker-a-.../gpu-worker] [worker-a] iter=100 vram=1514MB
[pod/gpu-worker-b-.../gpu-worker] [worker-b] iter=200 vram=2032MB
[pod/gpu-worker-a-.../gpu-worker] [worker-a] iter=200 vram=1514MB
[pod/gpu-worker-a-.../gpu-worker] [worker-a] device=cuda gpu=NVIDIA GeForce RTX 3080
[pod/gpu-worker-b-.../gpu-worker] [worker-b] device=cuda gpu=NVIDIA GeForce RTX 3080
[pod/gpu-worker-b-.../gpu-worker] [worker-b] VRAM allocated: 2000MB
[pod/gpu-worker-a-.../gpu-worker] [worker-a] VRAM allocated: 1500MB
[pod/gpu-worker-b-.../gpu-worker] [worker-b] iter=100 vram=2032MB
[pod/gpu-worker-a-.../gpu-worker] [worker-a] iter=100 vram=1514MB
[pod/gpu-worker-b-.../gpu-worker] [worker-b] iter=200 vram=2032MB
[pod/gpu-worker-a-.../gpu-worker] [worker-a] iter=200 vram=1514MB
[pod/gpu-worker-a-.../gpu-worker] [worker-a] device=cuda gpu=NVIDIA GeForce RTX 3080
[pod/gpu-worker-b-.../gpu-worker] [worker-b] device=cuda gpu=NVIDIA GeForce RTX 3080
[pod/gpu-worker-b-.../gpu-worker] [worker-b] VRAM allocated: 2000MB
[pod/gpu-worker-a-.../gpu-worker] [worker-a] VRAM allocated: 1500MB
[pod/gpu-worker-b-.../gpu-worker] [worker-b] iter=100 vram=2032MB
[pod/gpu-worker-a-.../gpu-worker] [worker-a] iter=100 vram=1514MB
[pod/gpu-worker-b-.../gpu-worker] [worker-b] iter=200 vram=2032MB
[pod/gpu-worker-a-.../gpu-worker] [worker-a] iter=200 vram=1514MB
+-------------------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
|======================================================================================|
| 0 N/A N/A 116033 C python3 1828MiB |
| 0 N/A N/A 116034 C python3 2860MiB |
+-------------------------------------------------------------------------------------+
+-------------------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
|======================================================================================|
| 0 N/A N/A 116033 C python3 1828MiB |
| 0 N/A N/A 116034 C python3 2860MiB |
+-------------------------------------------------------------------------------------+
+-------------------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
|======================================================================================|
| 0 N/A N/A 116033 C python3 1828MiB |
| 0 N/A N/A 116034 C python3 2860MiB |
+-------------------------------------------------------------------------------------+
microk8s -weight: 500;">kubectl logs -n kube-system \ $(microk8s -weight: 500;">kubectl get pod -n kube-system \ -l app.kubernetes.io/component=hami-scheduler \ -o jsonpath='{.items[0].metadata.name}') \ -c vgpu-scheduler-extender --since=2m | grep "allocate success"
microk8s -weight: 500;">kubectl logs -n kube-system \ $(microk8s -weight: 500;">kubectl get pod -n kube-system \ -l app.kubernetes.io/component=hami-scheduler \ -o jsonpath='{.items[0].metadata.name}') \ -c vgpu-scheduler-extender --since=2m | grep "allocate success"
microk8s -weight: 500;">kubectl logs -n kube-system \ $(microk8s -weight: 500;">kubectl get pod -n kube-system \ -l app.kubernetes.io/component=hami-scheduler \ -o jsonpath='{.items[0].metadata.name}') \ -c vgpu-scheduler-extender --since=2m | grep "allocate success"
# This is required — do it right after HAMi -weight: 500;">install
microk8s -weight: 500;">kubectl label node <your-node-name> gpu=on
# This is required — do it right after HAMi -weight: 500;">install
microk8s -weight: 500;">kubectl label node <your-node-name> gpu=on
# This is required — do it right after HAMi -weight: 500;">install
microk8s -weight: 500;">kubectl label node <your-node-name> gpu=on
microk8s -weight: 500;">kubectl delete pod -l app=gpu-worker
# They reschedule automatically via the Deployment
microk8s -weight: 500;">kubectl delete pod -l app=gpu-worker
# They reschedule automatically via the Deployment
microk8s -weight: 500;">kubectl delete pod -l app=gpu-worker
# They reschedule automatically via the Deployment
# Must be non-empty
microk8s -weight: 500;">kubectl exec $POD_A -- env | grep CUDA_DEVICE_MEMORY_SHARED_CACHE # Must say "success"
microk8s -weight: 500;">kubectl get pods -l app=gpu-worker -o yaml | grep "bind-phase"
# Must be non-empty
microk8s -weight: 500;">kubectl exec $POD_A -- env | grep CUDA_DEVICE_MEMORY_SHARED_CACHE # Must say "success"
microk8s -weight: 500;">kubectl get pods -l app=gpu-worker -o yaml | grep "bind-phase"
# Must be non-empty
microk8s -weight: 500;">kubectl exec $POD_A -- env | grep CUDA_DEVICE_MEMORY_SHARED_CACHE # Must say "success"
microk8s -weight: 500;">kubectl get pods -l app=gpu-worker -o yaml | grep "bind-phase"
# Two containers, same GPU, no isolation
-weight: 500;">docker run --gpus device=0 -e NVIDIA_VISIBLE_DEVICES=0 pytorch/pytorch:latest python3 -c "
import torch
# This will happily allocate ALL available VRAM
blob = torch.zeros(9_000_000_000 // 4, dtype=torch.float32, device='cuda')
print(f'Allocated: {torch.cuda.memory_allocated() // 1024**2}MB')
"
# Two containers, same GPU, no isolation
-weight: 500;">docker run --gpus device=0 -e NVIDIA_VISIBLE_DEVICES=0 pytorch/pytorch:latest python3 -c "
import torch
# This will happily allocate ALL available VRAM
blob = torch.zeros(9_000_000_000 // 4, dtype=torch.float32, device='cuda')
print(f'Allocated: {torch.cuda.memory_allocated() // 1024**2}MB')
"
# Two containers, same GPU, no isolation
-weight: 500;">docker run --gpus device=0 -e NVIDIA_VISIBLE_DEVICES=0 pytorch/pytorch:latest python3 -c "
import torch
# This will happily allocate ALL available VRAM
blob = torch.zeros(9_000_000_000 // 4, dtype=torch.float32, device='cuda')
print(f'Allocated: {torch.cuda.memory_allocated() // 1024**2}MB')
"
# This is a suggestion, not enforcement
torch.cuda.set_per_process_memory_fraction(0.5, device=0)
# This is a suggestion, not enforcement
torch.cuda.set_per_process_memory_fraction(0.5, device=0)
# This is a suggestion, not enforcement
torch.cuda.set_per_process_memory_fraction(0.5, device=0)
Container calls cudaMalloc(1GB) ↓
libvgpu.so intercepts ↓
cumulative_alloc + 1GB > pod_limit? YES → return CUDA_ERROR_OUT_OF_MEMORY (your pod, your problem) NO → pass through to real cudaMalloc
Container calls cudaMalloc(1GB) ↓
libvgpu.so intercepts ↓
cumulative_alloc + 1GB > pod_limit? YES → return CUDA_ERROR_OUT_OF_MEMORY (your pod, your problem) NO → pass through to real cudaMalloc
Container calls cudaMalloc(1GB) ↓
libvgpu.so intercepts ↓
cumulative_alloc + 1GB > pod_limit? YES → return CUDA_ERROR_OUT_OF_MEMORY (your pod, your problem) NO → pass through to real cudaMalloc
-weight: 500;">curl -s http://localhost:31992/metrics | grep -v "^#"
-weight: 500;">curl -s http://localhost:31992/metrics | grep -v "^#"
-weight: 500;">curl -s http://localhost:31992/metrics | grep -v "^#"
vGPU_device_memory_usage_in_bytes{podname="gpu-worker-a",...} 1.82884864e+09
vGPU_device_memory_usage_in_bytes{podname="gpu-worker-b",...} 2.39507968e+09
vGPU_device_memory_limit_in_bytes{podname="gpu-worker-a",...} 2.147483648e+09
vGPU_device_memory_limit_in_bytes{podname="gpu-worker-b",...} 3.221225472e+09
Device_utilization_desc_of_container{podname="gpu-worker-a",...} 12
Device_utilization_desc_of_container{podname="gpu-worker-b",...} 31
HostCoreUtilization{deviceuuid="GPU-53aae475-...",...} 14
HostGPUMemoryUsage{deviceuuid="GPU-53aae475-...",...} 5.82e+09
vGPU_device_memory_usage_in_bytes{podname="gpu-worker-a",...} 1.82884864e+09
vGPU_device_memory_usage_in_bytes{podname="gpu-worker-b",...} 2.39507968e+09
vGPU_device_memory_limit_in_bytes{podname="gpu-worker-a",...} 2.147483648e+09
vGPU_device_memory_limit_in_bytes{podname="gpu-worker-b",...} 3.221225472e+09
Device_utilization_desc_of_container{podname="gpu-worker-a",...} 12
Device_utilization_desc_of_container{podname="gpu-worker-b",...} 31
HostCoreUtilization{deviceuuid="GPU-53aae475-...",...} 14
HostGPUMemoryUsage{deviceuuid="GPU-53aae475-...",...} 5.82e+09
vGPU_device_memory_usage_in_bytes{podname="gpu-worker-a",...} 1.82884864e+09
vGPU_device_memory_usage_in_bytes{podname="gpu-worker-b",...} 2.39507968e+09
vGPU_device_memory_limit_in_bytes{podname="gpu-worker-a",...} 2.147483648e+09
vGPU_device_memory_limit_in_bytes{podname="gpu-worker-b",...} 3.221225472e+09
Device_utilization_desc_of_container{podname="gpu-worker-a",...} 12
Device_utilization_desc_of_container{podname="gpu-worker-b",...} 31
HostCoreUtilization{deviceuuid="GPU-53aae475-...",...} 14
HostGPUMemoryUsage{deviceuuid="GPU-53aae475-...",...} 5.82e+09
-weight: 500;">curl -s http://localhost:31993/metrics | grep -v "^#"
-weight: 500;">curl -s http://localhost:31993/metrics | grep -v "^#"
-weight: 500;">curl -s http://localhost:31993/metrics | grep -v "^#"
GPUDeviceSharedNum{...} 2
GPUDeviceCoreAllocated{...} 65
GPUDeviceMemoryAllocated{...} 5.36870912e+09
vGPUCoreAllocated{podname="gpu-worker-a",...} 25
vGPUCoreAllocated{podname="gpu-worker-b",...} 40
vGPUMemoryAllocated{podname="gpu-worker-a",...} 2.147483648e+09
vGPUMemoryAllocated{podname="gpu-worker-b",...} 3.221225472e+09
QuotaUsed{quotaName="nvidia.com/gpumem", quotanamespace="default",...} 5120
GPUDeviceSharedNum{...} 2
GPUDeviceCoreAllocated{...} 65
GPUDeviceMemoryAllocated{...} 5.36870912e+09
vGPUCoreAllocated{podname="gpu-worker-a",...} 25
vGPUCoreAllocated{podname="gpu-worker-b",...} 40
vGPUMemoryAllocated{podname="gpu-worker-a",...} 2.147483648e+09
vGPUMemoryAllocated{podname="gpu-worker-b",...} 3.221225472e+09
QuotaUsed{quotaName="nvidia.com/gpumem", quotanamespace="default",...} 5120
GPUDeviceSharedNum{...} 2
GPUDeviceCoreAllocated{...} 65
GPUDeviceMemoryAllocated{...} 5.36870912e+09
vGPUCoreAllocated{podname="gpu-worker-a",...} 25
vGPUCoreAllocated{podname="gpu-worker-b",...} 40
vGPUMemoryAllocated{podname="gpu-worker-a",...} 2.147483648e+09
vGPUMemoryAllocated{podname="gpu-worker-b",...} 3.221225472e+09
QuotaUsed{quotaName="nvidia.com/gpumem", quotanamespace="default",...} 5120
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata: name: hami-scheduler-metrics namespace: observability labels: release: kube-prom-stack
spec: namespaceSelector: matchNames: - kube-system selector: matchLabels: app.kubernetes.io/component: hami-scheduler app.kubernetes.io/instance: hami endpoints: - port: monitor # → pod :9395 interval: 10s path: /metrics
---
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata: name: hami-device-plugin-metrics namespace: observability labels: release: kube-prom-stack
spec: namespaceSelector: matchNames: - kube-system selector: matchLabels: app.kubernetes.io/component: hami-device-plugin app.kubernetes.io/instance: hami endpoints: - port: monitorport # → pod :9394 interval: 5s path: /metrics
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata: name: hami-scheduler-metrics namespace: observability labels: release: kube-prom-stack
spec: namespaceSelector: matchNames: - kube-system selector: matchLabels: app.kubernetes.io/component: hami-scheduler app.kubernetes.io/instance: hami endpoints: - port: monitor # → pod :9395 interval: 10s path: /metrics
---
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata: name: hami-device-plugin-metrics namespace: observability labels: release: kube-prom-stack
spec: namespaceSelector: matchNames: - kube-system selector: matchLabels: app.kubernetes.io/component: hami-device-plugin app.kubernetes.io/instance: hami endpoints: - port: monitorport # → pod :9394 interval: 5s path: /metrics
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata: name: hami-scheduler-metrics namespace: observability labels: release: kube-prom-stack
spec: namespaceSelector: matchNames: - kube-system selector: matchLabels: app.kubernetes.io/component: hami-scheduler app.kubernetes.io/instance: hami endpoints: - port: monitor # → pod :9395 interval: 10s path: /metrics
---
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata: name: hami-device-plugin-metrics namespace: observability labels: release: kube-prom-stack
spec: namespaceSelector: matchNames: - kube-system selector: matchLabels: app.kubernetes.io/component: hami-device-plugin app.kubernetes.io/instance: hami endpoints: - port: monitorport # → pod :9394 interval: 5s path: /metrics
gpumem-percentage: "20" → Hard. If exceeded → CUDA_ERROR_OUT_OF_MEMORY. Deterministic.
gpucores: "25" → Soft. Best-effort ±5-10%. Not a hardware guarantee.
gpumem-percentage: "20" → Hard. If exceeded → CUDA_ERROR_OUT_OF_MEMORY. Deterministic.
gpucores: "25" → Soft. Best-effort ±5-10%. Not a hardware guarantee.
gpumem-percentage: "20" → Hard. If exceeded → CUDA_ERROR_OUT_OF_MEMORY. Deterministic.
gpucores: "25" → Soft. Best-effort ±5-10%. Not a hardware guarantee.
Physical GPU (e.g. RTX 3080 — 10 GB VRAM)
├── gpu-worker-a → 20% VRAM (~2 GB) + 25% SM cores
└── gpu-worker-b → 30% VRAM (~3 GB) + 40% SM cores
Physical GPU (e.g. RTX 3080 — 10 GB VRAM)
├── gpu-worker-a → 20% VRAM (~2 GB) + 25% SM cores
└── gpu-worker-b → 30% VRAM (~3 GB) + 40% SM cores
# 1. Install cert-manager
microk8s -weight: 500;">kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/v1.14.4/cert-manager.yaml
microk8s -weight: 500;">kubectl wait --for=condition=ready pod \ -l app.kubernetes.io/instance=cert-manager -n cert-manager --timeout=180s # 2. Install HAMi
microk8s helm3 repo add hami-charts https://project-hami.github.io/HAMi/
microk8s helm3 repo -weight: 500;">update
K8S_VERSION=$(microk8s -weight: 500;">kubectl version -o json | python3 -c \ "import sys,json,re; v=json.load(sys.stdin)['serverVersion']['gitVersion'].lstrip('v'); print(re.split(r'[+\-]',v)[0])")
microk8s helm3 -weight: 500;">install hami hami-charts/hami --namespace kube-system \ --set scheduler.kubeScheduler.imageTag=v${K8S_VERSION} \ --set devicePlugin.nvidiaDriverPath=/usr/local/nvidia \ --set scheduler.defaultSchedulerPolicy.gpuMemory=true \ --set scheduler.defaultSchedulerPolicy.gpuCores=true # 3. Label your GPU node
microk8s -weight: 500;">kubectl label node <your-node-name> gpu=on # 4. Deploy workers
microk8s -weight: 500;">kubectl apply -f gpu_worker_a.yaml
microk8s -weight: 500;">kubectl apply -f gpu_worker_b.yaml # 5. Watch the magic
microk8s -weight: 500;">kubectl logs -l app=gpu-worker --prefix=true -f &
watch -n 2 nvidia-smi # 6. Verify HAMi's view of the split
-weight: 500;">curl -s http://localhost:31993/metrics | grep -v "^#"
# 1. Install cert-manager
microk8s -weight: 500;">kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/v1.14.4/cert-manager.yaml
microk8s -weight: 500;">kubectl wait --for=condition=ready pod \ -l app.kubernetes.io/instance=cert-manager -n cert-manager --timeout=180s # 2. Install HAMi
microk8s helm3 repo add hami-charts https://project-hami.github.io/HAMi/
microk8s helm3 repo -weight: 500;">update
K8S_VERSION=$(microk8s -weight: 500;">kubectl version -o json | python3 -c \ "import sys,json,re; v=json.load(sys.stdin)['serverVersion']['gitVersion'].lstrip('v'); print(re.split(r'[+\-]',v)[0])")
microk8s helm3 -weight: 500;">install hami hami-charts/hami --namespace kube-system \ --set scheduler.kubeScheduler.imageTag=v${K8S_VERSION} \ --set devicePlugin.nvidiaDriverPath=/usr/local/nvidia \ --set scheduler.defaultSchedulerPolicy.gpuMemory=true \ --set scheduler.defaultSchedulerPolicy.gpuCores=true # 3. Label your GPU node
microk8s -weight: 500;">kubectl label node <your-node-name> gpu=on # 4. Deploy workers
microk8s -weight: 500;">kubectl apply -f gpu_worker_a.yaml
microk8s -weight: 500;">kubectl apply -f gpu_worker_b.yaml # 5. Watch the magic
microk8s -weight: 500;">kubectl logs -l app=gpu-worker --prefix=true -f &
watch -n 2 nvidia-smi # 6. Verify HAMi's view of the split
-weight: 500;">curl -s http://localhost:31993/metrics | grep -v "^#"
# 1. Install cert-manager
microk8s -weight: 500;">kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/v1.14.4/cert-manager.yaml
microk8s -weight: 500;">kubectl wait --for=condition=ready pod \ -l app.kubernetes.io/instance=cert-manager -n cert-manager --timeout=180s # 2. Install HAMi
microk8s helm3 repo add hami-charts https://project-hami.github.io/HAMi/
microk8s helm3 repo -weight: 500;">update
K8S_VERSION=$(microk8s -weight: 500;">kubectl version -o json | python3 -c \ "import sys,json,re; v=json.load(sys.stdin)['serverVersion']['gitVersion'].lstrip('v'); print(re.split(r'[+\-]',v)[0])")
microk8s helm3 -weight: 500;">install hami hami-charts/hami --namespace kube-system \ --set scheduler.kubeScheduler.imageTag=v${K8S_VERSION} \ --set devicePlugin.nvidiaDriverPath=/usr/local/nvidia \ --set scheduler.defaultSchedulerPolicy.gpuMemory=true \ --set scheduler.defaultSchedulerPolicy.gpuCores=true # 3. Label your GPU node
microk8s -weight: 500;">kubectl label node <your-node-name> gpu=on # 4. Deploy workers
microk8s -weight: 500;">kubectl apply -f gpu_worker_a.yaml
microk8s -weight: 500;">kubectl apply -f gpu_worker_b.yaml # 5. Watch the magic
microk8s -weight: 500;">kubectl logs -l app=gpu-worker --prefix=true -f &
watch -n 2 nvidia-smi # 6. Verify HAMi's view of the split
-weight: 500;">curl -s http://localhost:31993/metrics | grep -v "^#" - NVIDIA Time-Slicing — Easy to set up via the GPU Operator and it looks good on paper. In practice, for streaming and transcoding workloads it was a non-starter. Time-slicing serialises GPU access, which introduces jitter and latency spikes — exactly what you cannot have when you're processing live video or audio. Frames drop, buffers stall, quality degrades. We turned it off fast.
- Plain Docker with --gpus device=0 — Which I'll get into. We actually used this for a long time, and it worked — sort of. - No VRAM isolation — Docker containers on the same GPU shared memory completely. One greedy process could OOM the rest, and when it happened everything fell over at once.
- GPU workloads living outside K8s — a whole class of tasks with no K8s lifecycle management, no health checks, no rolling restarts. A permanent special case that needed permanent special handling.
- Node affinity as a constraint, not a choice — the orchestrator had to be pinned to the GPU node to reach the Docker daemon. Scaling to multiple GPU nodes meant more orchestrators, more complexity, more things to coordinate.
- No per-container GPU metrics — visibility into who was using what meant scraping nvidia-smi and correlating PIDs manually. Fragile and tedious. - Unlike MIG, it doesn't require specific hardware
- Unlike time-slicing, it enforces VRAM isolation (not just temporal sharing)
- Unlike MPS, a failing pod doesn't crash the shared context
- Unlike plain Docker, it's K8s-native and actually enforces limits - VRAM — hard cap; pod is OOM-killed if it exceeds its allocation
- GPU cores — soft cap via kernel submission throttling (±5–10% deviation is normal) - Ubuntu 22.04 / 24.04
- NVIDIA driver installed on host (nvidia-smi works)
- MicroK8s installed (snap -weight: 500;">install microk8s --classic) - sanity_check.yaml — Verify GPU access before installing HAMi
- gpu_worker_a.yaml — Worker A deployment (20% VRAM, 25% cores)
- gpu_worker_b.yaml — Worker B deployment (30% VRAM, 40% cores)
- hami_service_monitoring.yaml — Prometheus ServiceMonitors for both HAMi endpoints
- grafana_dashboard.yaml — Auto-importing Grafana dashboard via ConfigMap