apiVersion: inference.llmkube.dev/v1alpha1
kind: Model
metadata: name: llama-3-8b
spec: source: https://huggingface.co/bartowski/Meta-Llama-3.1-8B-Instruct-GGUF/resolve/main/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf format: gguf quantization: Q4_K_M hardware: accelerator: cuda gpu: count: 1
---
apiVersion: inference.llmkube.dev/v1alpha1
kind: InferenceService
metadata: name: llama-3-8b
spec: modelRef: llama-3-8b replicas: 1 resources: cpu: "2" memory: "4Gi"
apiVersion: inference.llmkube.dev/v1alpha1
kind: Model
metadata: name: llama-3-8b
spec: source: https://huggingface.co/bartowski/Meta-Llama-3.1-8B-Instruct-GGUF/resolve/main/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf format: gguf quantization: Q4_K_M hardware: accelerator: cuda gpu: count: 1
---
apiVersion: inference.llmkube.dev/v1alpha1
kind: InferenceService
metadata: name: llama-3-8b
spec: modelRef: llama-3-8b replicas: 1 resources: cpu: "2" memory: "4Gi"
apiVersion: inference.llmkube.dev/v1alpha1
kind: Model
metadata: name: llama-3-8b
spec: source: https://huggingface.co/bartowski/Meta-Llama-3.1-8B-Instruct-GGUF/resolve/main/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf format: gguf quantization: Q4_K_M hardware: accelerator: cuda gpu: count: 1
---
apiVersion: inference.llmkube.dev/v1alpha1
kind: InferenceService
metadata: name: llama-3-8b
spec: modelRef: llama-3-8b replicas: 1 resources: cpu: "2" memory: "4Gi"
# On your Mac
brew install llama.cpp
llmkube-metal-agent --host-ip 192.168.1.x # From anywhere in the cluster
llmkube deploy qwen-30b-a3b --accelerator metal
# On your Mac
brew install llama.cpp
llmkube-metal-agent --host-ip 192.168.1.x # From anywhere in the cluster
llmkube deploy qwen-30b-a3b --accelerator metal
# On your Mac
brew install llama.cpp
llmkube-metal-agent --host-ip 192.168.1.x # From anywhere in the cluster
llmkube deploy qwen-30b-a3b --accelerator metal
spec: hardware: accelerator: cuda gpu: count: 2 sharding: strategy: layer
spec: hardware: accelerator: cuda gpu: count: 2 sharding: strategy: layer
spec: hardware: accelerator: cuda gpu: count: 2 sharding: strategy: layer
# Install the CLI
brew install defilantech/tap/llmkube # Add the Helm repo and install the operator
helm repo add llmkube https://defilantech.github.io/LLMKube
helm install llmkube llmkube/llmkube \ --namespace llmkube-system \ --create-namespace
# Install the CLI
brew install defilantech/tap/llmkube # Add the Helm repo and install the operator
helm repo add llmkube https://defilantech.github.io/LLMKube
helm install llmkube llmkube/llmkube \ --namespace llmkube-system \ --create-namespace
# Install the CLI
brew install defilantech/tap/llmkube # Add the Helm repo and install the operator
helm repo add llmkube https://defilantech.github.io/LLMKube
helm install llmkube llmkube/llmkube \ --namespace llmkube-system \ --create-namespace
# Deploy Phi-4 Mini (3.8B params, from the built-in catalog)
llmkube deploy phi-4-mini
# Deploy Phi-4 Mini (3.8B params, from the built-in catalog)
llmkube deploy phi-4-mini
# Deploy Phi-4 Mini (3.8B params, from the built-in catalog)
llmkube deploy phi-4-mini
# Port-forward and test
kubectl port-forward svc/phi-4-mini 8080:8080 & curl http://localhost:8080/v1/chat/completions \ -H "Content-Type: application/json" \ -d '{ "messages": [ {"role": "user", "content": "What is Kubernetes in one sentence?"} ], "max_tokens": 100 }'
# Port-forward and test
kubectl port-forward svc/phi-4-mini 8080:8080 & curl http://localhost:8080/v1/chat/completions \ -H "Content-Type: application/json" \ -d '{ "messages": [ {"role": "user", "content": "What is Kubernetes in one sentence?"} ], "max_tokens": 100 }'
# Port-forward and test
kubectl port-forward svc/phi-4-mini 8080:8080 & curl http://localhost:8080/v1/chat/completions \ -H "Content-Type: application/json" \ -d '{ "messages": [ {"role": "user", "content": "What is Kubernetes in one sentence?"} ], "max_tokens": 100 }'
from openai import OpenAI client = OpenAI( base_url="http://localhost:8080/v1", api_key="not-needed"
) response = client.chat.completions.create( model="phi-4-mini", messages=[{"role": "user", "content": "Hello!"}]
) print(response.choices[0].message.content)
from openai import OpenAI client = OpenAI( base_url="http://localhost:8080/v1", api_key="not-needed"
) response = client.chat.completions.create( model="phi-4-mini", messages=[{"role": "user", "content": "Hello!"}]
) print(response.choices[0].message.content)
from openai import OpenAI client = OpenAI( base_url="http://localhost:8080/v1", api_key="not-needed"
) response = client.chat.completions.create( model="phi-4-mini", messages=[{"role": "user", "content": "Hello!"}]
) print(response.choices[0].message.content)
llmkube deploy llama-3.1-8b --gpu --gpu-count 1
llmkube deploy llama-3.1-8b --gpu --gpu-count 1
llmkube deploy llama-3.1-8b --gpu --gpu-count 1
spec: source: pvc://model-storage/models/llama-3-8b-q4.gguf sha256: a1b2c3d4e5f6...
spec: source: pvc://model-storage/models/llama-3-8b-q4.gguf sha256: a1b2c3d4e5f6...
spec: source: pvc://model-storage/models/llama-3-8b-q4.gguf sha256: a1b2c3d4e5f6... - Run llama.cpp on Kubernetes with proper lifecycle management
- Deploy models with a single command or a two-resource YAML
- Use NVIDIA GPUs with CUDA acceleration
- Use Apple Silicon Macs as GPU inference nodes in your cluster
- Split models across multiple GPUs for larger models
- Monitor everything with Prometheus and Grafana - AMD Ryzen 9 7900X (12 cores / 24 threads)
- 64GB DDR5-6000
- 2x NVIDIA RTX 5060 Ti (16GB VRAM each, 32GB total)
- Samsung 990 Pro 1TB NVMe
- Running MicroK8s as a single-node Kubernetes cluster - Watches the Kubernetes API for InferenceService resources with accelerator: metal
- Spawns llama-server natively on macOS with full Metal GPU access
- Registers the endpoint back into Kubernetes so other services can route to it - A Kubernetes cluster (Minikube, kind, K3s, or any managed cluster)
- kubectl configured - Edge deployment support for lightweight Kubernetes distributions like K3s and MicroK8s
- AMD GPU support (ROCm) with a community contributor already testing on Framework hardware with a Ryzen AI Max+ 395
- llmkube chat for testing models directly from the CLI without needing curl