# Install Docker
-weight: 500;">curl -fsSL https://get.-weight: 500;">docker.com | sh # Run Ollama
-weight: 500;">docker run -d --name ollama -p 11434:11434 \ -v ollama_data:/root/.ollama \ ollama/ollama:latest # Pull a model
-weight: 500;">docker exec -it ollama ollama pull llama3.2:3b
# Install Docker
-weight: 500;">curl -fsSL https://get.-weight: 500;">docker.com | sh # Run Ollama
-weight: 500;">docker run -d --name ollama -p 11434:11434 \ -v ollama_data:/root/.ollama \ ollama/ollama:latest # Pull a model
-weight: 500;">docker exec -it ollama ollama pull llama3.2:3b
# Install Docker
-weight: 500;">curl -fsSL https://get.-weight: 500;">docker.com | sh # Run Ollama
-weight: 500;">docker run -d --name ollama -p 11434:11434 \ -v ollama_data:/root/.ollama \ ollama/ollama:latest # Pull a model
-weight: 500;">docker exec -it ollama ollama pull llama3.2:3b
ssh root@your-server-ip
ssh root@your-server-ip
ssh root@your-server-ip
# Update system
-weight: 500;">apt -weight: 500;">update && -weight: 500;">apt -weight: 500;">upgrade -y # Install NVIDIA driver dependencies
-weight: 500;">apt -weight: 500;">install -y build-essential linux-headers-$(uname -r) # Install NVIDIA drivers (Ubuntu 22.04/24.04)
-weight: 500;">apt -weight: 500;">install -y nvidia-driver-550 # Reboot
reboot
# Update system
-weight: 500;">apt -weight: 500;">update && -weight: 500;">apt -weight: 500;">upgrade -y # Install NVIDIA driver dependencies
-weight: 500;">apt -weight: 500;">install -y build-essential linux-headers-$(uname -r) # Install NVIDIA drivers (Ubuntu 22.04/24.04)
-weight: 500;">apt -weight: 500;">install -y nvidia-driver-550 # Reboot
reboot
# Update system
-weight: 500;">apt -weight: 500;">update && -weight: 500;">apt -weight: 500;">upgrade -y # Install NVIDIA driver dependencies
-weight: 500;">apt -weight: 500;">install -y build-essential linux-headers-$(uname -r) # Install NVIDIA drivers (Ubuntu 22.04/24.04)
-weight: 500;">apt -weight: 500;">install -y nvidia-driver-550 # Reboot
reboot
# Install Docker
-weight: 500;">curl -fsSL https://get.-weight: 500;">docker.com | sh # Install NVIDIA Container Toolkit
-weight: 500;">curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | \ gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg -weight: 500;">curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \ sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \ tee /etc/-weight: 500;">apt/sources.list.d/nvidia-container-toolkit.list -weight: 500;">apt -weight: 500;">update && -weight: 500;">apt -weight: 500;">install -y nvidia-container-toolkit
nvidia-ctk runtime configure --runtime=-weight: 500;">docker
-weight: 500;">systemctl -weight: 500;">restart -weight: 500;">docker
# Install Docker
-weight: 500;">curl -fsSL https://get.-weight: 500;">docker.com | sh # Install NVIDIA Container Toolkit
-weight: 500;">curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | \ gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg -weight: 500;">curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \ sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \ tee /etc/-weight: 500;">apt/sources.list.d/nvidia-container-toolkit.list -weight: 500;">apt -weight: 500;">update && -weight: 500;">apt -weight: 500;">install -y nvidia-container-toolkit
nvidia-ctk runtime configure --runtime=-weight: 500;">docker
-weight: 500;">systemctl -weight: 500;">restart -weight: 500;">docker
# Install Docker
-weight: 500;">curl -fsSL https://get.-weight: 500;">docker.com | sh # Install NVIDIA Container Toolkit
-weight: 500;">curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | \ gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg -weight: 500;">curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \ sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \ tee /etc/-weight: 500;">apt/sources.list.d/nvidia-container-toolkit.list -weight: 500;">apt -weight: 500;">update && -weight: 500;">apt -weight: 500;">install -y nvidia-container-toolkit
nvidia-ctk runtime configure --runtime=-weight: 500;">docker
-weight: 500;">systemctl -weight: 500;">restart -weight: 500;">docker
-weight: 500;">docker run --rm --gpus all nvidia/cuda:12.4.0-base-ubuntu22.04 nvidia-smi
-weight: 500;">docker run --rm --gpus all nvidia/cuda:12.4.0-base-ubuntu22.04 nvidia-smi
-weight: 500;">docker run --rm --gpus all nvidia/cuda:12.4.0-base-ubuntu22.04 nvidia-smi
version: "3.8" services: ollama: image: ollama/ollama:latest container_name: ollama -weight: 500;">restart: unless-stopped ports: - "11434:11434" volumes: - ollama_data:/root/.ollama environment: - OLLAMA_NUM_PARALLEL=4 - OLLAMA_MAX_LOADED_MODELS=2 deploy: resources: reservations: devices: - driver: nvidia count: all capabilities: [gpu] open-webui: image: ghcr.io/open-webui/open-webui:main container_name: open-webui -weight: 500;">restart: unless-stopped ports: - "3000:8080" volumes: - open_webui_data:/app/backend/data environment: - OLLAMA_BASE_URL=http://ollama:11434 - WEBUI_AUTH=true - WEBUI_SECRET_KEY=change-this-to-a-random-string depends_on: - ollama volumes: ollama_data: open_webui_data:
version: "3.8" services: ollama: image: ollama/ollama:latest container_name: ollama -weight: 500;">restart: unless-stopped ports: - "11434:11434" volumes: - ollama_data:/root/.ollama environment: - OLLAMA_NUM_PARALLEL=4 - OLLAMA_MAX_LOADED_MODELS=2 deploy: resources: reservations: devices: - driver: nvidia count: all capabilities: [gpu] open-webui: image: ghcr.io/open-webui/open-webui:main container_name: open-webui -weight: 500;">restart: unless-stopped ports: - "3000:8080" volumes: - open_webui_data:/app/backend/data environment: - OLLAMA_BASE_URL=http://ollama:11434 - WEBUI_AUTH=true - WEBUI_SECRET_KEY=change-this-to-a-random-string depends_on: - ollama volumes: ollama_data: open_webui_data:
version: "3.8" services: ollama: image: ollama/ollama:latest container_name: ollama -weight: 500;">restart: unless-stopped ports: - "11434:11434" volumes: - ollama_data:/root/.ollama environment: - OLLAMA_NUM_PARALLEL=4 - OLLAMA_MAX_LOADED_MODELS=2 deploy: resources: reservations: devices: - driver: nvidia count: all capabilities: [gpu] open-webui: image: ghcr.io/open-webui/open-webui:main container_name: open-webui -weight: 500;">restart: unless-stopped ports: - "3000:8080" volumes: - open_webui_data:/app/backend/data environment: - OLLAMA_BASE_URL=http://ollama:11434 - WEBUI_AUTH=true - WEBUI_SECRET_KEY=change-this-to-a-random-string depends_on: - ollama volumes: ollama_data: open_webui_data:
-weight: 500;">docker compose up -d
-weight: 500;">docker compose up -d
-weight: 500;">docker compose up -d
# 14B model — fits easily, fast inference
-weight: 500;">docker exec -it ollama ollama pull phi-4:14b # 32B model (Q4) — fits in ~18 GB, good quality
-weight: 500;">docker exec -it ollama ollama pull qwen2.5:32b-instruct-q4_K_M # Coding-specific model
-weight: 500;">docker exec -it ollama ollama pull qwen2.5-coder:14b
# 14B model — fits easily, fast inference
-weight: 500;">docker exec -it ollama ollama pull phi-4:14b # 32B model (Q4) — fits in ~18 GB, good quality
-weight: 500;">docker exec -it ollama ollama pull qwen2.5:32b-instruct-q4_K_M # Coding-specific model
-weight: 500;">docker exec -it ollama ollama pull qwen2.5-coder:14b
# 14B model — fits easily, fast inference
-weight: 500;">docker exec -it ollama ollama pull phi-4:14b # 32B model (Q4) — fits in ~18 GB, good quality
-weight: 500;">docker exec -it ollama ollama pull qwen2.5:32b-instruct-q4_K_M # Coding-specific model
-weight: 500;">docker exec -it ollama ollama pull qwen2.5-coder:14b
-weight: 500;">apt -weight: 500;">install -y caddy
-weight: 500;">apt -weight: 500;">install -y caddy
-weight: 500;">apt -weight: 500;">install -y caddy
ai.yourdomain.com { reverse_proxy localhost:3000
}
ai.yourdomain.com { reverse_proxy localhost:3000
}
ai.yourdomain.com { reverse_proxy localhost:3000
}
-weight: 500;">systemctl reload caddy
-weight: 500;">systemctl reload caddy
-weight: 500;">systemctl reload caddy
# 1. Sign up at hetzner.com and create a cloud project # 2. Create a CX23 instance (€3.99/mo) via the console
# - Choose Ubuntu 24.04
# - Add your SSH key
# - Pick Falkenstein or Helsinki # 3. SSH into your server
ssh root@your-server-ip # 4. Install Docker
-weight: 500;">curl -fsSL https://get.-weight: 500;">docker.com | sh # 5. Run Ollama
-weight: 500;">docker run -d --name ollama -p 11434:11434 \ -v ollama_data:/root/.ollama ollama/ollama:latest # 6. Pull a small model
-weight: 500;">docker exec -it ollama ollama pull llama3.2:3b # 7. Test it
-weight: 500;">curl http://localhost:11434/api/generate \ -d '{"model": "llama3.2:3b", "prompt": "Hello, how are you?"}'
# 1. Sign up at hetzner.com and create a cloud project # 2. Create a CX23 instance (€3.99/mo) via the console
# - Choose Ubuntu 24.04
# - Add your SSH key
# - Pick Falkenstein or Helsinki # 3. SSH into your server
ssh root@your-server-ip # 4. Install Docker
-weight: 500;">curl -fsSL https://get.-weight: 500;">docker.com | sh # 5. Run Ollama
-weight: 500;">docker run -d --name ollama -p 11434:11434 \ -v ollama_data:/root/.ollama ollama/ollama:latest # 6. Pull a small model
-weight: 500;">docker exec -it ollama ollama pull llama3.2:3b # 7. Test it
-weight: 500;">curl http://localhost:11434/api/generate \ -d '{"model": "llama3.2:3b", "prompt": "Hello, how are you?"}'
# 1. Sign up at hetzner.com and create a cloud project # 2. Create a CX23 instance (€3.99/mo) via the console
# - Choose Ubuntu 24.04
# - Add your SSH key
# - Pick Falkenstein or Helsinki # 3. SSH into your server
ssh root@your-server-ip # 4. Install Docker
-weight: 500;">curl -fsSL https://get.-weight: 500;">docker.com | sh # 5. Run Ollama
-weight: 500;">docker run -d --name ollama -p 11434:11434 \ -v ollama_data:/root/.ollama ollama/ollama:latest # 6. Pull a small model
-weight: 500;">docker exec -it ollama ollama pull llama3.2:3b # 7. Test it
-weight: 500;">curl http://localhost:11434/api/generate \ -d '{"model": "llama3.2:3b", "prompt": "Hello, how are you?"}' - Flat monthly pricing. No surprise bandwidth bills, no hidden egress charges. Traffic is unlimited on most plans.
- EU data centers. Falkenstein and Helsinki give you GDPR compliance by default.
- Straightforward networking. Private networks, floating IPs, and load balancers at prices that make sense.
- ARM instances. Ampere-based CAX servers offer strong performance-per-euro for inference workloads. - No managed AI/ML services. No SageMaker equivalent, no managed Jupyter, no model registries. You manage everything yourself.
- No spot/preemptible instances. You cannot get cheap burst GPU time. It is flat monthly pricing or nothing.
- Limited GPU availability. Dedicated GPU servers can have waitlists. AWS and GCP have broader GPU SKU availability.
- No US data centers. If you need sub-50ms latency for US users, Hetzner is not the right choice. - Llama 3.2 3B (Q4) — Fits in ~2-3 GB. General chat and simple tasks.
- Phi-3.5 Mini 3.8B (Q4) — Microsoft's efficient model. Good for code and reasoning.
- TinyLlama 1.1B — Fast even on CPU. Useful for classification and simple generation. - Llama 3.2 8B (Q4) — Solid general model. ~5 GB loaded.
- Gemma 2 2B — Google's efficient model. Punches above its weight.
- Qwen 2.5 7B (Q4) — Excellent for multilingual use cases. - AWS SageMaker / GCP Vertex AI — Managed model training, deployment, and monitoring. If you need MLOps at scale, Hetzner's bare metal cannot compete.
- Spot/preemptible instances — AWS spot pricing can bring GPU costs down 60-70% for interruptible workloads. Hetzner has no equivalent.
- Global regions — AWS has 30+ regions worldwide. Hetzner has 3 European locations.
- Auto-scaling — Cloud providers scale GPU instances based on demand. Hetzner dedicated servers are fixed capacity. - Coolify — More mature, better for multi--weight: 500;">service deployments, built-in database management.
- Dokploy — Simpler, lighter footprint, good if Ollama is your primary workload. - Need frontier intelligence (complex reasoning, creative work)? → Use API services.
- Need high-volume, predictable inference? → Self-host on Hetzner GPU.
- Need lightweight, always-on AI? → CX/CAX instance with small models.
- Need managed MLOps at scale? → Use AWS/GCP (we do not, but many teams should). - You are experimenting with self-hosted AI for the first time
- You need a personal chatbot or simple RAG pipeline
- Your queries are infrequent and latency is not critical
- Budget is the primary constraint - You need 7-8B models with slightly better response times
- You are running the AI alongside other services (Git, CI, monitoring)
- Multiple people on your team need occasional access - You need interactive-speed inference (30+ tokens/second)
- You want to run 14B-32B models with real quality
- Multiple users need concurrent access
- You are building products or services that rely on AI inference
- Fine-tuning smaller models is part of your workflow - You need 70B+ models at full precision
- Multi-user production inference is a requirement
- You are fine-tuning large models regularly
- You need 96 GB VRAM for large embedding databases or multi-model serving