$ -weight: 500;">brew -weight: 500;">install ollama
-weight: 500;">brew -weight: 500;">install ollama
-weight: 500;">brew -weight: 500;">install ollama
-weight: 500;">curl -fsSL https://ollama.com/-weight: 500;">install.sh | sh
-weight: 500;">curl -fsSL https://ollama.com/-weight: 500;">install.sh | sh
-weight: 500;">curl -fsSL https://ollama.com/-weight: 500;">install.sh | sh
# Pull a model (one-time download)
ollama pull llama3.2:8b # Run it interactively
ollama run llama3.2:8b
# Pull a model (one-time download)
ollama pull llama3.2:8b # Run it interactively
ollama run llama3.2:8b
# Pull a model (one-time download)
ollama pull llama3.2:8b # Run it interactively
ollama run llama3.2:8b
ollama serve
ollama serve
ollama serve
-weight: 500;">curl http://localhost:11434/api/tags
-weight: 500;">curl http://localhost:11434/api/tags
-weight: 500;">curl http://localhost:11434/api/tags
-weight: 500;">docker run -d \ -p 3000:8080 \ --add-host=host.-weight: 500;">docker.internal:host-gateway \ -v open-webui:/app/backend/data \ --name open-webui \ ---weight: 500;">restart always \ ghcr.io/open-webui/open-webui:main
-weight: 500;">docker run -d \ -p 3000:8080 \ --add-host=host.-weight: 500;">docker.internal:host-gateway \ -v open-webui:/app/backend/data \ --name open-webui \ ---weight: 500;">restart always \ ghcr.io/open-webui/open-webui:main
-weight: 500;">docker run -d \ -p 3000:8080 \ --add-host=host.-weight: 500;">docker.internal:host-gateway \ -v open-webui:/app/backend/data \ --name open-webui \ ---weight: 500;">restart always \ ghcr.io/open-webui/open-webui:main
mkdir -p ~/ollama-stack && cd ~/ollama-stack
mkdir -p ~/ollama-stack && cd ~/ollama-stack
mkdir -p ~/ollama-stack && cd ~/ollama-stack
version: "3.8" services: ollama: image: ollama/ollama:latest container_name: ollama -weight: 500;">restart: unless-stopped ports: - "11434:11434" volumes: - ollama_data:/root/.ollama environment: - OLLAMA_NUM_PARALLEL=2 - OLLAMA_MAX_LOADED_MODELS=1 - OLLAMA_KEEP_ALIVE=10m # Remove the deploy section if your VPS has no GPU # deploy: # resources: # reservations: # devices: # - driver: nvidia # count: all # capabilities: [gpu] open-webui: image: ghcr.io/open-webui/open-webui:main container_name: open-webui -weight: 500;">restart: unless-stopped ports: - "3000:8080" volumes: - open_webui_data:/app/backend/data environment: - OLLAMA_BASE_URL=http://ollama:11434 - WEBUI_AUTH=true - WEBUI_SECRET_KEY=change-this-to-a-random-string depends_on: - ollama volumes: ollama_data: open_webui_data:
version: "3.8" services: ollama: image: ollama/ollama:latest container_name: ollama -weight: 500;">restart: unless-stopped ports: - "11434:11434" volumes: - ollama_data:/root/.ollama environment: - OLLAMA_NUM_PARALLEL=2 - OLLAMA_MAX_LOADED_MODELS=1 - OLLAMA_KEEP_ALIVE=10m # Remove the deploy section if your VPS has no GPU # deploy: # resources: # reservations: # devices: # - driver: nvidia # count: all # capabilities: [gpu] open-webui: image: ghcr.io/open-webui/open-webui:main container_name: open-webui -weight: 500;">restart: unless-stopped ports: - "3000:8080" volumes: - open_webui_data:/app/backend/data environment: - OLLAMA_BASE_URL=http://ollama:11434 - WEBUI_AUTH=true - WEBUI_SECRET_KEY=change-this-to-a-random-string depends_on: - ollama volumes: ollama_data: open_webui_data:
version: "3.8" services: ollama: image: ollama/ollama:latest container_name: ollama -weight: 500;">restart: unless-stopped ports: - "11434:11434" volumes: - ollama_data:/root/.ollama environment: - OLLAMA_NUM_PARALLEL=2 - OLLAMA_MAX_LOADED_MODELS=1 - OLLAMA_KEEP_ALIVE=10m # Remove the deploy section if your VPS has no GPU # deploy: # resources: # reservations: # devices: # - driver: nvidia # count: all # capabilities: [gpu] open-webui: image: ghcr.io/open-webui/open-webui:main container_name: open-webui -weight: 500;">restart: unless-stopped ports: - "3000:8080" volumes: - open_webui_data:/app/backend/data environment: - OLLAMA_BASE_URL=http://ollama:11434 - WEBUI_AUTH=true - WEBUI_SECRET_KEY=change-this-to-a-random-string depends_on: - ollama volumes: ollama_data: open_webui_data:
-weight: 500;">docker compose up -d
-weight: 500;">docker compose up -d
-weight: 500;">docker compose up -d
# For CX23 (4 GB RAM) — use a 3B model
-weight: 500;">docker exec -it ollama ollama pull llama3.2:3b # For CX33 (8 GB RAM) — you can try a 7B model
-weight: 500;">docker exec -it ollama ollama pull llama3.2:8b
# For CX23 (4 GB RAM) — use a 3B model
-weight: 500;">docker exec -it ollama ollama pull llama3.2:3b # For CX33 (8 GB RAM) — you can try a 7B model
-weight: 500;">docker exec -it ollama ollama pull llama3.2:8b
# For CX23 (4 GB RAM) — use a 3B model
-weight: 500;">docker exec -it ollama ollama pull llama3.2:3b # For CX33 (8 GB RAM) — you can try a 7B model
-weight: 500;">docker exec -it ollama ollama pull llama3.2:8b
# Install Caddy
-weight: 600;">sudo -weight: 500;">apt -weight: 500;">install -y caddy
# Install Caddy
-weight: 600;">sudo -weight: 500;">apt -weight: 500;">install -y caddy
# Install Caddy
-weight: 600;">sudo -weight: 500;">apt -weight: 500;">install -y caddy
chat.yourdomain.com { reverse_proxy localhost:3000
}
chat.yourdomain.com { reverse_proxy localhost:3000
}
chat.yourdomain.com { reverse_proxy localhost:3000
}
-weight: 600;">sudo -weight: 500;">systemctl reload caddy
-weight: 600;">sudo -weight: 500;">systemctl reload caddy
-weight: 600;">sudo -weight: 500;">systemctl reload caddy
# 1. Install Ollama
-weight: 500;">brew -weight: 500;">install ollama # macOS
# -weight: 500;">curl -fsSL https://ollama.com/-weight: 500;">install.sh | sh # Linux # 2. Pull a model
ollama pull llama3.2:8b # 3. Start the server
ollama serve # 4. Run Open WebUI
-weight: 500;">docker run -d -p 3000:8080 \ --add-host=host.-weight: 500;">docker.internal:host-gateway \ -v open-webui:/app/backend/data \ --name open-webui \ ghcr.io/open-webui/open-webui:main # 5. Open http://localhost:3000
# 1. Install Ollama
-weight: 500;">brew -weight: 500;">install ollama # macOS
# -weight: 500;">curl -fsSL https://ollama.com/-weight: 500;">install.sh | sh # Linux # 2. Pull a model
ollama pull llama3.2:8b # 3. Start the server
ollama serve # 4. Run Open WebUI
-weight: 500;">docker run -d -p 3000:8080 \ --add-host=host.-weight: 500;">docker.internal:host-gateway \ -v open-webui:/app/backend/data \ --name open-webui \ ghcr.io/open-webui/open-webui:main # 5. Open http://localhost:3000
# 1. Install Ollama
-weight: 500;">brew -weight: 500;">install ollama # macOS
# -weight: 500;">curl -fsSL https://ollama.com/-weight: 500;">install.sh | sh # Linux # 2. Pull a model
ollama pull llama3.2:8b # 3. Start the server
ollama serve # 4. Run Open WebUI
-weight: 500;">docker run -d -p 3000:8080 \ --add-host=host.-weight: 500;">docker.internal:host-gateway \ -v open-webui:/app/backend/data \ --name open-webui \ ghcr.io/open-webui/open-webui:main # 5. Open http://localhost:3000
# 1. SSH into your server
ssh root@your-server-ip # 2. Install Docker
-weight: 500;">curl -fsSL https://get.-weight: 500;">docker.com | sh # 3. Create -weight: 500;">docker-compose.yml (see VPS section above) # 4. Start the stack
-weight: 500;">docker compose up -d # 5. Pull a model
-weight: 500;">docker exec -it ollama ollama pull llama3.2:3b
# 1. SSH into your server
ssh root@your-server-ip # 2. Install Docker
-weight: 500;">curl -fsSL https://get.-weight: 500;">docker.com | sh # 3. Create -weight: 500;">docker-compose.yml (see VPS section above) # 4. Start the stack
-weight: 500;">docker compose up -d # 5. Pull a model
-weight: 500;">docker exec -it ollama ollama pull llama3.2:3b
# 1. SSH into your server
ssh root@your-server-ip # 2. Install Docker
-weight: 500;">curl -fsSL https://get.-weight: 500;">docker.com | sh # 3. Create -weight: 500;">docker-compose.yml (see VPS section above) # 4. Start the stack
-weight: 500;">docker compose up -d # 5. Pull a model
-weight: 500;">docker exec -it ollama ollama pull llama3.2:3b - 52 million monthly downloads (Q1 2026) — 520x growth from Q1 2023
- 135,000+ GGUF models available on HuggingFace
- Runs on macOS (Apple Silicon), Linux, and Windows
- Exposes an OpenAI-compatible API, so existing code that talks to OpenAI can point at Ollama instead
- Default limit of ~4 parallel requests — designed for personal/small team use, not production multi-user deployments - Chat interface with conversation history and search
- Model switching (swap between models mid-conversation)
- Document upload and RAG (retrieval-augmented generation)
- Multi-user support with role-based access
- Prompt templates and system message customization
- Mobile-friendly responsive design
- Local file and image analysis - CX23: 2 vCPU, 4 GB RAM, 40 GB SSD — €3.99/month (~$4.99/month)
- CX33: 4 vCPU, 8 GB RAM, 80 GB SSD — €6.49/month (~$8.09/month) - EU data centers (Falkenstein, Helsinki) for GDPR compliance
- Flat monthly pricing with no bandwidth surprises - Qwen 2.5 Coder 7B — Best balance of code quality and resource usage. Handles Python, JavaScript, TypeScript, Go, and Rust well. If you are comparing self-hosted coding models against paid alternatives, see our AI coding tools pricing breakdown for the full cost picture.
- DeepSeek Coder V2 (distilled) — Strong at multi-file reasoning and debugging. Needs more RAM.
- Llama 3.2 8B — Decent general coding, but specialized coding models outperform it. - Llama 3.3 70B — Best open-source general model if you have the hardware (40+ GB RAM).
- Qwen 2.5 32B — Excellent writing quality, 83.2% MMLU score. Needs 16-20 GB.
- Gemma 2 9B — Surprisingly good writing quality for its size. Runs on 6 GB. - Llama 3.2 3B — Solid general capability in a tiny package. Best first model to try.
- Qwen 3.5 7B — 76.8% MMLU, 3x faster than the 32B variant. Great quality-per-watt.
- Phi-4 14B — Microsoft's efficient model. Good for development workflows if you have 10 GB. - Qwen 2.5 series — Supports 29+ languages natively. Best option for non-English work. - Personal assistant for writing, brainstorming, and summarization — A 7-8B model handles these tasks surprisingly well.
- Code completion and review — Specialized coding models match or beat older GPT-3.5-level performance.
- Private document analysis — Upload sensitive documents and query them without data leaving your network.
- Learning and experimentation — Try different models, fine-tune for specific tasks, understand how LLMs work. - Complex multi-step reasoning — Smaller models struggle with tasks that GPT-5 or Claude Opus handle easily.
- Multi-user production deployments — Ollama caps at ~4 parallel requests by default. It is designed for personal or small-team use.
- Speed-critical applications — CPU inference on a VPS delivers single-digit tokens per second. GPU inference on consumer hardware delivers 30-50 tok/s. API services deliver hundreds of tokens per second.
- Long context windows — Most open models cap at 8K-32K tokens. Frontier API models offer 128K-200K+ tokens. - Ollama: ~41 tokens/second (single user, GPU)
- llama.cpp (CPU): ~80 tokens/second (optimized build, good CPU)
- vLLM: ~793 tokens/second (production deployment, A100 GPU) - You need frontier intelligence. GPT-5, Claude Opus 4, and Gemini 2.5 Pro are significantly more capable than any model you can self-host. For complex reasoning, creative work, or tasks where quality is non-negotiable, APIs win.
- Your volume is low. Below ~2 million tokens per day, API services are almost always cheaper than self-hosting once you factor in infrastructure, maintenance, and your time.
- You need high concurrency. If multiple people need simultaneous access with low latency, API services handle this natively. Self-hosted Ollama does not.
- Uptime matters. API providers offer 99.9%+ uptime with automatic failover. Your Hetzner VPS does not. - Draft generation — First drafts of articles and documentation where privacy is not critical but cost adds up at volume.
- Code review assistance — Quick code reviews and refactoring suggestions where a 7B coding model is sufficient.
- Local RAG — Querying internal documents without sending proprietary content to external APIs.