$ -weight: 500;">docker run -p 8080:8080 --gpus all localai/localai:latest-gpu-nvidia-cuda-12
-weight: 500;">docker run -p 8080:8080 --gpus all localai/localai:latest-gpu-nvidia-cuda-12
-weight: 500;">docker run -p 8080:8080 --gpus all localai/localai:latest-gpu-nvidia-cuda-12
-weight: 500;">docker run -d \ -v ollama:/root/.ollama \ -p 11434:11434 \ --name ollama \ ollama/ollama
-weight: 500;">docker run -d \ -v ollama:/root/.ollama \ -p 11434:11434 \ --name ollama \ ollama/ollama
-weight: 500;">docker run -d \ -v ollama:/root/.ollama \ -p 11434:11434 \ --name ollama \ ollama/ollama
-weight: 500;">docker run -d \ -v ollama:/root/.ollama \ -p 11434:11434 \ --gpus all \ --name ollama \ ollama/ollama
,
-weight: 500;">docker run -d \ -v ollama:/root/.ollama \ -p 11434:11434 \ --gpus all \ --name ollama \ ollama/ollama
,
-weight: 500;">docker run -d \ -v ollama:/root/.ollama \ -p 11434:11434 \ --gpus all \ --name ollama \ ollama/ollama
,
-weight: 500;">curl http://localhost:11434
# Should return: Ollama is running
-weight: 500;">curl http://localhost:11434
# Should return: Ollama is running
-weight: 500;">curl http://localhost:11434
# Should return: Ollama is running
-weight: 500;">docker exec -it ollama ollama pull llama3.2:3b
-weight: 500;">docker exec -it ollama ollama pull llama3.2:3b
-weight: 500;">docker exec -it ollama ollama pull llama3.2:3b
-weight: 500;">docker exec -it ollama ollama pull llama3.1:8b
-weight: 500;">docker exec -it ollama ollama pull llama3.1:8b
-weight: 500;">docker exec -it ollama ollama pull llama3.1:8b
-weight: 500;">docker exec -it ollama ollama run llama3.2:3b "What is self-hosting?"
-weight: 500;">docker exec -it ollama ollama run llama3.2:3b "What is self-hosting?"
-weight: 500;">docker exec -it ollama ollama run llama3.2:3b "What is self-hosting?"
-weight: 500;">docker run -d \ -p 3000:8080 \ --add-host=host.-weight: 500;">docker.internal:host-gateway \ -v open-webui:/app/backend/data \ --name open-webui \ ---weight: 500;">restart always \ ghcr.io/open-webui/open-webui:main
-weight: 500;">docker run -d \ -p 3000:8080 \ --add-host=host.-weight: 500;">docker.internal:host-gateway \ -v open-webui:/app/backend/data \ --name open-webui \ ---weight: 500;">restart always \ ghcr.io/open-webui/open-webui:main
-weight: 500;">docker run -d \ -p 3000:8080 \ --add-host=host.-weight: 500;">docker.internal:host-gateway \ -v open-webui:/app/backend/data \ --name open-webui \ ---weight: 500;">restart always \ ghcr.io/open-webui/open-webui:main
services: ollama: image: ollama/ollama:latest container_name: ollama ports: - "11434:11434" volumes: - ollama_data:/root/.ollama deploy: resources: reservations: devices: - driver: nvidia count: 1 capabilities: [gpu] -weight: 500;">restart: unless-stopped open-webui: image: ghcr.io/open-webui/open-webui:main container_name: open-webui ports: - "3000:8080" environment: - OLLAMA_BASE_URL=http://ollama:11434 volumes: - open-webui_data:/app/backend/data depends_on: - ollama -weight: 500;">restart: unless-stopped
volumes: ollama_data: open-webui_data:
services: ollama: image: ollama/ollama:latest container_name: ollama ports: - "11434:11434" volumes: - ollama_data:/root/.ollama deploy: resources: reservations: devices: - driver: nvidia count: 1 capabilities: [gpu] -weight: 500;">restart: unless-stopped open-webui: image: ghcr.io/open-webui/open-webui:main container_name: open-webui ports: - "3000:8080" environment: - OLLAMA_BASE_URL=http://ollama:11434 volumes: - open-webui_data:/app/backend/data depends_on: - ollama -weight: 500;">restart: unless-stopped
volumes: ollama_data: open-webui_data:
services: ollama: image: ollama/ollama:latest container_name: ollama ports: - "11434:11434" volumes: - ollama_data:/root/.ollama deploy: resources: reservations: devices: - driver: nvidia count: 1 capabilities: [gpu] -weight: 500;">restart: unless-stopped open-webui: image: ghcr.io/open-webui/open-webui:main container_name: open-webui ports: - "3000:8080" environment: - OLLAMA_BASE_URL=http://ollama:11434 volumes: - open-webui_data:/app/backend/data depends_on: - ollama -weight: 500;">restart: unless-stopped
volumes: ollama_data: open-webui_data:
-weight: 500;">docker compose up -d
-weight: 500;">docker compose up -d
-weight: 500;">docker compose up -d
-weight: 500;">curl http://localhost:11434/api/generate -d '{ "model": "llama3.2:3b", "prompt": "Explain Docker in one sentence", "stream": false
}
-weight: 500;">curl http://localhost:11434/api/generate -d '{ "model": "llama3.2:3b", "prompt": "Explain Docker in one sentence", "stream": false
}
-weight: 500;">curl http://localhost:11434/api/generate -d '{ "model": "llama3.2:3b", "prompt": "Explain Docker in one sentence", "stream": false
}
-weight: 500;">curl http://localhost:11434/v1/chat/completions \ -H "Content-Type: application/json" \ -d '{ "model": "llama3.2:3b", "messages": [{"role": "user", "content": "Hello!"}] }
-weight: 500;">curl http://localhost:11434/v1/chat/completions \ -H "Content-Type: application/json" \ -d '{ "model": "llama3.2:3b", "messages": [{"role": "user", "content": "Hello!"}] }
-weight: 500;">curl http://localhost:11434/v1/chat/completions \ -H "Content-Type: application/json" \ -d '{ "model": "llama3.2:3b", "messages": [{"role": "user", "content": "Hello!"}] } - System RAM: 16GB minimum, 32GB recommended. RAM handles model loading and overflow when VRAM runs short.
- Storage: NVMe SSD essential. A quantized Llama 3 70B model is 35GB+. Spinning disks make load times painful.
- CPU: Modern processors with AVX-512 can run inference without a GPU, but expect single-digit tokens per second. Fine for testing, impractical for production. - General chat and reasoning: Llama 3.3 70B and Qwen 2.5 72B lead the pack. Both handle long contexts well and produce clean, controllable output.
- Coding: DeepSeek Coder V2 and Qwen2.5-Coder specialize here. Strong HumanEval scores and they catch bugs without excessive prompting.
- Multilingual: Qwen models support 29+ languages out of the box, useful for global teams.
- Resource-constrained deployment: Gemma 2 (9B) and Mistral Small 3 punch above their weight on modest hardware. - Zero data retention. Your prompts and outputs never touch external servers. The entire inference pipeline runs within your environment.
- Swiss jurisdiction. For teams with strict compliance requirements (GDPR, HIPAA, financial regulations), Prem's legal structure adds an extra layer of data protection.
- Built-in fine-tuning. Need to customize models on proprietary data? Prem's fine-tuning pipeline is integrated directly, no separate tooling required.
- Optimized inference. You get production-grade serving without manually configuring vLLM, managing CUDA drivers, or debugging memory issues. - Experimenting? Start with Ollama or LM Studio. Get a model running in minutes.
- Building production systems? Migrate to vLLM for throughput or LocalAI for multimodal needs.
- Enterprise with compliance requirements? Skip the DIY phase entirely, Prem AI gives you self-hosting benefits with managed infrastructure. - Docker Desktop installed (download here)
- 8GB+ RAM for smaller models (16GB+ recommended)
- GPU optional but significantly improves speed - A local LLM running entirely on your hardware
- A web interface at localhost:3000
- An OpenAI-compatible API at localhost:11434
- Zero data leaving your network - Try larger models: ollama pull qwen2.5:14b or ollama pull deepseek-coder:6.7b for coding tasks
- Connect to applications: Point any OpenAI-compatible tool at http://localhost:11434
- Add RAG: Use PrivateGPT or AnythingLLM to chat with your documents - Multiple applications sharing the same infrastructure
- Fine-tuned models (API fine-tuning costs add up quickly)
- Unpredictable usage spikes that would blow API budgets - Route simple queries (classification, extraction, FAQ responses) to a small self-hosted model (7B–13B)
- Reserve API calls for complex reasoning tasks that genuinely need larger models
- Use self-hosted for sensitive data and APIs for non-sensitive workloads - Volume: Are you processing 2M+ tokens daily? → Self-hosting becomes cost-competitive
- Compliance: Does your data require on-premises processing? → Self-hosting may be mandatory
- Customization: Do you need fine-tuning or model modifications? → Self-hosting gives full control
- Team: Do you have MLOps capacity to maintain infrastructure? → Required for DIY self-hosting
- Latency: Is sub-100ms response time critical? → Self-hosting eliminates network delays