services: tabby: image: tabbyml/tabby:v0.32.0 container_name: tabby ports: - "8080:8080" volumes: - tabby_data:/data command: serve --model StarCoder-1B --device cuda deploy: resources: reservations: devices: - driver: nvidia count: 1 capabilities: [gpu] restart: unless-stopped volumes: tabby_data:
services: tabby: image: tabbyml/tabby:v0.32.0 container_name: tabby ports: - "8080:8080" volumes: - tabby_data:/data command: serve --model StarCoder-1B --device cuda deploy: resources: reservations: devices: - driver: nvidia count: 1 capabilities: [gpu] restart: unless-stopped volumes: tabby_data:
services: tabby: image: tabbyml/tabby:v0.32.0 container_name: tabby ports: - "8080:8080" volumes: - tabby_data:/data command: serve --model StarCoder-1B --device cuda deploy: resources: reservations: devices: - driver: nvidia count: 1 capabilities: [gpu] restart: unless-stopped volumes: tabby_data:
{ "models": [ { "title": "Ollama - DeepSeek Coder", "provider": "ollama", "model": "deepseek-coder-v2:16b" } ], "tabAutocompleteModel": { "title": "Ollama - StarCoder", "provider": "ollama", "model": "starcoder2:3b" }
}
{ "models": [ { "title": "Ollama - DeepSeek Coder", "provider": "ollama", "model": "deepseek-coder-v2:16b" } ], "tabAutocompleteModel": { "title": "Ollama - StarCoder", "provider": "ollama", "model": "starcoder2:3b" }
}
{ "models": [ { "title": "Ollama - DeepSeek Coder", "provider": "ollama", "model": "deepseek-coder-v2:16b" } ], "tabAutocompleteModel": { "title": "Ollama - StarCoder", "provider": "ollama", "model": "starcoder2:3b" }
} - Install the Continue extension from the marketplace
- Configure ~/.continue/config.json to point at your LLM backend: - Requires a GPU for reasonable performance (NVIDIA recommended)
- StarCoder-1B: ~2 GB VRAM, fast completions
- StarCoder-7B: ~8 GB VRAM, better quality
- CPU mode works but completions are slow (2-5 seconds)
- Server RAM: ~1-2 GB + model size - Resource usage depends entirely on your LLM backend
- With Ollama: Same as Ollama's resource usage
- With a cloud provider (OpenAI, Anthropic): Zero local compute
- Extension itself uses minimal IDE resources - You want a centralized code AI server for your team
- You need repository indexing for context-aware completions
- You want usage analytics and admin controls
- You want an all-in-one solution (model serving + IDE integration)
- You need SSO/LDAP integration for enterprise deployment
- You want a dedicated GPU box serving code completions to multiple developers - You want maximum flexibility in choosing LLM backends
- You already have Ollama, LM Studio, or another LLM server running
- You want to use different models for different tasks (chat vs autocomplete)
- You don't want to manage a separate server
- You want MCP (Model Context Protocol) integration
- You want to use both local and cloud models (e.g., Ollama for autocomplete, Claude for chat)
- You're a solo developer, not managing a team - How to Self-Host Tabby
- How to Self-Host Ollama
- Ollama vs LocalAI
- Ollama vs vLLM
- Self-Hosted GitHub Copilot Alternatives
- Best Self-Hosted AI Tools
- Docker Compose Basics