# Verify current driver
nvidia-smi
# If missing/outdated, -weight: 500;">install the latest 550‑series
-weight: 600;">sudo -weight: 500;">apt -weight: 500;">update
-weight: 600;">sudo -weight: 500;">apt -weight: 500;">install -y nvidia-driver-550
-weight: 600;">sudo reboot
# Verify current driver
nvidia-smi
# If missing/outdated, -weight: 500;">install the latest 550‑series
-weight: 600;">sudo -weight: 500;">apt -weight: 500;">update
-weight: 600;">sudo -weight: 500;">apt -weight: 500;">install -y nvidia-driver-550
-weight: 600;">sudo reboot
# Verify current driver
nvidia-smi
# If missing/outdated, -weight: 500;">install the latest 550‑series
-weight: 600;">sudo -weight: 500;">apt -weight: 500;">update
-weight: 600;">sudo -weight: 500;">apt -weight: 500;">install -y nvidia-driver-550
-weight: 600;">sudo reboot
# Install Docker base packages
-weight: 600;">sudo -weight: 500;">apt-get -weight: 500;">install -y ca-certificates -weight: 500;">curl gnupg lsb-release
-weight: 600;">sudo mkdir -p /etc/-weight: 500;">apt/keyrings
-weight: 500;">curl -fsSL https://download.-weight: 500;">docker.com/linux/ubuntu/gpg | -weight: 600;">sudo gpg --dearmor -o /etc/-weight: 500;">apt/keyrings/-weight: 500;">docker.gpg
echo \ "deb [arch=$(dpkg --print-architecture) signed-by=/etc/-weight: 500;">apt/keyrings/-weight: 500;">docker.gpg] \ https://download.-weight: 500;">docker.com/linux/ubuntu \ $(lsb_release -cs) stable" | -weight: 600;">sudo tee /etc/-weight: 500;">apt/sources.list.d/-weight: 500;">docker.list > /dev/null
-weight: 600;">sudo -weight: 500;">apt-get -weight: 500;">update
-weight: 600;">sudo -weight: 500;">apt-get -weight: 500;">install -y -weight: 500;">docker-ce -weight: 500;">docker-ce-cli containerd.io \ -weight: 500;">docker-buildx-plugin -weight: 500;">docker-compose-plugin # Add NVIDIA Container Toolkit
distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
-weight: 500;">curl -s -L https://nvidia.github.io/nvidia--weight: 500;">docker/gpgkey | -weight: 600;">sudo -weight: 500;">apt-key add -
-weight: 500;">curl -s -L https://nvidia.github.io/nvidia--weight: 500;">docker/$distribution/nvidia--weight: 500;">docker.list | -weight: 600;">sudo tee /etc/-weight: 500;">apt/sources.list.d/nvidia--weight: 500;">docker.list
-weight: 600;">sudo -weight: 500;">apt-get -weight: 500;">update
-weight: 600;">sudo -weight: 500;">apt-get -weight: 500;">install -y nvidia-docker2
-weight: 600;">sudo -weight: 500;">systemctl -weight: 500;">restart -weight: 500;">docker # Test
-weight: 600;">sudo -weight: 500;">docker run --rm --gpus all nvidia/cuda:12.4.1-base-ubuntu22.04 nvidia-smi
# Install Docker base packages
-weight: 600;">sudo -weight: 500;">apt-get -weight: 500;">install -y ca-certificates -weight: 500;">curl gnupg lsb-release
-weight: 600;">sudo mkdir -p /etc/-weight: 500;">apt/keyrings
-weight: 500;">curl -fsSL https://download.-weight: 500;">docker.com/linux/ubuntu/gpg | -weight: 600;">sudo gpg --dearmor -o /etc/-weight: 500;">apt/keyrings/-weight: 500;">docker.gpg
echo \ "deb [arch=$(dpkg --print-architecture) signed-by=/etc/-weight: 500;">apt/keyrings/-weight: 500;">docker.gpg] \ https://download.-weight: 500;">docker.com/linux/ubuntu \ $(lsb_release -cs) stable" | -weight: 600;">sudo tee /etc/-weight: 500;">apt/sources.list.d/-weight: 500;">docker.list > /dev/null
-weight: 600;">sudo -weight: 500;">apt-get -weight: 500;">update
-weight: 600;">sudo -weight: 500;">apt-get -weight: 500;">install -y -weight: 500;">docker-ce -weight: 500;">docker-ce-cli containerd.io \ -weight: 500;">docker-buildx-plugin -weight: 500;">docker-compose-plugin # Add NVIDIA Container Toolkit
distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
-weight: 500;">curl -s -L https://nvidia.github.io/nvidia--weight: 500;">docker/gpgkey | -weight: 600;">sudo -weight: 500;">apt-key add -
-weight: 500;">curl -s -L https://nvidia.github.io/nvidia--weight: 500;">docker/$distribution/nvidia--weight: 500;">docker.list | -weight: 600;">sudo tee /etc/-weight: 500;">apt/sources.list.d/nvidia--weight: 500;">docker.list
-weight: 600;">sudo -weight: 500;">apt-get -weight: 500;">update
-weight: 600;">sudo -weight: 500;">apt-get -weight: 500;">install -y nvidia-docker2
-weight: 600;">sudo -weight: 500;">systemctl -weight: 500;">restart -weight: 500;">docker # Test
-weight: 600;">sudo -weight: 500;">docker run --rm --gpus all nvidia/cuda:12.4.1-base-ubuntu22.04 nvidia-smi
# Install Docker base packages
-weight: 600;">sudo -weight: 500;">apt-get -weight: 500;">install -y ca-certificates -weight: 500;">curl gnupg lsb-release
-weight: 600;">sudo mkdir -p /etc/-weight: 500;">apt/keyrings
-weight: 500;">curl -fsSL https://download.-weight: 500;">docker.com/linux/ubuntu/gpg | -weight: 600;">sudo gpg --dearmor -o /etc/-weight: 500;">apt/keyrings/-weight: 500;">docker.gpg
echo \ "deb [arch=$(dpkg --print-architecture) signed-by=/etc/-weight: 500;">apt/keyrings/-weight: 500;">docker.gpg] \ https://download.-weight: 500;">docker.com/linux/ubuntu \ $(lsb_release -cs) stable" | -weight: 600;">sudo tee /etc/-weight: 500;">apt/sources.list.d/-weight: 500;">docker.list > /dev/null
-weight: 600;">sudo -weight: 500;">apt-get -weight: 500;">update
-weight: 600;">sudo -weight: 500;">apt-get -weight: 500;">install -y -weight: 500;">docker-ce -weight: 500;">docker-ce-cli containerd.io \ -weight: 500;">docker-buildx-plugin -weight: 500;">docker-compose-plugin # Add NVIDIA Container Toolkit
distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
-weight: 500;">curl -s -L https://nvidia.github.io/nvidia--weight: 500;">docker/gpgkey | -weight: 600;">sudo -weight: 500;">apt-key add -
-weight: 500;">curl -s -L https://nvidia.github.io/nvidia--weight: 500;">docker/$distribution/nvidia--weight: 500;">docker.list | -weight: 600;">sudo tee /etc/-weight: 500;">apt/sources.list.d/nvidia--weight: 500;">docker.list
-weight: 600;">sudo -weight: 500;">apt-get -weight: 500;">update
-weight: 600;">sudo -weight: 500;">apt-get -weight: 500;">install -y nvidia-docker2
-weight: 600;">sudo -weight: 500;">systemctl -weight: 500;">restart -weight: 500;">docker # Test
-weight: 600;">sudo -weight: 500;">docker run --rm --gpus all nvidia/cuda:12.4.1-base-ubuntu22.04 nvidia-smi
# Install
-weight: 500;">curl -fsSL https://ollama.com/-weight: 500;">install.sh | sh
# Start the -weight: 500;">service (background)
ollama serve &
# OR -weight: 500;">enable as a systemd -weight: 500;">service
-weight: 600;">sudo -weight: 500;">systemctl -weight: 500;">enable ollama --now
# Install
-weight: 500;">curl -fsSL https://ollama.com/-weight: 500;">install.sh | sh
# Start the -weight: 500;">service (background)
ollama serve &
# OR -weight: 500;">enable as a systemd -weight: 500;">service
-weight: 600;">sudo -weight: 500;">systemctl -weight: 500;">enable ollama --now
# Install
-weight: 500;">curl -fsSL https://ollama.com/-weight: 500;">install.sh | sh
# Start the -weight: 500;">service (background)
ollama serve &
# OR -weight: 500;">enable as a systemd -weight: 500;">service
-weight: 600;">sudo -weight: 500;">systemctl -weight: 500;">enable ollama --now
ollama pull qwen3.5:9b # pulls Q4_K_M by default
# To choose a specific quantization:
# ollama pull qwen3.5:9b:q5_k_m
ollama pull qwen3.5:9b # pulls Q4_K_M by default
# To choose a specific quantization:
# ollama pull qwen3.5:9b:q5_k_m
ollama pull qwen3.5:9b # pulls Q4_K_M by default
# To choose a specific quantization:
# ollama pull qwen3.5:9b:q5_k_m
ollama run qwen3.5:9b "請用一句話介紹自己"
ollama run qwen3.5:9b "請用一句話介紹自己"
ollama run qwen3.5:9b "請用一句話介紹自己"
ollama serve --host 0.0.0.0:11434 &
ollama serve --host 0.0.0.0:11434 &
ollama serve --host 0.0.0.0:11434 &
FROM nvidia/cuda:12.4.1-runtime-ubuntu22.04 RUN -weight: 500;">apt-get -weight: 500;">update && -weight: 500;">apt-get -weight: 500;">install -y \ ca-certificates -weight: 500;">curl \ && rm -rf /var/lib/-weight: 500;">apt/lists/* RUN -weight: 500;">curl -fsSL https://ollama.com/-weight: 500;">install.sh | sh
RUN ollama pull qwen3.5:9b EXPOSE 11434
CMD ["ollama", "serve"]
FROM nvidia/cuda:12.4.1-runtime-ubuntu22.04 RUN -weight: 500;">apt-get -weight: 500;">update && -weight: 500;">apt-get -weight: 500;">install -y \ ca-certificates -weight: 500;">curl \ && rm -rf /var/lib/-weight: 500;">apt/lists/* RUN -weight: 500;">curl -fsSL https://ollama.com/-weight: 500;">install.sh | sh
RUN ollama pull qwen3.5:9b EXPOSE 11434
CMD ["ollama", "serve"]
FROM nvidia/cuda:12.4.1-runtime-ubuntu22.04 RUN -weight: 500;">apt-get -weight: 500;">update && -weight: 500;">apt-get -weight: 500;">install -y \ ca-certificates -weight: 500;">curl \ && rm -rf /var/lib/-weight: 500;">apt/lists/* RUN -weight: 500;">curl -fsSL https://ollama.com/-weight: 500;">install.sh | sh
RUN ollama pull qwen3.5:9b EXPOSE 11434
CMD ["ollama", "serve"]
-weight: 500;">docker build -t local-agent-ollama .
-weight: 500;">docker run --rm --gpus all -p 11434:11434 local-agent-ollama
-weight: 500;">docker build -t local-agent-ollama .
-weight: 500;">docker run --rm --gpus all -p 11434:11434 local-agent-ollama
-weight: 500;">docker build -t local-agent-ollama .
-weight: 500;">docker run --rm --gpus all -p 11434:11434 local-agent-ollama
-weight: 600;">sudo fallocate -l 16G /swapfile -weight: 600;">sudo chmod 600 /swapfile -weight: 600;">sudo mkswap /swapfile -weight: 600;">sudo swapon /swapfile echo '/swapfile none swap sw 0 0' | -weight: 600;">sudo tee -a /etc/fstab
-weight: 600;">sudo fallocate -l 16G /swapfile -weight: 600;">sudo chmod 600 /swapfile -weight: 600;">sudo mkswap /swapfile -weight: 600;">sudo swapon /swapfile echo '/swapfile none swap sw 0 0' | -weight: 600;">sudo tee -a /etc/fstab
-weight: 600;">sudo fallocate -l 16G /swapfile -weight: 600;">sudo chmod 600 /swapfile -weight: 600;">sudo mkswap /swapfile -weight: 600;">sudo swapon /swapfile echo '/swapfile none swap sw 0 0' | -weight: 600;">sudo tee -a /etc/fstab
OLLAMA_MAX_LOADED_MODELS=1 OLLAMA_NUM_PARALLEL=1 ollama serve &
OLLAMA_MAX_LOADED_MODELS=1 OLLAMA_NUM_PARALLEL=1 ollama serve &
OLLAMA_MAX_LOADED_MODELS=1 OLLAMA_NUM_PARALLEL=1 ollama serve &
#!/bin/bash while true; do echo "$(date) $(nvidia-smi --query-gpu=utilization.gpu,utilization.memory,temperature.gpu --format=csv,noheader,nounits)" >> ~/gpu_monitor.log sleep 30 done &
#!/bin/bash while true; do echo "$(date) $(nvidia-smi --query-gpu=utilization.gpu,utilization.memory,temperature.gpu --format=csv,noheader,nounits)" >> ~/gpu_monitor.log sleep 30 done &
#!/bin/bash while true; do echo "$(date) $(nvidia-smi --query-gpu=utilization.gpu,utilization.memory,temperature.gpu --format=csv,noheader,nounits)" >> ~/gpu_monitor.log sleep 30 done &
time ollama run qwen3.5:9b "你好"
time ollama run qwen3.5:9b "你好"
time ollama run qwen3.5:9b "你好" - Model Weights – varies by quantization: Q4_K_M ~6.6 GB, Q5_K_M ~7.8 GB, FP16 ~18 GB.
- KV Cache – grows linearly with sequence length. For a 9B model with 48 attention heads and hidden size 4096, a single token needs roughly 0.5 KB. A 2048‑token context ≈ 1 GB; 4096‑token ≈ 2 GB.
- Workspace – activation values, temporary buffers, and framework overhead (Ollama, llama.cpp, etc.) typically consume another 1–2 GB. - Used RTX 3060 12 GB (~$200–$250) – check VRAM carefully; some 12 GB cards may still feel tight for longer contexts.
- New RTX 4060 Ti 16 GB (~$400) – reliable, power‑efficient, and gives steady 30+ tok/s.
- AMD RX 6800 16 GB (~$350) – viable if you confirm ROCm support; Ollama currently favors CUDA, but community builds are emerging. - RTX 4070 12 GB (~$500)
- RTX 4070 Ti 12 GB (~$600)
- RTX 5070 Ti 16 GB (~$900) – if budget allows, this is currently the best single‑card balance of VRAM, speed, and power draw. - Dual‑card setup (e.g., two RTX 4060 Ti 16 GB) with simple load‑balancing (vLLM + round‑robin) or an NVLink‑capable motherboard if you find a used workstation board.
- External GPU enclosure (eGPU) via Thunderbolt 4 for laptop users who need portability.
- Keep a small cloud‑API quota as a burst‑only fallback for those rare occasions when you need >32k context or true multimodality (image/video understanding). - Set Up a Swap File
Prevent out‑of‑memory surprises by allocating swap at least equal to your VRAM. For a 16 GB card: - Limit Ollama’s Parallelism (if you’re a single user)
Reduce contention by telling Ollama to keep only one model loaded and handle one request at a time: - Monitor GPU Utilization
A lightweight logging script helps you spot under‑ or over‑use: - CPU: AMD Ryzen 9 9950X (16C/32T)
- Motherboard: X670E Artisan series
- RAM: 64 GB DDR5 6000 MHz (2×32 GB)
- Storage: 2 TB NVMe PCIe 4.0 (system) + 4 TB SATA III (backup)
- PSU: 1000 W 80+ Gold fully modular
- Case: Mid‑tower with three 120 mm fans front/rear/top/bottom
- GPU: NVIDIA RTX 5070 Ti 16 GB (Founders Edition) – driver 550.54.15, CUDA 12.4
- OS: Ubuntu 22.04 LTS (running inside WSL2 on a Windows 11 host)
- Docker: 27.0.3
- Ollama: 0.5.0
- Model: qwen3.5:9b Q4_K_M - Model cold‑-weight: 500;">start load: ~39.4 seconds
- Steady‑state request latency (200‑token output): ~1.8 seconds
- 12‑hour stability test (one request per minute): zero crashes, no memory leaks
- Daily throughput ≈ 12 million tokens, equating to roughly $180/day saved versus calling Claude Opus for the same volume. - Identify Your GPU & VRAM Windows: Win + R → dxdiag → Display tab. Linux: lspci -v | grep -i vga or just run nvidia-smi if drivers are installed.
Note the card name and VRAM size.
- Windows: Win + R → dxdiag → Display tab.
- Linux: lspci -v | grep -i vga or just run nvidia-smi if drivers are installed.
Note the card name and VRAM size.
- Confirm Driver & CUDA Health
Download the CUDA Toolkit’s deviceQuery sample (https://developer.nvidia.com/cuda-samples) Build and run it; you should see correct core counts and memory bandwidth.
- Run a Baseline Ollama Test
Install Ollama (as detailed above), pull qwen3.5:9b, and time a simple prompt: - Windows: Win + R → dxdiag → Display tab.
- Linux: lspci -v | grep -i vga or just run nvidia-smi if drivers are installed.
Note the card name and VRAM size. - Define Your Typical Workload Do you need to process very long documents (>16k tokens)? Is multimodal (image/audio) understanding required? How many agent calls per day do you anticipate?
- Do you need to process very long documents (>16k tokens)?
- Is multimodal (image/audio) understanding required?
- How many agent calls per day do you anticipate?
- Draft an Upgrade Timeline & Budget If VRAM < 12 GB, prioritize a 16 GB card (new or used). If funds are tight, consider a well‑reviewed used 16 GB model (e.g., RTX 3060 Ti 12 GB is risky due to insufficient VRAM; aim for a true 16 GB part). Remember to verify your power supply can handle the new card’s TDP and has the requisite PCIe power connectors.
- If VRAM < 12 GB, prioritize a 16 GB card (new or used).
- If funds are tight, consider a well‑reviewed used 16 GB model (e.g., RTX 3060 Ti 12 GB is risky due to insufficient VRAM; aim for a true 16 GB part).
- Remember to verify your power supply can handle the new card’s TDP and has the requisite PCIe power connectors. - Do you need to process very long documents (>16k tokens)?
- Is multimodal (image/audio) understanding required?
- How many agent calls per day do you anticipate? - If VRAM < 12 GB, prioritize a 16 GB card (new or used).
- If funds are tight, consider a well‑reviewed used 16 GB model (e.g., RTX 3060 Ti 12 GB is risky due to insufficient VRAM; aim for a true 16 GB part).
- Remember to verify your power supply can handle the new card’s TDP and has the requisite PCIe power connectors.