$ ssh root@your_droplet_ip
ssh root@your_droplet_ip
ssh root@your_droplet_ip
-weight: 500;">apt -weight: 500;">update && -weight: 500;">apt -weight: 500;">upgrade -y
-weight: 500;">apt -weight: 500;">install -y python3--weight: 500;">pip python3-dev build-essential -weight: 500;">git -weight: 500;">wget -weight: 500;">curl
-weight: 500;">apt -weight: 500;">update && -weight: 500;">apt -weight: 500;">upgrade -y
-weight: 500;">apt -weight: 500;">install -y python3--weight: 500;">pip python3-dev build-essential -weight: 500;">git -weight: 500;">wget -weight: 500;">curl
-weight: 500;">apt -weight: 500;">update && -weight: 500;">apt -weight: 500;">upgrade -y
-weight: 500;">apt -weight: 500;">install -y python3--weight: 500;">pip python3-dev build-essential -weight: 500;">git -weight: 500;">wget -weight: 500;">curl
# Add NVIDIA repository
distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
-weight: 500;">curl https://developer.download.nvidia.com/compute/cuda/repos/$distribution/x86_64/cuda-keyring_1.0-1_all.deb -o cuda-keyring_1.0-1_all.deb
dpkg -i cuda-keyring_1.0-1_all.deb # Install CUDA 12.1
-weight: 500;">apt-get -weight: 500;">update
-weight: 500;">apt-get -weight: 500;">install -y cuda-12-1 # Add to PATH
echo 'export PATH=/usr/local/cuda-12.1/bin:$PATH' >> ~/.bashrc
echo 'export LD_LIBRARY_PATH=/usr/local/cuda-12.1/lib64:$LD_LIBRARY_PATH' >> ~/.bashrc
source ~/.bashrc # Verify
nvcc --version
# Add NVIDIA repository
distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
-weight: 500;">curl https://developer.download.nvidia.com/compute/cuda/repos/$distribution/x86_64/cuda-keyring_1.0-1_all.deb -o cuda-keyring_1.0-1_all.deb
dpkg -i cuda-keyring_1.0-1_all.deb # Install CUDA 12.1
-weight: 500;">apt-get -weight: 500;">update
-weight: 500;">apt-get -weight: 500;">install -y cuda-12-1 # Add to PATH
echo 'export PATH=/usr/local/cuda-12.1/bin:$PATH' >> ~/.bashrc
echo 'export LD_LIBRARY_PATH=/usr/local/cuda-12.1/lib64:$LD_LIBRARY_PATH' >> ~/.bashrc
source ~/.bashrc # Verify
nvcc --version
# Add NVIDIA repository
distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
-weight: 500;">curl https://developer.download.nvidia.com/compute/cuda/repos/$distribution/x86_64/cuda-keyring_1.0-1_all.deb -o cuda-keyring_1.0-1_all.deb
dpkg -i cuda-keyring_1.0-1_all.deb # Install CUDA 12.1
-weight: 500;">apt-get -weight: 500;">update
-weight: 500;">apt-get -weight: 500;">install -y cuda-12-1 # Add to PATH
echo 'export PATH=/usr/local/cuda-12.1/bin:$PATH' >> ~/.bashrc
echo 'export LD_LIBRARY_PATH=/usr/local/cuda-12.1/lib64:$LD_LIBRARY_PATH' >> ~/.bashrc
source ~/.bashrc # Verify
nvcc --version
python3 -m venv /opt/vllm_env
source /opt/vllm_env/bin/activate # Upgrade -weight: 500;">pip
-weight: 500;">pip -weight: 500;">install ---weight: 500;">upgrade -weight: 500;">pip setuptools wheel # Install vLLM with CUDA support
-weight: 500;">pip -weight: 500;">install vllm[cuda12] # Install additional dependencies
-weight: 500;">pip -weight: 500;">install pydantic python-dotenv
python3 -m venv /opt/vllm_env
source /opt/vllm_env/bin/activate # Upgrade -weight: 500;">pip
-weight: 500;">pip -weight: 500;">install ---weight: 500;">upgrade -weight: 500;">pip setuptools wheel # Install vLLM with CUDA support
-weight: 500;">pip -weight: 500;">install vllm[cuda12] # Install additional dependencies
-weight: 500;">pip -weight: 500;">install pydantic python-dotenv
python3 -m venv /opt/vllm_env
source /opt/vllm_env/bin/activate # Upgrade -weight: 500;">pip
-weight: 500;">pip -weight: 500;">install ---weight: 500;">upgrade -weight: 500;">pip setuptools wheel # Install vLLM with CUDA support
-weight: 500;">pip -weight: 500;">install vllm[cuda12] # Install additional dependencies
-weight: 500;">pip -weight: 500;">install pydantic python-dotenv
-weight: 500;">pip -weight: 500;">install huggingface-hub # Login to Hugging Face (you'll need a free account)
huggingface-cli login # Download the model
huggingface-cli download mistralai/Mixtral-8x7B-Instruct-v0.1 --local-dir /models/mixtral-8x7b --cache-dir /models --local-dir-use-symlinks False
-weight: 500;">pip -weight: 500;">install huggingface-hub # Login to Hugging Face (you'll need a free account)
huggingface-cli login # Download the model
huggingface-cli download mistralai/Mixtral-8x7B-Instruct-v0.1 --local-dir /models/mixtral-8x7b --cache-dir /models --local-dir-use-symlinks False
-weight: 500;">pip -weight: 500;">install huggingface-hub # Login to Hugging Face (you'll need a free account)
huggingface-cli login # Download the model
huggingface-cli download mistralai/Mixtral-8x7B-Instruct-v0.1 --local-dir /models/mixtral-8x7b --cache-dir /models --local-dir-use-symlinks False
cat > /opt/vllm_config.py << 'EOF'
from vllm import LLM, SamplingParams
import os # Initialize model with optimizations for Mixtral
llm = LLM( model="/models/mixtral-8x7b", tensor_parallel_size=1, gpu_memory_utilization=0.9, # Use 90% of GPU memory dtype="float16", # Use half precision for speed max_model_len=4096, # Context window enable_prefix_caching=True, # Cache repeated prefixes disable_custom_all_reduce=False,
) sampling_params = SamplingParams( temperature=0.7, top_p=0.95, max_tokens=512,
) # Test inference
prompts = [ "What is machine learning?",
] outputs = llm.generate(prompts, sampling_params)
for output in outputs: print(f"Prompt: {output.prompt}") print(f"Generated text: {output.outputs[0].text}")
EOF
cat > /opt/vllm_config.py << 'EOF'
from vllm import LLM, SamplingParams
import os # Initialize model with optimizations for Mixtral
llm = LLM( model="/models/mixtral-8x7b", tensor_parallel_size=1, gpu_memory_utilization=0.9, # Use 90% of GPU memory dtype="float16", # Use half precision for speed max_model_len=4096, # Context window enable_prefix_caching=True, # Cache repeated prefixes disable_custom_all_reduce=False,
) sampling_params = SamplingParams( temperature=0.7, top_p=0.95, max_tokens=512,
) # Test inference
prompts = [ "What is machine learning?",
] outputs = llm.generate(prompts, sampling_params)
for output in outputs: print(f"Prompt: {output.prompt}") print(f"Generated text: {output.outputs[0].text}")
EOF
cat > /opt/vllm_config.py << 'EOF'
from vllm import LLM, SamplingParams
import os # Initialize model with optimizations for Mixtral
llm = LLM( model="/models/mixtral-8x7b", tensor_parallel_size=1, gpu_memory_utilization=0.9, # Use 90% of GPU memory dtype="float16", # Use half precision for speed max_model_len=4096, # Context window enable_prefix_caching=True, # Cache repeated prefixes disable_custom_all_reduce=False,
) sampling_params = SamplingParams( temperature=0.7, top_p=0.95, max_tokens=512,
) # Test inference
prompts = [ "What is machine learning?",
] outputs = llm.generate(prompts, sampling_params)
for output in outputs: print(f"Prompt: {output.prompt}") print(f"Generated text: {output.outputs[0].text}")
EOF
python -m vllm.entrypoints.openai.api_server \ --model /models/mixtral-8x7b \ --dtype float16 \ --gpu-memory-utilization 0.9 \ --tensor-parallel-size 1 \ --max-model-len 4096 \ ---weight: 500;">enable-prefix-caching \ --port 8000
python -m vllm.entrypoints.openai.api_server \ --model /models/mixtral-8x7b \ --dtype float16 \ --gpu-memory-utilization 0.9 \ --tensor-parallel-size 1 \ --max-model-len 4096 \ ---weight: 500;">enable-prefix-caching \ --port 8000
python -m vllm.entrypoints.openai.api_server \ --model /models/mixtral-8x7b \ --dtype float16 \ --gpu-memory-utilization 0.9 \ --tensor-parallel-size 1 \ --max-model-len 4096 \ ---weight: 500;">enable-prefix-caching \ --port 8000
INFO: Started server process [12345]
INFO: Waiting for application startup.
INFO: Application startup complete
INFO: Uvicorn running on http://0.0.0.0:8000
INFO: Started server process [12345]
INFO: Waiting for application startup.
INFO: Application startup complete
INFO: Uvicorn running on http://0.0.0.0:8000
INFO: Started server process [12345]
INFO: Waiting for application startup.
INFO: Application startup complete
INFO: Uvicorn running on http://0.0.0.0:8000 - Token-level batching: Process requests in real-time without waiting for batch completion
- Paged attention: Reduce memory overhead by 4-10x compared to standard transformers
- Sparse activation awareness: Only compute active expert paths, skipping dead weight - GPU: NVIDIA H100 (PCIe) - $28/month
- Region: Choose closest to your users (NYC, SFO, London, Singapore all available)
- Image: Ubuntu 22.04 LTS
- Size: 8GB RAM minimum, but grab 16GB if available in your region ($38/month instead) - Deploy your projects fast → DigitalOcean — get $200 in free credits
- Organize your AI workflows → Notion — free to -weight: 500;">start
- Run AI models cheaper → OpenRouter — pay per token, no subscriptions