$ ssh root@your_droplet_ip
ssh root@your_droplet_ip
ssh root@your_droplet_ip
-weight: 500;">apt -weight: 500;">update && -weight: 500;">apt -weight: 500;">upgrade -y
-weight: 500;">apt -weight: 500;">install -y python3--weight: 500;">pip python3-dev build-essential -weight: 500;">git -weight: 500;">wget -weight: 500;">curl # Install CUDA 12.1 (required for vLLM)
-weight: 500;">wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.0-1_all.deb
dpkg -i cuda-keyring_1.0-1_all.deb
-weight: 500;">apt-get -weight: 500;">update
-weight: 500;">apt-get -y -weight: 500;">install cuda-toolkit-12-1 # Verify GPU is detected
nvidia-smi
-weight: 500;">apt -weight: 500;">update && -weight: 500;">apt -weight: 500;">upgrade -y
-weight: 500;">apt -weight: 500;">install -y python3--weight: 500;">pip python3-dev build-essential -weight: 500;">git -weight: 500;">wget -weight: 500;">curl # Install CUDA 12.1 (required for vLLM)
-weight: 500;">wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.0-1_all.deb
dpkg -i cuda-keyring_1.0-1_all.deb
-weight: 500;">apt-get -weight: 500;">update
-weight: 500;">apt-get -y -weight: 500;">install cuda-toolkit-12-1 # Verify GPU is detected
nvidia-smi
-weight: 500;">apt -weight: 500;">update && -weight: 500;">apt -weight: 500;">upgrade -y
-weight: 500;">apt -weight: 500;">install -y python3--weight: 500;">pip python3-dev build-essential -weight: 500;">git -weight: 500;">wget -weight: 500;">curl # Install CUDA 12.1 (required for vLLM)
-weight: 500;">wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.0-1_all.deb
dpkg -i cuda-keyring_1.0-1_all.deb
-weight: 500;">apt-get -weight: 500;">update
-weight: 500;">apt-get -y -weight: 500;">install cuda-toolkit-12-1 # Verify GPU is detected
nvidia-smi
python3 -m venv /opt/vllm-env
source /opt/vllm-env/bin/activate # Install vLLM with CUDA support
-weight: 500;">pip -weight: 500;">install ---weight: 500;">upgrade -weight: 500;">pip
-weight: 500;">pip -weight: 500;">install vllm==0.4.2 ray==2.10.0 requests numpy # Verify installation
python -c "import vllm; print(vllm.__version__)"
python3 -m venv /opt/vllm-env
source /opt/vllm-env/bin/activate # Install vLLM with CUDA support
-weight: 500;">pip -weight: 500;">install ---weight: 500;">upgrade -weight: 500;">pip
-weight: 500;">pip -weight: 500;">install vllm==0.4.2 ray==2.10.0 requests numpy # Verify installation
python -c "import vllm; print(vllm.__version__)"
python3 -m venv /opt/vllm-env
source /opt/vllm-env/bin/activate # Install vLLM with CUDA support
-weight: 500;">pip -weight: 500;">install ---weight: 500;">upgrade -weight: 500;">pip
-weight: 500;">pip -weight: 500;">install vllm==0.4.2 ray==2.10.0 requests numpy # Verify installation
python -c "import vllm; print(vllm.__version__)"
# Create model directory
mkdir -p /mnt/models # Download the quantized model from HuggingFace
# Using TheBloke's quantized version (fastest download)
cd /mnt/models
-weight: 500;">git lfs -weight: 500;">install
-weight: 500;">git clone https://huggingface.co/TheBloke/Llama-2-70B-chat-GPTQ # This takes 10-15 minutes depending on your connection
# Create model directory
mkdir -p /mnt/models # Download the quantized model from HuggingFace
# Using TheBloke's quantized version (fastest download)
cd /mnt/models
-weight: 500;">git lfs -weight: 500;">install
-weight: 500;">git clone https://huggingface.co/TheBloke/Llama-2-70B-chat-GPTQ # This takes 10-15 minutes depending on your connection
# Create model directory
mkdir -p /mnt/models # Download the quantized model from HuggingFace
# Using TheBloke's quantized version (fastest download)
cd /mnt/models
-weight: 500;">git lfs -weight: 500;">install
-weight: 500;">git clone https://huggingface.co/TheBloke/Llama-2-70B-chat-GPTQ # This takes 10-15 minutes depending on your connection
import ray
from ray import serve
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.sampling_params import SamplingParams
from vllm.utils import random_uuid
import json
import asyncio # Initialize Ray
ray.init(ignore_reinit_error=True)
serve.-weight: 500;">start(detached=True) @serve.deployment( num_replicas=2, # Run 2 vLLM workers max_concurrent_queries=50, ray_actor_options={"num_gpus": 0.5} # Each worker uses 0.5 GPU
)
class VLLMServe: def __init__(self): # Initialize async vLLM engine engine_args = AsyncEngineArgs( model="/mnt/models/Llama-2-70B-chat-GPTQ", quantization="gptq", tensor_parallel_size=1, max_num_batched_tokens=4096, gpu_memory_utilization=0.9, ) self.engine = AsyncLLMEngine.from_engine_args(engine_args) async def __call__(self, request: dict): prompt = request.get("prompt", "") max_tokens = request.get("max_tokens", 512) temperature = request.get("temperature", 0.7) sampling_params = SamplingParams( temperature=temperature, max_tokens=max_tokens, top_p=0.95, ) request_id = random_uuid() results_generator = self.engine.generate( prompt, sampling_params, request_id ) final_output = None async for output in results_generator: final_output = output return { "text": final_output.outputs[0].text, "tokens": len(final_output.outputs[0].token_ids), } # Deploy the -weight: 500;">service
VLLMServe.deploy() print("✅ vLLM + Ray deployment ready!")
print("API endpoint: http://localhost:8000/VLLMServe")
import ray
from ray import serve
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.sampling_params import SamplingParams
from vllm.utils import random_uuid
import json
import asyncio # Initialize Ray
ray.init(ignore_reinit_error=True)
serve.-weight: 500;">start(detached=True) @serve.deployment( num_replicas=2, # Run 2 vLLM workers max_concurrent_queries=50, ray_actor_options={"num_gpus": 0.5} # Each worker uses 0.5 GPU
)
class VLLMServe: def __init__(self): # Initialize async vLLM engine engine_args = AsyncEngineArgs( model="/mnt/models/Llama-2-70B-chat-GPTQ", quantization="gptq", tensor_parallel_size=1, max_num_batched_tokens=4096, gpu_memory_utilization=0.9, ) self.engine = AsyncLLMEngine.from_engine_args(engine_args) async def __call__(self, request: dict): prompt = request.get("prompt", "") max_tokens = request.get("max_tokens", 512) temperature = request.get("temperature", 0.7) sampling_params = SamplingParams( temperature=temperature, max_tokens=max_tokens, top_p=0.95, ) request_id = random_uuid() results_generator = self.engine.generate( prompt, sampling_params, request_id ) final_output = None async for output in results_generator: final_output = output return { "text": final_output.outputs[0].text, "tokens": len(final_output.outputs[0].token_ids), } # Deploy the -weight: 500;">service
VLLMServe.deploy() print("✅ vLLM + Ray deployment ready!")
print("API endpoint: http://localhost:8000/VLLMServe")
import ray
from ray import serve
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.sampling_params import SamplingParams
from vllm.utils import random_uuid
import json
import asyncio # Initialize Ray
ray.init(ignore_reinit_error=True)
serve.-weight: 500;">start(detached=True) @serve.deployment( num_replicas=2, # Run 2 vLLM workers max_concurrent_queries=50, ray_actor_options={"num_gpus": 0.5} # Each worker uses 0.5 GPU
)
class VLLMServe: def __init__(self): # Initialize async vLLM engine engine_args = AsyncEngineArgs( model="/mnt/models/Llama-2-70B-chat-GPTQ", quantization="gptq", tensor_parallel_size=1, max_num_batched_tokens=4096, gpu_memory_utilization=0.9, ) self.engine = AsyncLLMEngine.from_engine_args(engine_args) async def __call__(self, request: dict): prompt = request.get("prompt", "") max_tokens = request.get("max_tokens", 512) temperature = request.get("temperature", 0.7) sampling_params = SamplingParams( temperature=temperature, max_tokens=max_tokens, top_p=0.95, ) request_id = random_uuid() results_generator = self.engine.generate( prompt, sampling_params, request_id ) final_output = None async for output in results_generator: final_output = output return { "text": final_output.outputs[0].text, "tokens": len(final_output.outputs[0].token_ids), } # Deploy the -weight: 500;">service
VLLMServe.deploy() print("✅ vLLM + Ray deployment ready!")
print("API endpoint: http://localhost:8000/VLLMServe")
source /opt/vllm-env/bin/activate
python /opt/serve_llama.py &
source /opt/vllm-env/bin/activate
python /opt/serve_llama.py &
source /opt/vllm-env/bin/activate
python /opt/serve_llama.py & - Run multiple vLLM instances on a single GPU (oversubscription)
- Distribute requests intelligently across available GPU memory
- Add horizontal scaling later (more Droplets) without code changes
- Monitor inference metrics in real-time - A DigitalOcean account (free $200 credits for new users)
- SSH access to a terminal
- 15 minutes of setup time - Log into DigitalOcean console
- Click Create → Droplets
- Select GPU under Compute Type
- Choose NVIDIA A40 ($18/month)
- Select Ubuntu 22.04 LTS as the OS
- Add your SSH key
- Create the Droplet - Deploy your projects fast → DigitalOcean — get $200 in free credits
- Organize your AI workflows → Notion — free to -weight: 500;">start
- Run AI models cheaper → OpenRouter — pay per token, no subscriptions