# Mount the Weka Parallel File System on every GPU node
-weight: 600;">sudo mkdir -p /mnt/shared_ai_storage
-weight: 600;">sudo mount -t wekafs backend01.internal/ai_models /mnt/shared_ai_storage # Download the model exactly once to the shared volume
pip3 -weight: 500;">install huggingface_hub
huggingface-cli download deepseek-ai/DeepSeek-V4-Flash \ --local-dir /mnt/shared_ai_storage/deepseek_v4_flash \ --resume-download
# Mount the Weka Parallel File System on every GPU node
-weight: 600;">sudo mkdir -p /mnt/shared_ai_storage
-weight: 600;">sudo mount -t wekafs backend01.internal/ai_models /mnt/shared_ai_storage # Download the model exactly once to the shared volume
pip3 -weight: 500;">install huggingface_hub
huggingface-cli download deepseek-ai/DeepSeek-V4-Flash \ --local-dir /mnt/shared_ai_storage/deepseek_v4_flash \ --resume-download
# Mount the Weka Parallel File System on every GPU node
-weight: 600;">sudo mkdir -p /mnt/shared_ai_storage
-weight: 600;">sudo mount -t wekafs backend01.internal/ai_models /mnt/shared_ai_storage # Download the model exactly once to the shared volume
pip3 -weight: 500;">install huggingface_hub
huggingface-cli download deepseek-ai/DeepSeek-V4-Flash \ --local-dir /mnt/shared_ai_storage/deepseek_v4_flash \ --resume-download
# Launch the inference server reading directly from shared storage
python3 -m vllm.entrypoints.openai.api_server \ --model /mnt/shared_ai_storage/deepseek_v4_flash \ --tensor-parallel-size 4 \ ---weight: 500;">enable-expert-parallel \ --dtype fp8 \ --max-model-len 32768 \ --gpu-memory-utilization 0.90 \ --port 8080
# Launch the inference server reading directly from shared storage
python3 -m vllm.entrypoints.openai.api_server \ --model /mnt/shared_ai_storage/deepseek_v4_flash \ --tensor-parallel-size 4 \ ---weight: 500;">enable-expert-parallel \ --dtype fp8 \ --max-model-len 32768 \ --gpu-memory-utilization 0.90 \ --port 8080
# Launch the inference server reading directly from shared storage
python3 -m vllm.entrypoints.openai.api_server \ --model /mnt/shared_ai_storage/deepseek_v4_flash \ --tensor-parallel-size 4 \ ---weight: 500;">enable-expert-parallel \ --dtype fp8 \ --max-model-len 32768 \ --gpu-memory-utilization 0.90 \ --port 8080
# Deploy Kong Gateway enforcing strict TLS
-weight: 600;">sudo -weight: 500;">docker run -d --name kong_gateway \ --network host \ -e "KONG_DATABASE=off" \ -e "KONG_DECLARATIVE_CONFIG=/kong/kong.yml" \ -e "KONG_PROXY_LISTEN=0.0.0.0:443 ssl" \ -e "KONG_SSL_CERT=/certs/fullchain.pem" \ -e "KONG_SSL_CERT_KEY=/certs/privkey.pem" \ -v /etc/kong/kong.yml:/kong/kong.yml \ -v /etc/letsencrypt/live/[api.yourdomain.com/:/certs/](https://api.yourdomain.com/:/certs/) \ kong:latest
# Deploy Kong Gateway enforcing strict TLS
-weight: 600;">sudo -weight: 500;">docker run -d --name kong_gateway \ --network host \ -e "KONG_DATABASE=off" \ -e "KONG_DECLARATIVE_CONFIG=/kong/kong.yml" \ -e "KONG_PROXY_LISTEN=0.0.0.0:443 ssl" \ -e "KONG_SSL_CERT=/certs/fullchain.pem" \ -e "KONG_SSL_CERT_KEY=/certs/privkey.pem" \ -v /etc/kong/kong.yml:/kong/kong.yml \ -v /etc/letsencrypt/live/[api.yourdomain.com/:/certs/](https://api.yourdomain.com/:/certs/) \ kong:latest
# Deploy Kong Gateway enforcing strict TLS
-weight: 600;">sudo -weight: 500;">docker run -d --name kong_gateway \ --network host \ -e "KONG_DATABASE=off" \ -e "KONG_DECLARATIVE_CONFIG=/kong/kong.yml" \ -e "KONG_PROXY_LISTEN=0.0.0.0:443 ssl" \ -e "KONG_SSL_CERT=/certs/fullchain.pem" \ -e "KONG_SSL_CERT_KEY=/certs/privkey.pem" \ -v /etc/kong/kong.yml:/kong/kong.yml \ -v /etc/letsencrypt/live/[api.yourdomain.com/:/certs/](https://api.yourdomain.com/:/certs/) \ kong:latest
from openai import OpenAI client = OpenAI( base_url="[https://api.yourdomain.com/v1](https://api.yourdomain.com/v1)", api_key="YOUR_SECURE_JWT_TOKEN" ) response = client.chat.completions.create( model="deepseek_v4_flash", messages=[{"role": "user", "content": "Analyze our secure architecture."}]
)
from openai import OpenAI client = OpenAI( base_url="[https://api.yourdomain.com/v1](https://api.yourdomain.com/v1)", api_key="YOUR_SECURE_JWT_TOKEN" ) response = client.chat.completions.create( model="deepseek_v4_flash", messages=[{"role": "user", "content": "Analyze our secure architecture."}]
)
from openai import OpenAI client = OpenAI( base_url="[https://api.yourdomain.com/v1](https://api.yourdomain.com/v1)", api_key="YOUR_SECURE_JWT_TOKEN" ) response = client.chat.completions.create( model="deepseek_v4_flash", messages=[{"role": "user", "content": "Analyze our secure architecture."}]
)