vllm_deepseek_32b

version: '3.9'
services:
  vllm_service:
    image: vllm/vllm-openai:latest
    container_name: vllm_deepseek_32b
    restart: always
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]
    ports:
      - "8000:8000"
    volumes:
      - /data/models/hf/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B-AWQ:/app/model
    command: [
      "--served-model-name", "DeepSeek-R1:32b",
      "--trust-remote-code",
      "--enforce-eager",
      "--gpu-memory-utilization", "0.8",
      "--model", "/app/model/",
      "--host", "0.0.0.0",
      "--port", "8000",
      "--max-model-len", "10000",
      "--api-key", "12345678",
      "--tokenizer", "/app/model/"
    ]