version: '3.8' services: llamactl-llamacpp: build: context: .. dockerfile: docker/Dockerfile.llamacpp image: llamactl:llamacpp-cuda container_name: llamactl-llamacpp ports: - "8080:8080" volumes: - ./data/llamacpp:/data - ./models:/models # Mount models directory - ~/.cache/llama.cpp:/root/.cache/llama.cpp # Llama.cpp cache environment: # Set data directory for persistence - LLAMACTL_DATA_DIR=/data # Enable Docker mode for nested containers (if needed) - LLAMACTL_LLAMACPP_DOCKER_ENABLED=false deploy: resources: reservations: devices: - driver: nvidia count: all capabilities: [gpu] restart: unless-stopped llamactl-vllm: build: context: .. dockerfile: docker/Dockerfile.vllm image: llamactl:vllm-cuda container_name: llamactl-vllm ports: - "8081:8080" # Use different port to avoid conflicts volumes: - ./data/vllm:/data - ./models:/models # Mount models directory - ~/.cache/huggingface:/root/.cache/huggingface # HuggingFace cache environment: # Set data directory for persistence - LLAMACTL_DATA_DIR=/data # Enable Docker mode for nested containers (if needed) - LLAMACTL_VLLM_DOCKER_ENABLED=false # vLLM specific environment variables - CUDA_VISIBLE_DEVICES=all deploy: resources: reservations: devices: - driver: nvidia count: all capabilities: [gpu] restart: unless-stopped