version: '3.8' services: llamactl-llamacpp: build: context: . dockerfile: Dockerfile.llamacpp image: llamactl:llamacpp-cuda container_name: llamactl-llamacpp ports: - "8080:8080" volumes: - ./data/llamacpp:/data - ./models:/models # Mount models directory environment: # Configure llamactl to use llama-server from the base image - LLAMACTL_LLAMACPP_COMMAND=llama-server # Enable Docker mode for nested containers (if needed) - LLAMACTL_LLAMACPP_DOCKER_ENABLED=false deploy: resources: reservations: devices: - driver: nvidia count: all capabilities: [gpu] restart: unless-stopped llamactl-vllm: build: context: . dockerfile: Dockerfile.vllm image: llamactl:vllm-cuda container_name: llamactl-vllm ports: - "8081:8080" # Use different port to avoid conflicts volumes: - ./data/vllm:/data - ./models:/models # Mount models directory - ~/.cache/huggingface:/root/.cache/huggingface # HuggingFace cache environment: # Configure llamactl to use vllm from the base image - LLAMACTL_VLLM_COMMAND=vllm - LLAMACTL_VLLM_ARGS=serve # Enable Docker mode for nested containers (if needed) - LLAMACTL_VLLM_DOCKER_ENABLED=false # vLLM specific environment variables - CUDA_VISIBLE_DEVICES=all deploy: resources: reservations: devices: - driver: nvidia count: all capabilities: [gpu] restart: unless-stopped networks: default: name: llamactl-network