version: '3.8'

services:
  llamactl-llamacpp:
    build:
      context: .
      dockerfile: Dockerfile.llamacpp
    image: llamactl:llamacpp-cuda
    container_name: llamactl-llamacpp
    ports:
      - "8080:8080"
    volumes:
      - ./data/llamacpp:/data
      - ./models:/models  # Mount models directory
    environment:
      # Configure llamactl to use llama-server from the base image
      - LLAMACTL_LLAMACPP_COMMAND=llama-server
      # Enable Docker mode for nested containers (if needed)
      - LLAMACTL_LLAMACPP_DOCKER_ENABLED=false
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]
    restart: unless-stopped

  llamactl-vllm:
    build:
      context: .
      dockerfile: Dockerfile.vllm
    image: llamactl:vllm-cuda
    container_name: llamactl-vllm
    ports:
      - "8081:8080"  # Use different port to avoid conflicts
    volumes:
      - ./data/vllm:/data
      - ./models:/models  # Mount models directory
      - ~/.cache/huggingface:/root/.cache/huggingface  # HuggingFace cache
    environment:
      # Configure llamactl to use vllm from the base image
      - LLAMACTL_VLLM_COMMAND=vllm
      - LLAMACTL_VLLM_ARGS=serve
      # Enable Docker mode for nested containers (if needed)
      - LLAMACTL_VLLM_DOCKER_ENABLED=false
      # vLLM specific environment variables
      - CUDA_VISIBLE_DEVICES=all
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]
    restart: unless-stopped

networks:
  default:
    name: llamactl-network