diff --git a/dev/__pycache__/readme_sync.cpython-311.pyc b/dev/__pycache__/readme_sync.cpython-311.pyc index c6516aa..7302af9 100644 Binary files a/dev/__pycache__/readme_sync.cpython-311.pyc and b/dev/__pycache__/readme_sync.cpython-311.pyc differ diff --git a/dev/getting-started/configuration/index.html b/dev/getting-started/configuration/index.html index 9487c8a..8885de0 100644 --- a/dev/getting-started/configuration/index.html +++ b/dev/getting-started/configuration/index.html @@ -848,58 +848,59 @@ host: "0.0.0.0" # Server host to bind to port: 8080 # Server port to bind to allowed_origins: ["*"] # Allowed CORS origins (default: all) - enable_swagger: false # Enable Swagger UI for API docs - -backends: - llama-cpp: - command: "llama-server" - args: [] - environment: {} # Environment variables for the backend process - docker: - enabled: false - image: "ghcr.io/ggml-org/llama.cpp:server" - args: ["run", "--rm", "--network", "host", "--gpus", "all"] - environment: {} - response_headers: {} # Additional response headers to send with responses - - vllm: - command: "vllm" - args: ["serve"] - environment: {} # Environment variables for the backend process - docker: - enabled: false - image: "vllm/vllm-openai:latest" - args: ["run", "--rm", "--network", "host", "--gpus", "all", "--shm-size", "1g"] - environment: {} - response_headers: {} # Additional response headers to send with responses - - mlx: - command: "mlx_lm.server" - args: [] - environment: {} # Environment variables for the backend process - response_headers: {} # Additional response headers to send with responses - -instances: - port_range: [8000, 9000] # Port range for instances - data_dir: ~/.local/share/llamactl # Data directory (platform-specific, see below) - configs_dir: ~/.local/share/llamactl/instances # Instance configs directory - logs_dir: ~/.local/share/llamactl/logs # Logs directory - auto_create_dirs: true # Auto-create data/config/logs dirs if missing - max_instances: -1 # Max instances (-1 = unlimited) - max_running_instances: -1 # Max running instances (-1 = unlimited) - enable_lru_eviction: true # Enable LRU eviction for idle instances - default_auto_restart: true # Auto-restart new instances by default - default_max_restarts: 3 # Max restarts for new instances - default_restart_delay: 5 # Restart delay (seconds) for new instances - default_on_demand_start: true # Default on-demand start setting - on_demand_start_timeout: 120 # Default on-demand start timeout in seconds - timeout_check_interval: 5 # Idle instance timeout check in minutes - -auth: - require_inference_auth: true # Require auth for inference endpoints - inference_keys: [] # Keys for inference endpoints - require_management_auth: true # Require auth for management endpoints - management_keys: [] # Keys for management endpoints + allowed_headers: ["*"] # Allowed CORS headers (default: all) + enable_swagger: false # Enable Swagger UI for API docs + +backends: + llama-cpp: + command: "llama-server" + args: [] + environment: {} # Environment variables for the backend process + docker: + enabled: false + image: "ghcr.io/ggml-org/llama.cpp:server" + args: ["run", "--rm", "--network", "host", "--gpus", "all"] + environment: {} + response_headers: {} # Additional response headers to send with responses + + vllm: + command: "vllm" + args: ["serve"] + environment: {} # Environment variables for the backend process + docker: + enabled: false + image: "vllm/vllm-openai:latest" + args: ["run", "--rm", "--network", "host", "--gpus", "all", "--shm-size", "1g"] + environment: {} + response_headers: {} # Additional response headers to send with responses + + mlx: + command: "mlx_lm.server" + args: [] + environment: {} # Environment variables for the backend process + response_headers: {} # Additional response headers to send with responses + +instances: + port_range: [8000, 9000] # Port range for instances + data_dir: ~/.local/share/llamactl # Data directory (platform-specific, see below) + configs_dir: ~/.local/share/llamactl/instances # Instance configs directory + logs_dir: ~/.local/share/llamactl/logs # Logs directory + auto_create_dirs: true # Auto-create data/config/logs dirs if missing + max_instances: -1 # Max instances (-1 = unlimited) + max_running_instances: -1 # Max running instances (-1 = unlimited) + enable_lru_eviction: true # Enable LRU eviction for idle instances + default_auto_restart: true # Auto-restart new instances by default + default_max_restarts: 3 # Max restarts for new instances + default_restart_delay: 5 # Restart delay (seconds) for new instances + default_on_demand_start: true # Default on-demand start setting + on_demand_start_timeout: 120 # Default on-demand start timeout in seconds + timeout_check_interval: 5 # Idle instance timeout check in minutes + +auth: + require_inference_auth: true # Require auth for inference endpoints + inference_keys: [] # Keys for inference endpoints + require_management_auth: true # Require auth for management endpoints + management_keys: [] # Keys for management endpoints

Configuration Files

Configuration File Locations

@@ -924,7 +925,8 @@ host: "0.0.0.0" # Server host to bind to (default: "0.0.0.0") port: 8080 # Server port to bind to (default: 8080) allowed_origins: ["*"] # CORS allowed origins (default: ["*"]) - enable_swagger: false # Enable Swagger UI (default: false) + allowed_headers: ["*"] # CORS allowed headers (default: ["*"]) + enable_swagger: false # Enable Swagger UI (default: false)

Environment Variables: - LLAMACTL_HOST - Server host @@ -1068,7 +1070,7 @@ - September 29, 2025 + October 4, 2025 diff --git a/dev/search/search_index.json b/dev/search/search_index.json index cca106d..d81091f 100644 --- a/dev/search/search_index.json +++ b/dev/search/search_index.json @@ -1 +1 @@ -{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"","title":"Llamactl Documentation","text":"

Welcome to the Llamactl documentation!

"},{"location":"#what-is-llamactl","title":"What is Llamactl?","text":"

Unified management and routing for llama.cpp, MLX and vLLM models with web dashboard.

"},{"location":"#features","title":"Features","text":""},{"location":"#easy-model-management","title":"\ud83d\ude80 Easy Model Management","text":""},{"location":"#universal-compatibility","title":"\ud83d\udd17 Universal Compatibility","text":""},{"location":"#user-friendly-interface","title":"\ud83c\udf10 User-Friendly Interface","text":""},{"location":"#smart-operations","title":"\u26a1 Smart Operations","text":""},{"location":"#quick-links","title":"Quick Links","text":""},{"location":"#getting-help","title":"Getting Help","text":"

If you need help or have questions:

"},{"location":"#license","title":"License","text":"

MIT License - see the LICENSE file.

"},{"location":"getting-started/configuration/","title":"Configuration","text":"

llamactl can be configured via configuration files or environment variables. Configuration is loaded in the following order of precedence:

Defaults < Configuration file < Environment variables\n

llamactl works out of the box with sensible defaults, but you can customize the behavior to suit your needs.

"},{"location":"getting-started/configuration/#default-configuration","title":"Default Configuration","text":"

Here's the default configuration with all available options:

server:\n  host: \"0.0.0.0\"                # Server host to bind to\n  port: 8080                     # Server port to bind to\n  allowed_origins: [\"*\"]         # Allowed CORS origins (default: all)\n  enable_swagger: false          # Enable Swagger UI for API docs\n\nbackends:\n  llama-cpp:\n    command: \"llama-server\"\n    args: []\n    environment: {}              # Environment variables for the backend process\n    docker:\n      enabled: false\n      image: \"ghcr.io/ggml-org/llama.cpp:server\"\n      args: [\"run\", \"--rm\", \"--network\", \"host\", \"--gpus\", \"all\"]\n      environment: {}\n    response_headers: {}         # Additional response headers to send with responses\n\n  vllm:\n    command: \"vllm\"\n    args: [\"serve\"]\n    environment: {}              # Environment variables for the backend process\n    docker:\n      enabled: false\n      image: \"vllm/vllm-openai:latest\"\n      args: [\"run\", \"--rm\", \"--network\", \"host\", \"--gpus\", \"all\", \"--shm-size\", \"1g\"]\n      environment: {}\n    response_headers: {}         # Additional response headers to send with responses\n\n  mlx:\n    command: \"mlx_lm.server\"\n    args: []\n    environment: {}              # Environment variables for the backend process\n    response_headers: {}         # Additional response headers to send with responses\n\ninstances:\n  port_range: [8000, 9000]       # Port range for instances\n  data_dir: ~/.local/share/llamactl         # Data directory (platform-specific, see below)\n  configs_dir: ~/.local/share/llamactl/instances  # Instance configs directory\n  logs_dir: ~/.local/share/llamactl/logs    # Logs directory\n  auto_create_dirs: true         # Auto-create data/config/logs dirs if missing\n  max_instances: -1              # Max instances (-1 = unlimited)\n  max_running_instances: -1      # Max running instances (-1 = unlimited)\n  enable_lru_eviction: true      # Enable LRU eviction for idle instances\n  default_auto_restart: true     # Auto-restart new instances by default\n  default_max_restarts: 3        # Max restarts for new instances\n  default_restart_delay: 5       # Restart delay (seconds) for new instances\n  default_on_demand_start: true  # Default on-demand start setting\n  on_demand_start_timeout: 120   # Default on-demand start timeout in seconds\n  timeout_check_interval: 5      # Idle instance timeout check in minutes\n\nauth:\n  require_inference_auth: true   # Require auth for inference endpoints\n  inference_keys: []             # Keys for inference endpoints\n  require_management_auth: true  # Require auth for management endpoints\n  management_keys: []            # Keys for management endpoints\n
"},{"location":"getting-started/configuration/#configuration-files","title":"Configuration Files","text":""},{"location":"getting-started/configuration/#configuration-file-locations","title":"Configuration File Locations","text":"

Configuration files are searched in the following locations (in order of precedence):

Linux: - ./llamactl.yaml or ./config.yaml (current directory) - $HOME/.config/llamactl/config.yaml - /etc/llamactl/config.yaml

macOS: - ./llamactl.yaml or ./config.yaml (current directory) - $HOME/Library/Application Support/llamactl/config.yaml - /Library/Application Support/llamactl/config.yaml

Windows: - ./llamactl.yaml or ./config.yaml (current directory) - %APPDATA%\\llamactl\\config.yaml - %USERPROFILE%\\llamactl\\config.yaml - %PROGRAMDATA%\\llamactl\\config.yaml

You can specify the path to config file with LLAMACTL_CONFIG_PATH environment variable.

"},{"location":"getting-started/configuration/#configuration-options","title":"Configuration Options","text":""},{"location":"getting-started/configuration/#server-configuration","title":"Server Configuration","text":"
server:\n  host: \"0.0.0.0\"         # Server host to bind to (default: \"0.0.0.0\")\n  port: 8080              # Server port to bind to (default: 8080)\n  allowed_origins: [\"*\"]  # CORS allowed origins (default: [\"*\"])\n  enable_swagger: false   # Enable Swagger UI (default: false)\n

Environment Variables: - LLAMACTL_HOST - Server host - LLAMACTL_PORT - Server port - LLAMACTL_ALLOWED_ORIGINS - Comma-separated CORS origins - LLAMACTL_ENABLE_SWAGGER - Enable Swagger UI (true/false)

"},{"location":"getting-started/configuration/#backend-configuration","title":"Backend Configuration","text":"
backends:\n  llama-cpp:\n    command: \"llama-server\"\n    args: []\n    environment: {}              # Environment variables for the backend process\n    docker:\n      enabled: false             # Enable Docker runtime (default: false)\n      image: \"ghcr.io/ggml-org/llama.cpp:server\"\n      args: [\"run\", \"--rm\", \"--network\", \"host\", \"--gpus\", \"all\"]\n      environment: {}\n    response_headers: {}         # Additional response headers to send with responses\n\n  vllm:\n    command: \"vllm\"\n    args: [\"serve\"]\n    environment: {}              # Environment variables for the backend process\n    docker:\n      enabled: false             # Enable Docker runtime (default: false)\n      image: \"vllm/vllm-openai:latest\"\n      args: [\"run\", \"--rm\", \"--network\", \"host\", \"--gpus\", \"all\", \"--shm-size\", \"1g\"]\n      environment: {}\n    response_headers: {}         # Additional response headers to send with responses\n\n  mlx:\n    command: \"mlx_lm.server\"\n    args: []\n    environment: {}              # Environment variables for the backend process\n    # MLX does not support Docker\n    response_headers: {}         # Additional response headers to send with responses\n

Backend Configuration Fields: - command: Executable name/path for the backend - args: Default arguments prepended to all instances - environment: Environment variables for the backend process (optional) - response_headers: Additional response headers to send with responses (optional) - docker: Docker-specific configuration (optional) - enabled: Boolean flag to enable Docker runtime - image: Docker image to use - args: Additional arguments passed to docker run - environment: Environment variables for the container (optional)

If llamactl is behind an NGINX proxy, X-Accel-Buffering: no response header may be required for NGINX to properly stream the responses without buffering.

Environment Variables:

LlamaCpp Backend: - LLAMACTL_LLAMACPP_COMMAND - LlamaCpp executable command - LLAMACTL_LLAMACPP_ARGS - Space-separated default arguments - LLAMACTL_LLAMACPP_ENV - Environment variables in format \"KEY1=value1,KEY2=value2\" - LLAMACTL_LLAMACPP_DOCKER_ENABLED - Enable Docker runtime (true/false) - LLAMACTL_LLAMACPP_DOCKER_IMAGE - Docker image to use - LLAMACTL_LLAMACPP_DOCKER_ARGS - Space-separated Docker arguments - LLAMACTL_LLAMACPP_DOCKER_ENV - Docker environment variables in format \"KEY1=value1,KEY2=value2\" - LLAMACTL_LLAMACPP_RESPONSE_HEADERS - Response headers in format \"KEY1=value1;KEY2=value2\"

VLLM Backend: - LLAMACTL_VLLM_COMMAND - VLLM executable command - LLAMACTL_VLLM_ARGS - Space-separated default arguments - LLAMACTL_VLLM_ENV - Environment variables in format \"KEY1=value1,KEY2=value2\" - LLAMACTL_VLLM_DOCKER_ENABLED - Enable Docker runtime (true/false) - LLAMACTL_VLLM_DOCKER_IMAGE - Docker image to use - LLAMACTL_VLLM_DOCKER_ARGS - Space-separated Docker arguments - LLAMACTL_VLLM_DOCKER_ENV - Docker environment variables in format \"KEY1=value1,KEY2=value2\" - LLAMACTL_VLLM_RESPONSE_HEADERS - Response headers in format \"KEY1=value1;KEY2=value2\"

MLX Backend: - LLAMACTL_MLX_COMMAND - MLX executable command - LLAMACTL_MLX_ARGS - Space-separated default arguments - LLAMACTL_MLX_ENV - Environment variables in format \"KEY1=value1,KEY2=value2\" - LLAMACTL_MLX_RESPONSE_HEADERS - Response headers in format \"KEY1=value1;KEY2=value2\"

"},{"location":"getting-started/configuration/#instance-configuration","title":"Instance Configuration","text":"
instances:\n  port_range: [8000, 9000]                          # Port range for instances (default: [8000, 9000])\n  data_dir: \"~/.local/share/llamactl\"               # Directory for all llamactl data (default varies by OS)\n  configs_dir: \"~/.local/share/llamactl/instances\"  # Directory for instance configs (default: data_dir/instances)\n  logs_dir: \"~/.local/share/llamactl/logs\"          # Directory for instance logs (default: data_dir/logs)\n  auto_create_dirs: true                            # Automatically create data/config/logs directories (default: true)\n  max_instances: -1                                 # Maximum instances (-1 = unlimited)\n  max_running_instances: -1                         # Maximum running instances (-1 = unlimited)\n  enable_lru_eviction: true                         # Enable LRU eviction for idle instances\n  default_auto_restart: true                        # Default auto-restart setting\n  default_max_restarts: 3                           # Default maximum restart attempts\n  default_restart_delay: 5                          # Default restart delay in seconds\n  default_on_demand_start: true                     # Default on-demand start setting\n  on_demand_start_timeout: 120                      # Default on-demand start timeout in seconds\n  timeout_check_interval: 5                         # Default instance timeout check interval in minutes\n

Environment Variables: - LLAMACTL_INSTANCE_PORT_RANGE - Port range (format: \"8000-9000\" or \"8000,9000\") - LLAMACTL_DATA_DIRECTORY - Data directory path - LLAMACTL_INSTANCES_DIR - Instance configs directory path - LLAMACTL_LOGS_DIR - Log directory path - LLAMACTL_AUTO_CREATE_DATA_DIR - Auto-create data/config/logs directories (true/false) - LLAMACTL_MAX_INSTANCES - Maximum number of instances - LLAMACTL_MAX_RUNNING_INSTANCES - Maximum number of running instances - LLAMACTL_ENABLE_LRU_EVICTION - Enable LRU eviction for idle instances - LLAMACTL_DEFAULT_AUTO_RESTART - Default auto-restart setting (true/false) - LLAMACTL_DEFAULT_MAX_RESTARTS - Default maximum restarts - LLAMACTL_DEFAULT_RESTART_DELAY - Default restart delay in seconds - LLAMACTL_DEFAULT_ON_DEMAND_START - Default on-demand start setting (true/false) - LLAMACTL_ON_DEMAND_START_TIMEOUT - Default on-demand start timeout in seconds - LLAMACTL_TIMEOUT_CHECK_INTERVAL - Default instance timeout check interval in minutes

"},{"location":"getting-started/configuration/#authentication-configuration","title":"Authentication Configuration","text":"
auth:\n  require_inference_auth: true           # Require API key for OpenAI endpoints (default: true)\n  inference_keys: []                     # List of valid inference API keys\n  require_management_auth: true          # Require API key for management endpoints (default: true)\n  management_keys: []                    # List of valid management API keys\n

Environment Variables: - LLAMACTL_REQUIRE_INFERENCE_AUTH - Require auth for OpenAI endpoints (true/false) - LLAMACTL_INFERENCE_KEYS - Comma-separated inference API keys - LLAMACTL_REQUIRE_MANAGEMENT_AUTH - Require auth for management endpoints (true/false) - LLAMACTL_MANAGEMENT_KEYS - Comma-separated management API keys

"},{"location":"getting-started/configuration/#command-line-options","title":"Command Line Options","text":"

View all available command line options:

llamactl --help\n

You can also override configuration using command line flags when starting llamactl.

"},{"location":"getting-started/installation/","title":"Installation","text":"

This guide will walk you through installing Llamactl on your system.

"},{"location":"getting-started/installation/#prerequisites","title":"Prerequisites","text":""},{"location":"getting-started/installation/#backend-dependencies","title":"Backend Dependencies","text":"

llamactl supports multiple backends. Install at least one:

For llama.cpp backend (all platforms):

You need llama-server from llama.cpp installed:

# Homebrew (macOS/Linux)\nbrew install llama.cpp\n# Winget (Windows)\nwinget install llama.cpp\n

Or build from source - see llama.cpp docs

For MLX backend (macOS only):

MLX provides optimized inference on Apple Silicon. Install MLX-LM:

# Install via pip (requires Python 3.8+)\npip install mlx-lm\n\n# Or in a virtual environment (recommended)\npython -m venv mlx-env\nsource mlx-env/bin/activate\npip install mlx-lm\n

Note: MLX backend is only available on macOS with Apple Silicon (M1, M2, M3, etc.)

For vLLM backend:

vLLM provides high-throughput distributed serving for LLMs. Install vLLM:

# Install via pip (requires Python 3.8+, GPU required)\npip install vllm\n\n# Or in a virtual environment (recommended)\npython -m venv vllm-env\nsource vllm-env/bin/activate\npip install vllm\n\n# For production deployments, consider container-based installation\n
"},{"location":"getting-started/installation/#installation-methods","title":"Installation Methods","text":""},{"location":"getting-started/installation/#option-1-download-binary-recommended","title":"Option 1: Download Binary (Recommended)","text":"

Download the latest release from the GitHub releases page:

# Linux/macOS - Get latest version and download\nLATEST_VERSION=$(curl -s https://api.github.com/repos/lordmathis/llamactl/releases/latest | grep '\"tag_name\":' | sed -E 's/.*\"([^\"]+)\".*/\\1/')\ncurl -L https://github.com/lordmathis/llamactl/releases/download/${LATEST_VERSION}/llamactl-${LATEST_VERSION}-$(uname -s | tr '[:upper:]' '[:lower:]')-$(uname -m).tar.gz | tar -xz\nsudo mv llamactl /usr/local/bin/\n\n# Or download manually from:\n# https://github.com/lordmathis/llamactl/releases/latest\n\n# Windows - Download from releases page\n
"},{"location":"getting-started/installation/#option-2-docker","title":"Option 2: Docker","text":"

llamactl provides Dockerfiles for creating Docker images with backends pre-installed. The resulting images include the latest llamactl release with the respective backend.

Available Dockerfiles (CUDA): - llamactl with llama.cpp CUDA: docker/Dockerfile.llamacpp (based on ghcr.io/ggml-org/llama.cpp:server-cuda) - llamactl with vLLM CUDA: docker/Dockerfile.vllm (based on vllm/vllm-openai:latest) - llamactl built from source: docker/Dockerfile.source (multi-stage build with webui)

Note: These Dockerfiles are configured for CUDA. For other platforms (CPU, ROCm, Vulkan, etc.), adapt the base image. For llama.cpp, see available tags at llama.cpp Docker docs. For vLLM, check vLLM docs.

"},{"location":"getting-started/installation/#using-docker-compose","title":"Using Docker Compose","text":"
# Clone the repository\ngit clone https://github.com/lordmathis/llamactl.git\ncd llamactl\n\n# Create directories for data and models\nmkdir -p data/llamacpp data/vllm models\n\n# Start llamactl with llama.cpp backend\ndocker-compose -f docker/docker-compose.yml up llamactl-llamacpp -d\n\n# Or start llamactl with vLLM backend\ndocker-compose -f docker/docker-compose.yml up llamactl-vllm -d\n

Access the dashboard at: - llamactl with llama.cpp: http://localhost:8080 - llamactl with vLLM: http://localhost:8081

"},{"location":"getting-started/installation/#using-docker-build-and-run","title":"Using Docker Build and Run","text":"

llamactl with llama.cpp CUDA:

docker build -f docker/Dockerfile.llamacpp -t llamactl:llamacpp-cuda .\ndocker run -d \\\n  --name llamactl-llamacpp \\\n  --gpus all \\\n  -p 8080:8080 \\\n  -v ~/.cache/llama.cpp:/root/.cache/llama.cpp \\\n  llamactl:llamacpp-cuda\n

llamactl with vLLM CUDA:

docker build -f docker/Dockerfile.vllm -t llamactl:vllm-cuda .\ndocker run -d \\\n  --name llamactl-vllm \\\n  --gpus all \\\n  -p 8080:8080 \\\n  -v ~/.cache/huggingface:/root/.cache/huggingface \\\n  llamactl:vllm-cuda\n

llamactl built from source:

docker build -f docker/Dockerfile.source -t llamactl:source .\ndocker run -d \\\n  --name llamactl \\\n  -p 8080:8080 \\\n  llamactl:source\n

"},{"location":"getting-started/installation/#option-3-build-from-source","title":"Option 3: Build from Source","text":"

Requirements: - Go 1.24 or later - Node.js 22 or later - Git

If you prefer to build from source:

# Clone the repository\ngit clone https://github.com/lordmathis/llamactl.git\ncd llamactl\n\n# Build the web UI\ncd webui && npm ci && npm run build && cd ..\n\n# Build the application\ngo build -o llamactl ./cmd/server\n
"},{"location":"getting-started/installation/#verification","title":"Verification","text":"

Verify your installation by checking the version:

llamactl --version\n
"},{"location":"getting-started/installation/#next-steps","title":"Next Steps","text":"

Now that Llamactl is installed, continue to the Quick Start guide to get your first instance running!

"},{"location":"getting-started/quick-start/","title":"Quick Start","text":"

This guide will help you get Llamactl up and running in just a few minutes.

"},{"location":"getting-started/quick-start/#step-1-start-llamactl","title":"Step 1: Start Llamactl","text":"

Start the Llamactl server:

llamactl\n

By default, Llamactl will start on http://localhost:8080.

"},{"location":"getting-started/quick-start/#step-2-access-the-web-ui","title":"Step 2: Access the Web UI","text":"

Open your web browser and navigate to:

http://localhost:8080\n

Login with the management API key. By default it is generated during server startup. Copy it from the terminal output.

You should see the Llamactl web interface.

"},{"location":"getting-started/quick-start/#step-3-create-your-first-instance","title":"Step 3: Create Your First Instance","text":"
  1. Click the \"Add Instance\" button
  2. Fill in the instance configuration:
  3. Name: Give your instance a descriptive name
  4. Backend Type: Choose from llama.cpp, MLX, or vLLM
  5. Model: Model path or identifier for your chosen backend
  6. Additional Options: Backend-specific parameters

  7. Click \"Create Instance\"

"},{"location":"getting-started/quick-start/#step-4-start-your-instance","title":"Step 4: Start Your Instance","text":"

Once created, you can:

"},{"location":"getting-started/quick-start/#example-configurations","title":"Example Configurations","text":"

Here are basic example configurations for each backend:

llama.cpp backend:

{\n  \"name\": \"llama2-7b\",\n  \"backend_type\": \"llama_cpp\",\n  \"backend_options\": {\n    \"model\": \"/path/to/llama-2-7b-chat.gguf\",\n    \"threads\": 4,\n    \"ctx_size\": 2048,\n    \"gpu_layers\": 32\n  }\n}\n

MLX backend (macOS only):

{\n  \"name\": \"mistral-mlx\",\n  \"backend_type\": \"mlx_lm\",\n  \"backend_options\": {\n    \"model\": \"mlx-community/Mistral-7B-Instruct-v0.3-4bit\",\n    \"temp\": 0.7,\n    \"max_tokens\": 2048\n  }\n}\n

vLLM backend:

{\n  \"name\": \"dialogpt-vllm\",\n  \"backend_type\": \"vllm\",\n  \"backend_options\": {\n    \"model\": \"microsoft/DialoGPT-medium\",\n    \"tensor_parallel_size\": 2,\n    \"gpu_memory_utilization\": 0.9\n  }\n}\n

"},{"location":"getting-started/quick-start/#docker-support","title":"Docker Support","text":"

Llamactl can run backends in Docker containers. To enable Docker for a backend, add a docker section to that backend in your YAML configuration file (e.g. config.yaml) as shown below:

backends:\n  vllm:\n    command: \"vllm\"\n    args: [\"serve\"]\n    docker:\n      enabled: true\n      image: \"vllm/vllm-openai:latest\"\n      args: [\"run\", \"--rm\", \"--network\", \"host\", \"--gpus\", \"all\", \"--shm-size\", \"1g\"]\n
"},{"location":"getting-started/quick-start/#using-the-api","title":"Using the API","text":"

You can also manage instances via the REST API:

# List all instances\ncurl http://localhost:8080/api/instances\n\n# Create a new llama.cpp instance\ncurl -X POST http://localhost:8080/api/instances/my-model \\\n  -H \"Content-Type: application/json\" \\\n  -d '{\n    \"backend_type\": \"llama_cpp\",\n    \"backend_options\": {\n      \"model\": \"/path/to/model.gguf\"\n    }\n  }'\n\n# Start an instance\ncurl -X POST http://localhost:8080/api/instances/my-model/start\n
"},{"location":"getting-started/quick-start/#openai-compatible-api","title":"OpenAI Compatible API","text":"

Llamactl provides OpenAI-compatible endpoints, making it easy to integrate with existing OpenAI client libraries and tools.

"},{"location":"getting-started/quick-start/#chat-completions","title":"Chat Completions","text":"

Once you have an instance running, you can use it with the OpenAI-compatible chat completions endpoint:

curl -X POST http://localhost:8080/v1/chat/completions \\\n  -H \"Content-Type: application/json\" \\\n  -d '{\n    \"model\": \"my-model\",\n    \"messages\": [\n      {\n        \"role\": \"user\",\n        \"content\": \"Hello! Can you help me write a Python function?\"\n      }\n    ],\n    \"max_tokens\": 150,\n    \"temperature\": 0.7\n  }'\n
"},{"location":"getting-started/quick-start/#using-with-python-openai-client","title":"Using with Python OpenAI Client","text":"

You can also use the official OpenAI Python client:

from openai import OpenAI\n\n# Point the client to your Llamactl server\nclient = OpenAI(\n    base_url=\"http://localhost:8080/v1\",\n    api_key=\"not-needed\"  # Llamactl doesn't require API keys by default\n)\n\n# Create a chat completion\nresponse = client.chat.completions.create(\n    model=\"my-model\",  # Use the name of your instance\n    messages=[\n        {\"role\": \"user\", \"content\": \"Explain quantum computing in simple terms\"}\n    ],\n    max_tokens=200,\n    temperature=0.7\n)\n\nprint(response.choices[0].message.content)\n
"},{"location":"getting-started/quick-start/#list-available-models","title":"List Available Models","text":"

Get a list of running instances (models) in OpenAI-compatible format:

curl http://localhost:8080/v1/models\n
"},{"location":"getting-started/quick-start/#next-steps","title":"Next Steps","text":""},{"location":"user-guide/api-reference/","title":"API Reference","text":"

Complete reference for the Llamactl REST API.

"},{"location":"user-guide/api-reference/#base-url","title":"Base URL","text":"

All API endpoints are relative to the base URL:

http://localhost:8080/api/v1\n
"},{"location":"user-guide/api-reference/#authentication","title":"Authentication","text":"

Llamactl supports API key authentication. If authentication is enabled, include the API key in the Authorization header:

curl -H \"Authorization: Bearer <your-api-key>\" \\\n  http://localhost:8080/api/v1/instances\n

The server supports two types of API keys: - Management API Keys: Required for instance management operations (CRUD operations on instances) - Inference API Keys: Required for OpenAI-compatible inference endpoints

"},{"location":"user-guide/api-reference/#system-endpoints","title":"System Endpoints","text":""},{"location":"user-guide/api-reference/#get-llamactl-version","title":"Get Llamactl Version","text":"

Get the version information of the llamactl server.

GET /api/v1/version\n

Response:

Version: 1.0.0\nCommit: abc123\nBuild Time: 2024-01-15T10:00:00Z\n

"},{"location":"user-guide/api-reference/#get-llama-server-help","title":"Get Llama Server Help","text":"

Get help text for the llama-server command.

GET /api/v1/server/help\n

Response: Plain text help output from llama-server --help

"},{"location":"user-guide/api-reference/#get-llama-server-version","title":"Get Llama Server Version","text":"

Get version information of the llama-server binary.

GET /api/v1/server/version\n

Response: Plain text version output from llama-server --version

"},{"location":"user-guide/api-reference/#list-available-devices","title":"List Available Devices","text":"

List available devices for llama-server.

GET /api/v1/server/devices\n

Response: Plain text device list from llama-server --list-devices

"},{"location":"user-guide/api-reference/#instances","title":"Instances","text":""},{"location":"user-guide/api-reference/#list-all-instances","title":"List All Instances","text":"

Get a list of all instances.

GET /api/v1/instances\n

Response:

[\n  {\n    \"name\": \"llama2-7b\",\n    \"status\": \"running\",\n    \"created\": 1705312200\n  }\n]\n

"},{"location":"user-guide/api-reference/#get-instance-details","title":"Get Instance Details","text":"

Get detailed information about a specific instance.

GET /api/v1/instances/{name}\n

Response:

{\n  \"name\": \"llama2-7b\",\n  \"status\": \"running\",\n  \"created\": 1705312200\n}\n

"},{"location":"user-guide/api-reference/#create-instance","title":"Create Instance","text":"

Create and start a new instance.

POST /api/v1/instances/{name}\n

Request Body: JSON object with instance configuration. Common fields include:

See Managing Instances for complete configuration options.

Response:

{\n  \"name\": \"llama2-7b\",\n  \"status\": \"running\",\n  \"created\": 1705312200\n}\n

"},{"location":"user-guide/api-reference/#update-instance","title":"Update Instance","text":"

Update an existing instance configuration. See Managing Instances for available configuration options.

PUT /api/v1/instances/{name}\n

Request Body: JSON object with configuration fields to update.

Response:

{\n  \"name\": \"llama2-7b\",\n  \"status\": \"running\",\n  \"created\": 1705312200\n}\n

"},{"location":"user-guide/api-reference/#delete-instance","title":"Delete Instance","text":"

Stop and remove an instance.

DELETE /api/v1/instances/{name}\n

Response: 204 No Content

"},{"location":"user-guide/api-reference/#instance-operations","title":"Instance Operations","text":""},{"location":"user-guide/api-reference/#start-instance","title":"Start Instance","text":"

Start a stopped instance.

POST /api/v1/instances/{name}/start\n

Response:

{\n  \"name\": \"llama2-7b\",\n  \"status\": \"running\",\n  \"created\": 1705312200\n}\n

Error Responses: - 409 Conflict: Maximum number of running instances reached - 500 Internal Server Error: Failed to start instance

"},{"location":"user-guide/api-reference/#stop-instance","title":"Stop Instance","text":"

Stop a running instance.

POST /api/v1/instances/{name}/stop\n

Response:

{\n  \"name\": \"llama2-7b\",\n  \"status\": \"stopped\",\n  \"created\": 1705312200\n}\n

"},{"location":"user-guide/api-reference/#restart-instance","title":"Restart Instance","text":"

Restart an instance (stop then start).

POST /api/v1/instances/{name}/restart\n

Response:

{\n  \"name\": \"llama2-7b\",\n  \"status\": \"running\",\n  \"created\": 1705312200\n}\n

"},{"location":"user-guide/api-reference/#get-instance-logs","title":"Get Instance Logs","text":"

Retrieve instance logs.

GET /api/v1/instances/{name}/logs\n

Query Parameters: - lines: Number of lines to return (default: all lines, use -1 for all)

Response: Plain text log output

Example:

curl \"http://localhost:8080/api/v1/instances/my-instance/logs?lines=100\"\n

"},{"location":"user-guide/api-reference/#proxy-to-instance","title":"Proxy to Instance","text":"

Proxy HTTP requests directly to the llama-server instance.

GET /api/v1/instances/{name}/proxy/*\nPOST /api/v1/instances/{name}/proxy/*\n

This endpoint forwards all requests to the underlying llama-server instance running on its configured port. The proxy strips the /api/v1/instances/{name}/proxy prefix and forwards the remaining path to the instance.

Example - Check Instance Health:

curl -H \"Authorization: Bearer your-api-key\" \\\n  http://localhost:8080/api/v1/instances/my-model/proxy/health\n

This forwards the request to http://instance-host:instance-port/health on the actual llama-server instance.

Error Responses: - 503 Service Unavailable: Instance is not running

"},{"location":"user-guide/api-reference/#openai-compatible-api","title":"OpenAI-Compatible API","text":"

Llamactl provides OpenAI-compatible endpoints for inference operations.

"},{"location":"user-guide/api-reference/#list-models","title":"List Models","text":"

List all instances in OpenAI-compatible format.

GET /v1/models\n

Response:

{\n  \"object\": \"list\",\n  \"data\": [\n    {\n      \"id\": \"llama2-7b\",\n      \"object\": \"model\",\n      \"created\": 1705312200,\n      \"owned_by\": \"llamactl\"\n    }\n  ]\n}\n

"},{"location":"user-guide/api-reference/#chat-completions-completions-embeddings","title":"Chat Completions, Completions, Embeddings","text":"

All OpenAI-compatible inference endpoints are available:

POST /v1/chat/completions\nPOST /v1/completions\nPOST /v1/embeddings\nPOST /v1/rerank\nPOST /v1/reranking\n

Request Body: Standard OpenAI format with model field specifying the instance name

Example:

{\n  \"model\": \"llama2-7b\",\n  \"messages\": [\n    {\n      \"role\": \"user\",\n      \"content\": \"Hello, how are you?\"\n    }\n  ]\n}\n

The server routes requests to the appropriate instance based on the model field in the request body. Instances with on-demand starting enabled will be automatically started if not running. For configuration details, see Managing Instances.

Error Responses: - 400 Bad Request: Invalid request body or missing instance name - 503 Service Unavailable: Instance is not running and on-demand start is disabled - 409 Conflict: Cannot start instance due to maximum instances limit

"},{"location":"user-guide/api-reference/#instance-status-values","title":"Instance Status Values","text":"

Instances can have the following status values: - stopped: Instance is not running - running: Instance is running and ready to accept requests - failed: Instance failed to start or crashed

"},{"location":"user-guide/api-reference/#error-responses","title":"Error Responses","text":"

All endpoints may return error responses in the following format:

{\n  \"error\": \"Error message description\"\n}\n
"},{"location":"user-guide/api-reference/#common-http-status-codes","title":"Common HTTP Status Codes","text":""},{"location":"user-guide/api-reference/#examples","title":"Examples","text":""},{"location":"user-guide/api-reference/#complete-instance-lifecycle","title":"Complete Instance Lifecycle","text":"
# Create and start instance\ncurl -X POST http://localhost:8080/api/v1/instances/my-model \\\n  -H \"Content-Type: application/json\" \\\n  -H \"Authorization: Bearer your-api-key\" \\\n  -d '{\n    \"backend_type\": \"llama_cpp\",\n    \"backend_options\": {\n      \"model\": \"/models/llama-2-7b.gguf\",\n      \"gpu_layers\": 32\n    },\n    \"environment\": {\n      \"CUDA_VISIBLE_DEVICES\": \"0\",\n      \"OMP_NUM_THREADS\": \"8\"\n    }\n  }'\n\n# Check instance status\ncurl -H \"Authorization: Bearer your-api-key\" \\\n  http://localhost:8080/api/v1/instances/my-model\n\n# Get instance logs\ncurl -H \"Authorization: Bearer your-api-key\" \\\n  \"http://localhost:8080/api/v1/instances/my-model/logs?lines=50\"\n\n# Use OpenAI-compatible chat completions\ncurl -X POST http://localhost:8080/v1/chat/completions \\\n  -H \"Content-Type: application/json\" \\\n  -H \"Authorization: Bearer your-inference-api-key\" \\\n  -d '{\n    \"model\": \"my-model\",\n    \"messages\": [\n      {\"role\": \"user\", \"content\": \"Hello!\"}\n    ],\n    \"max_tokens\": 100\n  }'\n\n# Stop instance\ncurl -X POST -H \"Authorization: Bearer your-api-key\" \\\n  http://localhost:8080/api/v1/instances/my-model/stop\n\n# Delete instance\ncurl -X DELETE -H \"Authorization: Bearer your-api-key\" \\\n  http://localhost:8080/api/v1/instances/my-model\n
"},{"location":"user-guide/api-reference/#using-the-proxy-endpoint","title":"Using the Proxy Endpoint","text":"

You can also directly proxy requests to the llama-server instance:

# Direct proxy to instance (bypasses OpenAI compatibility layer)\ncurl -X POST http://localhost:8080/api/v1/instances/my-model/proxy/completion \\\n  -H \"Content-Type: application/json\" \\\n  -H \"Authorization: Bearer your-api-key\" \\\n  -d '{\n    \"prompt\": \"Hello, world!\",\n    \"n_predict\": 50\n  }'\n
"},{"location":"user-guide/api-reference/#backend-specific-endpoints","title":"Backend-Specific Endpoints","text":""},{"location":"user-guide/api-reference/#parse-commands","title":"Parse Commands","text":"

Llamactl provides endpoints to parse command strings from different backends into instance configuration options.

"},{"location":"user-guide/api-reference/#parse-llamacpp-command","title":"Parse Llama.cpp Command","text":"

Parse a llama-server command string into instance options.

POST /api/v1/backends/llama-cpp/parse-command\n

Request Body:

{\n  \"command\": \"llama-server -m /path/to/model.gguf -c 2048 --port 8080\"\n}\n

Response:

{\n  \"backend_type\": \"llama_cpp\",\n  \"llama_server_options\": {\n    \"model\": \"/path/to/model.gguf\",\n    \"ctx_size\": 2048,\n    \"port\": 8080\n  }\n}\n

"},{"location":"user-guide/api-reference/#parse-mlx-lm-command","title":"Parse MLX-LM Command","text":"

Parse an MLX-LM server command string into instance options.

POST /api/v1/backends/mlx/parse-command\n

Request Body:

{\n  \"command\": \"mlx_lm.server --model /path/to/model --port 8080\"\n}\n

Response:

{\n  \"backend_type\": \"mlx_lm\",\n  \"mlx_server_options\": {\n    \"model\": \"/path/to/model\",\n    \"port\": 8080\n  }\n}\n

"},{"location":"user-guide/api-reference/#parse-vllm-command","title":"Parse vLLM Command","text":"

Parse a vLLM serve command string into instance options.

POST /api/v1/backends/vllm/parse-command\n

Request Body:

{\n  \"command\": \"vllm serve /path/to/model --port 8080\"\n}\n

Response:

{\n  \"backend_type\": \"vllm\",\n  \"vllm_server_options\": {\n    \"model\": \"/path/to/model\",\n    \"port\": 8080\n  }\n}\n

Error Responses for Parse Commands: - 400 Bad Request: Invalid request body, empty command, or parse error - 500 Internal Server Error: Encoding error

"},{"location":"user-guide/api-reference/#auto-generated-documentation","title":"Auto-Generated Documentation","text":"

The API documentation is automatically generated from code annotations using Swagger/OpenAPI. To regenerate the documentation:

  1. Install the swag tool: go install github.com/swaggo/swag/cmd/swag@latest
  2. Generate docs: swag init -g cmd/server/main.go -o apidocs
"},{"location":"user-guide/api-reference/#swagger-documentation","title":"Swagger Documentation","text":"

If swagger documentation is enabled in the server configuration, you can access the interactive API documentation at:

http://localhost:8080/swagger/\n

This provides a complete interactive interface for testing all API endpoints.

"},{"location":"user-guide/managing-instances/","title":"Managing Instances","text":"

Learn how to effectively manage your llama.cpp, MLX, and vLLM instances with Llamactl through both the Web UI and API.

"},{"location":"user-guide/managing-instances/#overview","title":"Overview","text":"

Llamactl provides two ways to manage instances:

"},{"location":"user-guide/managing-instances/#authentication","title":"Authentication","text":"

If authentication is enabled: 1. Navigate to the web UI 2. Enter your credentials 3. Bearer token is stored for the session

"},{"location":"user-guide/managing-instances/#theme-support","title":"Theme Support","text":""},{"location":"user-guide/managing-instances/#instance-cards","title":"Instance Cards","text":"

Each instance is displayed as a card showing:

"},{"location":"user-guide/managing-instances/#create-instance","title":"Create Instance","text":""},{"location":"user-guide/managing-instances/#via-web-ui","title":"Via Web UI","text":"
  1. Click the \"Create Instance\" button on the dashboard
  2. Enter a unique Name for your instance (only required field)
  3. Choose Backend Type:
  4. Configure model source:
  5. Configure optional instance management settings:
  6. Configure backend-specific options:
  7. Click \"Create\" to save the instance
"},{"location":"user-guide/managing-instances/#via-api","title":"Via API","text":"
# Create llama.cpp instance with local model file\ncurl -X POST http://localhost:8080/api/instances/my-llama-instance \\\n  -H \"Content-Type: application/json\" \\\n  -d '{\n    \"backend_type\": \"llama_cpp\",\n    \"backend_options\": {\n      \"model\": \"/path/to/model.gguf\",\n      \"threads\": 8,\n      \"ctx_size\": 4096,\n      \"gpu_layers\": 32\n    }\n  }'\n\n# Create MLX instance (macOS only)\ncurl -X POST http://localhost:8080/api/instances/my-mlx-instance \\\n  -H \"Content-Type: application/json\" \\\n  -d '{\n    \"backend_type\": \"mlx_lm\",\n    \"backend_options\": {\n      \"model\": \"mlx-community/Mistral-7B-Instruct-v0.3-4bit\",\n      \"temp\": 0.7,\n      \"top_p\": 0.9,\n      \"max_tokens\": 2048\n    },\n    \"auto_restart\": true,\n    \"max_restarts\": 3\n  }'\n\n# Create vLLM instance\ncurl -X POST http://localhost:8080/api/instances/my-vllm-instance \\\n  -H \"Content-Type: application/json\" \\\n  -d '{\n    \"backend_type\": \"vllm\",\n    \"backend_options\": {\n      \"model\": \"microsoft/DialoGPT-medium\",\n      \"tensor_parallel_size\": 2,\n      \"gpu_memory_utilization\": 0.9\n    },\n    \"auto_restart\": true,\n    \"on_demand_start\": true,\n    \"environment\": {\n      \"CUDA_VISIBLE_DEVICES\": \"0,1\",\n      \"NCCL_DEBUG\": \"INFO\",\n      \"PYTHONPATH\": \"/custom/path\"\n    }\n  }'\n\n# Create llama.cpp instance with HuggingFace model\ncurl -X POST http://localhost:8080/api/instances/gemma-3-27b \\\n  -H \"Content-Type: application/json\" \\\n  -d '{\n    \"backend_type\": \"llama_cpp\",\n    \"backend_options\": {\n      \"hf_repo\": \"unsloth/gemma-3-27b-it-GGUF\",\n      \"hf_file\": \"gemma-3-27b-it-GGUF.gguf\",\n      \"gpu_layers\": 32\n    }\n  }'\n
"},{"location":"user-guide/managing-instances/#start-instance","title":"Start Instance","text":""},{"location":"user-guide/managing-instances/#via-web-ui_1","title":"Via Web UI","text":"
  1. Click the \"Start\" button on an instance card
  2. Watch the status change to \"Unknown\"
  3. Monitor progress in the logs
  4. Instance status changes to \"Ready\" when ready
"},{"location":"user-guide/managing-instances/#via-api_1","title":"Via API","text":"
curl -X POST http://localhost:8080/api/instances/{name}/start\n
"},{"location":"user-guide/managing-instances/#stop-instance","title":"Stop Instance","text":""},{"location":"user-guide/managing-instances/#via-web-ui_2","title":"Via Web UI","text":"
  1. Click the \"Stop\" button on an instance card
  2. Instance gracefully shuts down
"},{"location":"user-guide/managing-instances/#via-api_2","title":"Via API","text":"
curl -X POST http://localhost:8080/api/instances/{name}/stop\n
"},{"location":"user-guide/managing-instances/#edit-instance","title":"Edit Instance","text":""},{"location":"user-guide/managing-instances/#via-web-ui_3","title":"Via Web UI","text":"
  1. Click the \"Edit\" button on an instance card
  2. Modify settings in the configuration dialog
  3. Changes require instance restart to take effect
  4. Click \"Update & Restart\" to apply changes
"},{"location":"user-guide/managing-instances/#via-api_3","title":"Via API","text":"

Modify instance settings:

curl -X PUT http://localhost:8080/api/instances/{name} \\\n  -H \"Content-Type: application/json\" \\\n  -d '{\n    \"backend_options\": {\n      \"threads\": 8,\n      \"context_size\": 4096\n    }\n  }'\n

Note

Configuration changes require restarting the instance to take effect.

"},{"location":"user-guide/managing-instances/#view-logs","title":"View Logs","text":""},{"location":"user-guide/managing-instances/#via-web-ui_4","title":"Via Web UI","text":"
  1. Click the \"Logs\" button on any instance card
  2. Real-time log viewer opens
"},{"location":"user-guide/managing-instances/#via-api_4","title":"Via API","text":"

Check instance status in real-time:

# Get instance details\ncurl http://localhost:8080/api/instances/{name}/logs\n
"},{"location":"user-guide/managing-instances/#delete-instance","title":"Delete Instance","text":""},{"location":"user-guide/managing-instances/#via-web-ui_5","title":"Via Web UI","text":"
  1. Click the \"Delete\" button on an instance card
  2. Only stopped instances can be deleted
  3. Confirm deletion in the dialog
"},{"location":"user-guide/managing-instances/#via-api_5","title":"Via API","text":"
curl -X DELETE http://localhost:8080/api/instances/{name}\n
"},{"location":"user-guide/managing-instances/#instance-proxy","title":"Instance Proxy","text":"

Llamactl proxies all requests to the underlying backend instances (llama-server, MLX, or vLLM).

# Get instance details\ncurl http://localhost:8080/api/instances/{name}/proxy/\n

All backends provide OpenAI-compatible endpoints. Check the respective documentation: - llama-server docs - MLX-LM docs - vLLM docs

"},{"location":"user-guide/managing-instances/#instance-health","title":"Instance Health","text":""},{"location":"user-guide/managing-instances/#via-web-ui_6","title":"Via Web UI","text":"
  1. The health status badge is displayed on each instance card
"},{"location":"user-guide/managing-instances/#via-api_6","title":"Via API","text":"

Check the health status of your instances:

curl http://localhost:8080/api/instances/{name}/proxy/health\n
"},{"location":"user-guide/troubleshooting/","title":"Troubleshooting","text":"

Issues specific to Llamactl deployment and operation.

"},{"location":"user-guide/troubleshooting/#configuration-issues","title":"Configuration Issues","text":""},{"location":"user-guide/troubleshooting/#invalid-configuration","title":"Invalid Configuration","text":"

Problem: Invalid configuration preventing startup

Solutions: 1. Use minimal configuration:

server:\n  host: \"0.0.0.0\"\n  port: 8080\ninstances:\n  port_range: [8000, 9000]\n

  1. Check data directory permissions:
    # Ensure data directory is writable (default: ~/.local/share/llamactl)\nmkdir -p ~/.local/share/llamactl/{instances,logs}\n
"},{"location":"user-guide/troubleshooting/#instance-management-issues","title":"Instance Management Issues","text":""},{"location":"user-guide/troubleshooting/#model-loading-failures","title":"Model Loading Failures","text":"

Problem: Instance fails to start with model loading errors

Common Solutions: - llama-server not found: Ensure llama-server binary is in PATH - Wrong model format: Ensure model is in GGUF format - Insufficient memory: Use smaller model or reduce context size - Path issues: Use absolute paths to model files

"},{"location":"user-guide/troubleshooting/#memory-issues","title":"Memory Issues","text":"

Problem: Out of memory errors or system becomes unresponsive

Solutions: 1. Reduce context size:

{\n  \"n_ctx\": 1024\n}\n

  1. Use quantized models:
  2. Try Q4_K_M instead of higher precision models
  3. Use smaller model variants (7B instead of 13B)
"},{"location":"user-guide/troubleshooting/#gpu-configuration","title":"GPU Configuration","text":"

Problem: GPU not being used effectively

Solutions: 1. Configure GPU layers:

{\n  \"n_gpu_layers\": 35\n}\n

"},{"location":"user-guide/troubleshooting/#advanced-instance-issues","title":"Advanced Instance Issues","text":"

Problem: Complex model loading, performance, or compatibility issues

Since llamactl uses llama-server under the hood, many instance-related issues are actually llama.cpp issues. For advanced troubleshooting:

Resources: - llama.cpp Documentation: https://github.com/ggml/llama.cpp - llama.cpp Issues: https://github.com/ggml/llama.cpp/issues - llama.cpp Discussions: https://github.com/ggml/llama.cpp/discussions

Testing directly with llama-server:

# Test your model and parameters directly with llama-server\nllama-server --model /path/to/model.gguf --port 8081 --n-gpu-layers 35\n

This helps determine if the issue is with llamactl or with the underlying llama.cpp/llama-server.

"},{"location":"user-guide/troubleshooting/#api-and-network-issues","title":"API and Network Issues","text":""},{"location":"user-guide/troubleshooting/#cors-errors","title":"CORS Errors","text":"

Problem: Web UI shows CORS errors in browser console

Solutions: 1. Configure allowed origins:

server:\n  allowed_origins:\n    - \"http://localhost:3000\"\n    - \"https://yourdomain.com\"\n

"},{"location":"user-guide/troubleshooting/#authentication-issues","title":"Authentication Issues","text":"

Problem: API requests failing with authentication errors

Solutions: 1. Disable authentication temporarily:

auth:\n  require_management_auth: false\n  require_inference_auth: false\n

  1. Configure API keys:

    auth:\n  management_keys:\n    - \"your-management-key\"\n  inference_keys:\n    - \"your-inference-key\"\n

  2. Use correct Authorization header:

    curl -H \"Authorization: Bearer your-api-key\" \\\n  http://localhost:8080/api/v1/instances\n

"},{"location":"user-guide/troubleshooting/#debugging-and-logs","title":"Debugging and Logs","text":""},{"location":"user-guide/troubleshooting/#viewing-instance-logs","title":"Viewing Instance Logs","text":"
# Get instance logs via API\ncurl http://localhost:8080/api/v1/instances/{name}/logs\n\n# Or check log files directly\ntail -f ~/.local/share/llamactl/logs/{instance-name}.log\n
"},{"location":"user-guide/troubleshooting/#enable-debug-logging","title":"Enable Debug Logging","text":"
export LLAMACTL_LOG_LEVEL=debug\nllamactl\n
"},{"location":"user-guide/troubleshooting/#getting-help","title":"Getting Help","text":"

When reporting issues, include:

  1. System information:

    llamactl --version\n

  2. Configuration file (remove sensitive keys)

  3. Relevant log output

  4. Steps to reproduce the issue

"}]} \ No newline at end of file +{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"","title":"Llamactl Documentation","text":"

Welcome to the Llamactl documentation!

"},{"location":"#what-is-llamactl","title":"What is Llamactl?","text":"

Unified management and routing for llama.cpp, MLX and vLLM models with web dashboard.

"},{"location":"#features","title":"Features","text":""},{"location":"#easy-model-management","title":"\ud83d\ude80 Easy Model Management","text":""},{"location":"#universal-compatibility","title":"\ud83d\udd17 Universal Compatibility","text":""},{"location":"#user-friendly-interface","title":"\ud83c\udf10 User-Friendly Interface","text":""},{"location":"#smart-operations","title":"\u26a1 Smart Operations","text":""},{"location":"#quick-links","title":"Quick Links","text":""},{"location":"#getting-help","title":"Getting Help","text":"

If you need help or have questions:

"},{"location":"#license","title":"License","text":"

MIT License - see the LICENSE file.

"},{"location":"getting-started/configuration/","title":"Configuration","text":"

llamactl can be configured via configuration files or environment variables. Configuration is loaded in the following order of precedence:

Defaults < Configuration file < Environment variables\n

llamactl works out of the box with sensible defaults, but you can customize the behavior to suit your needs.

"},{"location":"getting-started/configuration/#default-configuration","title":"Default Configuration","text":"

Here's the default configuration with all available options:

server:\n  host: \"0.0.0.0\"                # Server host to bind to\n  port: 8080                     # Server port to bind to\n  allowed_origins: [\"*\"]         # Allowed CORS origins (default: all)\n  allowed_headers: [\"*\"]         # Allowed CORS headers (default: all)\n  enable_swagger: false          # Enable Swagger UI for API docs\n\nbackends:\n  llama-cpp:\n    command: \"llama-server\"\n    args: []\n    environment: {}              # Environment variables for the backend process\n    docker:\n      enabled: false\n      image: \"ghcr.io/ggml-org/llama.cpp:server\"\n      args: [\"run\", \"--rm\", \"--network\", \"host\", \"--gpus\", \"all\"]\n      environment: {}\n    response_headers: {}         # Additional response headers to send with responses\n\n  vllm:\n    command: \"vllm\"\n    args: [\"serve\"]\n    environment: {}              # Environment variables for the backend process\n    docker:\n      enabled: false\n      image: \"vllm/vllm-openai:latest\"\n      args: [\"run\", \"--rm\", \"--network\", \"host\", \"--gpus\", \"all\", \"--shm-size\", \"1g\"]\n      environment: {}\n    response_headers: {}         # Additional response headers to send with responses\n\n  mlx:\n    command: \"mlx_lm.server\"\n    args: []\n    environment: {}              # Environment variables for the backend process\n    response_headers: {}         # Additional response headers to send with responses\n\ninstances:\n  port_range: [8000, 9000]       # Port range for instances\n  data_dir: ~/.local/share/llamactl         # Data directory (platform-specific, see below)\n  configs_dir: ~/.local/share/llamactl/instances  # Instance configs directory\n  logs_dir: ~/.local/share/llamactl/logs    # Logs directory\n  auto_create_dirs: true         # Auto-create data/config/logs dirs if missing\n  max_instances: -1              # Max instances (-1 = unlimited)\n  max_running_instances: -1      # Max running instances (-1 = unlimited)\n  enable_lru_eviction: true      # Enable LRU eviction for idle instances\n  default_auto_restart: true     # Auto-restart new instances by default\n  default_max_restarts: 3        # Max restarts for new instances\n  default_restart_delay: 5       # Restart delay (seconds) for new instances\n  default_on_demand_start: true  # Default on-demand start setting\n  on_demand_start_timeout: 120   # Default on-demand start timeout in seconds\n  timeout_check_interval: 5      # Idle instance timeout check in minutes\n\nauth:\n  require_inference_auth: true   # Require auth for inference endpoints\n  inference_keys: []             # Keys for inference endpoints\n  require_management_auth: true  # Require auth for management endpoints\n  management_keys: []            # Keys for management endpoints\n
"},{"location":"getting-started/configuration/#configuration-files","title":"Configuration Files","text":""},{"location":"getting-started/configuration/#configuration-file-locations","title":"Configuration File Locations","text":"

Configuration files are searched in the following locations (in order of precedence):

Linux: - ./llamactl.yaml or ./config.yaml (current directory) - $HOME/.config/llamactl/config.yaml - /etc/llamactl/config.yaml

macOS: - ./llamactl.yaml or ./config.yaml (current directory) - $HOME/Library/Application Support/llamactl/config.yaml - /Library/Application Support/llamactl/config.yaml

Windows: - ./llamactl.yaml or ./config.yaml (current directory) - %APPDATA%\\llamactl\\config.yaml - %USERPROFILE%\\llamactl\\config.yaml - %PROGRAMDATA%\\llamactl\\config.yaml

You can specify the path to config file with LLAMACTL_CONFIG_PATH environment variable.

"},{"location":"getting-started/configuration/#configuration-options","title":"Configuration Options","text":""},{"location":"getting-started/configuration/#server-configuration","title":"Server Configuration","text":"
server:\n  host: \"0.0.0.0\"         # Server host to bind to (default: \"0.0.0.0\")\n  port: 8080              # Server port to bind to (default: 8080)\n  allowed_origins: [\"*\"]  # CORS allowed origins (default: [\"*\"])\n  allowed_headers: [\"*\"]  # CORS allowed headers (default: [\"*\"])\n  enable_swagger: false   # Enable Swagger UI (default: false)\n

Environment Variables: - LLAMACTL_HOST - Server host - LLAMACTL_PORT - Server port - LLAMACTL_ALLOWED_ORIGINS - Comma-separated CORS origins - LLAMACTL_ENABLE_SWAGGER - Enable Swagger UI (true/false)

"},{"location":"getting-started/configuration/#backend-configuration","title":"Backend Configuration","text":"
backends:\n  llama-cpp:\n    command: \"llama-server\"\n    args: []\n    environment: {}              # Environment variables for the backend process\n    docker:\n      enabled: false             # Enable Docker runtime (default: false)\n      image: \"ghcr.io/ggml-org/llama.cpp:server\"\n      args: [\"run\", \"--rm\", \"--network\", \"host\", \"--gpus\", \"all\"]\n      environment: {}\n    response_headers: {}         # Additional response headers to send with responses\n\n  vllm:\n    command: \"vllm\"\n    args: [\"serve\"]\n    environment: {}              # Environment variables for the backend process\n    docker:\n      enabled: false             # Enable Docker runtime (default: false)\n      image: \"vllm/vllm-openai:latest\"\n      args: [\"run\", \"--rm\", \"--network\", \"host\", \"--gpus\", \"all\", \"--shm-size\", \"1g\"]\n      environment: {}\n    response_headers: {}         # Additional response headers to send with responses\n\n  mlx:\n    command: \"mlx_lm.server\"\n    args: []\n    environment: {}              # Environment variables for the backend process\n    # MLX does not support Docker\n    response_headers: {}         # Additional response headers to send with responses\n

Backend Configuration Fields: - command: Executable name/path for the backend - args: Default arguments prepended to all instances - environment: Environment variables for the backend process (optional) - response_headers: Additional response headers to send with responses (optional) - docker: Docker-specific configuration (optional) - enabled: Boolean flag to enable Docker runtime - image: Docker image to use - args: Additional arguments passed to docker run - environment: Environment variables for the container (optional)

If llamactl is behind an NGINX proxy, X-Accel-Buffering: no response header may be required for NGINX to properly stream the responses without buffering.

Environment Variables:

LlamaCpp Backend: - LLAMACTL_LLAMACPP_COMMAND - LlamaCpp executable command - LLAMACTL_LLAMACPP_ARGS - Space-separated default arguments - LLAMACTL_LLAMACPP_ENV - Environment variables in format \"KEY1=value1,KEY2=value2\" - LLAMACTL_LLAMACPP_DOCKER_ENABLED - Enable Docker runtime (true/false) - LLAMACTL_LLAMACPP_DOCKER_IMAGE - Docker image to use - LLAMACTL_LLAMACPP_DOCKER_ARGS - Space-separated Docker arguments - LLAMACTL_LLAMACPP_DOCKER_ENV - Docker environment variables in format \"KEY1=value1,KEY2=value2\" - LLAMACTL_LLAMACPP_RESPONSE_HEADERS - Response headers in format \"KEY1=value1;KEY2=value2\"

VLLM Backend: - LLAMACTL_VLLM_COMMAND - VLLM executable command - LLAMACTL_VLLM_ARGS - Space-separated default arguments - LLAMACTL_VLLM_ENV - Environment variables in format \"KEY1=value1,KEY2=value2\" - LLAMACTL_VLLM_DOCKER_ENABLED - Enable Docker runtime (true/false) - LLAMACTL_VLLM_DOCKER_IMAGE - Docker image to use - LLAMACTL_VLLM_DOCKER_ARGS - Space-separated Docker arguments - LLAMACTL_VLLM_DOCKER_ENV - Docker environment variables in format \"KEY1=value1,KEY2=value2\" - LLAMACTL_VLLM_RESPONSE_HEADERS - Response headers in format \"KEY1=value1;KEY2=value2\"

MLX Backend: - LLAMACTL_MLX_COMMAND - MLX executable command - LLAMACTL_MLX_ARGS - Space-separated default arguments - LLAMACTL_MLX_ENV - Environment variables in format \"KEY1=value1,KEY2=value2\" - LLAMACTL_MLX_RESPONSE_HEADERS - Response headers in format \"KEY1=value1;KEY2=value2\"

"},{"location":"getting-started/configuration/#instance-configuration","title":"Instance Configuration","text":"
instances:\n  port_range: [8000, 9000]                          # Port range for instances (default: [8000, 9000])\n  data_dir: \"~/.local/share/llamactl\"               # Directory for all llamactl data (default varies by OS)\n  configs_dir: \"~/.local/share/llamactl/instances\"  # Directory for instance configs (default: data_dir/instances)\n  logs_dir: \"~/.local/share/llamactl/logs\"          # Directory for instance logs (default: data_dir/logs)\n  auto_create_dirs: true                            # Automatically create data/config/logs directories (default: true)\n  max_instances: -1                                 # Maximum instances (-1 = unlimited)\n  max_running_instances: -1                         # Maximum running instances (-1 = unlimited)\n  enable_lru_eviction: true                         # Enable LRU eviction for idle instances\n  default_auto_restart: true                        # Default auto-restart setting\n  default_max_restarts: 3                           # Default maximum restart attempts\n  default_restart_delay: 5                          # Default restart delay in seconds\n  default_on_demand_start: true                     # Default on-demand start setting\n  on_demand_start_timeout: 120                      # Default on-demand start timeout in seconds\n  timeout_check_interval: 5                         # Default instance timeout check interval in minutes\n

Environment Variables: - LLAMACTL_INSTANCE_PORT_RANGE - Port range (format: \"8000-9000\" or \"8000,9000\") - LLAMACTL_DATA_DIRECTORY - Data directory path - LLAMACTL_INSTANCES_DIR - Instance configs directory path - LLAMACTL_LOGS_DIR - Log directory path - LLAMACTL_AUTO_CREATE_DATA_DIR - Auto-create data/config/logs directories (true/false) - LLAMACTL_MAX_INSTANCES - Maximum number of instances - LLAMACTL_MAX_RUNNING_INSTANCES - Maximum number of running instances - LLAMACTL_ENABLE_LRU_EVICTION - Enable LRU eviction for idle instances - LLAMACTL_DEFAULT_AUTO_RESTART - Default auto-restart setting (true/false) - LLAMACTL_DEFAULT_MAX_RESTARTS - Default maximum restarts - LLAMACTL_DEFAULT_RESTART_DELAY - Default restart delay in seconds - LLAMACTL_DEFAULT_ON_DEMAND_START - Default on-demand start setting (true/false) - LLAMACTL_ON_DEMAND_START_TIMEOUT - Default on-demand start timeout in seconds - LLAMACTL_TIMEOUT_CHECK_INTERVAL - Default instance timeout check interval in minutes

"},{"location":"getting-started/configuration/#authentication-configuration","title":"Authentication Configuration","text":"
auth:\n  require_inference_auth: true           # Require API key for OpenAI endpoints (default: true)\n  inference_keys: []                     # List of valid inference API keys\n  require_management_auth: true          # Require API key for management endpoints (default: true)\n  management_keys: []                    # List of valid management API keys\n

Environment Variables: - LLAMACTL_REQUIRE_INFERENCE_AUTH - Require auth for OpenAI endpoints (true/false) - LLAMACTL_INFERENCE_KEYS - Comma-separated inference API keys - LLAMACTL_REQUIRE_MANAGEMENT_AUTH - Require auth for management endpoints (true/false) - LLAMACTL_MANAGEMENT_KEYS - Comma-separated management API keys

"},{"location":"getting-started/configuration/#command-line-options","title":"Command Line Options","text":"

View all available command line options:

llamactl --help\n

You can also override configuration using command line flags when starting llamactl.

"},{"location":"getting-started/installation/","title":"Installation","text":"

This guide will walk you through installing Llamactl on your system.

"},{"location":"getting-started/installation/#prerequisites","title":"Prerequisites","text":""},{"location":"getting-started/installation/#backend-dependencies","title":"Backend Dependencies","text":"

llamactl supports multiple backends. Install at least one:

For llama.cpp backend (all platforms):

You need llama-server from llama.cpp installed:

# Homebrew (macOS/Linux)\nbrew install llama.cpp\n# Winget (Windows)\nwinget install llama.cpp\n

Or build from source - see llama.cpp docs

For MLX backend (macOS only):

MLX provides optimized inference on Apple Silicon. Install MLX-LM:

# Install via pip (requires Python 3.8+)\npip install mlx-lm\n\n# Or in a virtual environment (recommended)\npython -m venv mlx-env\nsource mlx-env/bin/activate\npip install mlx-lm\n

Note: MLX backend is only available on macOS with Apple Silicon (M1, M2, M3, etc.)

For vLLM backend:

vLLM provides high-throughput distributed serving for LLMs. Install vLLM:

# Install via pip (requires Python 3.8+, GPU required)\npip install vllm\n\n# Or in a virtual environment (recommended)\npython -m venv vllm-env\nsource vllm-env/bin/activate\npip install vllm\n\n# For production deployments, consider container-based installation\n
"},{"location":"getting-started/installation/#installation-methods","title":"Installation Methods","text":""},{"location":"getting-started/installation/#option-1-download-binary-recommended","title":"Option 1: Download Binary (Recommended)","text":"

Download the latest release from the GitHub releases page:

# Linux/macOS - Get latest version and download\nLATEST_VERSION=$(curl -s https://api.github.com/repos/lordmathis/llamactl/releases/latest | grep '\"tag_name\":' | sed -E 's/.*\"([^\"]+)\".*/\\1/')\ncurl -L https://github.com/lordmathis/llamactl/releases/download/${LATEST_VERSION}/llamactl-${LATEST_VERSION}-$(uname -s | tr '[:upper:]' '[:lower:]')-$(uname -m).tar.gz | tar -xz\nsudo mv llamactl /usr/local/bin/\n\n# Or download manually from:\n# https://github.com/lordmathis/llamactl/releases/latest\n\n# Windows - Download from releases page\n
"},{"location":"getting-started/installation/#option-2-docker","title":"Option 2: Docker","text":"

llamactl provides Dockerfiles for creating Docker images with backends pre-installed. The resulting images include the latest llamactl release with the respective backend.

Available Dockerfiles (CUDA): - llamactl with llama.cpp CUDA: docker/Dockerfile.llamacpp (based on ghcr.io/ggml-org/llama.cpp:server-cuda) - llamactl with vLLM CUDA: docker/Dockerfile.vllm (based on vllm/vllm-openai:latest) - llamactl built from source: docker/Dockerfile.source (multi-stage build with webui)

Note: These Dockerfiles are configured for CUDA. For other platforms (CPU, ROCm, Vulkan, etc.), adapt the base image. For llama.cpp, see available tags at llama.cpp Docker docs. For vLLM, check vLLM docs.

"},{"location":"getting-started/installation/#using-docker-compose","title":"Using Docker Compose","text":"
# Clone the repository\ngit clone https://github.com/lordmathis/llamactl.git\ncd llamactl\n\n# Create directories for data and models\nmkdir -p data/llamacpp data/vllm models\n\n# Start llamactl with llama.cpp backend\ndocker-compose -f docker/docker-compose.yml up llamactl-llamacpp -d\n\n# Or start llamactl with vLLM backend\ndocker-compose -f docker/docker-compose.yml up llamactl-vllm -d\n

Access the dashboard at: - llamactl with llama.cpp: http://localhost:8080 - llamactl with vLLM: http://localhost:8081

"},{"location":"getting-started/installation/#using-docker-build-and-run","title":"Using Docker Build and Run","text":"

llamactl with llama.cpp CUDA:

docker build -f docker/Dockerfile.llamacpp -t llamactl:llamacpp-cuda .\ndocker run -d \\\n  --name llamactl-llamacpp \\\n  --gpus all \\\n  -p 8080:8080 \\\n  -v ~/.cache/llama.cpp:/root/.cache/llama.cpp \\\n  llamactl:llamacpp-cuda\n

llamactl with vLLM CUDA:

docker build -f docker/Dockerfile.vllm -t llamactl:vllm-cuda .\ndocker run -d \\\n  --name llamactl-vllm \\\n  --gpus all \\\n  -p 8080:8080 \\\n  -v ~/.cache/huggingface:/root/.cache/huggingface \\\n  llamactl:vllm-cuda\n

llamactl built from source:

docker build -f docker/Dockerfile.source -t llamactl:source .\ndocker run -d \\\n  --name llamactl \\\n  -p 8080:8080 \\\n  llamactl:source\n

"},{"location":"getting-started/installation/#option-3-build-from-source","title":"Option 3: Build from Source","text":"

Requirements: - Go 1.24 or later - Node.js 22 or later - Git

If you prefer to build from source:

# Clone the repository\ngit clone https://github.com/lordmathis/llamactl.git\ncd llamactl\n\n# Build the web UI\ncd webui && npm ci && npm run build && cd ..\n\n# Build the application\ngo build -o llamactl ./cmd/server\n
"},{"location":"getting-started/installation/#verification","title":"Verification","text":"

Verify your installation by checking the version:

llamactl --version\n
"},{"location":"getting-started/installation/#next-steps","title":"Next Steps","text":"

Now that Llamactl is installed, continue to the Quick Start guide to get your first instance running!

"},{"location":"getting-started/quick-start/","title":"Quick Start","text":"

This guide will help you get Llamactl up and running in just a few minutes.

"},{"location":"getting-started/quick-start/#step-1-start-llamactl","title":"Step 1: Start Llamactl","text":"

Start the Llamactl server:

llamactl\n

By default, Llamactl will start on http://localhost:8080.

"},{"location":"getting-started/quick-start/#step-2-access-the-web-ui","title":"Step 2: Access the Web UI","text":"

Open your web browser and navigate to:

http://localhost:8080\n

Login with the management API key. By default it is generated during server startup. Copy it from the terminal output.

You should see the Llamactl web interface.

"},{"location":"getting-started/quick-start/#step-3-create-your-first-instance","title":"Step 3: Create Your First Instance","text":"
  1. Click the \"Add Instance\" button
  2. Fill in the instance configuration:
  3. Name: Give your instance a descriptive name
  4. Backend Type: Choose from llama.cpp, MLX, or vLLM
  5. Model: Model path or identifier for your chosen backend
  6. Additional Options: Backend-specific parameters

  7. Click \"Create Instance\"

"},{"location":"getting-started/quick-start/#step-4-start-your-instance","title":"Step 4: Start Your Instance","text":"

Once created, you can:

"},{"location":"getting-started/quick-start/#example-configurations","title":"Example Configurations","text":"

Here are basic example configurations for each backend:

llama.cpp backend:

{\n  \"name\": \"llama2-7b\",\n  \"backend_type\": \"llama_cpp\",\n  \"backend_options\": {\n    \"model\": \"/path/to/llama-2-7b-chat.gguf\",\n    \"threads\": 4,\n    \"ctx_size\": 2048,\n    \"gpu_layers\": 32\n  }\n}\n

MLX backend (macOS only):

{\n  \"name\": \"mistral-mlx\",\n  \"backend_type\": \"mlx_lm\",\n  \"backend_options\": {\n    \"model\": \"mlx-community/Mistral-7B-Instruct-v0.3-4bit\",\n    \"temp\": 0.7,\n    \"max_tokens\": 2048\n  }\n}\n

vLLM backend:

{\n  \"name\": \"dialogpt-vllm\",\n  \"backend_type\": \"vllm\",\n  \"backend_options\": {\n    \"model\": \"microsoft/DialoGPT-medium\",\n    \"tensor_parallel_size\": 2,\n    \"gpu_memory_utilization\": 0.9\n  }\n}\n

"},{"location":"getting-started/quick-start/#docker-support","title":"Docker Support","text":"

Llamactl can run backends in Docker containers. To enable Docker for a backend, add a docker section to that backend in your YAML configuration file (e.g. config.yaml) as shown below:

backends:\n  vllm:\n    command: \"vllm\"\n    args: [\"serve\"]\n    docker:\n      enabled: true\n      image: \"vllm/vllm-openai:latest\"\n      args: [\"run\", \"--rm\", \"--network\", \"host\", \"--gpus\", \"all\", \"--shm-size\", \"1g\"]\n
"},{"location":"getting-started/quick-start/#using-the-api","title":"Using the API","text":"

You can also manage instances via the REST API:

# List all instances\ncurl http://localhost:8080/api/instances\n\n# Create a new llama.cpp instance\ncurl -X POST http://localhost:8080/api/instances/my-model \\\n  -H \"Content-Type: application/json\" \\\n  -d '{\n    \"backend_type\": \"llama_cpp\",\n    \"backend_options\": {\n      \"model\": \"/path/to/model.gguf\"\n    }\n  }'\n\n# Start an instance\ncurl -X POST http://localhost:8080/api/instances/my-model/start\n
"},{"location":"getting-started/quick-start/#openai-compatible-api","title":"OpenAI Compatible API","text":"

Llamactl provides OpenAI-compatible endpoints, making it easy to integrate with existing OpenAI client libraries and tools.

"},{"location":"getting-started/quick-start/#chat-completions","title":"Chat Completions","text":"

Once you have an instance running, you can use it with the OpenAI-compatible chat completions endpoint:

curl -X POST http://localhost:8080/v1/chat/completions \\\n  -H \"Content-Type: application/json\" \\\n  -d '{\n    \"model\": \"my-model\",\n    \"messages\": [\n      {\n        \"role\": \"user\",\n        \"content\": \"Hello! Can you help me write a Python function?\"\n      }\n    ],\n    \"max_tokens\": 150,\n    \"temperature\": 0.7\n  }'\n
"},{"location":"getting-started/quick-start/#using-with-python-openai-client","title":"Using with Python OpenAI Client","text":"

You can also use the official OpenAI Python client:

from openai import OpenAI\n\n# Point the client to your Llamactl server\nclient = OpenAI(\n    base_url=\"http://localhost:8080/v1\",\n    api_key=\"not-needed\"  # Llamactl doesn't require API keys by default\n)\n\n# Create a chat completion\nresponse = client.chat.completions.create(\n    model=\"my-model\",  # Use the name of your instance\n    messages=[\n        {\"role\": \"user\", \"content\": \"Explain quantum computing in simple terms\"}\n    ],\n    max_tokens=200,\n    temperature=0.7\n)\n\nprint(response.choices[0].message.content)\n
"},{"location":"getting-started/quick-start/#list-available-models","title":"List Available Models","text":"

Get a list of running instances (models) in OpenAI-compatible format:

curl http://localhost:8080/v1/models\n
"},{"location":"getting-started/quick-start/#next-steps","title":"Next Steps","text":""},{"location":"user-guide/api-reference/","title":"API Reference","text":"

Complete reference for the Llamactl REST API.

"},{"location":"user-guide/api-reference/#base-url","title":"Base URL","text":"

All API endpoints are relative to the base URL:

http://localhost:8080/api/v1\n
"},{"location":"user-guide/api-reference/#authentication","title":"Authentication","text":"

Llamactl supports API key authentication. If authentication is enabled, include the API key in the Authorization header:

curl -H \"Authorization: Bearer <your-api-key>\" \\\n  http://localhost:8080/api/v1/instances\n

The server supports two types of API keys: - Management API Keys: Required for instance management operations (CRUD operations on instances) - Inference API Keys: Required for OpenAI-compatible inference endpoints

"},{"location":"user-guide/api-reference/#system-endpoints","title":"System Endpoints","text":""},{"location":"user-guide/api-reference/#get-llamactl-version","title":"Get Llamactl Version","text":"

Get the version information of the llamactl server.

GET /api/v1/version\n

Response:

Version: 1.0.0\nCommit: abc123\nBuild Time: 2024-01-15T10:00:00Z\n

"},{"location":"user-guide/api-reference/#get-llama-server-help","title":"Get Llama Server Help","text":"

Get help text for the llama-server command.

GET /api/v1/server/help\n

Response: Plain text help output from llama-server --help

"},{"location":"user-guide/api-reference/#get-llama-server-version","title":"Get Llama Server Version","text":"

Get version information of the llama-server binary.

GET /api/v1/server/version\n

Response: Plain text version output from llama-server --version

"},{"location":"user-guide/api-reference/#list-available-devices","title":"List Available Devices","text":"

List available devices for llama-server.

GET /api/v1/server/devices\n

Response: Plain text device list from llama-server --list-devices

"},{"location":"user-guide/api-reference/#instances","title":"Instances","text":""},{"location":"user-guide/api-reference/#list-all-instances","title":"List All Instances","text":"

Get a list of all instances.

GET /api/v1/instances\n

Response:

[\n  {\n    \"name\": \"llama2-7b\",\n    \"status\": \"running\",\n    \"created\": 1705312200\n  }\n]\n

"},{"location":"user-guide/api-reference/#get-instance-details","title":"Get Instance Details","text":"

Get detailed information about a specific instance.

GET /api/v1/instances/{name}\n

Response:

{\n  \"name\": \"llama2-7b\",\n  \"status\": \"running\",\n  \"created\": 1705312200\n}\n

"},{"location":"user-guide/api-reference/#create-instance","title":"Create Instance","text":"

Create and start a new instance.

POST /api/v1/instances/{name}\n

Request Body: JSON object with instance configuration. Common fields include:

See Managing Instances for complete configuration options.

Response:

{\n  \"name\": \"llama2-7b\",\n  \"status\": \"running\",\n  \"created\": 1705312200\n}\n

"},{"location":"user-guide/api-reference/#update-instance","title":"Update Instance","text":"

Update an existing instance configuration. See Managing Instances for available configuration options.

PUT /api/v1/instances/{name}\n

Request Body: JSON object with configuration fields to update.

Response:

{\n  \"name\": \"llama2-7b\",\n  \"status\": \"running\",\n  \"created\": 1705312200\n}\n

"},{"location":"user-guide/api-reference/#delete-instance","title":"Delete Instance","text":"

Stop and remove an instance.

DELETE /api/v1/instances/{name}\n

Response: 204 No Content

"},{"location":"user-guide/api-reference/#instance-operations","title":"Instance Operations","text":""},{"location":"user-guide/api-reference/#start-instance","title":"Start Instance","text":"

Start a stopped instance.

POST /api/v1/instances/{name}/start\n

Response:

{\n  \"name\": \"llama2-7b\",\n  \"status\": \"running\",\n  \"created\": 1705312200\n}\n

Error Responses: - 409 Conflict: Maximum number of running instances reached - 500 Internal Server Error: Failed to start instance

"},{"location":"user-guide/api-reference/#stop-instance","title":"Stop Instance","text":"

Stop a running instance.

POST /api/v1/instances/{name}/stop\n

Response:

{\n  \"name\": \"llama2-7b\",\n  \"status\": \"stopped\",\n  \"created\": 1705312200\n}\n

"},{"location":"user-guide/api-reference/#restart-instance","title":"Restart Instance","text":"

Restart an instance (stop then start).

POST /api/v1/instances/{name}/restart\n

Response:

{\n  \"name\": \"llama2-7b\",\n  \"status\": \"running\",\n  \"created\": 1705312200\n}\n

"},{"location":"user-guide/api-reference/#get-instance-logs","title":"Get Instance Logs","text":"

Retrieve instance logs.

GET /api/v1/instances/{name}/logs\n

Query Parameters: - lines: Number of lines to return (default: all lines, use -1 for all)

Response: Plain text log output

Example:

curl \"http://localhost:8080/api/v1/instances/my-instance/logs?lines=100\"\n

"},{"location":"user-guide/api-reference/#proxy-to-instance","title":"Proxy to Instance","text":"

Proxy HTTP requests directly to the llama-server instance.

GET /api/v1/instances/{name}/proxy/*\nPOST /api/v1/instances/{name}/proxy/*\n

This endpoint forwards all requests to the underlying llama-server instance running on its configured port. The proxy strips the /api/v1/instances/{name}/proxy prefix and forwards the remaining path to the instance.

Example - Check Instance Health:

curl -H \"Authorization: Bearer your-api-key\" \\\n  http://localhost:8080/api/v1/instances/my-model/proxy/health\n

This forwards the request to http://instance-host:instance-port/health on the actual llama-server instance.

Error Responses: - 503 Service Unavailable: Instance is not running

"},{"location":"user-guide/api-reference/#openai-compatible-api","title":"OpenAI-Compatible API","text":"

Llamactl provides OpenAI-compatible endpoints for inference operations.

"},{"location":"user-guide/api-reference/#list-models","title":"List Models","text":"

List all instances in OpenAI-compatible format.

GET /v1/models\n

Response:

{\n  \"object\": \"list\",\n  \"data\": [\n    {\n      \"id\": \"llama2-7b\",\n      \"object\": \"model\",\n      \"created\": 1705312200,\n      \"owned_by\": \"llamactl\"\n    }\n  ]\n}\n

"},{"location":"user-guide/api-reference/#chat-completions-completions-embeddings","title":"Chat Completions, Completions, Embeddings","text":"

All OpenAI-compatible inference endpoints are available:

POST /v1/chat/completions\nPOST /v1/completions\nPOST /v1/embeddings\nPOST /v1/rerank\nPOST /v1/reranking\n

Request Body: Standard OpenAI format with model field specifying the instance name

Example:

{\n  \"model\": \"llama2-7b\",\n  \"messages\": [\n    {\n      \"role\": \"user\",\n      \"content\": \"Hello, how are you?\"\n    }\n  ]\n}\n

The server routes requests to the appropriate instance based on the model field in the request body. Instances with on-demand starting enabled will be automatically started if not running. For configuration details, see Managing Instances.

Error Responses: - 400 Bad Request: Invalid request body or missing instance name - 503 Service Unavailable: Instance is not running and on-demand start is disabled - 409 Conflict: Cannot start instance due to maximum instances limit

"},{"location":"user-guide/api-reference/#instance-status-values","title":"Instance Status Values","text":"

Instances can have the following status values: - stopped: Instance is not running - running: Instance is running and ready to accept requests - failed: Instance failed to start or crashed

"},{"location":"user-guide/api-reference/#error-responses","title":"Error Responses","text":"

All endpoints may return error responses in the following format:

{\n  \"error\": \"Error message description\"\n}\n
"},{"location":"user-guide/api-reference/#common-http-status-codes","title":"Common HTTP Status Codes","text":""},{"location":"user-guide/api-reference/#examples","title":"Examples","text":""},{"location":"user-guide/api-reference/#complete-instance-lifecycle","title":"Complete Instance Lifecycle","text":"
# Create and start instance\ncurl -X POST http://localhost:8080/api/v1/instances/my-model \\\n  -H \"Content-Type: application/json\" \\\n  -H \"Authorization: Bearer your-api-key\" \\\n  -d '{\n    \"backend_type\": \"llama_cpp\",\n    \"backend_options\": {\n      \"model\": \"/models/llama-2-7b.gguf\",\n      \"gpu_layers\": 32\n    },\n    \"environment\": {\n      \"CUDA_VISIBLE_DEVICES\": \"0\",\n      \"OMP_NUM_THREADS\": \"8\"\n    }\n  }'\n\n# Check instance status\ncurl -H \"Authorization: Bearer your-api-key\" \\\n  http://localhost:8080/api/v1/instances/my-model\n\n# Get instance logs\ncurl -H \"Authorization: Bearer your-api-key\" \\\n  \"http://localhost:8080/api/v1/instances/my-model/logs?lines=50\"\n\n# Use OpenAI-compatible chat completions\ncurl -X POST http://localhost:8080/v1/chat/completions \\\n  -H \"Content-Type: application/json\" \\\n  -H \"Authorization: Bearer your-inference-api-key\" \\\n  -d '{\n    \"model\": \"my-model\",\n    \"messages\": [\n      {\"role\": \"user\", \"content\": \"Hello!\"}\n    ],\n    \"max_tokens\": 100\n  }'\n\n# Stop instance\ncurl -X POST -H \"Authorization: Bearer your-api-key\" \\\n  http://localhost:8080/api/v1/instances/my-model/stop\n\n# Delete instance\ncurl -X DELETE -H \"Authorization: Bearer your-api-key\" \\\n  http://localhost:8080/api/v1/instances/my-model\n
"},{"location":"user-guide/api-reference/#using-the-proxy-endpoint","title":"Using the Proxy Endpoint","text":"

You can also directly proxy requests to the llama-server instance:

# Direct proxy to instance (bypasses OpenAI compatibility layer)\ncurl -X POST http://localhost:8080/api/v1/instances/my-model/proxy/completion \\\n  -H \"Content-Type: application/json\" \\\n  -H \"Authorization: Bearer your-api-key\" \\\n  -d '{\n    \"prompt\": \"Hello, world!\",\n    \"n_predict\": 50\n  }'\n
"},{"location":"user-guide/api-reference/#backend-specific-endpoints","title":"Backend-Specific Endpoints","text":""},{"location":"user-guide/api-reference/#parse-commands","title":"Parse Commands","text":"

Llamactl provides endpoints to parse command strings from different backends into instance configuration options.

"},{"location":"user-guide/api-reference/#parse-llamacpp-command","title":"Parse Llama.cpp Command","text":"

Parse a llama-server command string into instance options.

POST /api/v1/backends/llama-cpp/parse-command\n

Request Body:

{\n  \"command\": \"llama-server -m /path/to/model.gguf -c 2048 --port 8080\"\n}\n

Response:

{\n  \"backend_type\": \"llama_cpp\",\n  \"llama_server_options\": {\n    \"model\": \"/path/to/model.gguf\",\n    \"ctx_size\": 2048,\n    \"port\": 8080\n  }\n}\n

"},{"location":"user-guide/api-reference/#parse-mlx-lm-command","title":"Parse MLX-LM Command","text":"

Parse an MLX-LM server command string into instance options.

POST /api/v1/backends/mlx/parse-command\n

Request Body:

{\n  \"command\": \"mlx_lm.server --model /path/to/model --port 8080\"\n}\n

Response:

{\n  \"backend_type\": \"mlx_lm\",\n  \"mlx_server_options\": {\n    \"model\": \"/path/to/model\",\n    \"port\": 8080\n  }\n}\n

"},{"location":"user-guide/api-reference/#parse-vllm-command","title":"Parse vLLM Command","text":"

Parse a vLLM serve command string into instance options.

POST /api/v1/backends/vllm/parse-command\n

Request Body:

{\n  \"command\": \"vllm serve /path/to/model --port 8080\"\n}\n

Response:

{\n  \"backend_type\": \"vllm\",\n  \"vllm_server_options\": {\n    \"model\": \"/path/to/model\",\n    \"port\": 8080\n  }\n}\n

Error Responses for Parse Commands: - 400 Bad Request: Invalid request body, empty command, or parse error - 500 Internal Server Error: Encoding error

"},{"location":"user-guide/api-reference/#auto-generated-documentation","title":"Auto-Generated Documentation","text":"

The API documentation is automatically generated from code annotations using Swagger/OpenAPI. To regenerate the documentation:

  1. Install the swag tool: go install github.com/swaggo/swag/cmd/swag@latest
  2. Generate docs: swag init -g cmd/server/main.go -o apidocs
"},{"location":"user-guide/api-reference/#swagger-documentation","title":"Swagger Documentation","text":"

If swagger documentation is enabled in the server configuration, you can access the interactive API documentation at:

http://localhost:8080/swagger/\n

This provides a complete interactive interface for testing all API endpoints.

"},{"location":"user-guide/managing-instances/","title":"Managing Instances","text":"

Learn how to effectively manage your llama.cpp, MLX, and vLLM instances with Llamactl through both the Web UI and API.

"},{"location":"user-guide/managing-instances/#overview","title":"Overview","text":"

Llamactl provides two ways to manage instances:

"},{"location":"user-guide/managing-instances/#authentication","title":"Authentication","text":"

If authentication is enabled: 1. Navigate to the web UI 2. Enter your credentials 3. Bearer token is stored for the session

"},{"location":"user-guide/managing-instances/#theme-support","title":"Theme Support","text":""},{"location":"user-guide/managing-instances/#instance-cards","title":"Instance Cards","text":"

Each instance is displayed as a card showing:

"},{"location":"user-guide/managing-instances/#create-instance","title":"Create Instance","text":""},{"location":"user-guide/managing-instances/#via-web-ui","title":"Via Web UI","text":"
  1. Click the \"Create Instance\" button on the dashboard
  2. Enter a unique Name for your instance (only required field)
  3. Choose Backend Type:
  4. Configure model source:
  5. Configure optional instance management settings:
  6. Configure backend-specific options:
  7. Click \"Create\" to save the instance
"},{"location":"user-guide/managing-instances/#via-api","title":"Via API","text":"
# Create llama.cpp instance with local model file\ncurl -X POST http://localhost:8080/api/instances/my-llama-instance \\\n  -H \"Content-Type: application/json\" \\\n  -d '{\n    \"backend_type\": \"llama_cpp\",\n    \"backend_options\": {\n      \"model\": \"/path/to/model.gguf\",\n      \"threads\": 8,\n      \"ctx_size\": 4096,\n      \"gpu_layers\": 32\n    }\n  }'\n\n# Create MLX instance (macOS only)\ncurl -X POST http://localhost:8080/api/instances/my-mlx-instance \\\n  -H \"Content-Type: application/json\" \\\n  -d '{\n    \"backend_type\": \"mlx_lm\",\n    \"backend_options\": {\n      \"model\": \"mlx-community/Mistral-7B-Instruct-v0.3-4bit\",\n      \"temp\": 0.7,\n      \"top_p\": 0.9,\n      \"max_tokens\": 2048\n    },\n    \"auto_restart\": true,\n    \"max_restarts\": 3\n  }'\n\n# Create vLLM instance\ncurl -X POST http://localhost:8080/api/instances/my-vllm-instance \\\n  -H \"Content-Type: application/json\" \\\n  -d '{\n    \"backend_type\": \"vllm\",\n    \"backend_options\": {\n      \"model\": \"microsoft/DialoGPT-medium\",\n      \"tensor_parallel_size\": 2,\n      \"gpu_memory_utilization\": 0.9\n    },\n    \"auto_restart\": true,\n    \"on_demand_start\": true,\n    \"environment\": {\n      \"CUDA_VISIBLE_DEVICES\": \"0,1\",\n      \"NCCL_DEBUG\": \"INFO\",\n      \"PYTHONPATH\": \"/custom/path\"\n    }\n  }'\n\n# Create llama.cpp instance with HuggingFace model\ncurl -X POST http://localhost:8080/api/instances/gemma-3-27b \\\n  -H \"Content-Type: application/json\" \\\n  -d '{\n    \"backend_type\": \"llama_cpp\",\n    \"backend_options\": {\n      \"hf_repo\": \"unsloth/gemma-3-27b-it-GGUF\",\n      \"hf_file\": \"gemma-3-27b-it-GGUF.gguf\",\n      \"gpu_layers\": 32\n    }\n  }'\n
"},{"location":"user-guide/managing-instances/#start-instance","title":"Start Instance","text":""},{"location":"user-guide/managing-instances/#via-web-ui_1","title":"Via Web UI","text":"
  1. Click the \"Start\" button on an instance card
  2. Watch the status change to \"Unknown\"
  3. Monitor progress in the logs
  4. Instance status changes to \"Ready\" when ready
"},{"location":"user-guide/managing-instances/#via-api_1","title":"Via API","text":"
curl -X POST http://localhost:8080/api/instances/{name}/start\n
"},{"location":"user-guide/managing-instances/#stop-instance","title":"Stop Instance","text":""},{"location":"user-guide/managing-instances/#via-web-ui_2","title":"Via Web UI","text":"
  1. Click the \"Stop\" button on an instance card
  2. Instance gracefully shuts down
"},{"location":"user-guide/managing-instances/#via-api_2","title":"Via API","text":"
curl -X POST http://localhost:8080/api/instances/{name}/stop\n
"},{"location":"user-guide/managing-instances/#edit-instance","title":"Edit Instance","text":""},{"location":"user-guide/managing-instances/#via-web-ui_3","title":"Via Web UI","text":"
  1. Click the \"Edit\" button on an instance card
  2. Modify settings in the configuration dialog
  3. Changes require instance restart to take effect
  4. Click \"Update & Restart\" to apply changes
"},{"location":"user-guide/managing-instances/#via-api_3","title":"Via API","text":"

Modify instance settings:

curl -X PUT http://localhost:8080/api/instances/{name} \\\n  -H \"Content-Type: application/json\" \\\n  -d '{\n    \"backend_options\": {\n      \"threads\": 8,\n      \"context_size\": 4096\n    }\n  }'\n

Note

Configuration changes require restarting the instance to take effect.

"},{"location":"user-guide/managing-instances/#view-logs","title":"View Logs","text":""},{"location":"user-guide/managing-instances/#via-web-ui_4","title":"Via Web UI","text":"
  1. Click the \"Logs\" button on any instance card
  2. Real-time log viewer opens
"},{"location":"user-guide/managing-instances/#via-api_4","title":"Via API","text":"

Check instance status in real-time:

# Get instance details\ncurl http://localhost:8080/api/instances/{name}/logs\n
"},{"location":"user-guide/managing-instances/#delete-instance","title":"Delete Instance","text":""},{"location":"user-guide/managing-instances/#via-web-ui_5","title":"Via Web UI","text":"
  1. Click the \"Delete\" button on an instance card
  2. Only stopped instances can be deleted
  3. Confirm deletion in the dialog
"},{"location":"user-guide/managing-instances/#via-api_5","title":"Via API","text":"
curl -X DELETE http://localhost:8080/api/instances/{name}\n
"},{"location":"user-guide/managing-instances/#instance-proxy","title":"Instance Proxy","text":"

Llamactl proxies all requests to the underlying backend instances (llama-server, MLX, or vLLM).

# Get instance details\ncurl http://localhost:8080/api/instances/{name}/proxy/\n

All backends provide OpenAI-compatible endpoints. Check the respective documentation: - llama-server docs - MLX-LM docs - vLLM docs

"},{"location":"user-guide/managing-instances/#instance-health","title":"Instance Health","text":""},{"location":"user-guide/managing-instances/#via-web-ui_6","title":"Via Web UI","text":"
  1. The health status badge is displayed on each instance card
"},{"location":"user-guide/managing-instances/#via-api_6","title":"Via API","text":"

Check the health status of your instances:

curl http://localhost:8080/api/instances/{name}/proxy/health\n
"},{"location":"user-guide/troubleshooting/","title":"Troubleshooting","text":"

Issues specific to Llamactl deployment and operation.

"},{"location":"user-guide/troubleshooting/#configuration-issues","title":"Configuration Issues","text":""},{"location":"user-guide/troubleshooting/#invalid-configuration","title":"Invalid Configuration","text":"

Problem: Invalid configuration preventing startup

Solutions: 1. Use minimal configuration:

server:\n  host: \"0.0.0.0\"\n  port: 8080\ninstances:\n  port_range: [8000, 9000]\n

  1. Check data directory permissions:
    # Ensure data directory is writable (default: ~/.local/share/llamactl)\nmkdir -p ~/.local/share/llamactl/{instances,logs}\n
"},{"location":"user-guide/troubleshooting/#instance-management-issues","title":"Instance Management Issues","text":""},{"location":"user-guide/troubleshooting/#model-loading-failures","title":"Model Loading Failures","text":"

Problem: Instance fails to start with model loading errors

Common Solutions: - llama-server not found: Ensure llama-server binary is in PATH - Wrong model format: Ensure model is in GGUF format - Insufficient memory: Use smaller model or reduce context size - Path issues: Use absolute paths to model files

"},{"location":"user-guide/troubleshooting/#memory-issues","title":"Memory Issues","text":"

Problem: Out of memory errors or system becomes unresponsive

Solutions: 1. Reduce context size:

{\n  \"n_ctx\": 1024\n}\n

  1. Use quantized models:
  2. Try Q4_K_M instead of higher precision models
  3. Use smaller model variants (7B instead of 13B)
"},{"location":"user-guide/troubleshooting/#gpu-configuration","title":"GPU Configuration","text":"

Problem: GPU not being used effectively

Solutions: 1. Configure GPU layers:

{\n  \"n_gpu_layers\": 35\n}\n

"},{"location":"user-guide/troubleshooting/#advanced-instance-issues","title":"Advanced Instance Issues","text":"

Problem: Complex model loading, performance, or compatibility issues

Since llamactl uses llama-server under the hood, many instance-related issues are actually llama.cpp issues. For advanced troubleshooting:

Resources: - llama.cpp Documentation: https://github.com/ggml/llama.cpp - llama.cpp Issues: https://github.com/ggml/llama.cpp/issues - llama.cpp Discussions: https://github.com/ggml/llama.cpp/discussions

Testing directly with llama-server:

# Test your model and parameters directly with llama-server\nllama-server --model /path/to/model.gguf --port 8081 --n-gpu-layers 35\n

This helps determine if the issue is with llamactl or with the underlying llama.cpp/llama-server.

"},{"location":"user-guide/troubleshooting/#api-and-network-issues","title":"API and Network Issues","text":""},{"location":"user-guide/troubleshooting/#cors-errors","title":"CORS Errors","text":"

Problem: Web UI shows CORS errors in browser console

Solutions: 1. Configure allowed origins:

server:\n  allowed_origins:\n    - \"http://localhost:3000\"\n    - \"https://yourdomain.com\"\n

"},{"location":"user-guide/troubleshooting/#authentication-issues","title":"Authentication Issues","text":"

Problem: API requests failing with authentication errors

Solutions: 1. Disable authentication temporarily:

auth:\n  require_management_auth: false\n  require_inference_auth: false\n

  1. Configure API keys:

    auth:\n  management_keys:\n    - \"your-management-key\"\n  inference_keys:\n    - \"your-inference-key\"\n

  2. Use correct Authorization header:

    curl -H \"Authorization: Bearer your-api-key\" \\\n  http://localhost:8080/api/v1/instances\n

"},{"location":"user-guide/troubleshooting/#debugging-and-logs","title":"Debugging and Logs","text":""},{"location":"user-guide/troubleshooting/#viewing-instance-logs","title":"Viewing Instance Logs","text":"
# Get instance logs via API\ncurl http://localhost:8080/api/v1/instances/{name}/logs\n\n# Or check log files directly\ntail -f ~/.local/share/llamactl/logs/{instance-name}.log\n
"},{"location":"user-guide/troubleshooting/#enable-debug-logging","title":"Enable Debug Logging","text":"
export LLAMACTL_LOG_LEVEL=debug\nllamactl\n
"},{"location":"user-guide/troubleshooting/#getting-help","title":"Getting Help","text":"

When reporting issues, include:

  1. System information:

    llamactl --version\n

  2. Configuration file (remove sensitive keys)

  3. Relevant log output

  4. Steps to reproduce the issue

"}]} \ No newline at end of file diff --git a/dev/sitemap.xml b/dev/sitemap.xml index fddac5e..0038974 100644 --- a/dev/sitemap.xml +++ b/dev/sitemap.xml @@ -2,37 +2,37 @@ https://llamactl.org/dev/ - 2025-09-29 + 2025-10-04 daily https://llamactl.org/dev/getting-started/configuration/ - 2025-09-29 + 2025-10-04 daily https://llamactl.org/dev/getting-started/installation/ - 2025-09-29 + 2025-10-04 daily https://llamactl.org/dev/getting-started/quick-start/ - 2025-09-29 + 2025-10-04 daily https://llamactl.org/dev/user-guide/api-reference/ - 2025-09-29 + 2025-10-04 daily https://llamactl.org/dev/user-guide/managing-instances/ - 2025-09-29 + 2025-10-04 daily https://llamactl.org/dev/user-guide/troubleshooting/ - 2025-09-29 + 2025-10-04 daily \ No newline at end of file diff --git a/dev/sitemap.xml.gz b/dev/sitemap.xml.gz index f17f5b1..ba6f7c1 100644 Binary files a/dev/sitemap.xml.gz and b/dev/sitemap.xml.gz differ