diff --git a/dev/__pycache__/readme_sync.cpython-311.pyc b/dev/__pycache__/readme_sync.cpython-311.pyc index 33a8a20..59fcd5b 100644 Binary files a/dev/__pycache__/readme_sync.cpython-311.pyc and b/dev/__pycache__/readme_sync.cpython-311.pyc differ diff --git a/dev/getting-started/configuration/index.html b/dev/getting-started/configuration/index.html index 02ee7c6..9487c8a 100644 --- a/dev/getting-started/configuration/index.html +++ b/dev/getting-started/configuration/index.html @@ -854,49 +854,52 @@ llama-cpp: command: "llama-server" args: [] - environment: {} # Environment variables for the backend process + environment: {} # Environment variables for the backend process docker: enabled: false image: "ghcr.io/ggml-org/llama.cpp:server" args: ["run", "--rm", "--network", "host", "--gpus", "all"] environment: {} - - vllm: - command: "vllm" - args: ["serve"] - environment: {} # Environment variables for the backend process - docker: - enabled: false - image: "vllm/vllm-openai:latest" - args: ["run", "--rm", "--network", "host", "--gpus", "all", "--shm-size", "1g"] - environment: {} - - mlx: - command: "mlx_lm.server" - args: [] - environment: {} # Environment variables for the backend process - -instances: - port_range: [8000, 9000] # Port range for instances - data_dir: ~/.local/share/llamactl # Data directory (platform-specific, see below) - configs_dir: ~/.local/share/llamactl/instances # Instance configs directory - logs_dir: ~/.local/share/llamactl/logs # Logs directory - auto_create_dirs: true # Auto-create data/config/logs dirs if missing - max_instances: -1 # Max instances (-1 = unlimited) - max_running_instances: -1 # Max running instances (-1 = unlimited) - enable_lru_eviction: true # Enable LRU eviction for idle instances - default_auto_restart: true # Auto-restart new instances by default - default_max_restarts: 3 # Max restarts for new instances - default_restart_delay: 5 # Restart delay (seconds) for new instances - default_on_demand_start: true # Default on-demand start setting - on_demand_start_timeout: 120 # Default on-demand start timeout in seconds - timeout_check_interval: 5 # Idle instance timeout check in minutes - -auth: - require_inference_auth: true # Require auth for inference endpoints - inference_keys: [] # Keys for inference endpoints - require_management_auth: true # Require auth for management endpoints - management_keys: [] # Keys for management endpoints + response_headers: {} # Additional response headers to send with responses + + vllm: + command: "vllm" + args: ["serve"] + environment: {} # Environment variables for the backend process + docker: + enabled: false + image: "vllm/vllm-openai:latest" + args: ["run", "--rm", "--network", "host", "--gpus", "all", "--shm-size", "1g"] + environment: {} + response_headers: {} # Additional response headers to send with responses + + mlx: + command: "mlx_lm.server" + args: [] + environment: {} # Environment variables for the backend process + response_headers: {} # Additional response headers to send with responses + +instances: + port_range: [8000, 9000] # Port range for instances + data_dir: ~/.local/share/llamactl # Data directory (platform-specific, see below) + configs_dir: ~/.local/share/llamactl/instances # Instance configs directory + logs_dir: ~/.local/share/llamactl/logs # Logs directory + auto_create_dirs: true # Auto-create data/config/logs dirs if missing + max_instances: -1 # Max instances (-1 = unlimited) + max_running_instances: -1 # Max running instances (-1 = unlimited) + enable_lru_eviction: true # Enable LRU eviction for idle instances + default_auto_restart: true # Auto-restart new instances by default + default_max_restarts: 3 # Max restarts for new instances + default_restart_delay: 5 # Restart delay (seconds) for new instances + default_on_demand_start: true # Default on-demand start setting + on_demand_start_timeout: 120 # Default on-demand start timeout in seconds + timeout_check_interval: 5 # Idle instance timeout check in minutes + +auth: + require_inference_auth: true # Require auth for inference endpoints + inference_keys: [] # Keys for inference endpoints + require_management_auth: true # Require auth for management endpoints + management_keys: [] # Keys for management endpoints
Backend Configuration Fields:
- command: Executable name/path for the backend
- args: Default arguments prepended to all instances
- environment: Environment variables for the backend process (optional)
+- response_headers: Additional response headers to send with responses (optional)
- docker: Docker-specific configuration (optional)
- enabled: Boolean flag to enable Docker runtime
- image: Docker image to use
- args: Additional arguments passed to docker run
- environment: Environment variables for the container (optional)
+If llamactl is behind an NGINX proxy,
+X-Accel-Buffering: noresponse header may be required for NGINX to properly stream the responses without buffering.
Environment Variables:
LlamaCpp Backend:
- LLAMACTL_LLAMACPP_COMMAND - LlamaCpp executable command
@@ -973,7 +983,8 @@
- LLAMACTL_LLAMACPP_DOCKER_ENABLED - Enable Docker runtime (true/false)
- LLAMACTL_LLAMACPP_DOCKER_IMAGE - Docker image to use
- LLAMACTL_LLAMACPP_DOCKER_ARGS - Space-separated Docker arguments
-- LLAMACTL_LLAMACPP_DOCKER_ENV - Docker environment variables in format "KEY1=value1,KEY2=value2"
LLAMACTL_LLAMACPP_DOCKER_ENV - Docker environment variables in format "KEY1=value1,KEY2=value2"
+- LLAMACTL_LLAMACPP_RESPONSE_HEADERS - Response headers in format "KEY1=value1;KEY2=value2"
VLLM Backend:
- LLAMACTL_VLLM_COMMAND - VLLM executable command
- LLAMACTL_VLLM_ARGS - Space-separated default arguments
@@ -981,11 +992,13 @@
- LLAMACTL_VLLM_DOCKER_ENABLED - Enable Docker runtime (true/false)
- LLAMACTL_VLLM_DOCKER_IMAGE - Docker image to use
- LLAMACTL_VLLM_DOCKER_ARGS - Space-separated Docker arguments
-- LLAMACTL_VLLM_DOCKER_ENV - Docker environment variables in format "KEY1=value1,KEY2=value2"
LLAMACTL_VLLM_DOCKER_ENV - Docker environment variables in format "KEY1=value1,KEY2=value2"
+- LLAMACTL_VLLM_RESPONSE_HEADERS - Response headers in format "KEY1=value1;KEY2=value2"
MLX Backend:
- LLAMACTL_MLX_COMMAND - MLX executable command
- LLAMACTL_MLX_ARGS - Space-separated default arguments
-- LLAMACTL_MLX_ENV - Environment variables in format "KEY1=value1,KEY2=value2"
LLAMACTL_MLX_ENV - Environment variables in format "KEY1=value1,KEY2=value2"
+- LLAMACTL_MLX_RESPONSE_HEADERS - Response headers in format "KEY1=value1;KEY2=value2"
instances:
port_range: [8000, 9000] # Port range for instances (default: [8000, 9000])
@@ -1055,7 +1068,7 @@
- September 28, 2025
+ September 29, 2025
diff --git a/dev/search/search_index.json b/dev/search/search_index.json
index a71adec..9f8cfc2 100644
--- a/dev/search/search_index.json
+++ b/dev/search/search_index.json
@@ -1 +1 @@
-{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"","title":"Llamactl Documentation","text":"Welcome to the Llamactl documentation!
"},{"location":"#what-is-llamactl","title":"What is Llamactl?","text":"Unified management and routing for llama.cpp, MLX and vLLM models with web dashboard.
"},{"location":"#features","title":"Features","text":""},{"location":"#easy-model-management","title":"\ud83d\ude80 Easy Model Management","text":" - Multiple Model Serving: Run different models simultaneously (7B for speed, 70B for quality)
- On-Demand Instance Start: Automatically launch instances upon receiving API requests
- State Persistence: Ensure instances remain intact across server restarts
"},{"location":"#universal-compatibility","title":"\ud83d\udd17 Universal Compatibility","text":" - OpenAI API Compatible: Drop-in replacement - route requests by instance name
- Multi-Backend Support: Native support for llama.cpp, MLX (Apple Silicon optimized), and vLLM
- Docker Support: Run backends in containers
"},{"location":"#user-friendly-interface","title":"\ud83c\udf10 User-Friendly Interface","text":" - Web Dashboard: Modern React UI for visual management (unlike CLI-only tools)
- API Key Authentication: Separate keys for management vs inference access
"},{"location":"#smart-operations","title":"\u26a1 Smart Operations","text":" - Instance Monitoring: Health checks, auto-restart, log management
- Smart Resource Management: Idle timeout, LRU eviction, and configurable instance limits
- Environment Variables: Set custom environment variables per instance for advanced configuration
"},{"location":"#quick-links","title":"Quick Links","text":" - Installation Guide - Get Llamactl up and running
- Configuration Guide - Detailed configuration options
- Quick Start - Your first steps with Llamactl
- Managing Instances - Instance lifecycle management
- API Reference - Complete API documentation
"},{"location":"#getting-help","title":"Getting Help","text":"If you need help or have questions:
- Check the Troubleshooting guide
- Visit the GitHub repository
- Review the Configuration Guide for advanced settings
"},{"location":"#license","title":"License","text":"MIT License - see the LICENSE file.
"},{"location":"getting-started/configuration/","title":"Configuration","text":"llamactl can be configured via configuration files or environment variables. Configuration is loaded in the following order of precedence:
Defaults < Configuration file < Environment variables\n
llamactl works out of the box with sensible defaults, but you can customize the behavior to suit your needs.
"},{"location":"getting-started/configuration/#default-configuration","title":"Default Configuration","text":"Here's the default configuration with all available options:
server:\n host: \"0.0.0.0\" # Server host to bind to\n port: 8080 # Server port to bind to\n allowed_origins: [\"*\"] # Allowed CORS origins (default: all)\n enable_swagger: false # Enable Swagger UI for API docs\n\nbackends:\n llama-cpp:\n command: \"llama-server\"\n args: []\n environment: {} # Environment variables for the backend process\n docker:\n enabled: false\n image: \"ghcr.io/ggml-org/llama.cpp:server\"\n args: [\"run\", \"--rm\", \"--network\", \"host\", \"--gpus\", \"all\"]\n environment: {}\n\n vllm:\n command: \"vllm\"\n args: [\"serve\"]\n environment: {} # Environment variables for the backend process\n docker:\n enabled: false\n image: \"vllm/vllm-openai:latest\"\n args: [\"run\", \"--rm\", \"--network\", \"host\", \"--gpus\", \"all\", \"--shm-size\", \"1g\"]\n environment: {}\n\n mlx:\n command: \"mlx_lm.server\"\n args: []\n environment: {} # Environment variables for the backend process\n\ninstances:\n port_range: [8000, 9000] # Port range for instances\n data_dir: ~/.local/share/llamactl # Data directory (platform-specific, see below)\n configs_dir: ~/.local/share/llamactl/instances # Instance configs directory\n logs_dir: ~/.local/share/llamactl/logs # Logs directory\n auto_create_dirs: true # Auto-create data/config/logs dirs if missing\n max_instances: -1 # Max instances (-1 = unlimited)\n max_running_instances: -1 # Max running instances (-1 = unlimited)\n enable_lru_eviction: true # Enable LRU eviction for idle instances\n default_auto_restart: true # Auto-restart new instances by default\n default_max_restarts: 3 # Max restarts for new instances\n default_restart_delay: 5 # Restart delay (seconds) for new instances\n default_on_demand_start: true # Default on-demand start setting\n on_demand_start_timeout: 120 # Default on-demand start timeout in seconds\n timeout_check_interval: 5 # Idle instance timeout check in minutes\n\nauth:\n require_inference_auth: true # Require auth for inference endpoints\n inference_keys: [] # Keys for inference endpoints\n require_management_auth: true # Require auth for management endpoints\n management_keys: [] # Keys for management endpoints\n
"},{"location":"getting-started/configuration/#configuration-files","title":"Configuration Files","text":""},{"location":"getting-started/configuration/#configuration-file-locations","title":"Configuration File Locations","text":"Configuration files are searched in the following locations (in order of precedence):
Linux: - ./llamactl.yaml or ./config.yaml (current directory) - $HOME/.config/llamactl/config.yaml - /etc/llamactl/config.yaml
macOS: - ./llamactl.yaml or ./config.yaml (current directory) - $HOME/Library/Application Support/llamactl/config.yaml - /Library/Application Support/llamactl/config.yaml
Windows: - ./llamactl.yaml or ./config.yaml (current directory) - %APPDATA%\\llamactl\\config.yaml - %USERPROFILE%\\llamactl\\config.yaml - %PROGRAMDATA%\\llamactl\\config.yaml
You can specify the path to config file with LLAMACTL_CONFIG_PATH environment variable.
"},{"location":"getting-started/configuration/#configuration-options","title":"Configuration Options","text":""},{"location":"getting-started/configuration/#server-configuration","title":"Server Configuration","text":"server:\n host: \"0.0.0.0\" # Server host to bind to (default: \"0.0.0.0\")\n port: 8080 # Server port to bind to (default: 8080)\n allowed_origins: [\"*\"] # CORS allowed origins (default: [\"*\"])\n enable_swagger: false # Enable Swagger UI (default: false)\n
Environment Variables: - LLAMACTL_HOST - Server host - LLAMACTL_PORT - Server port - LLAMACTL_ALLOWED_ORIGINS - Comma-separated CORS origins - LLAMACTL_ENABLE_SWAGGER - Enable Swagger UI (true/false)
"},{"location":"getting-started/configuration/#backend-configuration","title":"Backend Configuration","text":"backends:\n llama-cpp:\n command: \"llama-server\"\n args: []\n environment: {} # Environment variables for the backend process\n docker:\n enabled: false # Enable Docker runtime (default: false)\n image: \"ghcr.io/ggml-org/llama.cpp:server\"\n args: [\"run\", \"--rm\", \"--network\", \"host\", \"--gpus\", \"all\"]\n environment: {}\n\n vllm:\n command: \"vllm\"\n args: [\"serve\"]\n environment: {} # Environment variables for the backend process\n docker:\n enabled: false\n image: \"vllm/vllm-openai:latest\"\n args: [\"run\", \"--rm\", \"--network\", \"host\", \"--gpus\", \"all\", \"--shm-size\", \"1g\"]\n environment: {}\n\n mlx:\n command: \"mlx_lm.server\"\n args: []\n environment: {} # Environment variables for the backend process\n # MLX does not support Docker\n
Backend Configuration Fields: - command: Executable name/path for the backend - args: Default arguments prepended to all instances - environment: Environment variables for the backend process (optional) - docker: Docker-specific configuration (optional) - enabled: Boolean flag to enable Docker runtime - image: Docker image to use - args: Additional arguments passed to docker run - environment: Environment variables for the container (optional)
Environment Variables:
LlamaCpp Backend: - LLAMACTL_LLAMACPP_COMMAND - LlamaCpp executable command - LLAMACTL_LLAMACPP_ARGS - Space-separated default arguments - LLAMACTL_LLAMACPP_ENV - Environment variables in format \"KEY1=value1,KEY2=value2\" - LLAMACTL_LLAMACPP_DOCKER_ENABLED - Enable Docker runtime (true/false) - LLAMACTL_LLAMACPP_DOCKER_IMAGE - Docker image to use - LLAMACTL_LLAMACPP_DOCKER_ARGS - Space-separated Docker arguments - LLAMACTL_LLAMACPP_DOCKER_ENV - Docker environment variables in format \"KEY1=value1,KEY2=value2\"
VLLM Backend: - LLAMACTL_VLLM_COMMAND - VLLM executable command - LLAMACTL_VLLM_ARGS - Space-separated default arguments - LLAMACTL_VLLM_ENV - Environment variables in format \"KEY1=value1,KEY2=value2\" - LLAMACTL_VLLM_DOCKER_ENABLED - Enable Docker runtime (true/false) - LLAMACTL_VLLM_DOCKER_IMAGE - Docker image to use - LLAMACTL_VLLM_DOCKER_ARGS - Space-separated Docker arguments - LLAMACTL_VLLM_DOCKER_ENV - Docker environment variables in format \"KEY1=value1,KEY2=value2\"
MLX Backend: - LLAMACTL_MLX_COMMAND - MLX executable command - LLAMACTL_MLX_ARGS - Space-separated default arguments - LLAMACTL_MLX_ENV - Environment variables in format \"KEY1=value1,KEY2=value2\"
"},{"location":"getting-started/configuration/#instance-configuration","title":"Instance Configuration","text":"instances:\n port_range: [8000, 9000] # Port range for instances (default: [8000, 9000])\n data_dir: \"~/.local/share/llamactl\" # Directory for all llamactl data (default varies by OS)\n configs_dir: \"~/.local/share/llamactl/instances\" # Directory for instance configs (default: data_dir/instances)\n logs_dir: \"~/.local/share/llamactl/logs\" # Directory for instance logs (default: data_dir/logs)\n auto_create_dirs: true # Automatically create data/config/logs directories (default: true)\n max_instances: -1 # Maximum instances (-1 = unlimited)\n max_running_instances: -1 # Maximum running instances (-1 = unlimited)\n enable_lru_eviction: true # Enable LRU eviction for idle instances\n default_auto_restart: true # Default auto-restart setting\n default_max_restarts: 3 # Default maximum restart attempts\n default_restart_delay: 5 # Default restart delay in seconds\n default_on_demand_start: true # Default on-demand start setting\n on_demand_start_timeout: 120 # Default on-demand start timeout in seconds\n timeout_check_interval: 5 # Default instance timeout check interval in minutes\n
Environment Variables: - LLAMACTL_INSTANCE_PORT_RANGE - Port range (format: \"8000-9000\" or \"8000,9000\") - LLAMACTL_DATA_DIRECTORY - Data directory path - LLAMACTL_INSTANCES_DIR - Instance configs directory path - LLAMACTL_LOGS_DIR - Log directory path - LLAMACTL_AUTO_CREATE_DATA_DIR - Auto-create data/config/logs directories (true/false) - LLAMACTL_MAX_INSTANCES - Maximum number of instances - LLAMACTL_MAX_RUNNING_INSTANCES - Maximum number of running instances - LLAMACTL_ENABLE_LRU_EVICTION - Enable LRU eviction for idle instances - LLAMACTL_DEFAULT_AUTO_RESTART - Default auto-restart setting (true/false) - LLAMACTL_DEFAULT_MAX_RESTARTS - Default maximum restarts - LLAMACTL_DEFAULT_RESTART_DELAY - Default restart delay in seconds - LLAMACTL_DEFAULT_ON_DEMAND_START - Default on-demand start setting (true/false) - LLAMACTL_ON_DEMAND_START_TIMEOUT - Default on-demand start timeout in seconds - LLAMACTL_TIMEOUT_CHECK_INTERVAL - Default instance timeout check interval in minutes
"},{"location":"getting-started/configuration/#authentication-configuration","title":"Authentication Configuration","text":"auth:\n require_inference_auth: true # Require API key for OpenAI endpoints (default: true)\n inference_keys: [] # List of valid inference API keys\n require_management_auth: true # Require API key for management endpoints (default: true)\n management_keys: [] # List of valid management API keys\n
Environment Variables: - LLAMACTL_REQUIRE_INFERENCE_AUTH - Require auth for OpenAI endpoints (true/false) - LLAMACTL_INFERENCE_KEYS - Comma-separated inference API keys - LLAMACTL_REQUIRE_MANAGEMENT_AUTH - Require auth for management endpoints (true/false) - LLAMACTL_MANAGEMENT_KEYS - Comma-separated management API keys
"},{"location":"getting-started/configuration/#command-line-options","title":"Command Line Options","text":"View all available command line options:
llamactl --help\n
You can also override configuration using command line flags when starting llamactl.
"},{"location":"getting-started/installation/","title":"Installation","text":"This guide will walk you through installing Llamactl on your system.
"},{"location":"getting-started/installation/#prerequisites","title":"Prerequisites","text":""},{"location":"getting-started/installation/#backend-dependencies","title":"Backend Dependencies","text":"llamactl supports multiple backends. Install at least one:
For llama.cpp backend (all platforms):
You need llama-server from llama.cpp installed:
# Homebrew (macOS/Linux)\nbrew install llama.cpp\n# Winget (Windows)\nwinget install llama.cpp\n
Or build from source - see llama.cpp docs
For MLX backend (macOS only):
MLX provides optimized inference on Apple Silicon. Install MLX-LM:
# Install via pip (requires Python 3.8+)\npip install mlx-lm\n\n# Or in a virtual environment (recommended)\npython -m venv mlx-env\nsource mlx-env/bin/activate\npip install mlx-lm\n
Note: MLX backend is only available on macOS with Apple Silicon (M1, M2, M3, etc.)
For vLLM backend:
vLLM provides high-throughput distributed serving for LLMs. Install vLLM:
# Install via pip (requires Python 3.8+, GPU required)\npip install vllm\n\n# Or in a virtual environment (recommended)\npython -m venv vllm-env\nsource vllm-env/bin/activate\npip install vllm\n\n# For production deployments, consider container-based installation\n
"},{"location":"getting-started/installation/#installation-methods","title":"Installation Methods","text":""},{"location":"getting-started/installation/#option-1-download-binary-recommended","title":"Option 1: Download Binary (Recommended)","text":"Download the latest release from the GitHub releases page:
# Linux/macOS - Get latest version and download\nLATEST_VERSION=$(curl -s https://api.github.com/repos/lordmathis/llamactl/releases/latest | grep '\"tag_name\":' | sed -E 's/.*\"([^\"]+)\".*/\\1/')\ncurl -L https://github.com/lordmathis/llamactl/releases/download/${LATEST_VERSION}/llamactl-${LATEST_VERSION}-$(uname -s | tr '[:upper:]' '[:lower:]')-$(uname -m).tar.gz | tar -xz\nsudo mv llamactl /usr/local/bin/\n\n# Or download manually from:\n# https://github.com/lordmathis/llamactl/releases/latest\n\n# Windows - Download from releases page\n
"},{"location":"getting-started/installation/#option-2-build-from-source","title":"Option 2: Build from Source","text":"Requirements: - Go 1.24 or later - Node.js 22 or later - Git
If you prefer to build from source:
# Clone the repository\ngit clone https://github.com/lordmathis/llamactl.git\ncd llamactl\n\n# Build the web UI\ncd webui && npm ci && npm run build && cd ..\n\n# Build the application\ngo build -o llamactl ./cmd/server\n
"},{"location":"getting-started/installation/#verification","title":"Verification","text":"Verify your installation by checking the version:
llamactl --version\n
"},{"location":"getting-started/installation/#next-steps","title":"Next Steps","text":"Now that Llamactl is installed, continue to the Quick Start guide to get your first instance running!
"},{"location":"getting-started/quick-start/","title":"Quick Start","text":"This guide will help you get Llamactl up and running in just a few minutes.
"},{"location":"getting-started/quick-start/#step-1-start-llamactl","title":"Step 1: Start Llamactl","text":"Start the Llamactl server:
llamactl\n
By default, Llamactl will start on http://localhost:8080.
"},{"location":"getting-started/quick-start/#step-2-access-the-web-ui","title":"Step 2: Access the Web UI","text":"Open your web browser and navigate to:
http://localhost:8080\n
Login with the management API key. By default it is generated during server startup. Copy it from the terminal output.
You should see the Llamactl web interface.
"},{"location":"getting-started/quick-start/#step-3-create-your-first-instance","title":"Step 3: Create Your First Instance","text":" - Click the \"Add Instance\" button
- Fill in the instance configuration:
- Name: Give your instance a descriptive name
- Backend Type: Choose from llama.cpp, MLX, or vLLM
- Model: Model path or identifier for your chosen backend
-
Additional Options: Backend-specific parameters
-
Click \"Create Instance\"
"},{"location":"getting-started/quick-start/#step-4-start-your-instance","title":"Step 4: Start Your Instance","text":"Once created, you can:
- Start the instance by clicking the start button
- Monitor its status in real-time
- View logs by clicking the logs button
- Stop the instance when needed
"},{"location":"getting-started/quick-start/#example-configurations","title":"Example Configurations","text":"Here are basic example configurations for each backend:
llama.cpp backend:
{\n \"name\": \"llama2-7b\",\n \"backend_type\": \"llama_cpp\",\n \"backend_options\": {\n \"model\": \"/path/to/llama-2-7b-chat.gguf\",\n \"threads\": 4,\n \"ctx_size\": 2048,\n \"gpu_layers\": 32\n }\n}\n
MLX backend (macOS only):
{\n \"name\": \"mistral-mlx\",\n \"backend_type\": \"mlx_lm\",\n \"backend_options\": {\n \"model\": \"mlx-community/Mistral-7B-Instruct-v0.3-4bit\",\n \"temp\": 0.7,\n \"max_tokens\": 2048\n }\n}\n
vLLM backend:
{\n \"name\": \"dialogpt-vllm\",\n \"backend_type\": \"vllm\",\n \"backend_options\": {\n \"model\": \"microsoft/DialoGPT-medium\",\n \"tensor_parallel_size\": 2,\n \"gpu_memory_utilization\": 0.9\n }\n}\n
"},{"location":"getting-started/quick-start/#docker-support","title":"Docker Support","text":"Llamactl can run backends in Docker containers. To enable Docker for a backend, add a docker section to that backend in your YAML configuration file (e.g. config.yaml) as shown below:
backends:\n vllm:\n command: \"vllm\"\n args: [\"serve\"]\n docker:\n enabled: true\n image: \"vllm/vllm-openai:latest\"\n args: [\"run\", \"--rm\", \"--network\", \"host\", \"--gpus\", \"all\", \"--shm-size\", \"1g\"]\n
"},{"location":"getting-started/quick-start/#using-the-api","title":"Using the API","text":"You can also manage instances via the REST API:
# List all instances\ncurl http://localhost:8080/api/instances\n\n# Create a new llama.cpp instance\ncurl -X POST http://localhost:8080/api/instances/my-model \\\n -H \"Content-Type: application/json\" \\\n -d '{\n \"backend_type\": \"llama_cpp\",\n \"backend_options\": {\n \"model\": \"/path/to/model.gguf\"\n }\n }'\n\n# Start an instance\ncurl -X POST http://localhost:8080/api/instances/my-model/start\n
"},{"location":"getting-started/quick-start/#openai-compatible-api","title":"OpenAI Compatible API","text":"Llamactl provides OpenAI-compatible endpoints, making it easy to integrate with existing OpenAI client libraries and tools.
"},{"location":"getting-started/quick-start/#chat-completions","title":"Chat Completions","text":"Once you have an instance running, you can use it with the OpenAI-compatible chat completions endpoint:
curl -X POST http://localhost:8080/v1/chat/completions \\\n -H \"Content-Type: application/json\" \\\n -d '{\n \"model\": \"my-model\",\n \"messages\": [\n {\n \"role\": \"user\",\n \"content\": \"Hello! Can you help me write a Python function?\"\n }\n ],\n \"max_tokens\": 150,\n \"temperature\": 0.7\n }'\n
"},{"location":"getting-started/quick-start/#using-with-python-openai-client","title":"Using with Python OpenAI Client","text":"You can also use the official OpenAI Python client:
from openai import OpenAI\n\n# Point the client to your Llamactl server\nclient = OpenAI(\n base_url=\"http://localhost:8080/v1\",\n api_key=\"not-needed\" # Llamactl doesn't require API keys by default\n)\n\n# Create a chat completion\nresponse = client.chat.completions.create(\n model=\"my-model\", # Use the name of your instance\n messages=[\n {\"role\": \"user\", \"content\": \"Explain quantum computing in simple terms\"}\n ],\n max_tokens=200,\n temperature=0.7\n)\n\nprint(response.choices[0].message.content)\n
"},{"location":"getting-started/quick-start/#list-available-models","title":"List Available Models","text":"Get a list of running instances (models) in OpenAI-compatible format:
curl http://localhost:8080/v1/models\n
"},{"location":"getting-started/quick-start/#next-steps","title":"Next Steps","text":" - Manage instances Managing Instances
- Explore the API Reference
- Configure advanced settings in the Configuration guide
"},{"location":"user-guide/api-reference/","title":"API Reference","text":"Complete reference for the Llamactl REST API.
"},{"location":"user-guide/api-reference/#base-url","title":"Base URL","text":"All API endpoints are relative to the base URL:
http://localhost:8080/api/v1\n
"},{"location":"user-guide/api-reference/#authentication","title":"Authentication","text":"Llamactl supports API key authentication. If authentication is enabled, include the API key in the Authorization header:
curl -H \"Authorization: Bearer <your-api-key>\" \\\n http://localhost:8080/api/v1/instances\n
The server supports two types of API keys: - Management API Keys: Required for instance management operations (CRUD operations on instances) - Inference API Keys: Required for OpenAI-compatible inference endpoints
"},{"location":"user-guide/api-reference/#system-endpoints","title":"System Endpoints","text":""},{"location":"user-guide/api-reference/#get-llamactl-version","title":"Get Llamactl Version","text":"Get the version information of the llamactl server.
GET /api/v1/version\n
Response:
Version: 1.0.0\nCommit: abc123\nBuild Time: 2024-01-15T10:00:00Z\n
"},{"location":"user-guide/api-reference/#get-llama-server-help","title":"Get Llama Server Help","text":"Get help text for the llama-server command.
GET /api/v1/server/help\n
Response: Plain text help output from llama-server --help
"},{"location":"user-guide/api-reference/#get-llama-server-version","title":"Get Llama Server Version","text":"Get version information of the llama-server binary.
GET /api/v1/server/version\n
Response: Plain text version output from llama-server --version
"},{"location":"user-guide/api-reference/#list-available-devices","title":"List Available Devices","text":"List available devices for llama-server.
GET /api/v1/server/devices\n
Response: Plain text device list from llama-server --list-devices
"},{"location":"user-guide/api-reference/#instances","title":"Instances","text":""},{"location":"user-guide/api-reference/#list-all-instances","title":"List All Instances","text":"Get a list of all instances.
GET /api/v1/instances\n
Response:
[\n {\n \"name\": \"llama2-7b\",\n \"status\": \"running\",\n \"created\": 1705312200\n }\n]\n
"},{"location":"user-guide/api-reference/#get-instance-details","title":"Get Instance Details","text":"Get detailed information about a specific instance.
GET /api/v1/instances/{name}\n
Response:
{\n \"name\": \"llama2-7b\",\n \"status\": \"running\",\n \"created\": 1705312200\n}\n
"},{"location":"user-guide/api-reference/#create-instance","title":"Create Instance","text":"Create and start a new instance.
POST /api/v1/instances/{name}\n
Request Body: JSON object with instance configuration. Common fields include:
backend_type: Backend type (llama_cpp, mlx_lm, or vllm) backend_options: Backend-specific configuration auto_restart: Enable automatic restart on failure max_restarts: Maximum restart attempts restart_delay: Delay between restarts in seconds on_demand_start: Start instance when receiving requests idle_timeout: Idle timeout in minutes environment: Environment variables as key-value pairs
See Managing Instances for complete configuration options.
Response:
{\n \"name\": \"llama2-7b\",\n \"status\": \"running\",\n \"created\": 1705312200\n}\n
"},{"location":"user-guide/api-reference/#update-instance","title":"Update Instance","text":"Update an existing instance configuration. See Managing Instances for available configuration options.
PUT /api/v1/instances/{name}\n
Request Body: JSON object with configuration fields to update.
Response:
{\n \"name\": \"llama2-7b\",\n \"status\": \"running\",\n \"created\": 1705312200\n}\n
"},{"location":"user-guide/api-reference/#delete-instance","title":"Delete Instance","text":"Stop and remove an instance.
DELETE /api/v1/instances/{name}\n
Response: 204 No Content
"},{"location":"user-guide/api-reference/#instance-operations","title":"Instance Operations","text":""},{"location":"user-guide/api-reference/#start-instance","title":"Start Instance","text":"Start a stopped instance.
POST /api/v1/instances/{name}/start\n
Response:
{\n \"name\": \"llama2-7b\",\n \"status\": \"running\",\n \"created\": 1705312200\n}\n
Error Responses: - 409 Conflict: Maximum number of running instances reached - 500 Internal Server Error: Failed to start instance
"},{"location":"user-guide/api-reference/#stop-instance","title":"Stop Instance","text":"Stop a running instance.
POST /api/v1/instances/{name}/stop\n
Response:
{\n \"name\": \"llama2-7b\",\n \"status\": \"stopped\",\n \"created\": 1705312200\n}\n
"},{"location":"user-guide/api-reference/#restart-instance","title":"Restart Instance","text":"Restart an instance (stop then start).
POST /api/v1/instances/{name}/restart\n
Response:
{\n \"name\": \"llama2-7b\",\n \"status\": \"running\",\n \"created\": 1705312200\n}\n
"},{"location":"user-guide/api-reference/#get-instance-logs","title":"Get Instance Logs","text":"Retrieve instance logs.
GET /api/v1/instances/{name}/logs\n
Query Parameters: - lines: Number of lines to return (default: all lines, use -1 for all)
Response: Plain text log output
Example:
curl \"http://localhost:8080/api/v1/instances/my-instance/logs?lines=100\"\n
"},{"location":"user-guide/api-reference/#proxy-to-instance","title":"Proxy to Instance","text":"Proxy HTTP requests directly to the llama-server instance.
GET /api/v1/instances/{name}/proxy/*\nPOST /api/v1/instances/{name}/proxy/*\n
This endpoint forwards all requests to the underlying llama-server instance running on its configured port. The proxy strips the /api/v1/instances/{name}/proxy prefix and forwards the remaining path to the instance.
Example - Check Instance Health:
curl -H \"Authorization: Bearer your-api-key\" \\\n http://localhost:8080/api/v1/instances/my-model/proxy/health\n
This forwards the request to http://instance-host:instance-port/health on the actual llama-server instance.
Error Responses: - 503 Service Unavailable: Instance is not running
"},{"location":"user-guide/api-reference/#openai-compatible-api","title":"OpenAI-Compatible API","text":"Llamactl provides OpenAI-compatible endpoints for inference operations.
"},{"location":"user-guide/api-reference/#list-models","title":"List Models","text":"List all instances in OpenAI-compatible format.
GET /v1/models\n
Response:
{\n \"object\": \"list\",\n \"data\": [\n {\n \"id\": \"llama2-7b\",\n \"object\": \"model\",\n \"created\": 1705312200,\n \"owned_by\": \"llamactl\"\n }\n ]\n}\n
"},{"location":"user-guide/api-reference/#chat-completions-completions-embeddings","title":"Chat Completions, Completions, Embeddings","text":"All OpenAI-compatible inference endpoints are available:
POST /v1/chat/completions\nPOST /v1/completions\nPOST /v1/embeddings\nPOST /v1/rerank\nPOST /v1/reranking\n
Request Body: Standard OpenAI format with model field specifying the instance name
Example:
{\n \"model\": \"llama2-7b\",\n \"messages\": [\n {\n \"role\": \"user\",\n \"content\": \"Hello, how are you?\"\n }\n ]\n}\n
The server routes requests to the appropriate instance based on the model field in the request body. Instances with on-demand starting enabled will be automatically started if not running. For configuration details, see Managing Instances.
Error Responses: - 400 Bad Request: Invalid request body or missing instance name - 503 Service Unavailable: Instance is not running and on-demand start is disabled - 409 Conflict: Cannot start instance due to maximum instances limit
"},{"location":"user-guide/api-reference/#instance-status-values","title":"Instance Status Values","text":"Instances can have the following status values: - stopped: Instance is not running - running: Instance is running and ready to accept requests - failed: Instance failed to start or crashed
"},{"location":"user-guide/api-reference/#error-responses","title":"Error Responses","text":"All endpoints may return error responses in the following format:
{\n \"error\": \"Error message description\"\n}\n
"},{"location":"user-guide/api-reference/#common-http-status-codes","title":"Common HTTP Status Codes","text":" 200: Success 201: Created 204: No Content (successful deletion) 400: Bad Request (invalid parameters or request body) 401: Unauthorized (missing or invalid API key) 403: Forbidden (insufficient permissions) 404: Not Found (instance not found) 409: Conflict (instance already exists, max instances reached) 500: Internal Server Error 503: Service Unavailable (instance not running)
"},{"location":"user-guide/api-reference/#examples","title":"Examples","text":""},{"location":"user-guide/api-reference/#complete-instance-lifecycle","title":"Complete Instance Lifecycle","text":"# Create and start instance\ncurl -X POST http://localhost:8080/api/v1/instances/my-model \\\n -H \"Content-Type: application/json\" \\\n -H \"Authorization: Bearer your-api-key\" \\\n -d '{\n \"backend_type\": \"llama_cpp\",\n \"backend_options\": {\n \"model\": \"/models/llama-2-7b.gguf\",\n \"gpu_layers\": 32\n },\n \"environment\": {\n \"CUDA_VISIBLE_DEVICES\": \"0\",\n \"OMP_NUM_THREADS\": \"8\"\n }\n }'\n\n# Check instance status\ncurl -H \"Authorization: Bearer your-api-key\" \\\n http://localhost:8080/api/v1/instances/my-model\n\n# Get instance logs\ncurl -H \"Authorization: Bearer your-api-key\" \\\n \"http://localhost:8080/api/v1/instances/my-model/logs?lines=50\"\n\n# Use OpenAI-compatible chat completions\ncurl -X POST http://localhost:8080/v1/chat/completions \\\n -H \"Content-Type: application/json\" \\\n -H \"Authorization: Bearer your-inference-api-key\" \\\n -d '{\n \"model\": \"my-model\",\n \"messages\": [\n {\"role\": \"user\", \"content\": \"Hello!\"}\n ],\n \"max_tokens\": 100\n }'\n\n# Stop instance\ncurl -X POST -H \"Authorization: Bearer your-api-key\" \\\n http://localhost:8080/api/v1/instances/my-model/stop\n\n# Delete instance\ncurl -X DELETE -H \"Authorization: Bearer your-api-key\" \\\n http://localhost:8080/api/v1/instances/my-model\n
"},{"location":"user-guide/api-reference/#using-the-proxy-endpoint","title":"Using the Proxy Endpoint","text":"You can also directly proxy requests to the llama-server instance:
# Direct proxy to instance (bypasses OpenAI compatibility layer)\ncurl -X POST http://localhost:8080/api/v1/instances/my-model/proxy/completion \\\n -H \"Content-Type: application/json\" \\\n -H \"Authorization: Bearer your-api-key\" \\\n -d '{\n \"prompt\": \"Hello, world!\",\n \"n_predict\": 50\n }'\n
"},{"location":"user-guide/api-reference/#backend-specific-endpoints","title":"Backend-Specific Endpoints","text":""},{"location":"user-guide/api-reference/#parse-commands","title":"Parse Commands","text":"Llamactl provides endpoints to parse command strings from different backends into instance configuration options.
"},{"location":"user-guide/api-reference/#parse-llamacpp-command","title":"Parse Llama.cpp Command","text":"Parse a llama-server command string into instance options.
POST /api/v1/backends/llama-cpp/parse-command\n
Request Body:
{\n \"command\": \"llama-server -m /path/to/model.gguf -c 2048 --port 8080\"\n}\n
Response:
{\n \"backend_type\": \"llama_cpp\",\n \"llama_server_options\": {\n \"model\": \"/path/to/model.gguf\",\n \"ctx_size\": 2048,\n \"port\": 8080\n }\n}\n
"},{"location":"user-guide/api-reference/#parse-mlx-lm-command","title":"Parse MLX-LM Command","text":"Parse an MLX-LM server command string into instance options.
POST /api/v1/backends/mlx/parse-command\n
Request Body:
{\n \"command\": \"mlx_lm.server --model /path/to/model --port 8080\"\n}\n
Response:
{\n \"backend_type\": \"mlx_lm\",\n \"mlx_server_options\": {\n \"model\": \"/path/to/model\",\n \"port\": 8080\n }\n}\n
"},{"location":"user-guide/api-reference/#parse-vllm-command","title":"Parse vLLM Command","text":"Parse a vLLM serve command string into instance options.
POST /api/v1/backends/vllm/parse-command\n
Request Body:
{\n \"command\": \"vllm serve /path/to/model --port 8080\"\n}\n
Response:
{\n \"backend_type\": \"vllm\",\n \"vllm_server_options\": {\n \"model\": \"/path/to/model\",\n \"port\": 8080\n }\n}\n
Error Responses for Parse Commands: - 400 Bad Request: Invalid request body, empty command, or parse error - 500 Internal Server Error: Encoding error
"},{"location":"user-guide/api-reference/#auto-generated-documentation","title":"Auto-Generated Documentation","text":"The API documentation is automatically generated from code annotations using Swagger/OpenAPI. To regenerate the documentation:
- Install the swag tool:
go install github.com/swaggo/swag/cmd/swag@latest - Generate docs:
swag init -g cmd/server/main.go -o apidocs
"},{"location":"user-guide/api-reference/#swagger-documentation","title":"Swagger Documentation","text":"If swagger documentation is enabled in the server configuration, you can access the interactive API documentation at:
http://localhost:8080/swagger/\n
This provides a complete interactive interface for testing all API endpoints.
"},{"location":"user-guide/managing-instances/","title":"Managing Instances","text":"Learn how to effectively manage your llama.cpp, MLX, and vLLM instances with Llamactl through both the Web UI and API.
"},{"location":"user-guide/managing-instances/#overview","title":"Overview","text":"Llamactl provides two ways to manage instances:
- Web UI: Accessible at
http://localhost:8080 with an intuitive dashboard - REST API: Programmatic access for automation and integration
"},{"location":"user-guide/managing-instances/#authentication","title":"Authentication","text":"If authentication is enabled: 1. Navigate to the web UI 2. Enter your credentials 3. Bearer token is stored for the session
"},{"location":"user-guide/managing-instances/#theme-support","title":"Theme Support","text":" - Switch between light and dark themes
- Setting is remembered across sessions
"},{"location":"user-guide/managing-instances/#instance-cards","title":"Instance Cards","text":"Each instance is displayed as a card showing:
- Instance name
- Health status badge (unknown, ready, error, failed)
- Action buttons (start, stop, edit, logs, delete)
"},{"location":"user-guide/managing-instances/#create-instance","title":"Create Instance","text":""},{"location":"user-guide/managing-instances/#via-web-ui","title":"Via Web UI","text":" - Click the \"Create Instance\" button on the dashboard
- Enter a unique Name for your instance (only required field)
- Choose Backend Type:
- llama.cpp: For GGUF models using llama-server
- MLX: For MLX-optimized models (macOS only)
- vLLM: For distributed serving and high-throughput inference
- Configure model source:
- For llama.cpp: GGUF model path or HuggingFace repo
- For MLX: MLX model path or identifier (e.g.,
mlx-community/Mistral-7B-Instruct-v0.3-4bit) - For vLLM: HuggingFace model identifier (e.g.,
microsoft/DialoGPT-medium)
- Configure optional instance management settings:
- Auto Restart: Automatically restart instance on failure
- Max Restarts: Maximum number of restart attempts
- Restart Delay: Delay in seconds between restart attempts
- On Demand Start: Start instance when receiving a request to the OpenAI compatible endpoint
- Idle Timeout: Minutes before stopping idle instance (set to 0 to disable)
- Environment Variables: Set custom environment variables for the instance process
- Configure backend-specific options:
- llama.cpp: Threads, context size, GPU layers, port, etc.
- MLX: Temperature, top-p, adapter path, Python environment, etc.
- vLLM: Tensor parallel size, GPU memory utilization, quantization, etc.
- Click \"Create\" to save the instance
"},{"location":"user-guide/managing-instances/#via-api","title":"Via API","text":"# Create llama.cpp instance with local model file\ncurl -X POST http://localhost:8080/api/instances/my-llama-instance \\\n -H \"Content-Type: application/json\" \\\n -d '{\n \"backend_type\": \"llama_cpp\",\n \"backend_options\": {\n \"model\": \"/path/to/model.gguf\",\n \"threads\": 8,\n \"ctx_size\": 4096,\n \"gpu_layers\": 32\n }\n }'\n\n# Create MLX instance (macOS only)\ncurl -X POST http://localhost:8080/api/instances/my-mlx-instance \\\n -H \"Content-Type: application/json\" \\\n -d '{\n \"backend_type\": \"mlx_lm\",\n \"backend_options\": {\n \"model\": \"mlx-community/Mistral-7B-Instruct-v0.3-4bit\",\n \"temp\": 0.7,\n \"top_p\": 0.9,\n \"max_tokens\": 2048\n },\n \"auto_restart\": true,\n \"max_restarts\": 3\n }'\n\n# Create vLLM instance\ncurl -X POST http://localhost:8080/api/instances/my-vllm-instance \\\n -H \"Content-Type: application/json\" \\\n -d '{\n \"backend_type\": \"vllm\",\n \"backend_options\": {\n \"model\": \"microsoft/DialoGPT-medium\",\n \"tensor_parallel_size\": 2,\n \"gpu_memory_utilization\": 0.9\n },\n \"auto_restart\": true,\n \"on_demand_start\": true,\n \"environment\": {\n \"CUDA_VISIBLE_DEVICES\": \"0,1\",\n \"NCCL_DEBUG\": \"INFO\",\n \"PYTHONPATH\": \"/custom/path\"\n }\n }'\n\n# Create llama.cpp instance with HuggingFace model\ncurl -X POST http://localhost:8080/api/instances/gemma-3-27b \\\n -H \"Content-Type: application/json\" \\\n -d '{\n \"backend_type\": \"llama_cpp\",\n \"backend_options\": {\n \"hf_repo\": \"unsloth/gemma-3-27b-it-GGUF\",\n \"hf_file\": \"gemma-3-27b-it-GGUF.gguf\",\n \"gpu_layers\": 32\n }\n }'\n
"},{"location":"user-guide/managing-instances/#start-instance","title":"Start Instance","text":""},{"location":"user-guide/managing-instances/#via-web-ui_1","title":"Via Web UI","text":" - Click the \"Start\" button on an instance card
- Watch the status change to \"Unknown\"
- Monitor progress in the logs
- Instance status changes to \"Ready\" when ready
"},{"location":"user-guide/managing-instances/#via-api_1","title":"Via API","text":"curl -X POST http://localhost:8080/api/instances/{name}/start\n
"},{"location":"user-guide/managing-instances/#stop-instance","title":"Stop Instance","text":""},{"location":"user-guide/managing-instances/#via-web-ui_2","title":"Via Web UI","text":" - Click the \"Stop\" button on an instance card
- Instance gracefully shuts down
"},{"location":"user-guide/managing-instances/#via-api_2","title":"Via API","text":"curl -X POST http://localhost:8080/api/instances/{name}/stop\n
"},{"location":"user-guide/managing-instances/#edit-instance","title":"Edit Instance","text":""},{"location":"user-guide/managing-instances/#via-web-ui_3","title":"Via Web UI","text":" - Click the \"Edit\" button on an instance card
- Modify settings in the configuration dialog
- Changes require instance restart to take effect
- Click \"Update & Restart\" to apply changes
"},{"location":"user-guide/managing-instances/#via-api_3","title":"Via API","text":"Modify instance settings:
curl -X PUT http://localhost:8080/api/instances/{name} \\\n -H \"Content-Type: application/json\" \\\n -d '{\n \"backend_options\": {\n \"threads\": 8,\n \"context_size\": 4096\n }\n }'\n
Note
Configuration changes require restarting the instance to take effect.
"},{"location":"user-guide/managing-instances/#view-logs","title":"View Logs","text":""},{"location":"user-guide/managing-instances/#via-web-ui_4","title":"Via Web UI","text":" - Click the \"Logs\" button on any instance card
- Real-time log viewer opens
"},{"location":"user-guide/managing-instances/#via-api_4","title":"Via API","text":"Check instance status in real-time:
# Get instance details\ncurl http://localhost:8080/api/instances/{name}/logs\n
"},{"location":"user-guide/managing-instances/#delete-instance","title":"Delete Instance","text":""},{"location":"user-guide/managing-instances/#via-web-ui_5","title":"Via Web UI","text":" - Click the \"Delete\" button on an instance card
- Only stopped instances can be deleted
- Confirm deletion in the dialog
"},{"location":"user-guide/managing-instances/#via-api_5","title":"Via API","text":"curl -X DELETE http://localhost:8080/api/instances/{name}\n
"},{"location":"user-guide/managing-instances/#instance-proxy","title":"Instance Proxy","text":"Llamactl proxies all requests to the underlying backend instances (llama-server, MLX, or vLLM).
# Get instance details\ncurl http://localhost:8080/api/instances/{name}/proxy/\n
All backends provide OpenAI-compatible endpoints. Check the respective documentation: - llama-server docs - MLX-LM docs - vLLM docs
"},{"location":"user-guide/managing-instances/#instance-health","title":"Instance Health","text":""},{"location":"user-guide/managing-instances/#via-web-ui_6","title":"Via Web UI","text":" - The health status badge is displayed on each instance card
"},{"location":"user-guide/managing-instances/#via-api_6","title":"Via API","text":"Check the health status of your instances:
curl http://localhost:8080/api/instances/{name}/proxy/health\n
"},{"location":"user-guide/troubleshooting/","title":"Troubleshooting","text":"Issues specific to Llamactl deployment and operation.
"},{"location":"user-guide/troubleshooting/#configuration-issues","title":"Configuration Issues","text":""},{"location":"user-guide/troubleshooting/#invalid-configuration","title":"Invalid Configuration","text":"Problem: Invalid configuration preventing startup
Solutions: 1. Use minimal configuration:
server:\n host: \"0.0.0.0\"\n port: 8080\ninstances:\n port_range: [8000, 9000]\n
- Check data directory permissions:
# Ensure data directory is writable (default: ~/.local/share/llamactl)\nmkdir -p ~/.local/share/llamactl/{instances,logs}\n
"},{"location":"user-guide/troubleshooting/#instance-management-issues","title":"Instance Management Issues","text":""},{"location":"user-guide/troubleshooting/#model-loading-failures","title":"Model Loading Failures","text":"Problem: Instance fails to start with model loading errors
Common Solutions: - llama-server not found: Ensure llama-server binary is in PATH - Wrong model format: Ensure model is in GGUF format - Insufficient memory: Use smaller model or reduce context size - Path issues: Use absolute paths to model files
"},{"location":"user-guide/troubleshooting/#memory-issues","title":"Memory Issues","text":"Problem: Out of memory errors or system becomes unresponsive
Solutions: 1. Reduce context size:
{\n \"n_ctx\": 1024\n}\n
- Use quantized models:
- Try Q4_K_M instead of higher precision models
- Use smaller model variants (7B instead of 13B)
"},{"location":"user-guide/troubleshooting/#gpu-configuration","title":"GPU Configuration","text":"Problem: GPU not being used effectively
Solutions: 1. Configure GPU layers:
{\n \"n_gpu_layers\": 35\n}\n
"},{"location":"user-guide/troubleshooting/#advanced-instance-issues","title":"Advanced Instance Issues","text":"Problem: Complex model loading, performance, or compatibility issues
Since llamactl uses llama-server under the hood, many instance-related issues are actually llama.cpp issues. For advanced troubleshooting:
Resources: - llama.cpp Documentation: https://github.com/ggml/llama.cpp - llama.cpp Issues: https://github.com/ggml/llama.cpp/issues - llama.cpp Discussions: https://github.com/ggml/llama.cpp/discussions
Testing directly with llama-server:
# Test your model and parameters directly with llama-server\nllama-server --model /path/to/model.gguf --port 8081 --n-gpu-layers 35\n
This helps determine if the issue is with llamactl or with the underlying llama.cpp/llama-server.
"},{"location":"user-guide/troubleshooting/#api-and-network-issues","title":"API and Network Issues","text":""},{"location":"user-guide/troubleshooting/#cors-errors","title":"CORS Errors","text":"Problem: Web UI shows CORS errors in browser console
Solutions: 1. Configure allowed origins:
server:\n allowed_origins:\n - \"http://localhost:3000\"\n - \"https://yourdomain.com\"\n
"},{"location":"user-guide/troubleshooting/#authentication-issues","title":"Authentication Issues","text":"Problem: API requests failing with authentication errors
Solutions: 1. Disable authentication temporarily:
auth:\n require_management_auth: false\n require_inference_auth: false\n
-
Configure API keys:
auth:\n management_keys:\n - \"your-management-key\"\n inference_keys:\n - \"your-inference-key\"\n
-
Use correct Authorization header:
curl -H \"Authorization: Bearer your-api-key\" \\\n http://localhost:8080/api/v1/instances\n
"},{"location":"user-guide/troubleshooting/#debugging-and-logs","title":"Debugging and Logs","text":""},{"location":"user-guide/troubleshooting/#viewing-instance-logs","title":"Viewing Instance Logs","text":"# Get instance logs via API\ncurl http://localhost:8080/api/v1/instances/{name}/logs\n\n# Or check log files directly\ntail -f ~/.local/share/llamactl/logs/{instance-name}.log\n
"},{"location":"user-guide/troubleshooting/#enable-debug-logging","title":"Enable Debug Logging","text":"export LLAMACTL_LOG_LEVEL=debug\nllamactl\n
"},{"location":"user-guide/troubleshooting/#getting-help","title":"Getting Help","text":"When reporting issues, include:
-
System information:
llamactl --version\n
-
Configuration file (remove sensitive keys)
-
Relevant log output
-
Steps to reproduce the issue
"}]}
\ No newline at end of file
+{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"","title":"Llamactl Documentation","text":"Welcome to the Llamactl documentation!
"},{"location":"#what-is-llamactl","title":"What is Llamactl?","text":"Unified management and routing for llama.cpp, MLX and vLLM models with web dashboard.
"},{"location":"#features","title":"Features","text":""},{"location":"#easy-model-management","title":"\ud83d\ude80 Easy Model Management","text":" - Multiple Model Serving: Run different models simultaneously (7B for speed, 70B for quality)
- On-Demand Instance Start: Automatically launch instances upon receiving API requests
- State Persistence: Ensure instances remain intact across server restarts
"},{"location":"#universal-compatibility","title":"\ud83d\udd17 Universal Compatibility","text":" - OpenAI API Compatible: Drop-in replacement - route requests by instance name
- Multi-Backend Support: Native support for llama.cpp, MLX (Apple Silicon optimized), and vLLM
- Docker Support: Run backends in containers
"},{"location":"#user-friendly-interface","title":"\ud83c\udf10 User-Friendly Interface","text":" - Web Dashboard: Modern React UI for visual management (unlike CLI-only tools)
- API Key Authentication: Separate keys for management vs inference access
"},{"location":"#smart-operations","title":"\u26a1 Smart Operations","text":" - Instance Monitoring: Health checks, auto-restart, log management
- Smart Resource Management: Idle timeout, LRU eviction, and configurable instance limits
- Environment Variables: Set custom environment variables per instance for advanced configuration
"},{"location":"#quick-links","title":"Quick Links","text":" - Installation Guide - Get Llamactl up and running
- Configuration Guide - Detailed configuration options
- Quick Start - Your first steps with Llamactl
- Managing Instances - Instance lifecycle management
- API Reference - Complete API documentation
"},{"location":"#getting-help","title":"Getting Help","text":"If you need help or have questions:
- Check the Troubleshooting guide
- Visit the GitHub repository
- Review the Configuration Guide for advanced settings
"},{"location":"#license","title":"License","text":"MIT License - see the LICENSE file.
"},{"location":"getting-started/configuration/","title":"Configuration","text":"llamactl can be configured via configuration files or environment variables. Configuration is loaded in the following order of precedence:
Defaults < Configuration file < Environment variables\n
llamactl works out of the box with sensible defaults, but you can customize the behavior to suit your needs.
"},{"location":"getting-started/configuration/#default-configuration","title":"Default Configuration","text":"Here's the default configuration with all available options:
server:\n host: \"0.0.0.0\" # Server host to bind to\n port: 8080 # Server port to bind to\n allowed_origins: [\"*\"] # Allowed CORS origins (default: all)\n enable_swagger: false # Enable Swagger UI for API docs\n\nbackends:\n llama-cpp:\n command: \"llama-server\"\n args: []\n environment: {} # Environment variables for the backend process\n docker:\n enabled: false\n image: \"ghcr.io/ggml-org/llama.cpp:server\"\n args: [\"run\", \"--rm\", \"--network\", \"host\", \"--gpus\", \"all\"]\n environment: {}\n response_headers: {} # Additional response headers to send with responses\n\n vllm:\n command: \"vllm\"\n args: [\"serve\"]\n environment: {} # Environment variables for the backend process\n docker:\n enabled: false\n image: \"vllm/vllm-openai:latest\"\n args: [\"run\", \"--rm\", \"--network\", \"host\", \"--gpus\", \"all\", \"--shm-size\", \"1g\"]\n environment: {}\n response_headers: {} # Additional response headers to send with responses\n\n mlx:\n command: \"mlx_lm.server\"\n args: []\n environment: {} # Environment variables for the backend process\n response_headers: {} # Additional response headers to send with responses\n\ninstances:\n port_range: [8000, 9000] # Port range for instances\n data_dir: ~/.local/share/llamactl # Data directory (platform-specific, see below)\n configs_dir: ~/.local/share/llamactl/instances # Instance configs directory\n logs_dir: ~/.local/share/llamactl/logs # Logs directory\n auto_create_dirs: true # Auto-create data/config/logs dirs if missing\n max_instances: -1 # Max instances (-1 = unlimited)\n max_running_instances: -1 # Max running instances (-1 = unlimited)\n enable_lru_eviction: true # Enable LRU eviction for idle instances\n default_auto_restart: true # Auto-restart new instances by default\n default_max_restarts: 3 # Max restarts for new instances\n default_restart_delay: 5 # Restart delay (seconds) for new instances\n default_on_demand_start: true # Default on-demand start setting\n on_demand_start_timeout: 120 # Default on-demand start timeout in seconds\n timeout_check_interval: 5 # Idle instance timeout check in minutes\n\nauth:\n require_inference_auth: true # Require auth for inference endpoints\n inference_keys: [] # Keys for inference endpoints\n require_management_auth: true # Require auth for management endpoints\n management_keys: [] # Keys for management endpoints\n
"},{"location":"getting-started/configuration/#configuration-files","title":"Configuration Files","text":""},{"location":"getting-started/configuration/#configuration-file-locations","title":"Configuration File Locations","text":"Configuration files are searched in the following locations (in order of precedence):
Linux: - ./llamactl.yaml or ./config.yaml (current directory) - $HOME/.config/llamactl/config.yaml - /etc/llamactl/config.yaml
macOS: - ./llamactl.yaml or ./config.yaml (current directory) - $HOME/Library/Application Support/llamactl/config.yaml - /Library/Application Support/llamactl/config.yaml
Windows: - ./llamactl.yaml or ./config.yaml (current directory) - %APPDATA%\\llamactl\\config.yaml - %USERPROFILE%\\llamactl\\config.yaml - %PROGRAMDATA%\\llamactl\\config.yaml
You can specify the path to config file with LLAMACTL_CONFIG_PATH environment variable.
"},{"location":"getting-started/configuration/#configuration-options","title":"Configuration Options","text":""},{"location":"getting-started/configuration/#server-configuration","title":"Server Configuration","text":"server:\n host: \"0.0.0.0\" # Server host to bind to (default: \"0.0.0.0\")\n port: 8080 # Server port to bind to (default: 8080)\n allowed_origins: [\"*\"] # CORS allowed origins (default: [\"*\"])\n enable_swagger: false # Enable Swagger UI (default: false)\n
Environment Variables: - LLAMACTL_HOST - Server host - LLAMACTL_PORT - Server port - LLAMACTL_ALLOWED_ORIGINS - Comma-separated CORS origins - LLAMACTL_ENABLE_SWAGGER - Enable Swagger UI (true/false)
"},{"location":"getting-started/configuration/#backend-configuration","title":"Backend Configuration","text":"backends:\n llama-cpp:\n command: \"llama-server\"\n args: []\n environment: {} # Environment variables for the backend process\n docker:\n enabled: false # Enable Docker runtime (default: false)\n image: \"ghcr.io/ggml-org/llama.cpp:server\"\n args: [\"run\", \"--rm\", \"--network\", \"host\", \"--gpus\", \"all\"]\n environment: {}\n response_headers: {} # Additional response headers to send with responses\n\n vllm:\n command: \"vllm\"\n args: [\"serve\"]\n environment: {} # Environment variables for the backend process\n docker:\n enabled: false # Enable Docker runtime (default: false)\n image: \"vllm/vllm-openai:latest\"\n args: [\"run\", \"--rm\", \"--network\", \"host\", \"--gpus\", \"all\", \"--shm-size\", \"1g\"]\n environment: {}\n response_headers: {} # Additional response headers to send with responses\n\n mlx:\n command: \"mlx_lm.server\"\n args: []\n environment: {} # Environment variables for the backend process\n # MLX does not support Docker\n response_headers: {} # Additional response headers to send with responses\n
Backend Configuration Fields: - command: Executable name/path for the backend - args: Default arguments prepended to all instances - environment: Environment variables for the backend process (optional) - response_headers: Additional response headers to send with responses (optional) - docker: Docker-specific configuration (optional) - enabled: Boolean flag to enable Docker runtime - image: Docker image to use - args: Additional arguments passed to docker run - environment: Environment variables for the container (optional)
If llamactl is behind an NGINX proxy, X-Accel-Buffering: no response header may be required for NGINX to properly stream the responses without buffering.
Environment Variables:
LlamaCpp Backend: - LLAMACTL_LLAMACPP_COMMAND - LlamaCpp executable command - LLAMACTL_LLAMACPP_ARGS - Space-separated default arguments - LLAMACTL_LLAMACPP_ENV - Environment variables in format \"KEY1=value1,KEY2=value2\" - LLAMACTL_LLAMACPP_DOCKER_ENABLED - Enable Docker runtime (true/false) - LLAMACTL_LLAMACPP_DOCKER_IMAGE - Docker image to use - LLAMACTL_LLAMACPP_DOCKER_ARGS - Space-separated Docker arguments - LLAMACTL_LLAMACPP_DOCKER_ENV - Docker environment variables in format \"KEY1=value1,KEY2=value2\" - LLAMACTL_LLAMACPP_RESPONSE_HEADERS - Response headers in format \"KEY1=value1;KEY2=value2\"
VLLM Backend: - LLAMACTL_VLLM_COMMAND - VLLM executable command - LLAMACTL_VLLM_ARGS - Space-separated default arguments - LLAMACTL_VLLM_ENV - Environment variables in format \"KEY1=value1,KEY2=value2\" - LLAMACTL_VLLM_DOCKER_ENABLED - Enable Docker runtime (true/false) - LLAMACTL_VLLM_DOCKER_IMAGE - Docker image to use - LLAMACTL_VLLM_DOCKER_ARGS - Space-separated Docker arguments - LLAMACTL_VLLM_DOCKER_ENV - Docker environment variables in format \"KEY1=value1,KEY2=value2\" - LLAMACTL_VLLM_RESPONSE_HEADERS - Response headers in format \"KEY1=value1;KEY2=value2\"
MLX Backend: - LLAMACTL_MLX_COMMAND - MLX executable command - LLAMACTL_MLX_ARGS - Space-separated default arguments - LLAMACTL_MLX_ENV - Environment variables in format \"KEY1=value1,KEY2=value2\" - LLAMACTL_MLX_RESPONSE_HEADERS - Response headers in format \"KEY1=value1;KEY2=value2\"
"},{"location":"getting-started/configuration/#instance-configuration","title":"Instance Configuration","text":"instances:\n port_range: [8000, 9000] # Port range for instances (default: [8000, 9000])\n data_dir: \"~/.local/share/llamactl\" # Directory for all llamactl data (default varies by OS)\n configs_dir: \"~/.local/share/llamactl/instances\" # Directory for instance configs (default: data_dir/instances)\n logs_dir: \"~/.local/share/llamactl/logs\" # Directory for instance logs (default: data_dir/logs)\n auto_create_dirs: true # Automatically create data/config/logs directories (default: true)\n max_instances: -1 # Maximum instances (-1 = unlimited)\n max_running_instances: -1 # Maximum running instances (-1 = unlimited)\n enable_lru_eviction: true # Enable LRU eviction for idle instances\n default_auto_restart: true # Default auto-restart setting\n default_max_restarts: 3 # Default maximum restart attempts\n default_restart_delay: 5 # Default restart delay in seconds\n default_on_demand_start: true # Default on-demand start setting\n on_demand_start_timeout: 120 # Default on-demand start timeout in seconds\n timeout_check_interval: 5 # Default instance timeout check interval in minutes\n
Environment Variables: - LLAMACTL_INSTANCE_PORT_RANGE - Port range (format: \"8000-9000\" or \"8000,9000\") - LLAMACTL_DATA_DIRECTORY - Data directory path - LLAMACTL_INSTANCES_DIR - Instance configs directory path - LLAMACTL_LOGS_DIR - Log directory path - LLAMACTL_AUTO_CREATE_DATA_DIR - Auto-create data/config/logs directories (true/false) - LLAMACTL_MAX_INSTANCES - Maximum number of instances - LLAMACTL_MAX_RUNNING_INSTANCES - Maximum number of running instances - LLAMACTL_ENABLE_LRU_EVICTION - Enable LRU eviction for idle instances - LLAMACTL_DEFAULT_AUTO_RESTART - Default auto-restart setting (true/false) - LLAMACTL_DEFAULT_MAX_RESTARTS - Default maximum restarts - LLAMACTL_DEFAULT_RESTART_DELAY - Default restart delay in seconds - LLAMACTL_DEFAULT_ON_DEMAND_START - Default on-demand start setting (true/false) - LLAMACTL_ON_DEMAND_START_TIMEOUT - Default on-demand start timeout in seconds - LLAMACTL_TIMEOUT_CHECK_INTERVAL - Default instance timeout check interval in minutes
"},{"location":"getting-started/configuration/#authentication-configuration","title":"Authentication Configuration","text":"auth:\n require_inference_auth: true # Require API key for OpenAI endpoints (default: true)\n inference_keys: [] # List of valid inference API keys\n require_management_auth: true # Require API key for management endpoints (default: true)\n management_keys: [] # List of valid management API keys\n
Environment Variables: - LLAMACTL_REQUIRE_INFERENCE_AUTH - Require auth for OpenAI endpoints (true/false) - LLAMACTL_INFERENCE_KEYS - Comma-separated inference API keys - LLAMACTL_REQUIRE_MANAGEMENT_AUTH - Require auth for management endpoints (true/false) - LLAMACTL_MANAGEMENT_KEYS - Comma-separated management API keys
"},{"location":"getting-started/configuration/#command-line-options","title":"Command Line Options","text":"View all available command line options:
llamactl --help\n
You can also override configuration using command line flags when starting llamactl.
"},{"location":"getting-started/installation/","title":"Installation","text":"This guide will walk you through installing Llamactl on your system.
"},{"location":"getting-started/installation/#prerequisites","title":"Prerequisites","text":""},{"location":"getting-started/installation/#backend-dependencies","title":"Backend Dependencies","text":"llamactl supports multiple backends. Install at least one:
For llama.cpp backend (all platforms):
You need llama-server from llama.cpp installed:
# Homebrew (macOS/Linux)\nbrew install llama.cpp\n# Winget (Windows)\nwinget install llama.cpp\n
Or build from source - see llama.cpp docs
For MLX backend (macOS only):
MLX provides optimized inference on Apple Silicon. Install MLX-LM:
# Install via pip (requires Python 3.8+)\npip install mlx-lm\n\n# Or in a virtual environment (recommended)\npython -m venv mlx-env\nsource mlx-env/bin/activate\npip install mlx-lm\n
Note: MLX backend is only available on macOS with Apple Silicon (M1, M2, M3, etc.)
For vLLM backend:
vLLM provides high-throughput distributed serving for LLMs. Install vLLM:
# Install via pip (requires Python 3.8+, GPU required)\npip install vllm\n\n# Or in a virtual environment (recommended)\npython -m venv vllm-env\nsource vllm-env/bin/activate\npip install vllm\n\n# For production deployments, consider container-based installation\n
"},{"location":"getting-started/installation/#installation-methods","title":"Installation Methods","text":""},{"location":"getting-started/installation/#option-1-download-binary-recommended","title":"Option 1: Download Binary (Recommended)","text":"Download the latest release from the GitHub releases page:
# Linux/macOS - Get latest version and download\nLATEST_VERSION=$(curl -s https://api.github.com/repos/lordmathis/llamactl/releases/latest | grep '\"tag_name\":' | sed -E 's/.*\"([^\"]+)\".*/\\1/')\ncurl -L https://github.com/lordmathis/llamactl/releases/download/${LATEST_VERSION}/llamactl-${LATEST_VERSION}-$(uname -s | tr '[:upper:]' '[:lower:]')-$(uname -m).tar.gz | tar -xz\nsudo mv llamactl /usr/local/bin/\n\n# Or download manually from:\n# https://github.com/lordmathis/llamactl/releases/latest\n\n# Windows - Download from releases page\n
"},{"location":"getting-started/installation/#option-2-build-from-source","title":"Option 2: Build from Source","text":"Requirements: - Go 1.24 or later - Node.js 22 or later - Git
If you prefer to build from source:
# Clone the repository\ngit clone https://github.com/lordmathis/llamactl.git\ncd llamactl\n\n# Build the web UI\ncd webui && npm ci && npm run build && cd ..\n\n# Build the application\ngo build -o llamactl ./cmd/server\n
"},{"location":"getting-started/installation/#verification","title":"Verification","text":"Verify your installation by checking the version:
llamactl --version\n
"},{"location":"getting-started/installation/#next-steps","title":"Next Steps","text":"Now that Llamactl is installed, continue to the Quick Start guide to get your first instance running!
"},{"location":"getting-started/quick-start/","title":"Quick Start","text":"This guide will help you get Llamactl up and running in just a few minutes.
"},{"location":"getting-started/quick-start/#step-1-start-llamactl","title":"Step 1: Start Llamactl","text":"Start the Llamactl server:
llamactl\n
By default, Llamactl will start on http://localhost:8080.
"},{"location":"getting-started/quick-start/#step-2-access-the-web-ui","title":"Step 2: Access the Web UI","text":"Open your web browser and navigate to:
http://localhost:8080\n
Login with the management API key. By default it is generated during server startup. Copy it from the terminal output.
You should see the Llamactl web interface.
"},{"location":"getting-started/quick-start/#step-3-create-your-first-instance","title":"Step 3: Create Your First Instance","text":" - Click the \"Add Instance\" button
- Fill in the instance configuration:
- Name: Give your instance a descriptive name
- Backend Type: Choose from llama.cpp, MLX, or vLLM
- Model: Model path or identifier for your chosen backend
-
Additional Options: Backend-specific parameters
-
Click \"Create Instance\"
"},{"location":"getting-started/quick-start/#step-4-start-your-instance","title":"Step 4: Start Your Instance","text":"Once created, you can:
- Start the instance by clicking the start button
- Monitor its status in real-time
- View logs by clicking the logs button
- Stop the instance when needed
"},{"location":"getting-started/quick-start/#example-configurations","title":"Example Configurations","text":"Here are basic example configurations for each backend:
llama.cpp backend:
{\n \"name\": \"llama2-7b\",\n \"backend_type\": \"llama_cpp\",\n \"backend_options\": {\n \"model\": \"/path/to/llama-2-7b-chat.gguf\",\n \"threads\": 4,\n \"ctx_size\": 2048,\n \"gpu_layers\": 32\n }\n}\n
MLX backend (macOS only):
{\n \"name\": \"mistral-mlx\",\n \"backend_type\": \"mlx_lm\",\n \"backend_options\": {\n \"model\": \"mlx-community/Mistral-7B-Instruct-v0.3-4bit\",\n \"temp\": 0.7,\n \"max_tokens\": 2048\n }\n}\n
vLLM backend:
{\n \"name\": \"dialogpt-vllm\",\n \"backend_type\": \"vllm\",\n \"backend_options\": {\n \"model\": \"microsoft/DialoGPT-medium\",\n \"tensor_parallel_size\": 2,\n \"gpu_memory_utilization\": 0.9\n }\n}\n
"},{"location":"getting-started/quick-start/#docker-support","title":"Docker Support","text":"Llamactl can run backends in Docker containers. To enable Docker for a backend, add a docker section to that backend in your YAML configuration file (e.g. config.yaml) as shown below:
backends:\n vllm:\n command: \"vllm\"\n args: [\"serve\"]\n docker:\n enabled: true\n image: \"vllm/vllm-openai:latest\"\n args: [\"run\", \"--rm\", \"--network\", \"host\", \"--gpus\", \"all\", \"--shm-size\", \"1g\"]\n
"},{"location":"getting-started/quick-start/#using-the-api","title":"Using the API","text":"You can also manage instances via the REST API:
# List all instances\ncurl http://localhost:8080/api/instances\n\n# Create a new llama.cpp instance\ncurl -X POST http://localhost:8080/api/instances/my-model \\\n -H \"Content-Type: application/json\" \\\n -d '{\n \"backend_type\": \"llama_cpp\",\n \"backend_options\": {\n \"model\": \"/path/to/model.gguf\"\n }\n }'\n\n# Start an instance\ncurl -X POST http://localhost:8080/api/instances/my-model/start\n
"},{"location":"getting-started/quick-start/#openai-compatible-api","title":"OpenAI Compatible API","text":"Llamactl provides OpenAI-compatible endpoints, making it easy to integrate with existing OpenAI client libraries and tools.
"},{"location":"getting-started/quick-start/#chat-completions","title":"Chat Completions","text":"Once you have an instance running, you can use it with the OpenAI-compatible chat completions endpoint:
curl -X POST http://localhost:8080/v1/chat/completions \\\n -H \"Content-Type: application/json\" \\\n -d '{\n \"model\": \"my-model\",\n \"messages\": [\n {\n \"role\": \"user\",\n \"content\": \"Hello! Can you help me write a Python function?\"\n }\n ],\n \"max_tokens\": 150,\n \"temperature\": 0.7\n }'\n
"},{"location":"getting-started/quick-start/#using-with-python-openai-client","title":"Using with Python OpenAI Client","text":"You can also use the official OpenAI Python client:
from openai import OpenAI\n\n# Point the client to your Llamactl server\nclient = OpenAI(\n base_url=\"http://localhost:8080/v1\",\n api_key=\"not-needed\" # Llamactl doesn't require API keys by default\n)\n\n# Create a chat completion\nresponse = client.chat.completions.create(\n model=\"my-model\", # Use the name of your instance\n messages=[\n {\"role\": \"user\", \"content\": \"Explain quantum computing in simple terms\"}\n ],\n max_tokens=200,\n temperature=0.7\n)\n\nprint(response.choices[0].message.content)\n
"},{"location":"getting-started/quick-start/#list-available-models","title":"List Available Models","text":"Get a list of running instances (models) in OpenAI-compatible format:
curl http://localhost:8080/v1/models\n
"},{"location":"getting-started/quick-start/#next-steps","title":"Next Steps","text":" - Manage instances Managing Instances
- Explore the API Reference
- Configure advanced settings in the Configuration guide
"},{"location":"user-guide/api-reference/","title":"API Reference","text":"Complete reference for the Llamactl REST API.
"},{"location":"user-guide/api-reference/#base-url","title":"Base URL","text":"All API endpoints are relative to the base URL:
http://localhost:8080/api/v1\n
"},{"location":"user-guide/api-reference/#authentication","title":"Authentication","text":"Llamactl supports API key authentication. If authentication is enabled, include the API key in the Authorization header:
curl -H \"Authorization: Bearer <your-api-key>\" \\\n http://localhost:8080/api/v1/instances\n
The server supports two types of API keys: - Management API Keys: Required for instance management operations (CRUD operations on instances) - Inference API Keys: Required for OpenAI-compatible inference endpoints
"},{"location":"user-guide/api-reference/#system-endpoints","title":"System Endpoints","text":""},{"location":"user-guide/api-reference/#get-llamactl-version","title":"Get Llamactl Version","text":"Get the version information of the llamactl server.
GET /api/v1/version\n
Response:
Version: 1.0.0\nCommit: abc123\nBuild Time: 2024-01-15T10:00:00Z\n
"},{"location":"user-guide/api-reference/#get-llama-server-help","title":"Get Llama Server Help","text":"Get help text for the llama-server command.
GET /api/v1/server/help\n
Response: Plain text help output from llama-server --help
"},{"location":"user-guide/api-reference/#get-llama-server-version","title":"Get Llama Server Version","text":"Get version information of the llama-server binary.
GET /api/v1/server/version\n
Response: Plain text version output from llama-server --version
"},{"location":"user-guide/api-reference/#list-available-devices","title":"List Available Devices","text":"List available devices for llama-server.
GET /api/v1/server/devices\n
Response: Plain text device list from llama-server --list-devices
"},{"location":"user-guide/api-reference/#instances","title":"Instances","text":""},{"location":"user-guide/api-reference/#list-all-instances","title":"List All Instances","text":"Get a list of all instances.
GET /api/v1/instances\n
Response:
[\n {\n \"name\": \"llama2-7b\",\n \"status\": \"running\",\n \"created\": 1705312200\n }\n]\n
"},{"location":"user-guide/api-reference/#get-instance-details","title":"Get Instance Details","text":"Get detailed information about a specific instance.
GET /api/v1/instances/{name}\n
Response:
{\n \"name\": \"llama2-7b\",\n \"status\": \"running\",\n \"created\": 1705312200\n}\n
"},{"location":"user-guide/api-reference/#create-instance","title":"Create Instance","text":"Create and start a new instance.
POST /api/v1/instances/{name}\n
Request Body: JSON object with instance configuration. Common fields include:
backend_type: Backend type (llama_cpp, mlx_lm, or vllm) backend_options: Backend-specific configuration auto_restart: Enable automatic restart on failure max_restarts: Maximum restart attempts restart_delay: Delay between restarts in seconds on_demand_start: Start instance when receiving requests idle_timeout: Idle timeout in minutes environment: Environment variables as key-value pairs
See Managing Instances for complete configuration options.
Response:
{\n \"name\": \"llama2-7b\",\n \"status\": \"running\",\n \"created\": 1705312200\n}\n
"},{"location":"user-guide/api-reference/#update-instance","title":"Update Instance","text":"Update an existing instance configuration. See Managing Instances for available configuration options.
PUT /api/v1/instances/{name}\n
Request Body: JSON object with configuration fields to update.
Response:
{\n \"name\": \"llama2-7b\",\n \"status\": \"running\",\n \"created\": 1705312200\n}\n
"},{"location":"user-guide/api-reference/#delete-instance","title":"Delete Instance","text":"Stop and remove an instance.
DELETE /api/v1/instances/{name}\n
Response: 204 No Content
"},{"location":"user-guide/api-reference/#instance-operations","title":"Instance Operations","text":""},{"location":"user-guide/api-reference/#start-instance","title":"Start Instance","text":"Start a stopped instance.
POST /api/v1/instances/{name}/start\n
Response:
{\n \"name\": \"llama2-7b\",\n \"status\": \"running\",\n \"created\": 1705312200\n}\n
Error Responses: - 409 Conflict: Maximum number of running instances reached - 500 Internal Server Error: Failed to start instance
"},{"location":"user-guide/api-reference/#stop-instance","title":"Stop Instance","text":"Stop a running instance.
POST /api/v1/instances/{name}/stop\n
Response:
{\n \"name\": \"llama2-7b\",\n \"status\": \"stopped\",\n \"created\": 1705312200\n}\n
"},{"location":"user-guide/api-reference/#restart-instance","title":"Restart Instance","text":"Restart an instance (stop then start).
POST /api/v1/instances/{name}/restart\n
Response:
{\n \"name\": \"llama2-7b\",\n \"status\": \"running\",\n \"created\": 1705312200\n}\n
"},{"location":"user-guide/api-reference/#get-instance-logs","title":"Get Instance Logs","text":"Retrieve instance logs.
GET /api/v1/instances/{name}/logs\n
Query Parameters: - lines: Number of lines to return (default: all lines, use -1 for all)
Response: Plain text log output
Example:
curl \"http://localhost:8080/api/v1/instances/my-instance/logs?lines=100\"\n
"},{"location":"user-guide/api-reference/#proxy-to-instance","title":"Proxy to Instance","text":"Proxy HTTP requests directly to the llama-server instance.
GET /api/v1/instances/{name}/proxy/*\nPOST /api/v1/instances/{name}/proxy/*\n
This endpoint forwards all requests to the underlying llama-server instance running on its configured port. The proxy strips the /api/v1/instances/{name}/proxy prefix and forwards the remaining path to the instance.
Example - Check Instance Health:
curl -H \"Authorization: Bearer your-api-key\" \\\n http://localhost:8080/api/v1/instances/my-model/proxy/health\n
This forwards the request to http://instance-host:instance-port/health on the actual llama-server instance.
Error Responses: - 503 Service Unavailable: Instance is not running
"},{"location":"user-guide/api-reference/#openai-compatible-api","title":"OpenAI-Compatible API","text":"Llamactl provides OpenAI-compatible endpoints for inference operations.
"},{"location":"user-guide/api-reference/#list-models","title":"List Models","text":"List all instances in OpenAI-compatible format.
GET /v1/models\n
Response:
{\n \"object\": \"list\",\n \"data\": [\n {\n \"id\": \"llama2-7b\",\n \"object\": \"model\",\n \"created\": 1705312200,\n \"owned_by\": \"llamactl\"\n }\n ]\n}\n
"},{"location":"user-guide/api-reference/#chat-completions-completions-embeddings","title":"Chat Completions, Completions, Embeddings","text":"All OpenAI-compatible inference endpoints are available:
POST /v1/chat/completions\nPOST /v1/completions\nPOST /v1/embeddings\nPOST /v1/rerank\nPOST /v1/reranking\n
Request Body: Standard OpenAI format with model field specifying the instance name
Example:
{\n \"model\": \"llama2-7b\",\n \"messages\": [\n {\n \"role\": \"user\",\n \"content\": \"Hello, how are you?\"\n }\n ]\n}\n
The server routes requests to the appropriate instance based on the model field in the request body. Instances with on-demand starting enabled will be automatically started if not running. For configuration details, see Managing Instances.
Error Responses: - 400 Bad Request: Invalid request body or missing instance name - 503 Service Unavailable: Instance is not running and on-demand start is disabled - 409 Conflict: Cannot start instance due to maximum instances limit
"},{"location":"user-guide/api-reference/#instance-status-values","title":"Instance Status Values","text":"Instances can have the following status values: - stopped: Instance is not running - running: Instance is running and ready to accept requests - failed: Instance failed to start or crashed
"},{"location":"user-guide/api-reference/#error-responses","title":"Error Responses","text":"All endpoints may return error responses in the following format:
{\n \"error\": \"Error message description\"\n}\n
"},{"location":"user-guide/api-reference/#common-http-status-codes","title":"Common HTTP Status Codes","text":" 200: Success 201: Created 204: No Content (successful deletion) 400: Bad Request (invalid parameters or request body) 401: Unauthorized (missing or invalid API key) 403: Forbidden (insufficient permissions) 404: Not Found (instance not found) 409: Conflict (instance already exists, max instances reached) 500: Internal Server Error 503: Service Unavailable (instance not running)
"},{"location":"user-guide/api-reference/#examples","title":"Examples","text":""},{"location":"user-guide/api-reference/#complete-instance-lifecycle","title":"Complete Instance Lifecycle","text":"# Create and start instance\ncurl -X POST http://localhost:8080/api/v1/instances/my-model \\\n -H \"Content-Type: application/json\" \\\n -H \"Authorization: Bearer your-api-key\" \\\n -d '{\n \"backend_type\": \"llama_cpp\",\n \"backend_options\": {\n \"model\": \"/models/llama-2-7b.gguf\",\n \"gpu_layers\": 32\n },\n \"environment\": {\n \"CUDA_VISIBLE_DEVICES\": \"0\",\n \"OMP_NUM_THREADS\": \"8\"\n }\n }'\n\n# Check instance status\ncurl -H \"Authorization: Bearer your-api-key\" \\\n http://localhost:8080/api/v1/instances/my-model\n\n# Get instance logs\ncurl -H \"Authorization: Bearer your-api-key\" \\\n \"http://localhost:8080/api/v1/instances/my-model/logs?lines=50\"\n\n# Use OpenAI-compatible chat completions\ncurl -X POST http://localhost:8080/v1/chat/completions \\\n -H \"Content-Type: application/json\" \\\n -H \"Authorization: Bearer your-inference-api-key\" \\\n -d '{\n \"model\": \"my-model\",\n \"messages\": [\n {\"role\": \"user\", \"content\": \"Hello!\"}\n ],\n \"max_tokens\": 100\n }'\n\n# Stop instance\ncurl -X POST -H \"Authorization: Bearer your-api-key\" \\\n http://localhost:8080/api/v1/instances/my-model/stop\n\n# Delete instance\ncurl -X DELETE -H \"Authorization: Bearer your-api-key\" \\\n http://localhost:8080/api/v1/instances/my-model\n
"},{"location":"user-guide/api-reference/#using-the-proxy-endpoint","title":"Using the Proxy Endpoint","text":"You can also directly proxy requests to the llama-server instance:
# Direct proxy to instance (bypasses OpenAI compatibility layer)\ncurl -X POST http://localhost:8080/api/v1/instances/my-model/proxy/completion \\\n -H \"Content-Type: application/json\" \\\n -H \"Authorization: Bearer your-api-key\" \\\n -d '{\n \"prompt\": \"Hello, world!\",\n \"n_predict\": 50\n }'\n
"},{"location":"user-guide/api-reference/#backend-specific-endpoints","title":"Backend-Specific Endpoints","text":""},{"location":"user-guide/api-reference/#parse-commands","title":"Parse Commands","text":"Llamactl provides endpoints to parse command strings from different backends into instance configuration options.
"},{"location":"user-guide/api-reference/#parse-llamacpp-command","title":"Parse Llama.cpp Command","text":"Parse a llama-server command string into instance options.
POST /api/v1/backends/llama-cpp/parse-command\n
Request Body:
{\n \"command\": \"llama-server -m /path/to/model.gguf -c 2048 --port 8080\"\n}\n
Response:
{\n \"backend_type\": \"llama_cpp\",\n \"llama_server_options\": {\n \"model\": \"/path/to/model.gguf\",\n \"ctx_size\": 2048,\n \"port\": 8080\n }\n}\n
"},{"location":"user-guide/api-reference/#parse-mlx-lm-command","title":"Parse MLX-LM Command","text":"Parse an MLX-LM server command string into instance options.
POST /api/v1/backends/mlx/parse-command\n
Request Body:
{\n \"command\": \"mlx_lm.server --model /path/to/model --port 8080\"\n}\n
Response:
{\n \"backend_type\": \"mlx_lm\",\n \"mlx_server_options\": {\n \"model\": \"/path/to/model\",\n \"port\": 8080\n }\n}\n
"},{"location":"user-guide/api-reference/#parse-vllm-command","title":"Parse vLLM Command","text":"Parse a vLLM serve command string into instance options.
POST /api/v1/backends/vllm/parse-command\n
Request Body:
{\n \"command\": \"vllm serve /path/to/model --port 8080\"\n}\n
Response:
{\n \"backend_type\": \"vllm\",\n \"vllm_server_options\": {\n \"model\": \"/path/to/model\",\n \"port\": 8080\n }\n}\n
Error Responses for Parse Commands: - 400 Bad Request: Invalid request body, empty command, or parse error - 500 Internal Server Error: Encoding error
"},{"location":"user-guide/api-reference/#auto-generated-documentation","title":"Auto-Generated Documentation","text":"The API documentation is automatically generated from code annotations using Swagger/OpenAPI. To regenerate the documentation:
- Install the swag tool:
go install github.com/swaggo/swag/cmd/swag@latest - Generate docs:
swag init -g cmd/server/main.go -o apidocs
"},{"location":"user-guide/api-reference/#swagger-documentation","title":"Swagger Documentation","text":"If swagger documentation is enabled in the server configuration, you can access the interactive API documentation at:
http://localhost:8080/swagger/\n
This provides a complete interactive interface for testing all API endpoints.
"},{"location":"user-guide/managing-instances/","title":"Managing Instances","text":"Learn how to effectively manage your llama.cpp, MLX, and vLLM instances with Llamactl through both the Web UI and API.
"},{"location":"user-guide/managing-instances/#overview","title":"Overview","text":"Llamactl provides two ways to manage instances:
- Web UI: Accessible at
http://localhost:8080 with an intuitive dashboard - REST API: Programmatic access for automation and integration
"},{"location":"user-guide/managing-instances/#authentication","title":"Authentication","text":"If authentication is enabled: 1. Navigate to the web UI 2. Enter your credentials 3. Bearer token is stored for the session
"},{"location":"user-guide/managing-instances/#theme-support","title":"Theme Support","text":" - Switch between light and dark themes
- Setting is remembered across sessions
"},{"location":"user-guide/managing-instances/#instance-cards","title":"Instance Cards","text":"Each instance is displayed as a card showing:
- Instance name
- Health status badge (unknown, ready, error, failed)
- Action buttons (start, stop, edit, logs, delete)
"},{"location":"user-guide/managing-instances/#create-instance","title":"Create Instance","text":""},{"location":"user-guide/managing-instances/#via-web-ui","title":"Via Web UI","text":" - Click the \"Create Instance\" button on the dashboard
- Enter a unique Name for your instance (only required field)
- Choose Backend Type:
- llama.cpp: For GGUF models using llama-server
- MLX: For MLX-optimized models (macOS only)
- vLLM: For distributed serving and high-throughput inference
- Configure model source:
- For llama.cpp: GGUF model path or HuggingFace repo
- For MLX: MLX model path or identifier (e.g.,
mlx-community/Mistral-7B-Instruct-v0.3-4bit) - For vLLM: HuggingFace model identifier (e.g.,
microsoft/DialoGPT-medium)
- Configure optional instance management settings:
- Auto Restart: Automatically restart instance on failure
- Max Restarts: Maximum number of restart attempts
- Restart Delay: Delay in seconds between restart attempts
- On Demand Start: Start instance when receiving a request to the OpenAI compatible endpoint
- Idle Timeout: Minutes before stopping idle instance (set to 0 to disable)
- Environment Variables: Set custom environment variables for the instance process
- Configure backend-specific options:
- llama.cpp: Threads, context size, GPU layers, port, etc.
- MLX: Temperature, top-p, adapter path, Python environment, etc.
- vLLM: Tensor parallel size, GPU memory utilization, quantization, etc.
- Click \"Create\" to save the instance
"},{"location":"user-guide/managing-instances/#via-api","title":"Via API","text":"# Create llama.cpp instance with local model file\ncurl -X POST http://localhost:8080/api/instances/my-llama-instance \\\n -H \"Content-Type: application/json\" \\\n -d '{\n \"backend_type\": \"llama_cpp\",\n \"backend_options\": {\n \"model\": \"/path/to/model.gguf\",\n \"threads\": 8,\n \"ctx_size\": 4096,\n \"gpu_layers\": 32\n }\n }'\n\n# Create MLX instance (macOS only)\ncurl -X POST http://localhost:8080/api/instances/my-mlx-instance \\\n -H \"Content-Type: application/json\" \\\n -d '{\n \"backend_type\": \"mlx_lm\",\n \"backend_options\": {\n \"model\": \"mlx-community/Mistral-7B-Instruct-v0.3-4bit\",\n \"temp\": 0.7,\n \"top_p\": 0.9,\n \"max_tokens\": 2048\n },\n \"auto_restart\": true,\n \"max_restarts\": 3\n }'\n\n# Create vLLM instance\ncurl -X POST http://localhost:8080/api/instances/my-vllm-instance \\\n -H \"Content-Type: application/json\" \\\n -d '{\n \"backend_type\": \"vllm\",\n \"backend_options\": {\n \"model\": \"microsoft/DialoGPT-medium\",\n \"tensor_parallel_size\": 2,\n \"gpu_memory_utilization\": 0.9\n },\n \"auto_restart\": true,\n \"on_demand_start\": true,\n \"environment\": {\n \"CUDA_VISIBLE_DEVICES\": \"0,1\",\n \"NCCL_DEBUG\": \"INFO\",\n \"PYTHONPATH\": \"/custom/path\"\n }\n }'\n\n# Create llama.cpp instance with HuggingFace model\ncurl -X POST http://localhost:8080/api/instances/gemma-3-27b \\\n -H \"Content-Type: application/json\" \\\n -d '{\n \"backend_type\": \"llama_cpp\",\n \"backend_options\": {\n \"hf_repo\": \"unsloth/gemma-3-27b-it-GGUF\",\n \"hf_file\": \"gemma-3-27b-it-GGUF.gguf\",\n \"gpu_layers\": 32\n }\n }'\n
"},{"location":"user-guide/managing-instances/#start-instance","title":"Start Instance","text":""},{"location":"user-guide/managing-instances/#via-web-ui_1","title":"Via Web UI","text":" - Click the \"Start\" button on an instance card
- Watch the status change to \"Unknown\"
- Monitor progress in the logs
- Instance status changes to \"Ready\" when ready
"},{"location":"user-guide/managing-instances/#via-api_1","title":"Via API","text":"curl -X POST http://localhost:8080/api/instances/{name}/start\n
"},{"location":"user-guide/managing-instances/#stop-instance","title":"Stop Instance","text":""},{"location":"user-guide/managing-instances/#via-web-ui_2","title":"Via Web UI","text":" - Click the \"Stop\" button on an instance card
- Instance gracefully shuts down
"},{"location":"user-guide/managing-instances/#via-api_2","title":"Via API","text":"curl -X POST http://localhost:8080/api/instances/{name}/stop\n
"},{"location":"user-guide/managing-instances/#edit-instance","title":"Edit Instance","text":""},{"location":"user-guide/managing-instances/#via-web-ui_3","title":"Via Web UI","text":" - Click the \"Edit\" button on an instance card
- Modify settings in the configuration dialog
- Changes require instance restart to take effect
- Click \"Update & Restart\" to apply changes
"},{"location":"user-guide/managing-instances/#via-api_3","title":"Via API","text":"Modify instance settings:
curl -X PUT http://localhost:8080/api/instances/{name} \\\n -H \"Content-Type: application/json\" \\\n -d '{\n \"backend_options\": {\n \"threads\": 8,\n \"context_size\": 4096\n }\n }'\n
Note
Configuration changes require restarting the instance to take effect.
"},{"location":"user-guide/managing-instances/#view-logs","title":"View Logs","text":""},{"location":"user-guide/managing-instances/#via-web-ui_4","title":"Via Web UI","text":" - Click the \"Logs\" button on any instance card
- Real-time log viewer opens
"},{"location":"user-guide/managing-instances/#via-api_4","title":"Via API","text":"Check instance status in real-time:
# Get instance details\ncurl http://localhost:8080/api/instances/{name}/logs\n
"},{"location":"user-guide/managing-instances/#delete-instance","title":"Delete Instance","text":""},{"location":"user-guide/managing-instances/#via-web-ui_5","title":"Via Web UI","text":" - Click the \"Delete\" button on an instance card
- Only stopped instances can be deleted
- Confirm deletion in the dialog
"},{"location":"user-guide/managing-instances/#via-api_5","title":"Via API","text":"curl -X DELETE http://localhost:8080/api/instances/{name}\n
"},{"location":"user-guide/managing-instances/#instance-proxy","title":"Instance Proxy","text":"Llamactl proxies all requests to the underlying backend instances (llama-server, MLX, or vLLM).
# Get instance details\ncurl http://localhost:8080/api/instances/{name}/proxy/\n
All backends provide OpenAI-compatible endpoints. Check the respective documentation: - llama-server docs - MLX-LM docs - vLLM docs
"},{"location":"user-guide/managing-instances/#instance-health","title":"Instance Health","text":""},{"location":"user-guide/managing-instances/#via-web-ui_6","title":"Via Web UI","text":" - The health status badge is displayed on each instance card
"},{"location":"user-guide/managing-instances/#via-api_6","title":"Via API","text":"Check the health status of your instances:
curl http://localhost:8080/api/instances/{name}/proxy/health\n
"},{"location":"user-guide/troubleshooting/","title":"Troubleshooting","text":"Issues specific to Llamactl deployment and operation.
"},{"location":"user-guide/troubleshooting/#configuration-issues","title":"Configuration Issues","text":""},{"location":"user-guide/troubleshooting/#invalid-configuration","title":"Invalid Configuration","text":"Problem: Invalid configuration preventing startup
Solutions: 1. Use minimal configuration:
server:\n host: \"0.0.0.0\"\n port: 8080\ninstances:\n port_range: [8000, 9000]\n
- Check data directory permissions:
# Ensure data directory is writable (default: ~/.local/share/llamactl)\nmkdir -p ~/.local/share/llamactl/{instances,logs}\n
"},{"location":"user-guide/troubleshooting/#instance-management-issues","title":"Instance Management Issues","text":""},{"location":"user-guide/troubleshooting/#model-loading-failures","title":"Model Loading Failures","text":"Problem: Instance fails to start with model loading errors
Common Solutions: - llama-server not found: Ensure llama-server binary is in PATH - Wrong model format: Ensure model is in GGUF format - Insufficient memory: Use smaller model or reduce context size - Path issues: Use absolute paths to model files
"},{"location":"user-guide/troubleshooting/#memory-issues","title":"Memory Issues","text":"Problem: Out of memory errors or system becomes unresponsive
Solutions: 1. Reduce context size:
{\n \"n_ctx\": 1024\n}\n
- Use quantized models:
- Try Q4_K_M instead of higher precision models
- Use smaller model variants (7B instead of 13B)
"},{"location":"user-guide/troubleshooting/#gpu-configuration","title":"GPU Configuration","text":"Problem: GPU not being used effectively
Solutions: 1. Configure GPU layers:
{\n \"n_gpu_layers\": 35\n}\n
"},{"location":"user-guide/troubleshooting/#advanced-instance-issues","title":"Advanced Instance Issues","text":"Problem: Complex model loading, performance, or compatibility issues
Since llamactl uses llama-server under the hood, many instance-related issues are actually llama.cpp issues. For advanced troubleshooting:
Resources: - llama.cpp Documentation: https://github.com/ggml/llama.cpp - llama.cpp Issues: https://github.com/ggml/llama.cpp/issues - llama.cpp Discussions: https://github.com/ggml/llama.cpp/discussions
Testing directly with llama-server:
# Test your model and parameters directly with llama-server\nllama-server --model /path/to/model.gguf --port 8081 --n-gpu-layers 35\n
This helps determine if the issue is with llamactl or with the underlying llama.cpp/llama-server.
"},{"location":"user-guide/troubleshooting/#api-and-network-issues","title":"API and Network Issues","text":""},{"location":"user-guide/troubleshooting/#cors-errors","title":"CORS Errors","text":"Problem: Web UI shows CORS errors in browser console
Solutions: 1. Configure allowed origins:
server:\n allowed_origins:\n - \"http://localhost:3000\"\n - \"https://yourdomain.com\"\n
"},{"location":"user-guide/troubleshooting/#authentication-issues","title":"Authentication Issues","text":"Problem: API requests failing with authentication errors
Solutions: 1. Disable authentication temporarily:
auth:\n require_management_auth: false\n require_inference_auth: false\n
-
Configure API keys:
auth:\n management_keys:\n - \"your-management-key\"\n inference_keys:\n - \"your-inference-key\"\n
-
Use correct Authorization header:
curl -H \"Authorization: Bearer your-api-key\" \\\n http://localhost:8080/api/v1/instances\n
"},{"location":"user-guide/troubleshooting/#debugging-and-logs","title":"Debugging and Logs","text":""},{"location":"user-guide/troubleshooting/#viewing-instance-logs","title":"Viewing Instance Logs","text":"# Get instance logs via API\ncurl http://localhost:8080/api/v1/instances/{name}/logs\n\n# Or check log files directly\ntail -f ~/.local/share/llamactl/logs/{instance-name}.log\n
"},{"location":"user-guide/troubleshooting/#enable-debug-logging","title":"Enable Debug Logging","text":"export LLAMACTL_LOG_LEVEL=debug\nllamactl\n
"},{"location":"user-guide/troubleshooting/#getting-help","title":"Getting Help","text":"When reporting issues, include:
-
System information:
llamactl --version\n
-
Configuration file (remove sensitive keys)
-
Relevant log output
-
Steps to reproduce the issue
"}]}
\ No newline at end of file
diff --git a/dev/sitemap.xml b/dev/sitemap.xml
index 585b951..fddac5e 100644
--- a/dev/sitemap.xml
+++ b/dev/sitemap.xml
@@ -2,37 +2,37 @@
https://llamactl.org/dev/
- 2025-09-28
+ 2025-09-29
daily
https://llamactl.org/dev/getting-started/configuration/
- 2025-09-28
+ 2025-09-29
daily
https://llamactl.org/dev/getting-started/installation/
- 2025-09-28
+ 2025-09-29
daily
https://llamactl.org/dev/getting-started/quick-start/
- 2025-09-28
+ 2025-09-29
daily
https://llamactl.org/dev/user-guide/api-reference/
- 2025-09-28
+ 2025-09-29
daily
https://llamactl.org/dev/user-guide/managing-instances/
- 2025-09-28
+ 2025-09-29
daily
https://llamactl.org/dev/user-guide/troubleshooting/
- 2025-09-28
+ 2025-09-29
daily
\ No newline at end of file
diff --git a/dev/sitemap.xml.gz b/dev/sitemap.xml.gz
index 77276e2..423e59f 100644
Binary files a/dev/sitemap.xml.gz and b/dev/sitemap.xml.gz differ