From a3dc1ca05a0b12087b25e3b09471ff26733036ca Mon Sep 17 00:00:00 2001 From: lordmathis Date: Mon, 22 Sep 2025 19:58:42 +0000 Subject: [PATCH] Deployed ebc82c3 to dev with MkDocs 1.5.3 and mike 2.0.0 --- dev/getting-started/configuration/index.html | 51 ++--- dev/getting-started/installation/index.html | 52 +++-- dev/getting-started/quick-start/index.html | 152 +++++++------ dev/search/search_index.json | 2 +- dev/sitemap.xml | 14 +- dev/sitemap.xml.gz | Bin 292 -> 291 bytes dev/user-guide/api-reference/index.html | 215 ++++++++++++++++++- dev/user-guide/managing-instances/index.html | 44 ++-- 8 files changed, 395 insertions(+), 135 deletions(-) diff --git a/dev/getting-started/configuration/index.html b/dev/getting-started/configuration/index.html index ec3a523..1eb3a74 100644 --- a/dev/getting-started/configuration/index.html +++ b/dev/getting-started/configuration/index.html @@ -853,28 +853,29 @@ backends: llama_executable: llama-server # Path to llama-server executable mlx_lm_executable: mlx_lm.server # Path to mlx_lm.server executable - -instances: - port_range: [8000, 9000] # Port range for instances - data_dir: ~/.local/share/llamactl # Data directory (platform-specific, see below) - configs_dir: ~/.local/share/llamactl/instances # Instance configs directory - logs_dir: ~/.local/share/llamactl/logs # Logs directory - auto_create_dirs: true # Auto-create data/config/logs dirs if missing - max_instances: -1 # Max instances (-1 = unlimited) - max_running_instances: -1 # Max running instances (-1 = unlimited) - enable_lru_eviction: true # Enable LRU eviction for idle instances - default_auto_restart: true # Auto-restart new instances by default - default_max_restarts: 3 # Max restarts for new instances - default_restart_delay: 5 # Restart delay (seconds) for new instances - default_on_demand_start: true # Default on-demand start setting - on_demand_start_timeout: 120 # Default on-demand start timeout in seconds - timeout_check_interval: 5 # Idle instance timeout check in minutes - -auth: - require_inference_auth: true # Require auth for inference endpoints - inference_keys: [] # Keys for inference endpoints - require_management_auth: true # Require auth for management endpoints - management_keys: [] # Keys for management endpoints + vllm_executable: vllm # Path to vllm executable + +instances: + port_range: [8000, 9000] # Port range for instances + data_dir: ~/.local/share/llamactl # Data directory (platform-specific, see below) + configs_dir: ~/.local/share/llamactl/instances # Instance configs directory + logs_dir: ~/.local/share/llamactl/logs # Logs directory + auto_create_dirs: true # Auto-create data/config/logs dirs if missing + max_instances: -1 # Max instances (-1 = unlimited) + max_running_instances: -1 # Max running instances (-1 = unlimited) + enable_lru_eviction: true # Enable LRU eviction for idle instances + default_auto_restart: true # Auto-restart new instances by default + default_max_restarts: 3 # Max restarts for new instances + default_restart_delay: 5 # Restart delay (seconds) for new instances + default_on_demand_start: true # Default on-demand start setting + on_demand_start_timeout: 120 # Default on-demand start timeout in seconds + timeout_check_interval: 5 # Idle instance timeout check in minutes + +auth: + require_inference_auth: true # Require auth for inference endpoints + inference_keys: [] # Keys for inference endpoints + require_management_auth: true # Require auth for management endpoints + management_keys: [] # Keys for management endpoints

Configuration Files

Configuration File Locations

@@ -910,10 +911,12 @@
backends:
   llama_executable: "llama-server"     # Path to llama-server executable (default: "llama-server")
   mlx_lm_executable: "mlx_lm.server"   # Path to mlx_lm.server executable (default: "mlx_lm.server")
+  vllm_executable: "vllm"              # Path to vllm executable (default: "vllm")
 

Environment Variables: - LLAMACTL_LLAMA_EXECUTABLE - Path to llama-server executable -- LLAMACTL_MLX_LM_EXECUTABLE - Path to mlx_lm.server executable

+- LLAMACTL_MLX_LM_EXECUTABLE - Path to mlx_lm.server executable +- LLAMACTL_VLLM_EXECUTABLE - Path to vllm executable

Instance Configuration

instances:
   port_range: [8000, 9000]                          # Port range for instances (default: [8000, 9000])
@@ -983,7 +986,7 @@
     
       
     
-    September 18, 2025
+    September 21, 2025
   
 
     
diff --git a/dev/getting-started/installation/index.html b/dev/getting-started/installation/index.html
index 0cced7b..4e5053e 100644
--- a/dev/getting-started/installation/index.html
+++ b/dev/getting-started/installation/index.html
@@ -825,18 +825,30 @@
 pip install mlx-lm
 

Note: MLX backend is only available on macOS with Apple Silicon (M1, M2, M3, etc.)

+

For vLLM backend:

+

vLLM provides high-throughput distributed serving for LLMs. Install vLLM:

+
# Install via pip (requires Python 3.8+, GPU required)
+pip install vllm
+
+# Or in a virtual environment (recommended)
+python -m venv vllm-env
+source vllm-env/bin/activate
+pip install vllm
+
+# For production deployments, consider container-based installation
+

Installation Methods

Download the latest release from the GitHub releases page:

-
# Linux/macOS - Get latest version and download
-LATEST_VERSION=$(curl -s https://api.github.com/repos/lordmathis/llamactl/releases/latest | grep '"tag_name":' | sed -E 's/.*"([^"]+)".*/\1/')
-curl -L https://github.com/lordmathis/llamactl/releases/download/${LATEST_VERSION}/llamactl-${LATEST_VERSION}-$(uname -s | tr '[:upper:]' '[:lower:]')-$(uname -m).tar.gz | tar -xz
-sudo mv llamactl /usr/local/bin/
-
-# Or download manually from:
-# https://github.com/lordmathis/llamactl/releases/latest
-
-# Windows - Download from releases page
+
# Linux/macOS - Get latest version and download
+LATEST_VERSION=$(curl -s https://api.github.com/repos/lordmathis/llamactl/releases/latest | grep '"tag_name":' | sed -E 's/.*"([^"]+)".*/\1/')
+curl -L https://github.com/lordmathis/llamactl/releases/download/${LATEST_VERSION}/llamactl-${LATEST_VERSION}-$(uname -s | tr '[:upper:]' '[:lower:]')-$(uname -m).tar.gz | tar -xz
+sudo mv llamactl /usr/local/bin/
+
+# Or download manually from:
+# https://github.com/lordmathis/llamactl/releases/latest
+
+# Windows - Download from releases page
 

Option 2: Build from Source

Requirements: @@ -844,19 +856,19 @@ - Node.js 22 or later - Git

If you prefer to build from source:

-
# Clone the repository
-git clone https://github.com/lordmathis/llamactl.git
-cd llamactl
-
-# Build the web UI
-cd webui && npm ci && npm run build && cd ..
-
-# Build the application
-go build -o llamactl ./cmd/server
+
# Clone the repository
+git clone https://github.com/lordmathis/llamactl.git
+cd llamactl
+
+# Build the web UI
+cd webui && npm ci && npm run build && cd ..
+
+# Build the application
+go build -o llamactl ./cmd/server
 

Verification

Verify your installation by checking the version:

-
llamactl --version
+
llamactl --version
 

Next Steps

Now that Llamactl is installed, continue to the Quick Start guide to get your first instance running!

@@ -880,7 +892,7 @@ - September 18, 2025 + September 21, 2025 diff --git a/dev/getting-started/quick-start/index.html b/dev/getting-started/quick-start/index.html index 204e8f1..9318927 100644 --- a/dev/getting-started/quick-start/index.html +++ b/dev/getting-started/quick-start/index.html @@ -495,9 +495,9 @@
  • - + - Example Configuration + Example Configurations @@ -775,9 +775,9 @@
  • - + - Example Configuration + Example Configurations @@ -879,9 +879,10 @@
  • Click the "Add Instance" button
  • Fill in the instance configuration:
  • Name: Give your instance a descriptive name
  • -
  • Model Path: Path to your Llama.cpp model file
  • +
  • Backend Type: Choose from llama.cpp, MLX, or vLLM
  • +
  • Model: Model path or identifier for your chosen backend
  • -

    Additional Options: Any extra Llama.cpp parameters

    +

    Additional Options: Backend-specific parameters

  • Click "Create Instance"

    @@ -895,76 +896,103 @@
  • View logs by clicking the logs button
  • Stop the instance when needed
  • -

    Example Configuration

    -

    Here's a basic example configuration for a Llama 2 model:

    +

    Example Configurations

    +

    Here are basic example configurations for each backend:

    +

    llama.cpp backend:

    {
       "name": "llama2-7b",
    -  "model_path": "/path/to/llama-2-7b-chat.gguf",
    -  "options": {
    -    "threads": 4,
    -    "context_size": 2048
    -  }
    -}
    -
    + "backend_type": "llama_cpp", + "backend_options": { + "model": "/path/to/llama-2-7b-chat.gguf", + "threads": 4, + "ctx_size": 2048, + "gpu_layers": 32 + } +} +

    +

    MLX backend (macOS only): +

    {
    +  "name": "mistral-mlx",
    +  "backend_type": "mlx_lm",
    +  "backend_options": {
    +    "model": "mlx-community/Mistral-7B-Instruct-v0.3-4bit",
    +    "temp": 0.7,
    +    "max_tokens": 2048
    +  }
    +}
    +

    +

    vLLM backend: +

    {
    +  "name": "dialogpt-vllm",
    +  "backend_type": "vllm",
    +  "backend_options": {
    +    "model": "microsoft/DialoGPT-medium",
    +    "tensor_parallel_size": 2,
    +    "gpu_memory_utilization": 0.9
    +  }
    +}
    +

    Using the API

    You can also manage instances via the REST API:

    -
    # List all instances
    -curl http://localhost:8080/api/instances
    -
    -# Create a new instance
    -curl -X POST http://localhost:8080/api/instances \
    -  -H "Content-Type: application/json" \
    -  -d '{
    -    "name": "my-model",
    -    "model_path": "/path/to/model.gguf",
    -  }'
    -
    -# Start an instance
    -curl -X POST http://localhost:8080/api/instances/my-model/start
    +
    # List all instances
    +curl http://localhost:8080/api/instances
    +
    +# Create a new llama.cpp instance
    +curl -X POST http://localhost:8080/api/instances/my-model \
    +  -H "Content-Type: application/json" \
    +  -d '{
    +    "backend_type": "llama_cpp",
    +    "backend_options": {
    +      "model": "/path/to/model.gguf"
    +    }
    +  }'
    +
    +# Start an instance
    +curl -X POST http://localhost:8080/api/instances/my-model/start
     

    OpenAI Compatible API

    Llamactl provides OpenAI-compatible endpoints, making it easy to integrate with existing OpenAI client libraries and tools.

    Chat Completions

    Once you have an instance running, you can use it with the OpenAI-compatible chat completions endpoint:

    -
    curl -X POST http://localhost:8080/v1/chat/completions \
    -  -H "Content-Type: application/json" \
    -  -d '{
    -    "model": "my-model",
    -    "messages": [
    -      {
    -        "role": "user",
    -        "content": "Hello! Can you help me write a Python function?"
    -      }
    -    ],
    -    "max_tokens": 150,
    -    "temperature": 0.7
    -  }'
    +
    curl -X POST http://localhost:8080/v1/chat/completions \
    +  -H "Content-Type: application/json" \
    +  -d '{
    +    "model": "my-model",
    +    "messages": [
    +      {
    +        "role": "user",
    +        "content": "Hello! Can you help me write a Python function?"
    +      }
    +    ],
    +    "max_tokens": 150,
    +    "temperature": 0.7
    +  }'
     

    Using with Python OpenAI Client

    You can also use the official OpenAI Python client:

    -
    from openai import OpenAI
    -
    -# Point the client to your Llamactl server
    -client = OpenAI(
    -    base_url="http://localhost:8080/v1",
    -    api_key="not-needed"  # Llamactl doesn't require API keys by default
    -)
    -
    -# Create a chat completion
    -response = client.chat.completions.create(
    -    model="my-model",  # Use the name of your instance
    -    messages=[
    -        {"role": "user", "content": "Explain quantum computing in simple terms"}
    -    ],
    -    max_tokens=200,
    -    temperature=0.7
    -)
    -
    -print(response.choices[0].message.content)
    +
    from openai import OpenAI
    +
    +# Point the client to your Llamactl server
    +client = OpenAI(
    +    base_url="http://localhost:8080/v1",
    +    api_key="not-needed"  # Llamactl doesn't require API keys by default
    +)
    +
    +# Create a chat completion
    +response = client.chat.completions.create(
    +    model="my-model",  # Use the name of your instance
    +    messages=[
    +        {"role": "user", "content": "Explain quantum computing in simple terms"}
    +    ],
    +    max_tokens=200,
    +    temperature=0.7
    +)
    +
    +print(response.choices[0].message.content)
     

    List Available Models

    Get a list of running instances (models) in OpenAI-compatible format:

    -
    curl http://localhost:8080/v1/models
    +
    curl http://localhost:8080/v1/models
     

    Next Steps

      @@ -992,7 +1020,7 @@ - September 3, 2025 + September 21, 2025 diff --git a/dev/search/search_index.json b/dev/search/search_index.json index 4b42522..33d6a9b 100644 --- a/dev/search/search_index.json +++ b/dev/search/search_index.json @@ -1 +1 @@ -{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"","title":"Llamactl Documentation","text":"

      Welcome to the Llamactl documentation! Management server and proxy for multiple llama.cpp and MLX instances with OpenAI-compatible API routing.

      "},{"location":"#what-is-llamactl","title":"What is Llamactl?","text":"

      Llamactl is designed to simplify the deployment and management of llama-server and MLX instances. It provides a modern solution for running multiple large language models with centralized management and multi-backend support.

      "},{"location":"#features","title":"Features","text":"

      \ud83d\ude80 Multiple Model Serving: Run different models simultaneously (7B for speed, 70B for quality) \ud83d\udd17 OpenAI API Compatible: Drop-in replacement - route requests by model name \ud83c\udf4e Multi-Backend Support: Native support for both llama.cpp and MLX (Apple Silicon optimized) \ud83c\udf10 Web Dashboard: Modern React UI for visual management (unlike CLI-only tools) \ud83d\udd10 API Key Authentication: Separate keys for management vs inference access \ud83d\udcca Instance Monitoring: Health checks, auto-restart, log management \u26a1 Smart Resource Management: Idle timeout, LRU eviction, and configurable instance limits \ud83d\udca1 On-Demand Instance Start: Automatically launch instances upon receiving OpenAI-compatible API requests \ud83d\udcbe State Persistence: Ensure instances remain intact across server restarts

      "},{"location":"#quick-links","title":"Quick Links","text":"
      • Installation Guide - Get Llamactl up and running
      • Configuration Guide - Detailed configuration options
      • Quick Start - Your first steps with Llamactl
      • Managing Instances - Instance lifecycle management
      • API Reference - Complete API documentation
      "},{"location":"#getting-help","title":"Getting Help","text":"

      If you need help or have questions:

      • Check the Troubleshooting guide
      • Visit the GitHub repository
      • Review the Configuration Guide for advanced settings
      "},{"location":"#license","title":"License","text":"

      MIT License - see the LICENSE file.

      "},{"location":"getting-started/configuration/","title":"Configuration","text":"

      llamactl can be configured via configuration files or environment variables. Configuration is loaded in the following order of precedence:

      Defaults < Configuration file < Environment variables\n

      llamactl works out of the box with sensible defaults, but you can customize the behavior to suit your needs.

      "},{"location":"getting-started/configuration/#default-configuration","title":"Default Configuration","text":"

      Here's the default configuration with all available options:

      server:\n  host: \"0.0.0.0\"                # Server host to bind to\n  port: 8080                     # Server port to bind to\n  allowed_origins: [\"*\"]         # Allowed CORS origins (default: all)\n  enable_swagger: false          # Enable Swagger UI for API docs\n\nbackends:\n  llama_executable: llama-server # Path to llama-server executable\n  mlx_lm_executable: mlx_lm.server # Path to mlx_lm.server executable\n\ninstances:\n  port_range: [8000, 9000]       # Port range for instances\n  data_dir: ~/.local/share/llamactl         # Data directory (platform-specific, see below)\n  configs_dir: ~/.local/share/llamactl/instances  # Instance configs directory\n  logs_dir: ~/.local/share/llamactl/logs    # Logs directory\n  auto_create_dirs: true         # Auto-create data/config/logs dirs if missing\n  max_instances: -1              # Max instances (-1 = unlimited)\n  max_running_instances: -1      # Max running instances (-1 = unlimited)\n  enable_lru_eviction: true      # Enable LRU eviction for idle instances\n  default_auto_restart: true     # Auto-restart new instances by default\n  default_max_restarts: 3        # Max restarts for new instances\n  default_restart_delay: 5       # Restart delay (seconds) for new instances\n  default_on_demand_start: true  # Default on-demand start setting\n  on_demand_start_timeout: 120   # Default on-demand start timeout in seconds\n  timeout_check_interval: 5      # Idle instance timeout check in minutes\n\nauth:\n  require_inference_auth: true   # Require auth for inference endpoints\n  inference_keys: []             # Keys for inference endpoints\n  require_management_auth: true  # Require auth for management endpoints\n  management_keys: []            # Keys for management endpoints\n
      "},{"location":"getting-started/configuration/#configuration-files","title":"Configuration Files","text":""},{"location":"getting-started/configuration/#configuration-file-locations","title":"Configuration File Locations","text":"

      Configuration files are searched in the following locations (in order of precedence):

      Linux: - ./llamactl.yaml or ./config.yaml (current directory) - $HOME/.config/llamactl/config.yaml - /etc/llamactl/config.yaml

      macOS: - ./llamactl.yaml or ./config.yaml (current directory) - $HOME/Library/Application Support/llamactl/config.yaml - /Library/Application Support/llamactl/config.yaml

      Windows: - ./llamactl.yaml or ./config.yaml (current directory) - %APPDATA%\\llamactl\\config.yaml - %USERPROFILE%\\llamactl\\config.yaml - %PROGRAMDATA%\\llamactl\\config.yaml

      You can specify the path to config file with LLAMACTL_CONFIG_PATH environment variable.

      "},{"location":"getting-started/configuration/#configuration-options","title":"Configuration Options","text":""},{"location":"getting-started/configuration/#server-configuration","title":"Server Configuration","text":"
      server:\n  host: \"0.0.0.0\"         # Server host to bind to (default: \"0.0.0.0\")\n  port: 8080              # Server port to bind to (default: 8080)\n  allowed_origins: [\"*\"]  # CORS allowed origins (default: [\"*\"])\n  enable_swagger: false   # Enable Swagger UI (default: false)\n

      Environment Variables: - LLAMACTL_HOST - Server host - LLAMACTL_PORT - Server port - LLAMACTL_ALLOWED_ORIGINS - Comma-separated CORS origins - LLAMACTL_ENABLE_SWAGGER - Enable Swagger UI (true/false)

      "},{"location":"getting-started/configuration/#backend-configuration","title":"Backend Configuration","text":"
      backends:\n  llama_executable: \"llama-server\"     # Path to llama-server executable (default: \"llama-server\")\n  mlx_lm_executable: \"mlx_lm.server\"   # Path to mlx_lm.server executable (default: \"mlx_lm.server\")\n

      Environment Variables: - LLAMACTL_LLAMA_EXECUTABLE - Path to llama-server executable - LLAMACTL_MLX_LM_EXECUTABLE - Path to mlx_lm.server executable

      "},{"location":"getting-started/configuration/#instance-configuration","title":"Instance Configuration","text":"
      instances:\n  port_range: [8000, 9000]                          # Port range for instances (default: [8000, 9000])\n  data_dir: \"~/.local/share/llamactl\"               # Directory for all llamactl data (default varies by OS)\n  configs_dir: \"~/.local/share/llamactl/instances\"  # Directory for instance configs (default: data_dir/instances)\n  logs_dir: \"~/.local/share/llamactl/logs\"          # Directory for instance logs (default: data_dir/logs)\n  auto_create_dirs: true                            # Automatically create data/config/logs directories (default: true)\n  max_instances: -1                                 # Maximum instances (-1 = unlimited)\n  max_running_instances: -1                         # Maximum running instances (-1 = unlimited)\n  enable_lru_eviction: true                         # Enable LRU eviction for idle instances\n  default_auto_restart: true                        # Default auto-restart setting\n  default_max_restarts: 3                           # Default maximum restart attempts\n  default_restart_delay: 5                          # Default restart delay in seconds\n  default_on_demand_start: true                     # Default on-demand start setting\n  on_demand_start_timeout: 120                      # Default on-demand start timeout in seconds\n  timeout_check_interval: 5                         # Default instance timeout check interval in minutes\n

      Environment Variables: - LLAMACTL_INSTANCE_PORT_RANGE - Port range (format: \"8000-9000\" or \"8000,9000\") - LLAMACTL_DATA_DIRECTORY - Data directory path - LLAMACTL_INSTANCES_DIR - Instance configs directory path - LLAMACTL_LOGS_DIR - Log directory path - LLAMACTL_AUTO_CREATE_DATA_DIR - Auto-create data/config/logs directories (true/false) - LLAMACTL_MAX_INSTANCES - Maximum number of instances - LLAMACTL_MAX_RUNNING_INSTANCES - Maximum number of running instances - LLAMACTL_ENABLE_LRU_EVICTION - Enable LRU eviction for idle instances - LLAMACTL_DEFAULT_AUTO_RESTART - Default auto-restart setting (true/false) - LLAMACTL_DEFAULT_MAX_RESTARTS - Default maximum restarts - LLAMACTL_DEFAULT_RESTART_DELAY - Default restart delay in seconds - LLAMACTL_DEFAULT_ON_DEMAND_START - Default on-demand start setting (true/false) - LLAMACTL_ON_DEMAND_START_TIMEOUT - Default on-demand start timeout in seconds - LLAMACTL_TIMEOUT_CHECK_INTERVAL - Default instance timeout check interval in minutes

      "},{"location":"getting-started/configuration/#authentication-configuration","title":"Authentication Configuration","text":"
      auth:\n  require_inference_auth: true           # Require API key for OpenAI endpoints (default: true)\n  inference_keys: []                     # List of valid inference API keys\n  require_management_auth: true          # Require API key for management endpoints (default: true)\n  management_keys: []                    # List of valid management API keys\n

      Environment Variables: - LLAMACTL_REQUIRE_INFERENCE_AUTH - Require auth for OpenAI endpoints (true/false) - LLAMACTL_INFERENCE_KEYS - Comma-separated inference API keys - LLAMACTL_REQUIRE_MANAGEMENT_AUTH - Require auth for management endpoints (true/false) - LLAMACTL_MANAGEMENT_KEYS - Comma-separated management API keys

      "},{"location":"getting-started/configuration/#command-line-options","title":"Command Line Options","text":"

      View all available command line options:

      llamactl --help\n

      You can also override configuration using command line flags when starting llamactl.

      "},{"location":"getting-started/installation/","title":"Installation","text":"

      This guide will walk you through installing Llamactl on your system.

      "},{"location":"getting-started/installation/#prerequisites","title":"Prerequisites","text":""},{"location":"getting-started/installation/#backend-dependencies","title":"Backend Dependencies","text":"

      llamactl supports multiple backends. Install at least one:

      For llama.cpp backend (all platforms):

      You need llama-server from llama.cpp installed:

      # Homebrew (macOS/Linux)\nbrew install llama.cpp\n# Winget (Windows)\nwinget install llama.cpp\n

      Or build from source - see llama.cpp docs

      For MLX backend (macOS only):

      MLX provides optimized inference on Apple Silicon. Install MLX-LM:

      # Install via pip (requires Python 3.8+)\npip install mlx-lm\n\n# Or in a virtual environment (recommended)\npython -m venv mlx-env\nsource mlx-env/bin/activate\npip install mlx-lm\n

      Note: MLX backend is only available on macOS with Apple Silicon (M1, M2, M3, etc.)

      "},{"location":"getting-started/installation/#installation-methods","title":"Installation Methods","text":""},{"location":"getting-started/installation/#option-1-download-binary-recommended","title":"Option 1: Download Binary (Recommended)","text":"

      Download the latest release from the GitHub releases page:

      # Linux/macOS - Get latest version and download\nLATEST_VERSION=$(curl -s https://api.github.com/repos/lordmathis/llamactl/releases/latest | grep '\"tag_name\":' | sed -E 's/.*\"([^\"]+)\".*/\\1/')\ncurl -L https://github.com/lordmathis/llamactl/releases/download/${LATEST_VERSION}/llamactl-${LATEST_VERSION}-$(uname -s | tr '[:upper:]' '[:lower:]')-$(uname -m).tar.gz | tar -xz\nsudo mv llamactl /usr/local/bin/\n\n# Or download manually from:\n# https://github.com/lordmathis/llamactl/releases/latest\n\n# Windows - Download from releases page\n
      "},{"location":"getting-started/installation/#option-2-build-from-source","title":"Option 2: Build from Source","text":"

      Requirements: - Go 1.24 or later - Node.js 22 or later - Git

      If you prefer to build from source:

      # Clone the repository\ngit clone https://github.com/lordmathis/llamactl.git\ncd llamactl\n\n# Build the web UI\ncd webui && npm ci && npm run build && cd ..\n\n# Build the application\ngo build -o llamactl ./cmd/server\n
      "},{"location":"getting-started/installation/#verification","title":"Verification","text":"

      Verify your installation by checking the version:

      llamactl --version\n
      "},{"location":"getting-started/installation/#next-steps","title":"Next Steps","text":"

      Now that Llamactl is installed, continue to the Quick Start guide to get your first instance running!

      "},{"location":"getting-started/quick-start/","title":"Quick Start","text":"

      This guide will help you get Llamactl up and running in just a few minutes.

      "},{"location":"getting-started/quick-start/#step-1-start-llamactl","title":"Step 1: Start Llamactl","text":"

      Start the Llamactl server:

      llamactl\n

      By default, Llamactl will start on http://localhost:8080.

      "},{"location":"getting-started/quick-start/#step-2-access-the-web-ui","title":"Step 2: Access the Web UI","text":"

      Open your web browser and navigate to:

      http://localhost:8080\n

      Login with the management API key. By default it is generated during server startup. Copy it from the terminal output.

      You should see the Llamactl web interface.

      "},{"location":"getting-started/quick-start/#step-3-create-your-first-instance","title":"Step 3: Create Your First Instance","text":"
      1. Click the \"Add Instance\" button
      2. Fill in the instance configuration:
      3. Name: Give your instance a descriptive name
      4. Model Path: Path to your Llama.cpp model file
      5. Additional Options: Any extra Llama.cpp parameters

      6. Click \"Create Instance\"

      "},{"location":"getting-started/quick-start/#step-4-start-your-instance","title":"Step 4: Start Your Instance","text":"

      Once created, you can:

      • Start the instance by clicking the start button
      • Monitor its status in real-time
      • View logs by clicking the logs button
      • Stop the instance when needed
      "},{"location":"getting-started/quick-start/#example-configuration","title":"Example Configuration","text":"

      Here's a basic example configuration for a Llama 2 model:

      {\n  \"name\": \"llama2-7b\",\n  \"model_path\": \"/path/to/llama-2-7b-chat.gguf\",\n  \"options\": {\n    \"threads\": 4,\n    \"context_size\": 2048\n  }\n}\n
      "},{"location":"getting-started/quick-start/#using-the-api","title":"Using the API","text":"

      You can also manage instances via the REST API:

      # List all instances\ncurl http://localhost:8080/api/instances\n\n# Create a new instance\ncurl -X POST http://localhost:8080/api/instances \\\n  -H \"Content-Type: application/json\" \\\n  -d '{\n    \"name\": \"my-model\",\n    \"model_path\": \"/path/to/model.gguf\",\n  }'\n\n# Start an instance\ncurl -X POST http://localhost:8080/api/instances/my-model/start\n
      "},{"location":"getting-started/quick-start/#openai-compatible-api","title":"OpenAI Compatible API","text":"

      Llamactl provides OpenAI-compatible endpoints, making it easy to integrate with existing OpenAI client libraries and tools.

      "},{"location":"getting-started/quick-start/#chat-completions","title":"Chat Completions","text":"

      Once you have an instance running, you can use it with the OpenAI-compatible chat completions endpoint:

      curl -X POST http://localhost:8080/v1/chat/completions \\\n  -H \"Content-Type: application/json\" \\\n  -d '{\n    \"model\": \"my-model\",\n    \"messages\": [\n      {\n        \"role\": \"user\",\n        \"content\": \"Hello! Can you help me write a Python function?\"\n      }\n    ],\n    \"max_tokens\": 150,\n    \"temperature\": 0.7\n  }'\n
      "},{"location":"getting-started/quick-start/#using-with-python-openai-client","title":"Using with Python OpenAI Client","text":"

      You can also use the official OpenAI Python client:

      from openai import OpenAI\n\n# Point the client to your Llamactl server\nclient = OpenAI(\n    base_url=\"http://localhost:8080/v1\",\n    api_key=\"not-needed\"  # Llamactl doesn't require API keys by default\n)\n\n# Create a chat completion\nresponse = client.chat.completions.create(\n    model=\"my-model\",  # Use the name of your instance\n    messages=[\n        {\"role\": \"user\", \"content\": \"Explain quantum computing in simple terms\"}\n    ],\n    max_tokens=200,\n    temperature=0.7\n)\n\nprint(response.choices[0].message.content)\n
      "},{"location":"getting-started/quick-start/#list-available-models","title":"List Available Models","text":"

      Get a list of running instances (models) in OpenAI-compatible format:

      curl http://localhost:8080/v1/models\n
      "},{"location":"getting-started/quick-start/#next-steps","title":"Next Steps","text":"
      • Manage instances Managing Instances
      • Explore the API Reference
      • Configure advanced settings in the Configuration guide
      "},{"location":"user-guide/api-reference/","title":"API Reference","text":"

      Complete reference for the Llamactl REST API.

      "},{"location":"user-guide/api-reference/#base-url","title":"Base URL","text":"

      All API endpoints are relative to the base URL:

      http://localhost:8080/api/v1\n
      "},{"location":"user-guide/api-reference/#authentication","title":"Authentication","text":"

      Llamactl supports API key authentication. If authentication is enabled, include the API key in the Authorization header:

      curl -H \"Authorization: Bearer <your-api-key>\" \\\n  http://localhost:8080/api/v1/instances\n

      The server supports two types of API keys: - Management API Keys: Required for instance management operations (CRUD operations on instances) - Inference API Keys: Required for OpenAI-compatible inference endpoints

      "},{"location":"user-guide/api-reference/#system-endpoints","title":"System Endpoints","text":""},{"location":"user-guide/api-reference/#get-llamactl-version","title":"Get Llamactl Version","text":"

      Get the version information of the llamactl server.

      GET /api/v1/version\n

      Response:

      Version: 1.0.0\nCommit: abc123\nBuild Time: 2024-01-15T10:00:00Z\n

      "},{"location":"user-guide/api-reference/#get-llama-server-help","title":"Get Llama Server Help","text":"

      Get help text for the llama-server command.

      GET /api/v1/server/help\n

      Response: Plain text help output from llama-server --help

      "},{"location":"user-guide/api-reference/#get-llama-server-version","title":"Get Llama Server Version","text":"

      Get version information of the llama-server binary.

      GET /api/v1/server/version\n

      Response: Plain text version output from llama-server --version

      "},{"location":"user-guide/api-reference/#list-available-devices","title":"List Available Devices","text":"

      List available devices for llama-server.

      GET /api/v1/server/devices\n

      Response: Plain text device list from llama-server --list-devices

      "},{"location":"user-guide/api-reference/#instances","title":"Instances","text":""},{"location":"user-guide/api-reference/#list-all-instances","title":"List All Instances","text":"

      Get a list of all instances.

      GET /api/v1/instances\n

      Response:

      [\n  {\n    \"name\": \"llama2-7b\",\n    \"status\": \"running\",\n    \"created\": 1705312200\n  }\n]\n

      "},{"location":"user-guide/api-reference/#get-instance-details","title":"Get Instance Details","text":"

      Get detailed information about a specific instance.

      GET /api/v1/instances/{name}\n

      Response:

      {\n  \"name\": \"llama2-7b\",\n  \"status\": \"running\",\n  \"created\": 1705312200\n}\n

      "},{"location":"user-guide/api-reference/#create-instance","title":"Create Instance","text":"

      Create and start a new instance.

      POST /api/v1/instances/{name}\n

      Request Body: JSON object with instance configuration. See Managing Instances for available configuration options.

      Response:

      {\n  \"name\": \"llama2-7b\",\n  \"status\": \"running\",\n  \"created\": 1705312200\n}\n

      "},{"location":"user-guide/api-reference/#update-instance","title":"Update Instance","text":"

      Update an existing instance configuration. See Managing Instances for available configuration options.

      PUT /api/v1/instances/{name}\n

      Request Body: JSON object with configuration fields to update.

      Response:

      {\n  \"name\": \"llama2-7b\",\n  \"status\": \"running\",\n  \"created\": 1705312200\n}\n

      "},{"location":"user-guide/api-reference/#delete-instance","title":"Delete Instance","text":"

      Stop and remove an instance.

      DELETE /api/v1/instances/{name}\n

      Response: 204 No Content

      "},{"location":"user-guide/api-reference/#instance-operations","title":"Instance Operations","text":""},{"location":"user-guide/api-reference/#start-instance","title":"Start Instance","text":"

      Start a stopped instance.

      POST /api/v1/instances/{name}/start\n

      Response:

      {\n  \"name\": \"llama2-7b\",\n  \"status\": \"starting\",\n  \"created\": 1705312200\n}\n

      Error Responses: - 409 Conflict: Maximum number of running instances reached - 500 Internal Server Error: Failed to start instance

      "},{"location":"user-guide/api-reference/#stop-instance","title":"Stop Instance","text":"

      Stop a running instance.

      POST /api/v1/instances/{name}/stop\n

      Response:

      {\n  \"name\": \"llama2-7b\",\n  \"status\": \"stopping\",\n  \"created\": 1705312200\n}\n

      "},{"location":"user-guide/api-reference/#restart-instance","title":"Restart Instance","text":"

      Restart an instance (stop then start).

      POST /api/v1/instances/{name}/restart\n

      Response:

      {\n  \"name\": \"llama2-7b\",\n  \"status\": \"restarting\",\n  \"created\": 1705312200\n}\n

      "},{"location":"user-guide/api-reference/#get-instance-logs","title":"Get Instance Logs","text":"

      Retrieve instance logs.

      GET /api/v1/instances/{name}/logs\n

      Query Parameters: - lines: Number of lines to return (default: all lines, use -1 for all)

      Response: Plain text log output

      Example:

      curl \"http://localhost:8080/api/v1/instances/my-instance/logs?lines=100\"\n

      "},{"location":"user-guide/api-reference/#proxy-to-instance","title":"Proxy to Instance","text":"

      Proxy HTTP requests directly to the llama-server instance.

      GET /api/v1/instances/{name}/proxy/*\nPOST /api/v1/instances/{name}/proxy/*\n

      This endpoint forwards all requests to the underlying llama-server instance running on its configured port. The proxy strips the /api/v1/instances/{name}/proxy prefix and forwards the remaining path to the instance.

      Example - Check Instance Health:

      curl -H \"Authorization: Bearer your-api-key\" \\\n  http://localhost:8080/api/v1/instances/my-model/proxy/health\n

      This forwards the request to http://instance-host:instance-port/health on the actual llama-server instance.

      Error Responses: - 503 Service Unavailable: Instance is not running

      "},{"location":"user-guide/api-reference/#openai-compatible-api","title":"OpenAI-Compatible API","text":"

      Llamactl provides OpenAI-compatible endpoints for inference operations.

      "},{"location":"user-guide/api-reference/#list-models","title":"List Models","text":"

      List all instances in OpenAI-compatible format.

      GET /v1/models\n

      Response:

      {\n  \"object\": \"list\",\n  \"data\": [\n    {\n      \"id\": \"llama2-7b\",\n      \"object\": \"model\",\n      \"created\": 1705312200,\n      \"owned_by\": \"llamactl\"\n    }\n  ]\n}\n

      "},{"location":"user-guide/api-reference/#chat-completions-completions-embeddings","title":"Chat Completions, Completions, Embeddings","text":"

      All OpenAI-compatible inference endpoints are available:

      POST /v1/chat/completions\nPOST /v1/completions\nPOST /v1/embeddings\nPOST /v1/rerank\nPOST /v1/reranking\n

      Request Body: Standard OpenAI format with model field specifying the instance name

      Example:

      {\n  \"model\": \"llama2-7b\",\n  \"messages\": [\n    {\n      \"role\": \"user\",\n      \"content\": \"Hello, how are you?\"\n    }\n  ]\n}\n

      The server routes requests to the appropriate instance based on the model field in the request body. Instances with on-demand starting enabled will be automatically started if not running. For configuration details, see Managing Instances.

      Error Responses: - 400 Bad Request: Invalid request body or missing model name - 503 Service Unavailable: Instance is not running and on-demand start is disabled - 409 Conflict: Cannot start instance due to maximum instances limit

      "},{"location":"user-guide/api-reference/#instance-status-values","title":"Instance Status Values","text":"

      Instances can have the following status values: - stopped: Instance is not running - running: Instance is running and ready to accept requests - failed: Instance failed to start or crashed

      "},{"location":"user-guide/api-reference/#error-responses","title":"Error Responses","text":"

      All endpoints may return error responses in the following format:

      {\n  \"error\": \"Error message description\"\n}\n
      "},{"location":"user-guide/api-reference/#common-http-status-codes","title":"Common HTTP Status Codes","text":"
      • 200: Success
      • 201: Created
      • 204: No Content (successful deletion)
      • 400: Bad Request (invalid parameters or request body)
      • 401: Unauthorized (missing or invalid API key)
      • 403: Forbidden (insufficient permissions)
      • 404: Not Found (instance not found)
      • 409: Conflict (instance already exists, max instances reached)
      • 500: Internal Server Error
      • 503: Service Unavailable (instance not running)
      "},{"location":"user-guide/api-reference/#examples","title":"Examples","text":""},{"location":"user-guide/api-reference/#complete-instance-lifecycle","title":"Complete Instance Lifecycle","text":"
      # Create and start instance\ncurl -X POST http://localhost:8080/api/v1/instances/my-model \\\n  -H \"Content-Type: application/json\" \\\n  -H \"Authorization: Bearer your-api-key\" \\\n  -d '{\n    \"model\": \"/models/llama-2-7b.gguf\"\n  }'\n\n# Check instance status\ncurl -H \"Authorization: Bearer your-api-key\" \\\n  http://localhost:8080/api/v1/instances/my-model\n\n# Get instance logs\ncurl -H \"Authorization: Bearer your-api-key\" \\\n  \"http://localhost:8080/api/v1/instances/my-model/logs?lines=50\"\n\n# Use OpenAI-compatible chat completions\ncurl -X POST http://localhost:8080/v1/chat/completions \\\n  -H \"Content-Type: application/json\" \\\n  -H \"Authorization: Bearer your-inference-api-key\" \\\n  -d '{\n    \"model\": \"my-model\",\n    \"messages\": [\n      {\"role\": \"user\", \"content\": \"Hello!\"}\n    ],\n    \"max_tokens\": 100\n  }'\n\n# Stop instance\ncurl -X POST -H \"Authorization: Bearer your-api-key\" \\\n  http://localhost:8080/api/v1/instances/my-model/stop\n\n# Delete instance\ncurl -X DELETE -H \"Authorization: Bearer your-api-key\" \\\n  http://localhost:8080/api/v1/instances/my-model\n
      "},{"location":"user-guide/api-reference/#using-the-proxy-endpoint","title":"Using the Proxy Endpoint","text":"

      You can also directly proxy requests to the llama-server instance:

      # Direct proxy to instance (bypasses OpenAI compatibility layer)\ncurl -X POST http://localhost:8080/api/v1/instances/my-model/proxy/completion \\\n  -H \"Content-Type: application/json\" \\\n  -H \"Authorization: Bearer your-api-key\" \\\n  -d '{\n    \"prompt\": \"Hello, world!\",\n    \"n_predict\": 50\n  }'\n
      "},{"location":"user-guide/api-reference/#swagger-documentation","title":"Swagger Documentation","text":"

      If swagger documentation is enabled in the server configuration, you can access the interactive API documentation at:

      http://localhost:8080/swagger/\n

      This provides a complete interactive interface for testing all API endpoints.

      "},{"location":"user-guide/managing-instances/","title":"Managing Instances","text":"

      Learn how to effectively manage your llama.cpp and MLX instances with Llamactl through both the Web UI and API.

      "},{"location":"user-guide/managing-instances/#overview","title":"Overview","text":"

      Llamactl provides two ways to manage instances:

      • Web UI: Accessible at http://localhost:8080 with an intuitive dashboard
      • REST API: Programmatic access for automation and integration

      "},{"location":"user-guide/managing-instances/#authentication","title":"Authentication","text":"

      If authentication is enabled: 1. Navigate to the web UI 2. Enter your credentials 3. Bearer token is stored for the session

      "},{"location":"user-guide/managing-instances/#theme-support","title":"Theme Support","text":"
      • Switch between light and dark themes
      • Setting is remembered across sessions
      "},{"location":"user-guide/managing-instances/#instance-cards","title":"Instance Cards","text":"

      Each instance is displayed as a card showing:

      • Instance name
      • Health status badge (unknown, ready, error, failed)
      • Action buttons (start, stop, edit, logs, delete)
      "},{"location":"user-guide/managing-instances/#create-instance","title":"Create Instance","text":""},{"location":"user-guide/managing-instances/#via-web-ui","title":"Via Web UI","text":"
      1. Click the \"Create Instance\" button on the dashboard
      2. Enter a unique Name for your instance (only required field)
      3. Choose Backend Type:
        • llama.cpp: For GGUF models using llama-server
        • MLX: For MLX-optimized models (macOS only)
      4. Configure model source:
        • For llama.cpp: GGUF model path or HuggingFace repo
        • For MLX: MLX model path or identifier (e.g., mlx-community/Mistral-7B-Instruct-v0.3-4bit)
      5. Configure optional instance management settings:
        • Auto Restart: Automatically restart instance on failure
        • Max Restarts: Maximum number of restart attempts
        • Restart Delay: Delay in seconds between restart attempts
        • On Demand Start: Start instance when receiving a request to the OpenAI compatible endpoint
        • Idle Timeout: Minutes before stopping idle instance (set to 0 to disable)
      6. Configure backend-specific options:
        • llama.cpp: Threads, context size, GPU layers, port, etc.
        • MLX: Temperature, top-p, adapter path, Python environment, etc.
      7. Click \"Create\" to save the instance
      "},{"location":"user-guide/managing-instances/#via-api","title":"Via API","text":"
      # Create llama.cpp instance with local model file\ncurl -X POST http://localhost:8080/api/instances/my-llama-instance \\\n  -H \"Content-Type: application/json\" \\\n  -d '{\n    \"backend_type\": \"llama_cpp\",\n    \"backend_options\": {\n      \"model\": \"/path/to/model.gguf\",\n      \"threads\": 8,\n      \"ctx_size\": 4096,\n      \"gpu_layers\": 32\n    }\n  }'\n\n# Create MLX instance (macOS only)\ncurl -X POST http://localhost:8080/api/instances/my-mlx-instance \\\n  -H \"Content-Type: application/json\" \\\n  -d '{\n    \"backend_type\": \"mlx_lm\",\n    \"backend_options\": {\n      \"model\": \"mlx-community/Mistral-7B-Instruct-v0.3-4bit\",\n      \"temp\": 0.7,\n      \"top_p\": 0.9,\n      \"max_tokens\": 2048\n    },\n    \"auto_restart\": true,\n    \"max_restarts\": 3\n  }'\n\n# Create llama.cpp instance with HuggingFace model\ncurl -X POST http://localhost:8080/api/instances/gemma-3-27b \\\n  -H \"Content-Type: application/json\" \\\n  -d '{\n    \"backend_type\": \"llama_cpp\",\n    \"backend_options\": {\n      \"hf_repo\": \"unsloth/gemma-3-27b-it-GGUF\",\n      \"hf_file\": \"gemma-3-27b-it-GGUF.gguf\",\n      \"gpu_layers\": 32\n    }\n  }'\n
      "},{"location":"user-guide/managing-instances/#start-instance","title":"Start Instance","text":""},{"location":"user-guide/managing-instances/#via-web-ui_1","title":"Via Web UI","text":"
      1. Click the \"Start\" button on an instance card
      2. Watch the status change to \"Unknown\"
      3. Monitor progress in the logs
      4. Instance status changes to \"Ready\" when ready
      "},{"location":"user-guide/managing-instances/#via-api_1","title":"Via API","text":"
      curl -X POST http://localhost:8080/api/instances/{name}/start\n
      "},{"location":"user-guide/managing-instances/#stop-instance","title":"Stop Instance","text":""},{"location":"user-guide/managing-instances/#via-web-ui_2","title":"Via Web UI","text":"
      1. Click the \"Stop\" button on an instance card
      2. Instance gracefully shuts down
      "},{"location":"user-guide/managing-instances/#via-api_2","title":"Via API","text":"
      curl -X POST http://localhost:8080/api/instances/{name}/stop\n
      "},{"location":"user-guide/managing-instances/#edit-instance","title":"Edit Instance","text":""},{"location":"user-guide/managing-instances/#via-web-ui_3","title":"Via Web UI","text":"
      1. Click the \"Edit\" button on an instance card
      2. Modify settings in the configuration dialog
      3. Changes require instance restart to take effect
      4. Click \"Update & Restart\" to apply changes
      "},{"location":"user-guide/managing-instances/#via-api_3","title":"Via API","text":"

      Modify instance settings:

      curl -X PUT http://localhost:8080/api/instances/{name} \\\n  -H \"Content-Type: application/json\" \\\n  -d '{\n    \"backend_options\": {\n      \"threads\": 8,\n      \"context_size\": 4096\n    }\n  }'\n

      Note

      Configuration changes require restarting the instance to take effect.

      "},{"location":"user-guide/managing-instances/#view-logs","title":"View Logs","text":""},{"location":"user-guide/managing-instances/#via-web-ui_4","title":"Via Web UI","text":"
      1. Click the \"Logs\" button on any instance card
      2. Real-time log viewer opens
      "},{"location":"user-guide/managing-instances/#via-api_4","title":"Via API","text":"

      Check instance status in real-time:

      # Get instance details\ncurl http://localhost:8080/api/instances/{name}/logs\n
      "},{"location":"user-guide/managing-instances/#delete-instance","title":"Delete Instance","text":""},{"location":"user-guide/managing-instances/#via-web-ui_5","title":"Via Web UI","text":"
      1. Click the \"Delete\" button on an instance card
      2. Only stopped instances can be deleted
      3. Confirm deletion in the dialog
      "},{"location":"user-guide/managing-instances/#via-api_5","title":"Via API","text":"
      curl -X DELETE http://localhost:8080/api/instances/{name}\n
      "},{"location":"user-guide/managing-instances/#instance-proxy","title":"Instance Proxy","text":"

      Llamactl proxies all requests to the underlying backend instances (llama-server or MLX).

      # Get instance details\ncurl http://localhost:8080/api/instances/{name}/proxy/\n

      Both backends provide OpenAI-compatible endpoints. Check the respective documentation: - llama-server docs - MLX-LM docs

      "},{"location":"user-guide/managing-instances/#instance-health","title":"Instance Health","text":""},{"location":"user-guide/managing-instances/#via-web-ui_6","title":"Via Web UI","text":"
      1. The health status badge is displayed on each instance card
      "},{"location":"user-guide/managing-instances/#via-api_6","title":"Via API","text":"

      Check the health status of your instances:

      curl http://localhost:8080/api/instances/{name}/proxy/health\n
      "},{"location":"user-guide/troubleshooting/","title":"Troubleshooting","text":"

      Issues specific to Llamactl deployment and operation.

      "},{"location":"user-guide/troubleshooting/#configuration-issues","title":"Configuration Issues","text":""},{"location":"user-guide/troubleshooting/#invalid-configuration","title":"Invalid Configuration","text":"

      Problem: Invalid configuration preventing startup

      Solutions: 1. Use minimal configuration:

      server:\n  host: \"0.0.0.0\"\n  port: 8080\ninstances:\n  port_range: [8000, 9000]\n

      1. Check data directory permissions:
        # Ensure data directory is writable (default: ~/.local/share/llamactl)\nmkdir -p ~/.local/share/llamactl/{instances,logs}\n
      "},{"location":"user-guide/troubleshooting/#instance-management-issues","title":"Instance Management Issues","text":""},{"location":"user-guide/troubleshooting/#model-loading-failures","title":"Model Loading Failures","text":"

      Problem: Instance fails to start with model loading errors

      Common Solutions: - llama-server not found: Ensure llama-server binary is in PATH - Wrong model format: Ensure model is in GGUF format - Insufficient memory: Use smaller model or reduce context size - Path issues: Use absolute paths to model files

      "},{"location":"user-guide/troubleshooting/#memory-issues","title":"Memory Issues","text":"

      Problem: Out of memory errors or system becomes unresponsive

      Solutions: 1. Reduce context size:

      {\n  \"n_ctx\": 1024\n}\n

      1. Use quantized models:
      2. Try Q4_K_M instead of higher precision models
      3. Use smaller model variants (7B instead of 13B)
      "},{"location":"user-guide/troubleshooting/#gpu-configuration","title":"GPU Configuration","text":"

      Problem: GPU not being used effectively

      Solutions: 1. Configure GPU layers:

      {\n  \"n_gpu_layers\": 35\n}\n

      "},{"location":"user-guide/troubleshooting/#advanced-instance-issues","title":"Advanced Instance Issues","text":"

      Problem: Complex model loading, performance, or compatibility issues

      Since llamactl uses llama-server under the hood, many instance-related issues are actually llama.cpp issues. For advanced troubleshooting:

      Resources: - llama.cpp Documentation: https://github.com/ggml/llama.cpp - llama.cpp Issues: https://github.com/ggml/llama.cpp/issues - llama.cpp Discussions: https://github.com/ggml/llama.cpp/discussions

      Testing directly with llama-server:

      # Test your model and parameters directly with llama-server\nllama-server --model /path/to/model.gguf --port 8081 --n-gpu-layers 35\n

      This helps determine if the issue is with llamactl or with the underlying llama.cpp/llama-server.

      "},{"location":"user-guide/troubleshooting/#api-and-network-issues","title":"API and Network Issues","text":""},{"location":"user-guide/troubleshooting/#cors-errors","title":"CORS Errors","text":"

      Problem: Web UI shows CORS errors in browser console

      Solutions: 1. Configure allowed origins:

      server:\n  allowed_origins:\n    - \"http://localhost:3000\"\n    - \"https://yourdomain.com\"\n

      "},{"location":"user-guide/troubleshooting/#authentication-issues","title":"Authentication Issues","text":"

      Problem: API requests failing with authentication errors

      Solutions: 1. Disable authentication temporarily:

      auth:\n  require_management_auth: false\n  require_inference_auth: false\n

      1. Configure API keys:

        auth:\n  management_keys:\n    - \"your-management-key\"\n  inference_keys:\n    - \"your-inference-key\"\n

      2. Use correct Authorization header:

        curl -H \"Authorization: Bearer your-api-key\" \\\n  http://localhost:8080/api/v1/instances\n

      "},{"location":"user-guide/troubleshooting/#debugging-and-logs","title":"Debugging and Logs","text":""},{"location":"user-guide/troubleshooting/#viewing-instance-logs","title":"Viewing Instance Logs","text":"
      # Get instance logs via API\ncurl http://localhost:8080/api/v1/instances/{name}/logs\n\n# Or check log files directly\ntail -f ~/.local/share/llamactl/logs/{instance-name}.log\n
      "},{"location":"user-guide/troubleshooting/#enable-debug-logging","title":"Enable Debug Logging","text":"
      export LLAMACTL_LOG_LEVEL=debug\nllamactl\n
      "},{"location":"user-guide/troubleshooting/#getting-help","title":"Getting Help","text":"

      When reporting issues, include:

      1. System information:

        llamactl --version\n

      2. Configuration file (remove sensitive keys)

      3. Relevant log output

      4. Steps to reproduce the issue

      "}]} \ No newline at end of file +{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"","title":"Llamactl Documentation","text":"

      Welcome to the Llamactl documentation! Management server and proxy for multiple llama.cpp and MLX instances with OpenAI-compatible API routing.

      "},{"location":"#what-is-llamactl","title":"What is Llamactl?","text":"

      Llamactl is designed to simplify the deployment and management of llama-server and MLX instances. It provides a modern solution for running multiple large language models with centralized management and multi-backend support.

      "},{"location":"#features","title":"Features","text":"

      \ud83d\ude80 Multiple Model Serving: Run different models simultaneously (7B for speed, 70B for quality) \ud83d\udd17 OpenAI API Compatible: Drop-in replacement - route requests by model name \ud83c\udf4e Multi-Backend Support: Native support for both llama.cpp and MLX (Apple Silicon optimized) \ud83c\udf10 Web Dashboard: Modern React UI for visual management (unlike CLI-only tools) \ud83d\udd10 API Key Authentication: Separate keys for management vs inference access \ud83d\udcca Instance Monitoring: Health checks, auto-restart, log management \u26a1 Smart Resource Management: Idle timeout, LRU eviction, and configurable instance limits \ud83d\udca1 On-Demand Instance Start: Automatically launch instances upon receiving OpenAI-compatible API requests \ud83d\udcbe State Persistence: Ensure instances remain intact across server restarts

      "},{"location":"#quick-links","title":"Quick Links","text":"
      • Installation Guide - Get Llamactl up and running
      • Configuration Guide - Detailed configuration options
      • Quick Start - Your first steps with Llamactl
      • Managing Instances - Instance lifecycle management
      • API Reference - Complete API documentation
      "},{"location":"#getting-help","title":"Getting Help","text":"

      If you need help or have questions:

      • Check the Troubleshooting guide
      • Visit the GitHub repository
      • Review the Configuration Guide for advanced settings
      "},{"location":"#license","title":"License","text":"

      MIT License - see the LICENSE file.

      "},{"location":"getting-started/configuration/","title":"Configuration","text":"

      llamactl can be configured via configuration files or environment variables. Configuration is loaded in the following order of precedence:

      Defaults < Configuration file < Environment variables\n

      llamactl works out of the box with sensible defaults, but you can customize the behavior to suit your needs.

      "},{"location":"getting-started/configuration/#default-configuration","title":"Default Configuration","text":"

      Here's the default configuration with all available options:

      server:\n  host: \"0.0.0.0\"                # Server host to bind to\n  port: 8080                     # Server port to bind to\n  allowed_origins: [\"*\"]         # Allowed CORS origins (default: all)\n  enable_swagger: false          # Enable Swagger UI for API docs\n\nbackends:\n  llama_executable: llama-server # Path to llama-server executable\n  mlx_lm_executable: mlx_lm.server # Path to mlx_lm.server executable\n  vllm_executable: vllm # Path to vllm executable\n\ninstances:\n  port_range: [8000, 9000]       # Port range for instances\n  data_dir: ~/.local/share/llamactl         # Data directory (platform-specific, see below)\n  configs_dir: ~/.local/share/llamactl/instances  # Instance configs directory\n  logs_dir: ~/.local/share/llamactl/logs    # Logs directory\n  auto_create_dirs: true         # Auto-create data/config/logs dirs if missing\n  max_instances: -1              # Max instances (-1 = unlimited)\n  max_running_instances: -1      # Max running instances (-1 = unlimited)\n  enable_lru_eviction: true      # Enable LRU eviction for idle instances\n  default_auto_restart: true     # Auto-restart new instances by default\n  default_max_restarts: 3        # Max restarts for new instances\n  default_restart_delay: 5       # Restart delay (seconds) for new instances\n  default_on_demand_start: true  # Default on-demand start setting\n  on_demand_start_timeout: 120   # Default on-demand start timeout in seconds\n  timeout_check_interval: 5      # Idle instance timeout check in minutes\n\nauth:\n  require_inference_auth: true   # Require auth for inference endpoints\n  inference_keys: []             # Keys for inference endpoints\n  require_management_auth: true  # Require auth for management endpoints\n  management_keys: []            # Keys for management endpoints\n
      "},{"location":"getting-started/configuration/#configuration-files","title":"Configuration Files","text":""},{"location":"getting-started/configuration/#configuration-file-locations","title":"Configuration File Locations","text":"

      Configuration files are searched in the following locations (in order of precedence):

      Linux: - ./llamactl.yaml or ./config.yaml (current directory) - $HOME/.config/llamactl/config.yaml - /etc/llamactl/config.yaml

      macOS: - ./llamactl.yaml or ./config.yaml (current directory) - $HOME/Library/Application Support/llamactl/config.yaml - /Library/Application Support/llamactl/config.yaml

      Windows: - ./llamactl.yaml or ./config.yaml (current directory) - %APPDATA%\\llamactl\\config.yaml - %USERPROFILE%\\llamactl\\config.yaml - %PROGRAMDATA%\\llamactl\\config.yaml

      You can specify the path to config file with LLAMACTL_CONFIG_PATH environment variable.

      "},{"location":"getting-started/configuration/#configuration-options","title":"Configuration Options","text":""},{"location":"getting-started/configuration/#server-configuration","title":"Server Configuration","text":"
      server:\n  host: \"0.0.0.0\"         # Server host to bind to (default: \"0.0.0.0\")\n  port: 8080              # Server port to bind to (default: 8080)\n  allowed_origins: [\"*\"]  # CORS allowed origins (default: [\"*\"])\n  enable_swagger: false   # Enable Swagger UI (default: false)\n

      Environment Variables: - LLAMACTL_HOST - Server host - LLAMACTL_PORT - Server port - LLAMACTL_ALLOWED_ORIGINS - Comma-separated CORS origins - LLAMACTL_ENABLE_SWAGGER - Enable Swagger UI (true/false)

      "},{"location":"getting-started/configuration/#backend-configuration","title":"Backend Configuration","text":"
      backends:\n  llama_executable: \"llama-server\"     # Path to llama-server executable (default: \"llama-server\")\n  mlx_lm_executable: \"mlx_lm.server\"   # Path to mlx_lm.server executable (default: \"mlx_lm.server\")\n  vllm_executable: \"vllm\"              # Path to vllm executable (default: \"vllm\")\n

      Environment Variables: - LLAMACTL_LLAMA_EXECUTABLE - Path to llama-server executable - LLAMACTL_MLX_LM_EXECUTABLE - Path to mlx_lm.server executable - LLAMACTL_VLLM_EXECUTABLE - Path to vllm executable

      "},{"location":"getting-started/configuration/#instance-configuration","title":"Instance Configuration","text":"
      instances:\n  port_range: [8000, 9000]                          # Port range for instances (default: [8000, 9000])\n  data_dir: \"~/.local/share/llamactl\"               # Directory for all llamactl data (default varies by OS)\n  configs_dir: \"~/.local/share/llamactl/instances\"  # Directory for instance configs (default: data_dir/instances)\n  logs_dir: \"~/.local/share/llamactl/logs\"          # Directory for instance logs (default: data_dir/logs)\n  auto_create_dirs: true                            # Automatically create data/config/logs directories (default: true)\n  max_instances: -1                                 # Maximum instances (-1 = unlimited)\n  max_running_instances: -1                         # Maximum running instances (-1 = unlimited)\n  enable_lru_eviction: true                         # Enable LRU eviction for idle instances\n  default_auto_restart: true                        # Default auto-restart setting\n  default_max_restarts: 3                           # Default maximum restart attempts\n  default_restart_delay: 5                          # Default restart delay in seconds\n  default_on_demand_start: true                     # Default on-demand start setting\n  on_demand_start_timeout: 120                      # Default on-demand start timeout in seconds\n  timeout_check_interval: 5                         # Default instance timeout check interval in minutes\n

      Environment Variables: - LLAMACTL_INSTANCE_PORT_RANGE - Port range (format: \"8000-9000\" or \"8000,9000\") - LLAMACTL_DATA_DIRECTORY - Data directory path - LLAMACTL_INSTANCES_DIR - Instance configs directory path - LLAMACTL_LOGS_DIR - Log directory path - LLAMACTL_AUTO_CREATE_DATA_DIR - Auto-create data/config/logs directories (true/false) - LLAMACTL_MAX_INSTANCES - Maximum number of instances - LLAMACTL_MAX_RUNNING_INSTANCES - Maximum number of running instances - LLAMACTL_ENABLE_LRU_EVICTION - Enable LRU eviction for idle instances - LLAMACTL_DEFAULT_AUTO_RESTART - Default auto-restart setting (true/false) - LLAMACTL_DEFAULT_MAX_RESTARTS - Default maximum restarts - LLAMACTL_DEFAULT_RESTART_DELAY - Default restart delay in seconds - LLAMACTL_DEFAULT_ON_DEMAND_START - Default on-demand start setting (true/false) - LLAMACTL_ON_DEMAND_START_TIMEOUT - Default on-demand start timeout in seconds - LLAMACTL_TIMEOUT_CHECK_INTERVAL - Default instance timeout check interval in minutes

      "},{"location":"getting-started/configuration/#authentication-configuration","title":"Authentication Configuration","text":"
      auth:\n  require_inference_auth: true           # Require API key for OpenAI endpoints (default: true)\n  inference_keys: []                     # List of valid inference API keys\n  require_management_auth: true          # Require API key for management endpoints (default: true)\n  management_keys: []                    # List of valid management API keys\n

      Environment Variables: - LLAMACTL_REQUIRE_INFERENCE_AUTH - Require auth for OpenAI endpoints (true/false) - LLAMACTL_INFERENCE_KEYS - Comma-separated inference API keys - LLAMACTL_REQUIRE_MANAGEMENT_AUTH - Require auth for management endpoints (true/false) - LLAMACTL_MANAGEMENT_KEYS - Comma-separated management API keys

      "},{"location":"getting-started/configuration/#command-line-options","title":"Command Line Options","text":"

      View all available command line options:

      llamactl --help\n

      You can also override configuration using command line flags when starting llamactl.

      "},{"location":"getting-started/installation/","title":"Installation","text":"

      This guide will walk you through installing Llamactl on your system.

      "},{"location":"getting-started/installation/#prerequisites","title":"Prerequisites","text":""},{"location":"getting-started/installation/#backend-dependencies","title":"Backend Dependencies","text":"

      llamactl supports multiple backends. Install at least one:

      For llama.cpp backend (all platforms):

      You need llama-server from llama.cpp installed:

      # Homebrew (macOS/Linux)\nbrew install llama.cpp\n# Winget (Windows)\nwinget install llama.cpp\n

      Or build from source - see llama.cpp docs

      For MLX backend (macOS only):

      MLX provides optimized inference on Apple Silicon. Install MLX-LM:

      # Install via pip (requires Python 3.8+)\npip install mlx-lm\n\n# Or in a virtual environment (recommended)\npython -m venv mlx-env\nsource mlx-env/bin/activate\npip install mlx-lm\n

      Note: MLX backend is only available on macOS with Apple Silicon (M1, M2, M3, etc.)

      For vLLM backend:

      vLLM provides high-throughput distributed serving for LLMs. Install vLLM:

      # Install via pip (requires Python 3.8+, GPU required)\npip install vllm\n\n# Or in a virtual environment (recommended)\npython -m venv vllm-env\nsource vllm-env/bin/activate\npip install vllm\n\n# For production deployments, consider container-based installation\n
      "},{"location":"getting-started/installation/#installation-methods","title":"Installation Methods","text":""},{"location":"getting-started/installation/#option-1-download-binary-recommended","title":"Option 1: Download Binary (Recommended)","text":"

      Download the latest release from the GitHub releases page:

      # Linux/macOS - Get latest version and download\nLATEST_VERSION=$(curl -s https://api.github.com/repos/lordmathis/llamactl/releases/latest | grep '\"tag_name\":' | sed -E 's/.*\"([^\"]+)\".*/\\1/')\ncurl -L https://github.com/lordmathis/llamactl/releases/download/${LATEST_VERSION}/llamactl-${LATEST_VERSION}-$(uname -s | tr '[:upper:]' '[:lower:]')-$(uname -m).tar.gz | tar -xz\nsudo mv llamactl /usr/local/bin/\n\n# Or download manually from:\n# https://github.com/lordmathis/llamactl/releases/latest\n\n# Windows - Download from releases page\n
      "},{"location":"getting-started/installation/#option-2-build-from-source","title":"Option 2: Build from Source","text":"

      Requirements: - Go 1.24 or later - Node.js 22 or later - Git

      If you prefer to build from source:

      # Clone the repository\ngit clone https://github.com/lordmathis/llamactl.git\ncd llamactl\n\n# Build the web UI\ncd webui && npm ci && npm run build && cd ..\n\n# Build the application\ngo build -o llamactl ./cmd/server\n
      "},{"location":"getting-started/installation/#verification","title":"Verification","text":"

      Verify your installation by checking the version:

      llamactl --version\n
      "},{"location":"getting-started/installation/#next-steps","title":"Next Steps","text":"

      Now that Llamactl is installed, continue to the Quick Start guide to get your first instance running!

      "},{"location":"getting-started/quick-start/","title":"Quick Start","text":"

      This guide will help you get Llamactl up and running in just a few minutes.

      "},{"location":"getting-started/quick-start/#step-1-start-llamactl","title":"Step 1: Start Llamactl","text":"

      Start the Llamactl server:

      llamactl\n

      By default, Llamactl will start on http://localhost:8080.

      "},{"location":"getting-started/quick-start/#step-2-access-the-web-ui","title":"Step 2: Access the Web UI","text":"

      Open your web browser and navigate to:

      http://localhost:8080\n

      Login with the management API key. By default it is generated during server startup. Copy it from the terminal output.

      You should see the Llamactl web interface.

      "},{"location":"getting-started/quick-start/#step-3-create-your-first-instance","title":"Step 3: Create Your First Instance","text":"
      1. Click the \"Add Instance\" button
      2. Fill in the instance configuration:
      3. Name: Give your instance a descriptive name
      4. Backend Type: Choose from llama.cpp, MLX, or vLLM
      5. Model: Model path or identifier for your chosen backend
      6. Additional Options: Backend-specific parameters

      7. Click \"Create Instance\"

      "},{"location":"getting-started/quick-start/#step-4-start-your-instance","title":"Step 4: Start Your Instance","text":"

      Once created, you can:

      • Start the instance by clicking the start button
      • Monitor its status in real-time
      • View logs by clicking the logs button
      • Stop the instance when needed
      "},{"location":"getting-started/quick-start/#example-configurations","title":"Example Configurations","text":"

      Here are basic example configurations for each backend:

      llama.cpp backend:

      {\n  \"name\": \"llama2-7b\",\n  \"backend_type\": \"llama_cpp\",\n  \"backend_options\": {\n    \"model\": \"/path/to/llama-2-7b-chat.gguf\",\n    \"threads\": 4,\n    \"ctx_size\": 2048,\n    \"gpu_layers\": 32\n  }\n}\n

      MLX backend (macOS only):

      {\n  \"name\": \"mistral-mlx\",\n  \"backend_type\": \"mlx_lm\",\n  \"backend_options\": {\n    \"model\": \"mlx-community/Mistral-7B-Instruct-v0.3-4bit\",\n    \"temp\": 0.7,\n    \"max_tokens\": 2048\n  }\n}\n

      vLLM backend:

      {\n  \"name\": \"dialogpt-vllm\",\n  \"backend_type\": \"vllm\",\n  \"backend_options\": {\n    \"model\": \"microsoft/DialoGPT-medium\",\n    \"tensor_parallel_size\": 2,\n    \"gpu_memory_utilization\": 0.9\n  }\n}\n

      "},{"location":"getting-started/quick-start/#using-the-api","title":"Using the API","text":"

      You can also manage instances via the REST API:

      # List all instances\ncurl http://localhost:8080/api/instances\n\n# Create a new llama.cpp instance\ncurl -X POST http://localhost:8080/api/instances/my-model \\\n  -H \"Content-Type: application/json\" \\\n  -d '{\n    \"backend_type\": \"llama_cpp\",\n    \"backend_options\": {\n      \"model\": \"/path/to/model.gguf\"\n    }\n  }'\n\n# Start an instance\ncurl -X POST http://localhost:8080/api/instances/my-model/start\n
      "},{"location":"getting-started/quick-start/#openai-compatible-api","title":"OpenAI Compatible API","text":"

      Llamactl provides OpenAI-compatible endpoints, making it easy to integrate with existing OpenAI client libraries and tools.

      "},{"location":"getting-started/quick-start/#chat-completions","title":"Chat Completions","text":"

      Once you have an instance running, you can use it with the OpenAI-compatible chat completions endpoint:

      curl -X POST http://localhost:8080/v1/chat/completions \\\n  -H \"Content-Type: application/json\" \\\n  -d '{\n    \"model\": \"my-model\",\n    \"messages\": [\n      {\n        \"role\": \"user\",\n        \"content\": \"Hello! Can you help me write a Python function?\"\n      }\n    ],\n    \"max_tokens\": 150,\n    \"temperature\": 0.7\n  }'\n
      "},{"location":"getting-started/quick-start/#using-with-python-openai-client","title":"Using with Python OpenAI Client","text":"

      You can also use the official OpenAI Python client:

      from openai import OpenAI\n\n# Point the client to your Llamactl server\nclient = OpenAI(\n    base_url=\"http://localhost:8080/v1\",\n    api_key=\"not-needed\"  # Llamactl doesn't require API keys by default\n)\n\n# Create a chat completion\nresponse = client.chat.completions.create(\n    model=\"my-model\",  # Use the name of your instance\n    messages=[\n        {\"role\": \"user\", \"content\": \"Explain quantum computing in simple terms\"}\n    ],\n    max_tokens=200,\n    temperature=0.7\n)\n\nprint(response.choices[0].message.content)\n
      "},{"location":"getting-started/quick-start/#list-available-models","title":"List Available Models","text":"

      Get a list of running instances (models) in OpenAI-compatible format:

      curl http://localhost:8080/v1/models\n
      "},{"location":"getting-started/quick-start/#next-steps","title":"Next Steps","text":"
      • Manage instances Managing Instances
      • Explore the API Reference
      • Configure advanced settings in the Configuration guide
      "},{"location":"user-guide/api-reference/","title":"API Reference","text":"

      Complete reference for the Llamactl REST API.

      "},{"location":"user-guide/api-reference/#base-url","title":"Base URL","text":"

      All API endpoints are relative to the base URL:

      http://localhost:8080/api/v1\n
      "},{"location":"user-guide/api-reference/#authentication","title":"Authentication","text":"

      Llamactl supports API key authentication. If authentication is enabled, include the API key in the Authorization header:

      curl -H \"Authorization: Bearer <your-api-key>\" \\\n  http://localhost:8080/api/v1/instances\n

      The server supports two types of API keys: - Management API Keys: Required for instance management operations (CRUD operations on instances) - Inference API Keys: Required for OpenAI-compatible inference endpoints

      "},{"location":"user-guide/api-reference/#system-endpoints","title":"System Endpoints","text":""},{"location":"user-guide/api-reference/#get-llamactl-version","title":"Get Llamactl Version","text":"

      Get the version information of the llamactl server.

      GET /api/v1/version\n

      Response:

      Version: 1.0.0\nCommit: abc123\nBuild Time: 2024-01-15T10:00:00Z\n

      "},{"location":"user-guide/api-reference/#get-llama-server-help","title":"Get Llama Server Help","text":"

      Get help text for the llama-server command.

      GET /api/v1/server/help\n

      Response: Plain text help output from llama-server --help

      "},{"location":"user-guide/api-reference/#get-llama-server-version","title":"Get Llama Server Version","text":"

      Get version information of the llama-server binary.

      GET /api/v1/server/version\n

      Response: Plain text version output from llama-server --version

      "},{"location":"user-guide/api-reference/#list-available-devices","title":"List Available Devices","text":"

      List available devices for llama-server.

      GET /api/v1/server/devices\n

      Response: Plain text device list from llama-server --list-devices

      "},{"location":"user-guide/api-reference/#instances","title":"Instances","text":""},{"location":"user-guide/api-reference/#list-all-instances","title":"List All Instances","text":"

      Get a list of all instances.

      GET /api/v1/instances\n

      Response:

      [\n  {\n    \"name\": \"llama2-7b\",\n    \"status\": \"running\",\n    \"created\": 1705312200\n  }\n]\n

      "},{"location":"user-guide/api-reference/#get-instance-details","title":"Get Instance Details","text":"

      Get detailed information about a specific instance.

      GET /api/v1/instances/{name}\n

      Response:

      {\n  \"name\": \"llama2-7b\",\n  \"status\": \"running\",\n  \"created\": 1705312200\n}\n

      "},{"location":"user-guide/api-reference/#create-instance","title":"Create Instance","text":"

      Create and start a new instance.

      POST /api/v1/instances/{name}\n

      Request Body: JSON object with instance configuration. See Managing Instances for available configuration options.

      Response:

      {\n  \"name\": \"llama2-7b\",\n  \"status\": \"running\",\n  \"created\": 1705312200\n}\n

      "},{"location":"user-guide/api-reference/#update-instance","title":"Update Instance","text":"

      Update an existing instance configuration. See Managing Instances for available configuration options.

      PUT /api/v1/instances/{name}\n

      Request Body: JSON object with configuration fields to update.

      Response:

      {\n  \"name\": \"llama2-7b\",\n  \"status\": \"running\",\n  \"created\": 1705312200\n}\n

      "},{"location":"user-guide/api-reference/#delete-instance","title":"Delete Instance","text":"

      Stop and remove an instance.

      DELETE /api/v1/instances/{name}\n

      Response: 204 No Content

      "},{"location":"user-guide/api-reference/#instance-operations","title":"Instance Operations","text":""},{"location":"user-guide/api-reference/#start-instance","title":"Start Instance","text":"

      Start a stopped instance.

      POST /api/v1/instances/{name}/start\n

      Response:

      {\n  \"name\": \"llama2-7b\",\n  \"status\": \"running\",\n  \"created\": 1705312200\n}\n

      Error Responses: - 409 Conflict: Maximum number of running instances reached - 500 Internal Server Error: Failed to start instance

      "},{"location":"user-guide/api-reference/#stop-instance","title":"Stop Instance","text":"

      Stop a running instance.

      POST /api/v1/instances/{name}/stop\n

      Response:

      {\n  \"name\": \"llama2-7b\",\n  \"status\": \"stopped\",\n  \"created\": 1705312200\n}\n

      "},{"location":"user-guide/api-reference/#restart-instance","title":"Restart Instance","text":"

      Restart an instance (stop then start).

      POST /api/v1/instances/{name}/restart\n

      Response:

      {\n  \"name\": \"llama2-7b\",\n  \"status\": \"running\",\n  \"created\": 1705312200\n}\n

      "},{"location":"user-guide/api-reference/#get-instance-logs","title":"Get Instance Logs","text":"

      Retrieve instance logs.

      GET /api/v1/instances/{name}/logs\n

      Query Parameters: - lines: Number of lines to return (default: all lines, use -1 for all)

      Response: Plain text log output

      Example:

      curl \"http://localhost:8080/api/v1/instances/my-instance/logs?lines=100\"\n

      "},{"location":"user-guide/api-reference/#proxy-to-instance","title":"Proxy to Instance","text":"

      Proxy HTTP requests directly to the llama-server instance.

      GET /api/v1/instances/{name}/proxy/*\nPOST /api/v1/instances/{name}/proxy/*\n

      This endpoint forwards all requests to the underlying llama-server instance running on its configured port. The proxy strips the /api/v1/instances/{name}/proxy prefix and forwards the remaining path to the instance.

      Example - Check Instance Health:

      curl -H \"Authorization: Bearer your-api-key\" \\\n  http://localhost:8080/api/v1/instances/my-model/proxy/health\n

      This forwards the request to http://instance-host:instance-port/health on the actual llama-server instance.

      Error Responses: - 503 Service Unavailable: Instance is not running

      "},{"location":"user-guide/api-reference/#openai-compatible-api","title":"OpenAI-Compatible API","text":"

      Llamactl provides OpenAI-compatible endpoints for inference operations.

      "},{"location":"user-guide/api-reference/#list-models","title":"List Models","text":"

      List all instances in OpenAI-compatible format.

      GET /v1/models\n

      Response:

      {\n  \"object\": \"list\",\n  \"data\": [\n    {\n      \"id\": \"llama2-7b\",\n      \"object\": \"model\",\n      \"created\": 1705312200,\n      \"owned_by\": \"llamactl\"\n    }\n  ]\n}\n

      "},{"location":"user-guide/api-reference/#chat-completions-completions-embeddings","title":"Chat Completions, Completions, Embeddings","text":"

      All OpenAI-compatible inference endpoints are available:

      POST /v1/chat/completions\nPOST /v1/completions\nPOST /v1/embeddings\nPOST /v1/rerank\nPOST /v1/reranking\n

      Request Body: Standard OpenAI format with model field specifying the instance name

      Example:

      {\n  \"model\": \"llama2-7b\",\n  \"messages\": [\n    {\n      \"role\": \"user\",\n      \"content\": \"Hello, how are you?\"\n    }\n  ]\n}\n

      The server routes requests to the appropriate instance based on the model field in the request body. Instances with on-demand starting enabled will be automatically started if not running. For configuration details, see Managing Instances.

      Error Responses: - 400 Bad Request: Invalid request body or missing model name - 503 Service Unavailable: Instance is not running and on-demand start is disabled - 409 Conflict: Cannot start instance due to maximum instances limit

      "},{"location":"user-guide/api-reference/#instance-status-values","title":"Instance Status Values","text":"

      Instances can have the following status values: - stopped: Instance is not running - running: Instance is running and ready to accept requests - failed: Instance failed to start or crashed

      "},{"location":"user-guide/api-reference/#error-responses","title":"Error Responses","text":"

      All endpoints may return error responses in the following format:

      {\n  \"error\": \"Error message description\"\n}\n
      "},{"location":"user-guide/api-reference/#common-http-status-codes","title":"Common HTTP Status Codes","text":"
      • 200: Success
      • 201: Created
      • 204: No Content (successful deletion)
      • 400: Bad Request (invalid parameters or request body)
      • 401: Unauthorized (missing or invalid API key)
      • 403: Forbidden (insufficient permissions)
      • 404: Not Found (instance not found)
      • 409: Conflict (instance already exists, max instances reached)
      • 500: Internal Server Error
      • 503: Service Unavailable (instance not running)
      "},{"location":"user-guide/api-reference/#examples","title":"Examples","text":""},{"location":"user-guide/api-reference/#complete-instance-lifecycle","title":"Complete Instance Lifecycle","text":"
      # Create and start instance\ncurl -X POST http://localhost:8080/api/v1/instances/my-model \\\n  -H \"Content-Type: application/json\" \\\n  -H \"Authorization: Bearer your-api-key\" \\\n  -d '{\n    \"model\": \"/models/llama-2-7b.gguf\"\n  }'\n\n# Check instance status\ncurl -H \"Authorization: Bearer your-api-key\" \\\n  http://localhost:8080/api/v1/instances/my-model\n\n# Get instance logs\ncurl -H \"Authorization: Bearer your-api-key\" \\\n  \"http://localhost:8080/api/v1/instances/my-model/logs?lines=50\"\n\n# Use OpenAI-compatible chat completions\ncurl -X POST http://localhost:8080/v1/chat/completions \\\n  -H \"Content-Type: application/json\" \\\n  -H \"Authorization: Bearer your-inference-api-key\" \\\n  -d '{\n    \"model\": \"my-model\",\n    \"messages\": [\n      {\"role\": \"user\", \"content\": \"Hello!\"}\n    ],\n    \"max_tokens\": 100\n  }'\n\n# Stop instance\ncurl -X POST -H \"Authorization: Bearer your-api-key\" \\\n  http://localhost:8080/api/v1/instances/my-model/stop\n\n# Delete instance\ncurl -X DELETE -H \"Authorization: Bearer your-api-key\" \\\n  http://localhost:8080/api/v1/instances/my-model\n
      "},{"location":"user-guide/api-reference/#using-the-proxy-endpoint","title":"Using the Proxy Endpoint","text":"

      You can also directly proxy requests to the llama-server instance:

      # Direct proxy to instance (bypasses OpenAI compatibility layer)\ncurl -X POST http://localhost:8080/api/v1/instances/my-model/proxy/completion \\\n  -H \"Content-Type: application/json\" \\\n  -H \"Authorization: Bearer your-api-key\" \\\n  -d '{\n    \"prompt\": \"Hello, world!\",\n    \"n_predict\": 50\n  }'\n
      "},{"location":"user-guide/api-reference/#backend-specific-endpoints","title":"Backend-Specific Endpoints","text":""},{"location":"user-guide/api-reference/#parse-commands","title":"Parse Commands","text":"

      Llamactl provides endpoints to parse command strings from different backends into instance configuration options.

      "},{"location":"user-guide/api-reference/#parse-llamacpp-command","title":"Parse Llama.cpp Command","text":"

      Parse a llama-server command string into instance options.

      POST /api/v1/backends/llama-cpp/parse-command\n

      Request Body:

      {\n  \"command\": \"llama-server -m /path/to/model.gguf -c 2048 --port 8080\"\n}\n

      Response:

      {\n  \"backend_type\": \"llama_cpp\",\n  \"llama_server_options\": {\n    \"model\": \"/path/to/model.gguf\",\n    \"ctx_size\": 2048,\n    \"port\": 8080\n  }\n}\n

      "},{"location":"user-guide/api-reference/#parse-mlx-lm-command","title":"Parse MLX-LM Command","text":"

      Parse an MLX-LM server command string into instance options.

      POST /api/v1/backends/mlx/parse-command\n

      Request Body:

      {\n  \"command\": \"mlx_lm.server --model /path/to/model --port 8080\"\n}\n

      Response:

      {\n  \"backend_type\": \"mlx_lm\",\n  \"mlx_server_options\": {\n    \"model\": \"/path/to/model\",\n    \"port\": 8080\n  }\n}\n

      "},{"location":"user-guide/api-reference/#parse-vllm-command","title":"Parse vLLM Command","text":"

      Parse a vLLM serve command string into instance options.

      POST /api/v1/backends/vllm/parse-command\n

      Request Body:

      {\n  \"command\": \"vllm serve /path/to/model --port 8080\"\n}\n

      Response:

      {\n  \"backend_type\": \"vllm\",\n  \"vllm_server_options\": {\n    \"model\": \"/path/to/model\",\n    \"port\": 8080\n  }\n}\n

      Error Responses for Parse Commands: - 400 Bad Request: Invalid request body, empty command, or parse error - 500 Internal Server Error: Encoding error

      "},{"location":"user-guide/api-reference/#auto-generated-documentation","title":"Auto-Generated Documentation","text":"

      The API documentation is automatically generated from code annotations using Swagger/OpenAPI. To regenerate the documentation:

      1. Install the swag tool: go install github.com/swaggo/swag/cmd/swag@latest
      2. Generate docs: swag init -g cmd/server/main.go -o apidocs
      "},{"location":"user-guide/api-reference/#swagger-documentation","title":"Swagger Documentation","text":"

      If swagger documentation is enabled in the server configuration, you can access the interactive API documentation at:

      http://localhost:8080/swagger/\n

      This provides a complete interactive interface for testing all API endpoints.

      "},{"location":"user-guide/managing-instances/","title":"Managing Instances","text":"

      Learn how to effectively manage your llama.cpp, MLX, and vLLM instances with Llamactl through both the Web UI and API.

      "},{"location":"user-guide/managing-instances/#overview","title":"Overview","text":"

      Llamactl provides two ways to manage instances:

      • Web UI: Accessible at http://localhost:8080 with an intuitive dashboard
      • REST API: Programmatic access for automation and integration

      "},{"location":"user-guide/managing-instances/#authentication","title":"Authentication","text":"

      If authentication is enabled: 1. Navigate to the web UI 2. Enter your credentials 3. Bearer token is stored for the session

      "},{"location":"user-guide/managing-instances/#theme-support","title":"Theme Support","text":"
      • Switch between light and dark themes
      • Setting is remembered across sessions
      "},{"location":"user-guide/managing-instances/#instance-cards","title":"Instance Cards","text":"

      Each instance is displayed as a card showing:

      • Instance name
      • Health status badge (unknown, ready, error, failed)
      • Action buttons (start, stop, edit, logs, delete)
      "},{"location":"user-guide/managing-instances/#create-instance","title":"Create Instance","text":""},{"location":"user-guide/managing-instances/#via-web-ui","title":"Via Web UI","text":"
      1. Click the \"Create Instance\" button on the dashboard
      2. Enter a unique Name for your instance (only required field)
      3. Choose Backend Type:
        • llama.cpp: For GGUF models using llama-server
        • MLX: For MLX-optimized models (macOS only)
        • vLLM: For distributed serving and high-throughput inference
      4. Configure model source:
        • For llama.cpp: GGUF model path or HuggingFace repo
        • For MLX: MLX model path or identifier (e.g., mlx-community/Mistral-7B-Instruct-v0.3-4bit)
        • For vLLM: HuggingFace model identifier (e.g., microsoft/DialoGPT-medium)
      5. Configure optional instance management settings:
        • Auto Restart: Automatically restart instance on failure
        • Max Restarts: Maximum number of restart attempts
        • Restart Delay: Delay in seconds between restart attempts
        • On Demand Start: Start instance when receiving a request to the OpenAI compatible endpoint
        • Idle Timeout: Minutes before stopping idle instance (set to 0 to disable)
      6. Configure backend-specific options:
        • llama.cpp: Threads, context size, GPU layers, port, etc.
        • MLX: Temperature, top-p, adapter path, Python environment, etc.
        • vLLM: Tensor parallel size, GPU memory utilization, quantization, etc.
      7. Click \"Create\" to save the instance
      "},{"location":"user-guide/managing-instances/#via-api","title":"Via API","text":"
      # Create llama.cpp instance with local model file\ncurl -X POST http://localhost:8080/api/instances/my-llama-instance \\\n  -H \"Content-Type: application/json\" \\\n  -d '{\n    \"backend_type\": \"llama_cpp\",\n    \"backend_options\": {\n      \"model\": \"/path/to/model.gguf\",\n      \"threads\": 8,\n      \"ctx_size\": 4096,\n      \"gpu_layers\": 32\n    }\n  }'\n\n# Create MLX instance (macOS only)\ncurl -X POST http://localhost:8080/api/instances/my-mlx-instance \\\n  -H \"Content-Type: application/json\" \\\n  -d '{\n    \"backend_type\": \"mlx_lm\",\n    \"backend_options\": {\n      \"model\": \"mlx-community/Mistral-7B-Instruct-v0.3-4bit\",\n      \"temp\": 0.7,\n      \"top_p\": 0.9,\n      \"max_tokens\": 2048\n    },\n    \"auto_restart\": true,\n    \"max_restarts\": 3\n  }'\n\n# Create vLLM instance\ncurl -X POST http://localhost:8080/api/instances/my-vllm-instance \\\n  -H \"Content-Type: application/json\" \\\n  -d '{\n    \"backend_type\": \"vllm\",\n    \"backend_options\": {\n      \"model\": \"microsoft/DialoGPT-medium\",\n      \"tensor_parallel_size\": 2,\n      \"gpu_memory_utilization\": 0.9\n    },\n    \"auto_restart\": true,\n    \"on_demand_start\": true\n  }'\n\n# Create llama.cpp instance with HuggingFace model\ncurl -X POST http://localhost:8080/api/instances/gemma-3-27b \\\n  -H \"Content-Type: application/json\" \\\n  -d '{\n    \"backend_type\": \"llama_cpp\",\n    \"backend_options\": {\n      \"hf_repo\": \"unsloth/gemma-3-27b-it-GGUF\",\n      \"hf_file\": \"gemma-3-27b-it-GGUF.gguf\",\n      \"gpu_layers\": 32\n    }\n  }'\n
      "},{"location":"user-guide/managing-instances/#start-instance","title":"Start Instance","text":""},{"location":"user-guide/managing-instances/#via-web-ui_1","title":"Via Web UI","text":"
      1. Click the \"Start\" button on an instance card
      2. Watch the status change to \"Unknown\"
      3. Monitor progress in the logs
      4. Instance status changes to \"Ready\" when ready
      "},{"location":"user-guide/managing-instances/#via-api_1","title":"Via API","text":"
      curl -X POST http://localhost:8080/api/instances/{name}/start\n
      "},{"location":"user-guide/managing-instances/#stop-instance","title":"Stop Instance","text":""},{"location":"user-guide/managing-instances/#via-web-ui_2","title":"Via Web UI","text":"
      1. Click the \"Stop\" button on an instance card
      2. Instance gracefully shuts down
      "},{"location":"user-guide/managing-instances/#via-api_2","title":"Via API","text":"
      curl -X POST http://localhost:8080/api/instances/{name}/stop\n
      "},{"location":"user-guide/managing-instances/#edit-instance","title":"Edit Instance","text":""},{"location":"user-guide/managing-instances/#via-web-ui_3","title":"Via Web UI","text":"
      1. Click the \"Edit\" button on an instance card
      2. Modify settings in the configuration dialog
      3. Changes require instance restart to take effect
      4. Click \"Update & Restart\" to apply changes
      "},{"location":"user-guide/managing-instances/#via-api_3","title":"Via API","text":"

      Modify instance settings:

      curl -X PUT http://localhost:8080/api/instances/{name} \\\n  -H \"Content-Type: application/json\" \\\n  -d '{\n    \"backend_options\": {\n      \"threads\": 8,\n      \"context_size\": 4096\n    }\n  }'\n

      Note

      Configuration changes require restarting the instance to take effect.

      "},{"location":"user-guide/managing-instances/#view-logs","title":"View Logs","text":""},{"location":"user-guide/managing-instances/#via-web-ui_4","title":"Via Web UI","text":"
      1. Click the \"Logs\" button on any instance card
      2. Real-time log viewer opens
      "},{"location":"user-guide/managing-instances/#via-api_4","title":"Via API","text":"

      Check instance status in real-time:

      # Get instance details\ncurl http://localhost:8080/api/instances/{name}/logs\n
      "},{"location":"user-guide/managing-instances/#delete-instance","title":"Delete Instance","text":""},{"location":"user-guide/managing-instances/#via-web-ui_5","title":"Via Web UI","text":"
      1. Click the \"Delete\" button on an instance card
      2. Only stopped instances can be deleted
      3. Confirm deletion in the dialog
      "},{"location":"user-guide/managing-instances/#via-api_5","title":"Via API","text":"
      curl -X DELETE http://localhost:8080/api/instances/{name}\n
      "},{"location":"user-guide/managing-instances/#instance-proxy","title":"Instance Proxy","text":"

      Llamactl proxies all requests to the underlying backend instances (llama-server, MLX, or vLLM).

      # Get instance details\ncurl http://localhost:8080/api/instances/{name}/proxy/\n

      All backends provide OpenAI-compatible endpoints. Check the respective documentation: - llama-server docs - MLX-LM docs - vLLM docs

      "},{"location":"user-guide/managing-instances/#instance-health","title":"Instance Health","text":""},{"location":"user-guide/managing-instances/#via-web-ui_6","title":"Via Web UI","text":"
      1. The health status badge is displayed on each instance card
      "},{"location":"user-guide/managing-instances/#via-api_6","title":"Via API","text":"

      Check the health status of your instances:

      curl http://localhost:8080/api/instances/{name}/proxy/health\n
      "},{"location":"user-guide/troubleshooting/","title":"Troubleshooting","text":"

      Issues specific to Llamactl deployment and operation.

      "},{"location":"user-guide/troubleshooting/#configuration-issues","title":"Configuration Issues","text":""},{"location":"user-guide/troubleshooting/#invalid-configuration","title":"Invalid Configuration","text":"

      Problem: Invalid configuration preventing startup

      Solutions: 1. Use minimal configuration:

      server:\n  host: \"0.0.0.0\"\n  port: 8080\ninstances:\n  port_range: [8000, 9000]\n

      1. Check data directory permissions:
        # Ensure data directory is writable (default: ~/.local/share/llamactl)\nmkdir -p ~/.local/share/llamactl/{instances,logs}\n
      "},{"location":"user-guide/troubleshooting/#instance-management-issues","title":"Instance Management Issues","text":""},{"location":"user-guide/troubleshooting/#model-loading-failures","title":"Model Loading Failures","text":"

      Problem: Instance fails to start with model loading errors

      Common Solutions: - llama-server not found: Ensure llama-server binary is in PATH - Wrong model format: Ensure model is in GGUF format - Insufficient memory: Use smaller model or reduce context size - Path issues: Use absolute paths to model files

      "},{"location":"user-guide/troubleshooting/#memory-issues","title":"Memory Issues","text":"

      Problem: Out of memory errors or system becomes unresponsive

      Solutions: 1. Reduce context size:

      {\n  \"n_ctx\": 1024\n}\n

      1. Use quantized models:
      2. Try Q4_K_M instead of higher precision models
      3. Use smaller model variants (7B instead of 13B)
      "},{"location":"user-guide/troubleshooting/#gpu-configuration","title":"GPU Configuration","text":"

      Problem: GPU not being used effectively

      Solutions: 1. Configure GPU layers:

      {\n  \"n_gpu_layers\": 35\n}\n

      "},{"location":"user-guide/troubleshooting/#advanced-instance-issues","title":"Advanced Instance Issues","text":"

      Problem: Complex model loading, performance, or compatibility issues

      Since llamactl uses llama-server under the hood, many instance-related issues are actually llama.cpp issues. For advanced troubleshooting:

      Resources: - llama.cpp Documentation: https://github.com/ggml/llama.cpp - llama.cpp Issues: https://github.com/ggml/llama.cpp/issues - llama.cpp Discussions: https://github.com/ggml/llama.cpp/discussions

      Testing directly with llama-server:

      # Test your model and parameters directly with llama-server\nllama-server --model /path/to/model.gguf --port 8081 --n-gpu-layers 35\n

      This helps determine if the issue is with llamactl or with the underlying llama.cpp/llama-server.

      "},{"location":"user-guide/troubleshooting/#api-and-network-issues","title":"API and Network Issues","text":""},{"location":"user-guide/troubleshooting/#cors-errors","title":"CORS Errors","text":"

      Problem: Web UI shows CORS errors in browser console

      Solutions: 1. Configure allowed origins:

      server:\n  allowed_origins:\n    - \"http://localhost:3000\"\n    - \"https://yourdomain.com\"\n

      "},{"location":"user-guide/troubleshooting/#authentication-issues","title":"Authentication Issues","text":"

      Problem: API requests failing with authentication errors

      Solutions: 1. Disable authentication temporarily:

      auth:\n  require_management_auth: false\n  require_inference_auth: false\n

      1. Configure API keys:

        auth:\n  management_keys:\n    - \"your-management-key\"\n  inference_keys:\n    - \"your-inference-key\"\n

      2. Use correct Authorization header:

        curl -H \"Authorization: Bearer your-api-key\" \\\n  http://localhost:8080/api/v1/instances\n

      "},{"location":"user-guide/troubleshooting/#debugging-and-logs","title":"Debugging and Logs","text":""},{"location":"user-guide/troubleshooting/#viewing-instance-logs","title":"Viewing Instance Logs","text":"
      # Get instance logs via API\ncurl http://localhost:8080/api/v1/instances/{name}/logs\n\n# Or check log files directly\ntail -f ~/.local/share/llamactl/logs/{instance-name}.log\n
      "},{"location":"user-guide/troubleshooting/#enable-debug-logging","title":"Enable Debug Logging","text":"
      export LLAMACTL_LOG_LEVEL=debug\nllamactl\n
      "},{"location":"user-guide/troubleshooting/#getting-help","title":"Getting Help","text":"

      When reporting issues, include:

      1. System information:

        llamactl --version\n

      2. Configuration file (remove sensitive keys)

      3. Relevant log output

      4. Steps to reproduce the issue

      "}]} \ No newline at end of file diff --git a/dev/sitemap.xml b/dev/sitemap.xml index 612b6ca..0f9d74b 100644 --- a/dev/sitemap.xml +++ b/dev/sitemap.xml @@ -2,37 +2,37 @@ https://llamactl.org/dev/ - 2025-09-18 + 2025-09-22 daily https://llamactl.org/dev/getting-started/configuration/ - 2025-09-18 + 2025-09-22 daily https://llamactl.org/dev/getting-started/installation/ - 2025-09-18 + 2025-09-22 daily https://llamactl.org/dev/getting-started/quick-start/ - 2025-09-18 + 2025-09-22 daily https://llamactl.org/dev/user-guide/api-reference/ - 2025-09-18 + 2025-09-22 daily https://llamactl.org/dev/user-guide/managing-instances/ - 2025-09-18 + 2025-09-22 daily https://llamactl.org/dev/user-guide/troubleshooting/ - 2025-09-18 + 2025-09-22 daily \ No newline at end of file diff --git a/dev/sitemap.xml.gz b/dev/sitemap.xml.gz index 4d409eeac37df0fede0de68e07ee4ea8fb58fd9f..f7a9f941f31ec9060182f442a3dd23817285a89d 100644 GIT binary patch delta 277 zcmV+w0qXvw0;2*jABzYGa;nj20{?SqbY*Q}a4vXlYyj1i!EVDK42JK0iYRwN8fnrx zrS!HZ*dBm^kuVs4d-)n%af669_;Y!z>aqZ@g0|kA%lEfe_FP`Xq0P}Dq=LjA&t;#| z_`>-#O%>V%gB-C6(Qs5f70B;RURS45914orV9l0jT~K0nL9%g94pJqzSZKKMme1E) zwZt?8Et8m}E?#xeC6s zCP$fQY!AR|J!PbN5Zr6F>T|7jNShHNGudk3a>G8EUYdEx%PtTu?wdEo bkUtz?AHuvF{|~zy^vQ`IUs1x#6a@eP;lhJW delta 278 zcmV+x0qOpu0;B>kABzYGZdlA{0{?SqbY*Q}a4vXlYyj1i!EVDK42JK0iikTQtu(2f zQhMtXY!ASQkuVs4efjELafgUD_%qv=6)V8&&{%KI?EUS^kL)6Mb&d`p6(sg}W?f3d zs}SQjmS_|7YQQo?BT#o#pm;PzRi2pK6%>=flFd=OpylL(WaFIbl}>K9Xy7htF{d1a;{&$~dlxNY7P cL;i4pT?o@|{6Fk`&?hH<0OJ|NToeTW0D7~8AOHXW diff --git a/dev/user-guide/api-reference/index.html b/dev/user-guide/api-reference/index.html index eb7b86b..7633cef 100644 --- a/dev/user-guide/api-reference/index.html +++ b/dev/user-guide/api-reference/index.html @@ -856,6 +856,72 @@
    + + +
  • + + + Backend-Specific Endpoints + + + + + +
  • + +
  • + + + Auto-Generated Documentation + + +
  • @@ -1216,6 +1282,72 @@ +
  • + +
  • + + + Backend-Specific Endpoints + + + + + +
  • + +
  • + + + Auto-Generated Documentation + + +
  • @@ -1346,7 +1478,7 @@

    Response:

    {
       "name": "llama2-7b",
    -  "status": "starting",
    +  "status": "running",
       "created": 1705312200
     }
     

    @@ -1360,7 +1492,7 @@

    Response:

    {
       "name": "llama2-7b",
    -  "status": "stopping",
    +  "status": "stopped",
       "created": 1705312200
     }
     

    @@ -1371,7 +1503,7 @@

    Response:

    {
       "name": "llama2-7b",
    -  "status": "restarting",
    +  "status": "running",
       "created": 1705312200
     }
     

    @@ -1443,9 +1575,9 @@ - 503 Service Unavailable: Instance is not running and on-demand start is disabled - 409 Conflict: Cannot start instance due to maximum instances limit

    Instance Status Values

    -

    Instances can have the following status values:
    -- stopped: Instance is not running
    -- running: Instance is running and ready to accept requests
    +

    Instances can have the following status values: +- stopped: Instance is not running +- running: Instance is running and ready to accept requests - failed: Instance failed to start or crashed

    Error Responses

    All endpoints may return error responses in the following format:

    @@ -1515,9 +1647,76 @@ "n_predict": 50 }'
  • +

    Backend-Specific Endpoints

    +

    Parse Commands

    +

    Llamactl provides endpoints to parse command strings from different backends into instance configuration options.

    +

    Parse Llama.cpp Command

    +

    Parse a llama-server command string into instance options.

    +
    POST /api/v1/backends/llama-cpp/parse-command
    +
    +

    Request Body: +

    {
    +  "command": "llama-server -m /path/to/model.gguf -c 2048 --port 8080"
    +}
    +

    +

    Response: +

    {
    +  "backend_type": "llama_cpp",
    +  "llama_server_options": {
    +    "model": "/path/to/model.gguf",
    +    "ctx_size": 2048,
    +    "port": 8080
    +  }
    +}
    +

    +

    Parse MLX-LM Command

    +

    Parse an MLX-LM server command string into instance options.

    +
    POST /api/v1/backends/mlx/parse-command
    +
    +

    Request Body: +

    {
    +  "command": "mlx_lm.server --model /path/to/model --port 8080"
    +}
    +

    +

    Response: +

    {
    +  "backend_type": "mlx_lm",
    +  "mlx_server_options": {
    +    "model": "/path/to/model",
    +    "port": 8080
    +  }
    +}
    +

    +

    Parse vLLM Command

    +

    Parse a vLLM serve command string into instance options.

    +
    POST /api/v1/backends/vllm/parse-command
    +
    +

    Request Body: +

    {
    +  "command": "vllm serve /path/to/model --port 8080"
    +}
    +

    +

    Response: +

    {
    +  "backend_type": "vllm",
    +  "vllm_server_options": {
    +    "model": "/path/to/model",
    +    "port": 8080
    +  }
    +}
    +

    +

    Error Responses for Parse Commands: +- 400 Bad Request: Invalid request body, empty command, or parse error +- 500 Internal Server Error: Encoding error

    +

    Auto-Generated Documentation

    +

    The API documentation is automatically generated from code annotations using Swagger/OpenAPI. To regenerate the documentation:

    +
      +
    1. Install the swag tool: go install github.com/swaggo/swag/cmd/swag@latest
    2. +
    3. Generate docs: swag init -g cmd/server/main.go -o apidocs
    4. +

    Swagger Documentation

    If swagger documentation is enabled in the server configuration, you can access the interactive API documentation at:

    -
    http://localhost:8080/swagger/
    +
    http://localhost:8080/swagger/
     

    This provides a complete interactive interface for testing all API endpoints.

    @@ -1540,7 +1739,7 @@ - September 3, 2025 + September 21, 2025 diff --git a/dev/user-guide/managing-instances/index.html b/dev/user-guide/managing-instances/index.html index fdccc29..b005e1c 100644 --- a/dev/user-guide/managing-instances/index.html +++ b/dev/user-guide/managing-instances/index.html @@ -1228,7 +1228,7 @@

    Managing Instances

    -

    Learn how to effectively manage your llama.cpp and MLX instances with Llamactl through both the Web UI and API.

    +

    Learn how to effectively manage your llama.cpp, MLX, and vLLM instances with Llamactl through both the Web UI and API.

    Overview

    Llamactl provides two ways to manage instances:

      @@ -1262,11 +1262,13 @@
    • Choose Backend Type:
      • llama.cpp: For GGUF models using llama-server
      • MLX: For MLX-optimized models (macOS only)
      • +
      • vLLM: For distributed serving and high-throughput inference
    • Configure model source:
      • For llama.cpp: GGUF model path or HuggingFace repo
      • For MLX: MLX model path or identifier (e.g., mlx-community/Mistral-7B-Instruct-v0.3-4bit)
      • +
      • For vLLM: HuggingFace model identifier (e.g., microsoft/DialoGPT-medium)
    • Configure optional instance management settings:
        @@ -1280,6 +1282,7 @@
      • Configure backend-specific options:
        • llama.cpp: Threads, context size, GPU layers, port, etc.
        • MLX: Temperature, top-p, adapter path, Python environment, etc.
        • +
        • vLLM: Tensor parallel size, GPU memory utilization, quantization, etc.
      • Click "Create" to save the instance
      • @@ -1313,17 +1316,31 @@ "max_restarts": 3 }' -# Create llama.cpp instance with HuggingFace model -curl -X POST http://localhost:8080/api/instances/gemma-3-27b \ +# Create vLLM instance +curl -X POST http://localhost:8080/api/instances/my-vllm-instance \ -H "Content-Type: application/json" \ -d '{ - "backend_type": "llama_cpp", + "backend_type": "vllm", "backend_options": { - "hf_repo": "unsloth/gemma-3-27b-it-GGUF", - "hf_file": "gemma-3-27b-it-GGUF.gguf", - "gpu_layers": 32 - } - }' + "model": "microsoft/DialoGPT-medium", + "tensor_parallel_size": 2, + "gpu_memory_utilization": 0.9 + }, + "auto_restart": true, + "on_demand_start": true + }' + +# Create llama.cpp instance with HuggingFace model +curl -X POST http://localhost:8080/api/instances/gemma-3-27b \ + -H "Content-Type: application/json" \ + -d '{ + "backend_type": "llama_cpp", + "backend_options": { + "hf_repo": "unsloth/gemma-3-27b-it-GGUF", + "hf_file": "gemma-3-27b-it-GGUF.gguf", + "gpu_layers": 32 + } + }'

    Start Instance

    Via Web UI

    @@ -1390,13 +1407,14 @@
    curl -X DELETE http://localhost:8080/api/instances/{name}
     

    Instance Proxy

    -

    Llamactl proxies all requests to the underlying backend instances (llama-server or MLX).

    +

    Llamactl proxies all requests to the underlying backend instances (llama-server, MLX, or vLLM).

    # Get instance details
     curl http://localhost:8080/api/instances/{name}/proxy/
     
    -

    Both backends provide OpenAI-compatible endpoints. Check the respective documentation: +

    All backends provide OpenAI-compatible endpoints. Check the respective documentation: - llama-server docs -- MLX-LM docs

    +- MLX-LM docs +- vLLM docs

    Instance Health

    Via Web UI

      @@ -1426,7 +1444,7 @@ - September 18, 2025 + September 21, 2025