From 8820dc114693664e861b4198d08a3686c28cc68c Mon Sep 17 00:00:00 2001 From: LordMathis Date: Thu, 18 Sep 2025 20:01:18 +0200 Subject: [PATCH] Enhance documentation for MLX backend support --- docs/getting-started/configuration.md | 33 ++++++++++----- docs/getting-started/installation.md | 25 ++++++++++-- docs/index.md | 19 +++++---- docs/user-guide/managing-instances.md | 59 +++++++++++++++++---------- 4 files changed, 92 insertions(+), 44 deletions(-) diff --git a/docs/getting-started/configuration.md b/docs/getting-started/configuration.md index 64b097a..f8003ef 100644 --- a/docs/getting-started/configuration.md +++ b/docs/getting-started/configuration.md @@ -19,6 +19,10 @@ server: allowed_origins: ["*"] # Allowed CORS origins (default: all) enable_swagger: false # Enable Swagger UI for API docs +backends: + llama_executable: llama-server # Path to llama-server executable + mlx_lm_executable: mlx_lm.server # Path to mlx_lm.server executable + instances: port_range: [8000, 9000] # Port range for instances data_dir: ~/.local/share/llamactl # Data directory (platform-specific, see below) @@ -28,7 +32,6 @@ instances: max_instances: -1 # Max instances (-1 = unlimited) max_running_instances: -1 # Max running instances (-1 = unlimited) enable_lru_eviction: true # Enable LRU eviction for idle instances - llama_executable: llama-server # Path to llama-server executable default_auto_restart: true # Auto-restart new instances by default default_max_restarts: 3 # Max restarts for new instances default_restart_delay: 5 # Restart delay (seconds) for new instances @@ -79,11 +82,23 @@ server: enable_swagger: false # Enable Swagger UI (default: false) ``` -**Environment Variables:** -- `LLAMACTL_HOST` - Server host -- `LLAMACTL_PORT` - Server port -- `LLAMACTL_ALLOWED_ORIGINS` - Comma-separated CORS origins -- `LLAMACTL_ENABLE_SWAGGER` - Enable Swagger UI (true/false) +**Environment Variables:** +- `LLAMACTL_HOST` - Server host +- `LLAMACTL_PORT` - Server port +- `LLAMACTL_ALLOWED_ORIGINS` - Comma-separated CORS origins +- `LLAMACTL_ENABLE_SWAGGER` - Enable Swagger UI (true/false) + +### Backend Configuration + +```yaml +backends: + llama_executable: "llama-server" # Path to llama-server executable (default: "llama-server") + mlx_lm_executable: "mlx_lm.server" # Path to mlx_lm.server executable (default: "mlx_lm.server") +``` + +**Environment Variables:** +- `LLAMACTL_LLAMA_EXECUTABLE` - Path to llama-server executable +- `LLAMACTL_MLX_LM_EXECUTABLE` - Path to mlx_lm.server executable ### Instance Configuration @@ -97,7 +112,6 @@ instances: max_instances: -1 # Maximum instances (-1 = unlimited) max_running_instances: -1 # Maximum running instances (-1 = unlimited) enable_lru_eviction: true # Enable LRU eviction for idle instances - llama_executable: "llama-server" # Path to llama-server executable default_auto_restart: true # Default auto-restart setting default_max_restarts: 3 # Default maximum restart attempts default_restart_delay: 5 # Default restart delay in seconds @@ -113,9 +127,8 @@ instances: - `LLAMACTL_LOGS_DIR` - Log directory path - `LLAMACTL_AUTO_CREATE_DATA_DIR` - Auto-create data/config/logs directories (true/false) - `LLAMACTL_MAX_INSTANCES` - Maximum number of instances -- `LLAMACTL_MAX_RUNNING_INSTANCES` - Maximum number of running instances -- `LLAMACTL_ENABLE_LRU_EVICTION` - Enable LRU eviction for idle instances -- `LLAMACTL_LLAMA_EXECUTABLE` - Path to llama-server executable +- `LLAMACTL_MAX_RUNNING_INSTANCES` - Maximum number of running instances +- `LLAMACTL_ENABLE_LRU_EVICTION` - Enable LRU eviction for idle instances - `LLAMACTL_DEFAULT_AUTO_RESTART` - Default auto-restart setting (true/false) - `LLAMACTL_DEFAULT_MAX_RESTARTS` - Default maximum restarts - `LLAMACTL_DEFAULT_RESTART_DELAY` - Default restart delay in seconds diff --git a/docs/getting-started/installation.md b/docs/getting-started/installation.md index 90f78a8..a3ceae6 100644 --- a/docs/getting-started/installation.md +++ b/docs/getting-started/installation.md @@ -4,11 +4,14 @@ This guide will walk you through installing Llamactl on your system. ## Prerequisites +### Backend Dependencies + +llamactl supports multiple backends. Install at least one: + +**For llama.cpp backend (all platforms):** + You need `llama-server` from [llama.cpp](https://github.com/ggml-org/llama.cpp) installed: - -**Quick install methods:** - ```bash # Homebrew (macOS/Linux) brew install llama.cpp @@ -18,6 +21,22 @@ winget install llama.cpp Or build from source - see llama.cpp docs +**For MLX backend (macOS only):** + +MLX provides optimized inference on Apple Silicon. Install MLX-LM: + +```bash +# Install via pip (requires Python 3.8+) +pip install mlx-lm + +# Or in a virtual environment (recommended) +python -m venv mlx-env +source mlx-env/bin/activate +pip install mlx-lm +``` + +Note: MLX backend is only available on macOS with Apple Silicon (M1, M2, M3, etc.) + ## Installation Methods ### Option 1: Download Binary (Recommended) diff --git a/docs/index.md b/docs/index.md index d3e7bb9..585363c 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,22 +1,23 @@ # Llamactl Documentation -Welcome to the Llamactl documentation! **Management server and proxy for multiple llama.cpp instances with OpenAI-compatible API routing.** +Welcome to the Llamactl documentation! **Management server and proxy for multiple llama.cpp and MLX instances with OpenAI-compatible API routing.** ![Dashboard Screenshot](images/dashboard.png) ## What is Llamactl? -Llamactl is designed to simplify the deployment and management of llama-server instances. It provides a modern solution for running multiple large language models with centralized management. +Llamactl is designed to simplify the deployment and management of llama-server and MLX instances. It provides a modern solution for running multiple large language models with centralized management and multi-backend support. ## Features -🚀 **Multiple Model Serving**: Run different models simultaneously (7B for speed, 70B for quality) -🔗 **OpenAI API Compatible**: Drop-in replacement - route requests by model name -🌐 **Web Dashboard**: Modern React UI for visual management (unlike CLI-only tools) -🔐 **API Key Authentication**: Separate keys for management vs inference access -📊 **Instance Monitoring**: Health checks, auto-restart, log management -⚡ **Smart Resource Management**: Idle timeout, LRU eviction, and configurable instance limits -💡 **On-Demand Instance Start**: Automatically launch instances upon receiving OpenAI-compatible API requests +🚀 **Multiple Model Serving**: Run different models simultaneously (7B for speed, 70B for quality) +🔗 **OpenAI API Compatible**: Drop-in replacement - route requests by model name +🍎 **Multi-Backend Support**: Native support for both llama.cpp and MLX (Apple Silicon optimized) +🌐 **Web Dashboard**: Modern React UI for visual management (unlike CLI-only tools) +🔐 **API Key Authentication**: Separate keys for management vs inference access +📊 **Instance Monitoring**: Health checks, auto-restart, log management +⚡ **Smart Resource Management**: Idle timeout, LRU eviction, and configurable instance limits +💡 **On-Demand Instance Start**: Automatically launch instances upon receiving OpenAI-compatible API requests 💾 **State Persistence**: Ensure instances remain intact across server restarts ## Quick Links diff --git a/docs/user-guide/managing-instances.md b/docs/user-guide/managing-instances.md index 90e4552..186670c 100644 --- a/docs/user-guide/managing-instances.md +++ b/docs/user-guide/managing-instances.md @@ -1,6 +1,6 @@ # Managing Instances -Learn how to effectively manage your Llama.cpp instances with Llamactl through both the Web UI and API. +Learn how to effectively manage your llama.cpp and MLX instances with Llamactl through both the Web UI and API. ## Overview @@ -39,40 +39,55 @@ Each instance is displayed as a card showing: 1. Click the **"Create Instance"** button on the dashboard 2. Enter a unique **Name** for your instance (only required field) -3. Configure model source (choose one): - - **Model Path**: Full path to your downloaded GGUF model file - - **HuggingFace Repo**: Repository name (e.g., `unsloth/gemma-3-27b-it-GGUF`) - - **HuggingFace File**: Specific file within the repo (optional, uses default if not specified) -4. Configure optional instance management settings: +3. **Choose Backend Type**: + - **llama.cpp**: For GGUF models using llama-server + - **MLX**: For MLX-optimized models (macOS only) +4. Configure model source: + - **For llama.cpp**: GGUF model path or HuggingFace repo + - **For MLX**: MLX model path or identifier (e.g., `mlx-community/Mistral-7B-Instruct-v0.3-4bit`) +5. Configure optional instance management settings: - **Auto Restart**: Automatically restart instance on failure - **Max Restarts**: Maximum number of restart attempts - **Restart Delay**: Delay in seconds between restart attempts - **On Demand Start**: Start instance when receiving a request to the OpenAI compatible endpoint - **Idle Timeout**: Minutes before stopping idle instance (set to 0 to disable) -5. Configure optional llama-server backend options: - - **Threads**: Number of CPU threads to use - - **Context Size**: Context window size (ctx_size) - - **GPU Layers**: Number of layers to offload to GPU - - **Port**: Network port (auto-assigned by llamactl if not specified) - - **Additional Parameters**: Any other llama-server command line options (see [llama-server documentation](https://github.com/ggerganov/llama.cpp/blob/master/examples/server/README.md)) -6. Click **"Create"** to save the instance +6. Configure backend-specific options: + - **llama.cpp**: Threads, context size, GPU layers, port, etc. + - **MLX**: Temperature, top-p, adapter path, Python environment, etc. +7. Click **"Create"** to save the instance ### Via API ```bash -# Create instance with local model file -curl -X POST http://localhost:8080/api/instances/my-instance \ +# Create llama.cpp instance with local model file +curl -X POST http://localhost:8080/api/instances/my-llama-instance \ -H "Content-Type: application/json" \ -d '{ "backend_type": "llama_cpp", "backend_options": { "model": "/path/to/model.gguf", "threads": 8, - "ctx_size": 4096 + "ctx_size": 4096, + "gpu_layers": 32 } }' -# Create instance with HuggingFace model +# Create MLX instance (macOS only) +curl -X POST http://localhost:8080/api/instances/my-mlx-instance \ + -H "Content-Type: application/json" \ + -d '{ + "backend_type": "mlx_lm", + "backend_options": { + "model": "mlx-community/Mistral-7B-Instruct-v0.3-4bit", + "temp": 0.7, + "top_p": 0.9, + "max_tokens": 2048 + }, + "auto_restart": true, + "max_restarts": 3 + }' + +# Create llama.cpp instance with HuggingFace model curl -X POST http://localhost:8080/api/instances/gemma-3-27b \ -H "Content-Type: application/json" \ -d '{ @@ -81,9 +96,7 @@ curl -X POST http://localhost:8080/api/instances/gemma-3-27b \ "hf_repo": "unsloth/gemma-3-27b-it-GGUF", "hf_file": "gemma-3-27b-it-GGUF.gguf", "gpu_layers": 32 - }, - "auto_restart": true, - "max_restarts": 3 + } }' ``` @@ -166,14 +179,16 @@ curl -X DELETE http://localhost:8080/api/instances/{name} ## Instance Proxy -Llamactl proxies all requests to the underlying llama-server instances. +Llamactl proxies all requests to the underlying backend instances (llama-server or MLX). ```bash # Get instance details curl http://localhost:8080/api/instances/{name}/proxy/ ``` -Check llama-server [docs](https://github.com/ggml-org/llama.cpp/blob/master/tools/server/README.md) for more information. +Both backends provide OpenAI-compatible endpoints. Check the respective documentation: +- [llama-server docs](https://github.com/ggml-org/llama.cpp/blob/master/tools/server/README.md) +- [MLX-LM docs](https://github.com/ml-explore/mlx-lm/blob/main/mlx_lm/SERVER.md) ### Instance Health