From 8820dc114693664e861b4198d08a3686c28cc68c Mon Sep 17 00:00:00 2001
From: LordMathis <matus@namesny.com>
Date: Thu, 18 Sep 2025 20:01:18 +0200
Subject: [PATCH] Enhance documentation for MLX backend support

---
 docs/getting-started/configuration.md | 33 ++++++++++-----
 docs/getting-started/installation.md  | 25 ++++++++++--
 docs/index.md                         | 19 +++++----
 docs/user-guide/managing-instances.md | 59 +++++++++++++++++----------
 4 files changed, 92 insertions(+), 44 deletions(-)

diff --git a/docs/getting-started/configuration.md b/docs/getting-started/configuration.md
index 64b097a..f8003ef 100644
--- a/docs/getting-started/configuration.md
+++ b/docs/getting-started/configuration.md
@@ -19,6 +19,10 @@ server:
   allowed_origins: ["*"]         # Allowed CORS origins (default: all)
   enable_swagger: false          # Enable Swagger UI for API docs
 
+backends:
+  llama_executable: llama-server # Path to llama-server executable
+  mlx_lm_executable: mlx_lm.server # Path to mlx_lm.server executable
+
 instances:
   port_range: [8000, 9000]       # Port range for instances
   data_dir: ~/.local/share/llamactl         # Data directory (platform-specific, see below)
@@ -28,7 +32,6 @@ instances:
   max_instances: -1              # Max instances (-1 = unlimited)
   max_running_instances: -1      # Max running instances (-1 = unlimited)
   enable_lru_eviction: true      # Enable LRU eviction for idle instances
-  llama_executable: llama-server # Path to llama-server executable
   default_auto_restart: true     # Auto-restart new instances by default
   default_max_restarts: 3        # Max restarts for new instances
   default_restart_delay: 5       # Restart delay (seconds) for new instances
@@ -79,11 +82,23 @@ server:
   enable_swagger: false   # Enable Swagger UI (default: false)
 ```
 
-**Environment Variables:**  
-- `LLAMACTL_HOST` - Server host  
-- `LLAMACTL_PORT` - Server port  
-- `LLAMACTL_ALLOWED_ORIGINS` - Comma-separated CORS origins  
-- `LLAMACTL_ENABLE_SWAGGER` - Enable Swagger UI (true/false)  
+**Environment Variables:**
+- `LLAMACTL_HOST` - Server host
+- `LLAMACTL_PORT` - Server port
+- `LLAMACTL_ALLOWED_ORIGINS` - Comma-separated CORS origins
+- `LLAMACTL_ENABLE_SWAGGER` - Enable Swagger UI (true/false)
+
+### Backend Configuration
+
+```yaml
+backends:
+  llama_executable: "llama-server"     # Path to llama-server executable (default: "llama-server")
+  mlx_lm_executable: "mlx_lm.server"   # Path to mlx_lm.server executable (default: "mlx_lm.server")
+```
+
+**Environment Variables:**
+- `LLAMACTL_LLAMA_EXECUTABLE` - Path to llama-server executable
+- `LLAMACTL_MLX_LM_EXECUTABLE` - Path to mlx_lm.server executable
 
 ### Instance Configuration
 
@@ -97,7 +112,6 @@ instances:
   max_instances: -1                                 # Maximum instances (-1 = unlimited)
   max_running_instances: -1                         # Maximum running instances (-1 = unlimited)
   enable_lru_eviction: true                         # Enable LRU eviction for idle instances
-  llama_executable: "llama-server"                  # Path to llama-server executable
   default_auto_restart: true                        # Default auto-restart setting
   default_max_restarts: 3                           # Default maximum restart attempts
   default_restart_delay: 5                          # Default restart delay in seconds
@@ -113,9 +127,8 @@ instances:
 - `LLAMACTL_LOGS_DIR` - Log directory path  
 - `LLAMACTL_AUTO_CREATE_DATA_DIR` - Auto-create data/config/logs directories (true/false)  
 - `LLAMACTL_MAX_INSTANCES` - Maximum number of instances  
-- `LLAMACTL_MAX_RUNNING_INSTANCES` - Maximum number of running instances  
-- `LLAMACTL_ENABLE_LRU_EVICTION` - Enable LRU eviction for idle instances  
-- `LLAMACTL_LLAMA_EXECUTABLE` - Path to llama-server executable  
+- `LLAMACTL_MAX_RUNNING_INSTANCES` - Maximum number of running instances
+- `LLAMACTL_ENABLE_LRU_EVICTION` - Enable LRU eviction for idle instances
 - `LLAMACTL_DEFAULT_AUTO_RESTART` - Default auto-restart setting (true/false)  
 - `LLAMACTL_DEFAULT_MAX_RESTARTS` - Default maximum restarts  
 - `LLAMACTL_DEFAULT_RESTART_DELAY` - Default restart delay in seconds  
diff --git a/docs/getting-started/installation.md b/docs/getting-started/installation.md
index 90f78a8..a3ceae6 100644
--- a/docs/getting-started/installation.md
+++ b/docs/getting-started/installation.md
@@ -4,11 +4,14 @@ This guide will walk you through installing Llamactl on your system.
 
 ## Prerequisites
 
+### Backend Dependencies
+
+llamactl supports multiple backends. Install at least one:
+
+**For llama.cpp backend (all platforms):**
+
 You need `llama-server` from [llama.cpp](https://github.com/ggml-org/llama.cpp) installed:
 
-
-**Quick install methods:**
-
 ```bash
 # Homebrew (macOS/Linux)
 brew install llama.cpp
@@ -18,6 +21,22 @@ winget install llama.cpp
 
 Or build from source - see llama.cpp docs
 
+**For MLX backend (macOS only):**
+
+MLX provides optimized inference on Apple Silicon. Install MLX-LM:
+
+```bash
+# Install via pip (requires Python 3.8+)
+pip install mlx-lm
+
+# Or in a virtual environment (recommended)
+python -m venv mlx-env
+source mlx-env/bin/activate
+pip install mlx-lm
+```
+
+Note: MLX backend is only available on macOS with Apple Silicon (M1, M2, M3, etc.)
+
 ## Installation Methods
 
 ### Option 1: Download Binary (Recommended)
diff --git a/docs/index.md b/docs/index.md
index d3e7bb9..585363c 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -1,22 +1,23 @@
 # Llamactl Documentation
 
-Welcome to the Llamactl documentation! **Management server and proxy for multiple llama.cpp instances with OpenAI-compatible API routing.**
+Welcome to the Llamactl documentation! **Management server and proxy for multiple llama.cpp and MLX instances with OpenAI-compatible API routing.**
 
 ![Dashboard Screenshot](images/dashboard.png)
 
 ## What is Llamactl?
 
-Llamactl is designed to simplify the deployment and management of llama-server instances. It provides a modern solution for running multiple large language models with centralized management.
+Llamactl is designed to simplify the deployment and management of llama-server and MLX instances. It provides a modern solution for running multiple large language models with centralized management and multi-backend support.
 
 ## Features
 
-🚀 **Multiple Model Serving**: Run different models simultaneously (7B for speed, 70B for quality)  
-🔗 **OpenAI API Compatible**: Drop-in replacement - route requests by model name  
-🌐 **Web Dashboard**: Modern React UI for visual management (unlike CLI-only tools)  
-🔐 **API Key Authentication**: Separate keys for management vs inference access  
-📊 **Instance Monitoring**: Health checks, auto-restart, log management  
-⚡ **Smart Resource Management**: Idle timeout, LRU eviction, and configurable instance limits  
-💡 **On-Demand Instance Start**: Automatically launch instances upon receiving OpenAI-compatible API requests  
+🚀 **Multiple Model Serving**: Run different models simultaneously (7B for speed, 70B for quality)
+🔗 **OpenAI API Compatible**: Drop-in replacement - route requests by model name
+🍎 **Multi-Backend Support**: Native support for both llama.cpp and MLX (Apple Silicon optimized)
+🌐 **Web Dashboard**: Modern React UI for visual management (unlike CLI-only tools)
+🔐 **API Key Authentication**: Separate keys for management vs inference access
+📊 **Instance Monitoring**: Health checks, auto-restart, log management
+⚡ **Smart Resource Management**: Idle timeout, LRU eviction, and configurable instance limits
+💡 **On-Demand Instance Start**: Automatically launch instances upon receiving OpenAI-compatible API requests
 💾 **State Persistence**: Ensure instances remain intact across server restarts  
 
 ## Quick Links
diff --git a/docs/user-guide/managing-instances.md b/docs/user-guide/managing-instances.md
index 90e4552..186670c 100644
--- a/docs/user-guide/managing-instances.md
+++ b/docs/user-guide/managing-instances.md
@@ -1,6 +1,6 @@
 # Managing Instances
 
-Learn how to effectively manage your Llama.cpp instances with Llamactl through both the Web UI and API.
+Learn how to effectively manage your llama.cpp and MLX instances with Llamactl through both the Web UI and API.
 
 ## Overview
 
@@ -39,40 +39,55 @@ Each instance is displayed as a card showing:
 
 1. Click the **"Create Instance"** button on the dashboard
 2. Enter a unique **Name** for your instance (only required field)
-3. Configure model source (choose one):
-    - **Model Path**: Full path to your downloaded GGUF model file
-    - **HuggingFace Repo**: Repository name (e.g., `unsloth/gemma-3-27b-it-GGUF`)
-    - **HuggingFace File**: Specific file within the repo (optional, uses default if not specified)
-4. Configure optional instance management settings:
+3. **Choose Backend Type**:
+    - **llama.cpp**: For GGUF models using llama-server
+    - **MLX**: For MLX-optimized models (macOS only)
+4. Configure model source:
+    - **For llama.cpp**: GGUF model path or HuggingFace repo
+    - **For MLX**: MLX model path or identifier (e.g., `mlx-community/Mistral-7B-Instruct-v0.3-4bit`)
+5. Configure optional instance management settings:
     - **Auto Restart**: Automatically restart instance on failure
     - **Max Restarts**: Maximum number of restart attempts
     - **Restart Delay**: Delay in seconds between restart attempts
     - **On Demand Start**: Start instance when receiving a request to the OpenAI compatible endpoint
     - **Idle Timeout**: Minutes before stopping idle instance (set to 0 to disable)
-5. Configure optional llama-server backend options:
-    - **Threads**: Number of CPU threads to use
-    - **Context Size**: Context window size (ctx_size)
-    - **GPU Layers**: Number of layers to offload to GPU
-    - **Port**: Network port (auto-assigned by llamactl if not specified)
-    - **Additional Parameters**: Any other llama-server command line options (see [llama-server documentation](https://github.com/ggerganov/llama.cpp/blob/master/examples/server/README.md))
-6. Click **"Create"** to save the instance  
+6. Configure backend-specific options:
+    - **llama.cpp**: Threads, context size, GPU layers, port, etc.
+    - **MLX**: Temperature, top-p, adapter path, Python environment, etc.
+7. Click **"Create"** to save the instance  
 
 ### Via API
 
 ```bash
-# Create instance with local model file
-curl -X POST http://localhost:8080/api/instances/my-instance \
+# Create llama.cpp instance with local model file
+curl -X POST http://localhost:8080/api/instances/my-llama-instance \
   -H "Content-Type: application/json" \
   -d '{
     "backend_type": "llama_cpp",
     "backend_options": {
       "model": "/path/to/model.gguf",
       "threads": 8,
-      "ctx_size": 4096
+      "ctx_size": 4096,
+      "gpu_layers": 32
     }
   }'
 
-# Create instance with HuggingFace model
+# Create MLX instance (macOS only)
+curl -X POST http://localhost:8080/api/instances/my-mlx-instance \
+  -H "Content-Type: application/json" \
+  -d '{
+    "backend_type": "mlx_lm",
+    "backend_options": {
+      "model": "mlx-community/Mistral-7B-Instruct-v0.3-4bit",
+      "temp": 0.7,
+      "top_p": 0.9,
+      "max_tokens": 2048
+    },
+    "auto_restart": true,
+    "max_restarts": 3
+  }'
+
+# Create llama.cpp instance with HuggingFace model
 curl -X POST http://localhost:8080/api/instances/gemma-3-27b \
   -H "Content-Type: application/json" \
   -d '{
@@ -81,9 +96,7 @@ curl -X POST http://localhost:8080/api/instances/gemma-3-27b \
       "hf_repo": "unsloth/gemma-3-27b-it-GGUF",
       "hf_file": "gemma-3-27b-it-GGUF.gguf",
       "gpu_layers": 32
-    },
-    "auto_restart": true,
-    "max_restarts": 3
+    }
   }'
 ```
 
@@ -166,14 +179,16 @@ curl -X DELETE http://localhost:8080/api/instances/{name}
 
 ## Instance Proxy
 
-Llamactl proxies all requests to the underlying llama-server instances.
+Llamactl proxies all requests to the underlying backend instances (llama-server or MLX).
 
 ```bash
 # Get instance details
 curl http://localhost:8080/api/instances/{name}/proxy/
 ```
 
-Check llama-server [docs](https://github.com/ggml-org/llama.cpp/blob/master/tools/server/README.md) for more information.
+Both backends provide OpenAI-compatible endpoints. Check the respective documentation:
+- [llama-server docs](https://github.com/ggml-org/llama.cpp/blob/master/tools/server/README.md)
+- [MLX-LM docs](https://github.com/ml-explore/mlx-lm/blob/main/mlx_lm/SERVER.md)
 
 ### Instance Health