From 11296bc5f879437d7a4ca64375423918158245d2 Mon Sep 17 00:00:00 2001 From: LordMathis Date: Thu, 18 Sep 2025 19:34:40 +0200 Subject: [PATCH] Update README to include MLX backend support and enhance usage instructions --- README.md | 62 +++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 42 insertions(+), 20 deletions(-) diff --git a/README.md b/README.md index a2a1e48..6907ce5 100644 --- a/README.md +++ b/README.md @@ -2,30 +2,28 @@ ![Build and Release](https://github.com/lordmathis/llamactl/actions/workflows/release.yaml/badge.svg) ![Go Tests](https://github.com/lordmathis/llamactl/actions/workflows/go_test.yaml/badge.svg) ![WebUI Tests](https://github.com/lordmathis/llamactl/actions/workflows/webui_test.yaml/badge.svg) -**Management server and proxy for multiple llama.cpp instances with OpenAI-compatible API routing.** +**Management server and proxy for multiple llama.cpp and MLX instances with OpenAI-compatible API routing.** ## Why llamactl? -🚀 **Multiple Model Serving**: Run different models simultaneously (7B for speed, 70B for quality) -🔗 **OpenAI API Compatible**: Drop-in replacement - route requests by model name -🌐 **Web Dashboard**: Modern React UI for visual management (unlike CLI-only tools) -🔐 **API Key Authentication**: Separate keys for management vs inference access -📊 **Instance Monitoring**: Health checks, auto-restart, log management -⚡ **Smart Resource Management**: Idle timeout, LRU eviction, and configurable instance limits -💡 **On-Demand Instance Start**: Automatically launch instances upon receiving OpenAI-compatible API requests +🚀 **Multiple Model Serving**: Run different models simultaneously (7B for speed, 70B for quality) +🔗 **OpenAI API Compatible**: Drop-in replacement - route requests by model name +🍎 **Multi-Backend Support**: Native support for both llama.cpp and MLX (Apple Silicon optimized) +🌐 **Web Dashboard**: Modern React UI for visual management (unlike CLI-only tools) +🔐 **API Key Authentication**: Separate keys for management vs inference access +📊 **Instance Monitoring**: Health checks, auto-restart, log management +⚡ **Smart Resource Management**: Idle timeout, LRU eviction, and configurable instance limits +💡 **On-Demand Instance Start**: Automatically launch instances upon receiving OpenAI-compatible API requests 💾 **State Persistence**: Ensure instances remain intact across server restarts ![Dashboard Screenshot](docs/images/dashboard.png) -**Choose llamactl if**: You need authentication, health monitoring, auto-restart, and centralized management of multiple llama-server instances -**Choose Ollama if**: You want the simplest setup with strong community ecosystem and third-party integrations -**Choose LM Studio if**: You prefer a polished desktop GUI experience with easy model management - ## Quick Start ```bash -# 1. Install llama-server (one-time setup) -# See: https://github.com/ggml-org/llama.cpp#quick-start +# 1. Install backend (one-time setup) +# For llama.cpp: https://github.com/ggml-org/llama.cpp#quick-start +# For MLX on macOS: pip install mlx-lm # 2. Download and run llamactl LATEST_VERSION=$(curl -s https://api.github.com/repos/lordmathis/llamactl/releases/latest | grep '"tag_name":' | sed -E 's/.*"([^"]+)".*/\1/') @@ -42,15 +40,21 @@ llamactl ### Create and manage instances via web dashboard: 1. Open http://localhost:8080 2. Click "Create Instance" -3. Set model path and GPU layers -4. Start or stop the instance +3. Choose backend type (llama.cpp or MLX) +4. Set model path and backend-specific options +5. Start or stop the instance ### Or use the REST API: ```bash -# Create instance +# Create llama.cpp instance curl -X POST localhost:8080/api/v1/instances/my-7b-model \ -H "Authorization: Bearer your-key" \ - -d '{"model": "/path/to/model.gguf", "gpu_layers": 32}' + -d '{"backend_type": "llama_cpp", "backend_options": {"model": "/path/to/model.gguf", "gpu_layers": 32}}' + +# Create MLX instance (macOS) +curl -X POST localhost:8080/api/v1/instances/my-mlx-model \ + -H "Authorization: Bearer your-key" \ + -d '{"backend_type": "mlx_lm", "backend_options": {"model": "mlx-community/Mistral-7B-Instruct-v0.3-4bit"}}' # Use with OpenAI SDK curl -X POST localhost:8080/v1/chat/completions \ @@ -85,16 +89,31 @@ go build -o llamactl ./cmd/server ## Prerequisites +### Backend Dependencies + +**For llama.cpp backend:** You need `llama-server` from [llama.cpp](https://github.com/ggml-org/llama.cpp) installed: ```bash -# Quick install methods: # Homebrew (macOS) brew install llama.cpp # Or build from source - see llama.cpp docs ``` +**For MLX backend (macOS only):** +You need MLX-LM installed: + +```bash +# Install via pip (requires Python 3.8+) +pip install mlx-lm + +# Or in a virtual environment (recommended) +python -m venv mlx-env +source mlx-env/bin/activate +pip install mlx-lm +``` + ## Configuration llamactl works out of the box with sensible defaults. @@ -106,6 +125,10 @@ server: allowed_origins: ["*"] # Allowed CORS origins (default: all) enable_swagger: false # Enable Swagger UI for API docs +backends: + llama_executable: llama-server # Path to llama-server executable + mlx_lm_executable: mlx_lm.server # Path to mlx_lm.server executable + instances: port_range: [8000, 9000] # Port range for instances data_dir: ~/.local/share/llamactl # Data directory (platform-specific, see below) @@ -115,7 +138,6 @@ instances: max_instances: -1 # Max instances (-1 = unlimited) max_running_instances: -1 # Max running instances (-1 = unlimited) enable_lru_eviction: true # Enable LRU eviction for idle instances - llama_executable: llama-server # Path to llama-server executable default_auto_restart: true # Auto-restart new instances by default default_max_restarts: 3 # Max restarts for new instances default_restart_delay: 5 # Restart delay (seconds) for new instances