From 52d8c2a082f9512a746974f435ff036392a644f4 Mon Sep 17 00:00:00 2001 From: LordMathis Date: Sun, 26 Oct 2025 13:43:44 +0100 Subject: [PATCH] Simplify README.md --- README.md | 204 ++++++++++++++++++++---------------------------------- 1 file changed, 76 insertions(+), 128 deletions(-) diff --git a/README.md b/README.md index d9fea15..d016634 100644 --- a/README.md +++ b/README.md @@ -4,133 +4,32 @@ **Unified management and routing for llama.cpp, MLX and vLLM models with web dashboard.** -## Features - -### 🚀 Easy Model Management -- **Multiple Model Serving**: Run different models simultaneously (7B for speed, 70B for quality) -- **On-Demand Instance Start**: Automatically launch instances upon receiving API requests -- **State Persistence**: Ensure instances remain intact across server restarts - -### 🔗 Universal Compatibility -- **OpenAI API Compatible**: Drop-in replacement - route requests by instance name -- **Multi-Backend Support**: Native support for llama.cpp, MLX (Apple Silicon optimized), and vLLM -- **Docker Support**: Run backends in containers - -### 🌐 User-Friendly Interface -- **Web Dashboard**: Modern React UI for visual management (unlike CLI-only tools) -- **API Key Authentication**: Separate keys for management vs inference access - -### ⚡ Smart Operations -- **Instance Monitoring**: Health checks, auto-restart, log management -- **Smart Resource Management**: Idle timeout, LRU eviction, and configurable instance limits -- **Environment Variables**: Set custom environment variables per instance for advanced configuration - -### 🔗 Remote Instance Deployment -- **Remote Node Support**: Deploy instances on remote hosts -- **Central Management**: Manage remote instances from a single dashboard -- **Seamless Routing**: Automatic request routing to remote instances +📚 **[Full Documentation →](https://llamactl.org)** ![Dashboard Screenshot](docs/images/dashboard.png) +## Features + +### 🚀 Easy Model Management +- **Multiple Models Simultaneously**: Run different models at the same time (7B for speed, 70B for quality) +- **Smart Resource Management**: Automatic idle timeout, LRU eviction, and configurable instance limits +- **Web Dashboard**: Modern React UI for managing instances, monitoring health, and viewing logs + +### 🔗 Flexible Integration +- **OpenAI API Compatible**: Drop-in replacement - route requests to different models by instance name +- **Multi-Backend Support**: Native support for llama.cpp, MLX (Apple Silicon optimized), and vLLM +- **Docker Ready**: Run backends in containers with full GPU support + +### 🌐 Distributed Deployment +- **Remote Instances**: Deploy instances on remote hosts +- **Central Management**: Manage everything from a single dashboard with automatic routing + ## Quick Start -```bash -# 1. Install backend (one-time setup) -# For llama.cpp: https://github.com/ggml-org/llama.cpp#quick-start -# For MLX on macOS: pip install mlx-lm -# For vLLM: pip install vllm -# Or use Docker - no local installation required - -# 2. Download and run llamactl -LATEST_VERSION=$(curl -s https://api.github.com/repos/lordmathis/llamactl/releases/latest | grep '"tag_name":' | sed -E 's/.*"([^"]+)".*/\1/') -curl -L https://github.com/lordmathis/llamactl/releases/download/${LATEST_VERSION}/llamactl-${LATEST_VERSION}-linux-amd64.tar.gz | tar -xz -sudo mv llamactl /usr/local/bin/ - -# 3. Start the server -llamactl -# Access dashboard at http://localhost:8080 -``` - -## Usage - -### Create and manage instances via web dashboard: -1. Open http://localhost:8080 -2. Click "Create Instance" -3. Choose backend type (llama.cpp, MLX, or vLLM) -4. Set model path and backend-specific options -5. Configure environment variables if needed (optional) -6. Start or stop the instance - -### Or use the REST API: -```bash -# Create llama.cpp instance -curl -X POST localhost:8080/api/v1/instances/my-7b-model \ - -H "Authorization: Bearer your-key" \ - -d '{"backend_type": "llama_cpp", "backend_options": {"model": "/path/to/model.gguf", "gpu_layers": 32}}' - -# Create MLX instance (macOS) -curl -X POST localhost:8080/api/v1/instances/my-mlx-model \ - -H "Authorization: Bearer your-key" \ - -d '{"backend_type": "mlx_lm", "backend_options": {"model": "mlx-community/Mistral-7B-Instruct-v0.3-4bit"}}' - -# Create vLLM instance with environment variables -curl -X POST localhost:8080/api/v1/instances/my-vllm-model \ - -H "Authorization: Bearer your-key" \ - -d '{"backend_type": "vllm", "backend_options": {"model": "microsoft/DialoGPT-medium", "tensor_parallel_size": 2}, "environment": {"CUDA_VISIBLE_DEVICES": "0,1", "NCCL_DEBUG": "INFO"}}' - -# Use with OpenAI SDK -curl -X POST localhost:8080/v1/chat/completions \ - -H "Authorization: Bearer your-key" \ - -d '{"model": "my-7b-model", "messages": [{"role": "user", "content": "Hello!"}]}' -``` - -## Installation - -### Option 1: Download Binary (Recommended) - -```bash -# Linux/macOS - Get latest version and download -LATEST_VERSION=$(curl -s https://api.github.com/repos/lordmathis/llamactl/releases/latest | grep '"tag_name":' | sed -E 's/.*"([^"]+)".*/\1/') -curl -L https://github.com/lordmathis/llamactl/releases/download/${LATEST_VERSION}/llamactl-${LATEST_VERSION}-$(uname -s | tr '[:upper:]' '[:lower:]')-$(uname -m).tar.gz | tar -xz -sudo mv llamactl /usr/local/bin/ - -# Or download manually from the releases page: -# https://github.com/lordmathis/llamactl/releases/latest - -# Windows - Download from releases page -``` - -### Option 2: Docker (No local backend installation required) - -```bash -# Clone repository and build Docker images -git clone https://github.com/lordmathis/llamactl.git -cd llamactl -mkdir -p data/llamacpp data/vllm models - -# Build and start llamactl with llama.cpp CUDA backend -docker-compose -f docker/docker-compose.yml up llamactl-llamacpp -d - -# Build and start llamactl with vLLM CUDA backend -docker-compose -f docker/docker-compose.yml up llamactl-vllm -d - -# Build from source using multi-stage build -docker build -f docker/Dockerfile.source -t llamactl:source . -``` - -**Features:** CUDA support, automatic latest release installation, no backend dependencies. -**Note:** Dockerfiles are configured for CUDA. Adapt base images for other platforms (CPU, ROCm, etc.). - -For detailed Docker setup and configuration, see the [Installation Guide](docs/getting-started/installation.md). - -### Option 3: Build from Source -Requires Go 1.24+ and Node.js 22+ -```bash -git clone https://github.com/lordmathis/llamactl.git -cd llamactl -cd webui && npm ci && npm run build && cd .. -go build -o llamactl ./cmd/server -``` +1. Install a backend (llama.cpp, MLX, or vLLM) - see [Prerequisites](#prerequisites) below +2. [Download llamactl](#installation) for your platform +3. Run `llamactl` and open http://localhost:8080 +4. Create an instance and start inferencing! ## Prerequisites @@ -175,9 +74,9 @@ pip install vllm # Or use Docker - no local installation required ``` -## Backend Docker Support +### Docker Support -llamactl can run backends in Docker containers: +llamactl can run backends in Docker containers, eliminating the need for local backend installation: ```yaml backends: @@ -189,9 +88,58 @@ backends: enabled: true ``` -**Requirements:** Docker installed and running. For GPU support: nvidia-docker2 (Linux) or Docker Desktop with GPU support. +## Installation -For detailed Docker configuration options, see the [Configuration Guide](docs/getting-started/configuration.md). +### Option 1: Download Binary (Recommended) + +```bash +# Linux/macOS - Get latest version and download +LATEST_VERSION=$(curl -s https://api.github.com/repos/lordmathis/llamactl/releases/latest | grep '"tag_name":' | sed -E 's/.*"([^"]+)".*/\1/') +curl -L https://github.com/lordmathis/llamactl/releases/download/${LATEST_VERSION}/llamactl-${LATEST_VERSION}-$(uname -s | tr '[:upper:]' '[:lower:]')-$(uname -m).tar.gz | tar -xz +sudo mv llamactl /usr/local/bin/ + +# Or download manually from the releases page: +# https://github.com/lordmathis/llamactl/releases/latest + +# Windows - Download from releases page +``` + +### Option 2: Docker (No local backend installation required) + +```bash +# Clone repository and build Docker images +git clone https://github.com/lordmathis/llamactl.git +cd llamactl +mkdir -p data/llamacpp data/vllm models + +# Build and start llamactl with llama.cpp CUDA backend +docker-compose -f docker/docker-compose.yml up llamactl-llamacpp -d + +# Build and start llamactl with vLLM CUDA backend +docker-compose -f docker/docker-compose.yml up llamactl-vllm -d + +# Build from source using multi-stage build +docker build -f docker/Dockerfile.source -t llamactl:source . +``` + +**Note:** Dockerfiles are configured for CUDA. Adapt base images for other platforms (CPU, ROCm, etc.). + +### Option 3: Build from Source +Requires Go 1.24+ and Node.js 22+ +```bash +git clone https://github.com/lordmathis/llamactl.git +cd llamactl +cd webui && npm ci && npm run build && cd .. +go build -o llamactl ./cmd/server +``` + +## Usage + +1. Open http://localhost:8080 +2. Click "Create Instance" +3. Choose backend type (llama.cpp, MLX, or vLLM) +4. Configure your model and options +5. Start the instance and use it with any OpenAI-compatible client ## Configuration @@ -213,7 +161,7 @@ backends: docker: enabled: false image: "ghcr.io/ggml-org/llama.cpp:server" - args: ["run", "--rm", "--network", "host", "--gpus", "all"] + args: ["run", "--rm", "--network", "host", "--gpus", "all", "-v", "~/.local/share/llamactl/llama.cpp:/root/.cache/llama.cpp"] environment: {} # Environment variables for the container vllm: @@ -223,7 +171,7 @@ backends: docker: enabled: false image: "vllm/vllm-openai:latest" - args: ["run", "--rm", "--network", "host", "--gpus", "all", "--shm-size", "1g"] + args: ["run", "--rm", "--network", "host", "--gpus", "all", "--shm-size", "1g", "-v", "~/.local/share/llamactl/huggingface:/root/.cache/huggingface"] environment: {} # Environment variables for the container mlx: