From 52d8c2a082f9512a746974f435ff036392a644f4 Mon Sep 17 00:00:00 2001
From: LordMathis <matus@namesny.com>
Date: Sun, 26 Oct 2025 13:43:44 +0100
Subject: [PATCH] Simplify README.md

---
 README.md | 204 ++++++++++++++++++++----------------------------------
 1 file changed, 76 insertions(+), 128 deletions(-)

diff --git a/README.md b/README.md
index d9fea15..d016634 100644
--- a/README.md
+++ b/README.md
@@ -4,133 +4,32 @@
 
 **Unified management and routing for llama.cpp, MLX and vLLM models with web dashboard.**
 
-## Features
-
-### 🚀 Easy Model Management
-- **Multiple Model Serving**: Run different models simultaneously (7B for speed, 70B for quality)
-- **On-Demand Instance Start**: Automatically launch instances upon receiving API requests
-- **State Persistence**: Ensure instances remain intact across server restarts
-
-### 🔗 Universal Compatibility
-- **OpenAI API Compatible**: Drop-in replacement - route requests by instance name
-- **Multi-Backend Support**: Native support for llama.cpp, MLX (Apple Silicon optimized), and vLLM
-- **Docker Support**: Run backends in containers
-
-### 🌐 User-Friendly Interface
-- **Web Dashboard**: Modern React UI for visual management (unlike CLI-only tools)
-- **API Key Authentication**: Separate keys for management vs inference access
-
-### ⚡ Smart Operations
-- **Instance Monitoring**: Health checks, auto-restart, log management
-- **Smart Resource Management**: Idle timeout, LRU eviction, and configurable instance limits
-- **Environment Variables**: Set custom environment variables per instance for advanced configuration
-
-### 🔗 Remote Instance Deployment
-- **Remote Node Support**: Deploy instances on remote hosts
-- **Central Management**: Manage remote instances from a single dashboard
-- **Seamless Routing**: Automatic request routing to remote instances  
+📚 **[Full Documentation →](https://llamactl.org)**
 
 ![Dashboard Screenshot](docs/images/dashboard.png)
 
+## Features
+
+### 🚀 Easy Model Management
+- **Multiple Models Simultaneously**: Run different models at the same time (7B for speed, 70B for quality)
+- **Smart Resource Management**: Automatic idle timeout, LRU eviction, and configurable instance limits
+- **Web Dashboard**: Modern React UI for managing instances, monitoring health, and viewing logs
+
+### 🔗 Flexible Integration
+- **OpenAI API Compatible**: Drop-in replacement - route requests to different models by instance name
+- **Multi-Backend Support**: Native support for llama.cpp, MLX (Apple Silicon optimized), and vLLM
+- **Docker Ready**: Run backends in containers with full GPU support
+
+### 🌐 Distributed Deployment
+- **Remote Instances**: Deploy instances on remote hosts
+- **Central Management**: Manage everything from a single dashboard with automatic routing  
+
 ## Quick Start
 
-```bash
-# 1. Install backend (one-time setup)
-# For llama.cpp: https://github.com/ggml-org/llama.cpp#quick-start
-# For MLX on macOS: pip install mlx-lm
-# For vLLM: pip install vllm
-# Or use Docker - no local installation required
-
-# 2. Download and run llamactl
-LATEST_VERSION=$(curl -s https://api.github.com/repos/lordmathis/llamactl/releases/latest | grep '"tag_name":' | sed -E 's/.*"([^"]+)".*/\1/')
-curl -L https://github.com/lordmathis/llamactl/releases/download/${LATEST_VERSION}/llamactl-${LATEST_VERSION}-linux-amd64.tar.gz | tar -xz
-sudo mv llamactl /usr/local/bin/
-
-# 3. Start the server
-llamactl
-# Access dashboard at http://localhost:8080
-```
-
-## Usage
-
-### Create and manage instances via web dashboard:
-1. Open http://localhost:8080
-2. Click "Create Instance"
-3. Choose backend type (llama.cpp, MLX, or vLLM)
-4. Set model path and backend-specific options
-5. Configure environment variables if needed (optional)
-6. Start or stop the instance
-
-### Or use the REST API:
-```bash
-# Create llama.cpp instance
-curl -X POST localhost:8080/api/v1/instances/my-7b-model \
-  -H "Authorization: Bearer your-key" \
-  -d '{"backend_type": "llama_cpp", "backend_options": {"model": "/path/to/model.gguf", "gpu_layers": 32}}'
-
-# Create MLX instance (macOS)
-curl -X POST localhost:8080/api/v1/instances/my-mlx-model \
-  -H "Authorization: Bearer your-key" \
-  -d '{"backend_type": "mlx_lm", "backend_options": {"model": "mlx-community/Mistral-7B-Instruct-v0.3-4bit"}}'
-
-# Create vLLM instance with environment variables
-curl -X POST localhost:8080/api/v1/instances/my-vllm-model \
-  -H "Authorization: Bearer your-key" \
-  -d '{"backend_type": "vllm", "backend_options": {"model": "microsoft/DialoGPT-medium", "tensor_parallel_size": 2}, "environment": {"CUDA_VISIBLE_DEVICES": "0,1", "NCCL_DEBUG": "INFO"}}'
-
-# Use with OpenAI SDK
-curl -X POST localhost:8080/v1/chat/completions \
-  -H "Authorization: Bearer your-key" \
-  -d '{"model": "my-7b-model", "messages": [{"role": "user", "content": "Hello!"}]}'
-```
-
-## Installation
-
-### Option 1: Download Binary (Recommended)
-
-```bash
-# Linux/macOS - Get latest version and download
-LATEST_VERSION=$(curl -s https://api.github.com/repos/lordmathis/llamactl/releases/latest | grep '"tag_name":' | sed -E 's/.*"([^"]+)".*/\1/')
-curl -L https://github.com/lordmathis/llamactl/releases/download/${LATEST_VERSION}/llamactl-${LATEST_VERSION}-$(uname -s | tr '[:upper:]' '[:lower:]')-$(uname -m).tar.gz | tar -xz
-sudo mv llamactl /usr/local/bin/
-
-# Or download manually from the releases page:
-# https://github.com/lordmathis/llamactl/releases/latest
-
-# Windows - Download from releases page
-```
-
-### Option 2: Docker (No local backend installation required)
-
-```bash
-# Clone repository and build Docker images
-git clone https://github.com/lordmathis/llamactl.git
-cd llamactl
-mkdir -p data/llamacpp data/vllm models
-
-# Build and start llamactl with llama.cpp CUDA backend
-docker-compose -f docker/docker-compose.yml up llamactl-llamacpp -d
-
-# Build and start llamactl with vLLM CUDA backend
-docker-compose -f docker/docker-compose.yml up llamactl-vllm -d
-
-# Build from source using multi-stage build
-docker build -f docker/Dockerfile.source -t llamactl:source .
-```
-
-**Features:** CUDA support, automatic latest release installation, no backend dependencies.
-**Note:** Dockerfiles are configured for CUDA. Adapt base images for other platforms (CPU, ROCm, etc.).
-
-For detailed Docker setup and configuration, see the [Installation Guide](docs/getting-started/installation.md).
-
-### Option 3: Build from Source
-Requires Go 1.24+ and Node.js 22+
-```bash
-git clone https://github.com/lordmathis/llamactl.git
-cd llamactl
-cd webui && npm ci && npm run build && cd ..
-go build -o llamactl ./cmd/server
-```
+1. Install a backend (llama.cpp, MLX, or vLLM) - see [Prerequisites](#prerequisites) below
+2. [Download llamactl](#installation) for your platform
+3. Run `llamactl` and open http://localhost:8080
+4. Create an instance and start inferencing!
 
 ## Prerequisites
 
@@ -175,9 +74,9 @@ pip install vllm
 # Or use Docker - no local installation required
 ```
 
-## Backend Docker Support
+### Docker Support
 
-llamactl can run backends in Docker containers:
+llamactl can run backends in Docker containers, eliminating the need for local backend installation:
 
 ```yaml
 backends:
@@ -189,9 +88,58 @@ backends:
       enabled: true
 ```
 
-**Requirements:** Docker installed and running. For GPU support: nvidia-docker2 (Linux) or Docker Desktop with GPU support.
+## Installation
 
-For detailed Docker configuration options, see the [Configuration Guide](docs/getting-started/configuration.md).
+### Option 1: Download Binary (Recommended)
+
+```bash
+# Linux/macOS - Get latest version and download
+LATEST_VERSION=$(curl -s https://api.github.com/repos/lordmathis/llamactl/releases/latest | grep '"tag_name":' | sed -E 's/.*"([^"]+)".*/\1/')
+curl -L https://github.com/lordmathis/llamactl/releases/download/${LATEST_VERSION}/llamactl-${LATEST_VERSION}-$(uname -s | tr '[:upper:]' '[:lower:]')-$(uname -m).tar.gz | tar -xz
+sudo mv llamactl /usr/local/bin/
+
+# Or download manually from the releases page:
+# https://github.com/lordmathis/llamactl/releases/latest
+
+# Windows - Download from releases page
+```
+
+### Option 2: Docker (No local backend installation required)
+
+```bash
+# Clone repository and build Docker images
+git clone https://github.com/lordmathis/llamactl.git
+cd llamactl
+mkdir -p data/llamacpp data/vllm models
+
+# Build and start llamactl with llama.cpp CUDA backend
+docker-compose -f docker/docker-compose.yml up llamactl-llamacpp -d
+
+# Build and start llamactl with vLLM CUDA backend
+docker-compose -f docker/docker-compose.yml up llamactl-vllm -d
+
+# Build from source using multi-stage build
+docker build -f docker/Dockerfile.source -t llamactl:source .
+```
+
+**Note:** Dockerfiles are configured for CUDA. Adapt base images for other platforms (CPU, ROCm, etc.).
+
+### Option 3: Build from Source
+Requires Go 1.24+ and Node.js 22+
+```bash
+git clone https://github.com/lordmathis/llamactl.git
+cd llamactl
+cd webui && npm ci && npm run build && cd ..
+go build -o llamactl ./cmd/server
+```
+
+## Usage
+
+1. Open http://localhost:8080
+2. Click "Create Instance"
+3. Choose backend type (llama.cpp, MLX, or vLLM)
+4. Configure your model and options
+5. Start the instance and use it with any OpenAI-compatible client
 
 ## Configuration
 
@@ -213,7 +161,7 @@ backends:
     docker:
       enabled: false
       image: "ghcr.io/ggml-org/llama.cpp:server"
-      args: ["run", "--rm", "--network", "host", "--gpus", "all"]
+      args: ["run", "--rm", "--network", "host", "--gpus", "all", "-v", "~/.local/share/llamactl/llama.cpp:/root/.cache/llama.cpp"]
       environment: {}             # Environment variables for the container
 
   vllm:
@@ -223,7 +171,7 @@ backends:
     docker:
       enabled: false
       image: "vllm/vllm-openai:latest"
-      args: ["run", "--rm", "--network", "host", "--gpus", "all", "--shm-size", "1g"]
+      args: ["run", "--rm", "--network", "host", "--gpus", "all", "--shm-size", "1g", "-v", "~/.local/share/llamactl/huggingface:/root/.cache/huggingface"]
       environment: {}             # Environment variables for the container
 
   mlx: