Merge pull request #73 from lordmathis/refactor/docs

refactor: Update docs structure and improve content clarity
2025-11-05 16:44:22 +00:00 · 2025-10-26 17:30:50 +01:00
parent 108a977a9c d768845805
commit 14f4a80c89
27 changed files with 4363 additions and 1381 deletions
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -86,7 +86,7 @@ go install github.com/swaggo/swag/cmd/swag@latest

 # Update Swagger comments in pkg/server/handlers.go
 # Then regenerate docs
-swag init -g cmd/server/main.go -o apidocs
+swag init -g cmd/server/main.go
 ```

 ## Pull Request Guidelines
--- a/README.md
+++ b/README.md
@@ -4,133 +4,32 @@

 **Unified management and routing for llama.cpp, MLX and vLLM models with web dashboard.**

-## Features
-
-### 🚀 Easy Model Management
- **Multiple Model Serving**: Run different models simultaneously (7B for speed, 70B for quality)
- **On-Demand Instance Start**: Automatically launch instances upon receiving API requests
- **State Persistence**: Ensure instances remain intact across server restarts
-
-### 🔗 Universal Compatibility
- **OpenAI API Compatible**: Drop-in replacement - route requests by instance name
- **Multi-Backend Support**: Native support for llama.cpp, MLX (Apple Silicon optimized), and vLLM
- **Docker Support**: Run backends in containers
-
-### 🌐 User-Friendly Interface
- **Web Dashboard**: Modern React UI for visual management (unlike CLI-only tools)
- **API Key Authentication**: Separate keys for management vs inference access
-
-### ⚡ Smart Operations
- **Instance Monitoring**: Health checks, auto-restart, log management
- **Smart Resource Management**: Idle timeout, LRU eviction, and configurable instance limits
- **Environment Variables**: Set custom environment variables per instance for advanced configuration
-
-### 🔗 Remote Instance Deployment
- **Remote Node Support**: Deploy instances on remote hosts
- **Central Management**: Manage remote instances from a single dashboard
- **Seamless Routing**: Automatic request routing to remote instances  
+📚 **[Full Documentation →](https://llamactl.org)**

 ![Dashboard Screenshot](docs/images/dashboard.png)

+## Features
+
+**🚀 Easy Model Management**
+- **Multiple Models Simultaneously**: Run different models at the same time (7B for speed, 70B for quality)
+- **Smart Resource Management**: Automatic idle timeout, LRU eviction, and configurable instance limits
+- **Web Dashboard**: Modern React UI for managing instances, monitoring health, and viewing logs
+
+**🔗 Flexible Integration**
+- **OpenAI API Compatible**: Drop-in replacement - route requests to different models by instance name
+- **Multi-Backend Support**: Native support for llama.cpp, MLX (Apple Silicon optimized), and vLLM
+- **Docker Ready**: Run backends in containers with full GPU support
+
+**🌐 Distributed Deployment**
+- **Remote Instances**: Deploy instances on remote hosts
+- **Central Management**: Manage everything from a single dashboard with automatic routing  
+
 ## Quick Start

-```bash
-# 1. Install backend (one-time setup)
-# For llama.cpp: https://github.com/ggml-org/llama.cpp#quick-start
-# For MLX on macOS: pip install mlx-lm
-# For vLLM: pip install vllm
-# Or use Docker - no local installation required
-
-# 2. Download and run llamactl
-LATEST_VERSION=$(curl -s https://api.github.com/repos/lordmathis/llamactl/releases/latest | grep '"tag_name":' | sed -E 's/.*"([^"]+)".*/\1/')
-curl -L https://github.com/lordmathis/llamactl/releases/download/${LATEST_VERSION}/llamactl-${LATEST_VERSION}-linux-amd64.tar.gz | tar -xz
-sudo mv llamactl /usr/local/bin/
-
-# 3. Start the server
-llamactl
-# Access dashboard at http://localhost:8080
-```
-
-## Usage
-
-### Create and manage instances via web dashboard:
-1. Open http://localhost:8080
-2. Click "Create Instance"
-3. Choose backend type (llama.cpp, MLX, or vLLM)
-4. Set model path and backend-specific options
-5. Configure environment variables if needed (optional)
-6. Start or stop the instance
-
-### Or use the REST API:
-```bash
-# Create llama.cpp instance
-curl -X POST localhost:8080/api/v1/instances/my-7b-model \
-  -H "Authorization: Bearer your-key" \
-  -d '{"backend_type": "llama_cpp", "backend_options": {"model": "/path/to/model.gguf", "gpu_layers": 32}}'
-
-# Create MLX instance (macOS)
-curl -X POST localhost:8080/api/v1/instances/my-mlx-model \
-  -H "Authorization: Bearer your-key" \
-  -d '{"backend_type": "mlx_lm", "backend_options": {"model": "mlx-community/Mistral-7B-Instruct-v0.3-4bit"}}'
-
-# Create vLLM instance with environment variables
-curl -X POST localhost:8080/api/v1/instances/my-vllm-model \
-  -H "Authorization: Bearer your-key" \
-  -d '{"backend_type": "vllm", "backend_options": {"model": "microsoft/DialoGPT-medium", "tensor_parallel_size": 2}, "environment": {"CUDA_VISIBLE_DEVICES": "0,1", "NCCL_DEBUG": "INFO"}}'
-
-# Use with OpenAI SDK
-curl -X POST localhost:8080/v1/chat/completions \
-  -H "Authorization: Bearer your-key" \
-  -d '{"model": "my-7b-model", "messages": [{"role": "user", "content": "Hello!"}]}'
-```
-
-## Installation
-
-### Option 1: Download Binary (Recommended)
-
-```bash
-# Linux/macOS - Get latest version and download
-LATEST_VERSION=$(curl -s https://api.github.com/repos/lordmathis/llamactl/releases/latest | grep '"tag_name":' | sed -E 's/.*"([^"]+)".*/\1/')
-curl -L https://github.com/lordmathis/llamactl/releases/download/${LATEST_VERSION}/llamactl-${LATEST_VERSION}-$(uname -s | tr '[:upper:]' '[:lower:]')-$(uname -m).tar.gz | tar -xz
-sudo mv llamactl /usr/local/bin/
-
-# Or download manually from the releases page:
-# https://github.com/lordmathis/llamactl/releases/latest
-
-# Windows - Download from releases page
-```
-
-### Option 2: Docker (No local backend installation required)
-
-```bash
-# Clone repository and build Docker images
-git clone https://github.com/lordmathis/llamactl.git
-cd llamactl
-mkdir -p data/llamacpp data/vllm models
-
-# Build and start llamactl with llama.cpp CUDA backend
-docker-compose -f docker/docker-compose.yml up llamactl-llamacpp -d
-
-# Build and start llamactl with vLLM CUDA backend
-docker-compose -f docker/docker-compose.yml up llamactl-vllm -d
-
-# Build from source using multi-stage build
-docker build -f docker/Dockerfile.source -t llamactl:source .
-```
-
-**Features:** CUDA support, automatic latest release installation, no backend dependencies.
-**Note:** Dockerfiles are configured for CUDA. Adapt base images for other platforms (CPU, ROCm, etc.).
-
-For detailed Docker setup and configuration, see the [Installation Guide](docs/getting-started/installation.md).
-
-### Option 3: Build from Source
-Requires Go 1.24+ and Node.js 22+
-```bash
-git clone https://github.com/lordmathis/llamactl.git
-cd llamactl
-cd webui && npm ci && npm run build && cd ..
-go build -o llamactl ./cmd/server
-```
+1. Install a backend (llama.cpp, MLX, or vLLM) - see [Prerequisites](#prerequisites) below
+2. [Download llamactl](#installation) for your platform
+3. Run `llamactl` and open http://localhost:8080
+4. Create an instance and start inferencing!

 ## Prerequisites

@@ -175,9 +74,9 @@ pip install vllm
 # Or use Docker - no local installation required
 ```

-## Backend Docker Support
+### Docker Support

-llamactl can run backends in Docker containers:
+llamactl can run backends in Docker containers, eliminating the need for local backend installation:

 ```yaml
 backends:
@@ -189,9 +88,58 @@ backends:
      enabled: true
 ```

-**Requirements:** Docker installed and running. For GPU support: nvidia-docker2 (Linux) or Docker Desktop with GPU support.
+## Installation

-For detailed Docker configuration options, see the [Configuration Guide](docs/getting-started/configuration.md).
+### Option 1: Download Binary (Recommended)
+
+```bash
+# Linux/macOS - Get latest version and download
+LATEST_VERSION=$(curl -s https://api.github.com/repos/lordmathis/llamactl/releases/latest | grep '"tag_name":' | sed -E 's/.*"([^"]+)".*/\1/')
+curl -L https://github.com/lordmathis/llamactl/releases/download/${LATEST_VERSION}/llamactl-${LATEST_VERSION}-$(uname -s | tr '[:upper:]' '[:lower:]')-$(uname -m).tar.gz | tar -xz
+sudo mv llamactl /usr/local/bin/
+
+# Or download manually from the releases page:
+# https://github.com/lordmathis/llamactl/releases/latest
+
+# Windows - Download from releases page
+```
+
+### Option 2: Docker (No local backend installation required)
+
+```bash
+# Clone repository and build Docker images
+git clone https://github.com/lordmathis/llamactl.git
+cd llamactl
+mkdir -p data/llamacpp data/vllm models
+
+# Build and start llamactl with llama.cpp CUDA backend
+docker-compose -f docker/docker-compose.yml up llamactl-llamacpp -d
+
+# Build and start llamactl with vLLM CUDA backend
+docker-compose -f docker/docker-compose.yml up llamactl-vllm -d
+
+# Build from source using multi-stage build
+docker build -f docker/Dockerfile.source -t llamactl:source .
+```
+
+**Note:** Dockerfiles are configured for CUDA. Adapt base images for other platforms (CPU, ROCm, etc.).
+
+### Option 3: Build from Source
+Requires Go 1.24+ and Node.js 22+
+```bash
+git clone https://github.com/lordmathis/llamactl.git
+cd llamactl
+cd webui && npm ci && npm run build && cd ..
+go build -o llamactl ./cmd/server
+```
+
+## Usage
+
+1. Open http://localhost:8080
+2. Click "Create Instance"
+3. Choose backend type (llama.cpp, MLX, or vLLM)
+4. Configure your model and options (ports and API keys are auto-assigned)
+5. Start the instance and use it with any OpenAI-compatible client

 ## Configuration

@@ -213,7 +161,7 @@ backends:
    docker:
      enabled: false
      image: "ghcr.io/ggml-org/llama.cpp:server"
-      args: ["run", "--rm", "--network", "host", "--gpus", "all"]
+      args: ["run", "--rm", "--network", "host", "--gpus", "all", "-v", "~/.local/share/llamactl/llama.cpp:/root/.cache/llama.cpp"]
      environment: {}             # Environment variables for the container

  vllm:
@@ -223,7 +171,7 @@ backends:
    docker:
      enabled: false
      image: "vllm/vllm-openai:latest"
-      args: ["run", "--rm", "--network", "host", "--gpus", "all", "--shm-size", "1g"]
+      args: ["run", "--rm", "--network", "host", "--gpus", "all", "--shm-size", "1g", "-v", "~/.local/share/llamactl/huggingface:/root/.cache/huggingface"]
      environment: {}             # Environment variables for the container

  mlx:
--- a/cmd/server/main.go
+++ b/cmd/server/main.go
@@ -22,6 +22,9 @@ var buildTime string = "unknown"
 // @license.name MIT License
 // @license.url https://opensource.org/license/mit/
 // @basePath /api/v1
+// @securityDefinitions.apikey ApiKeyAuth
+// @in header
+// @name X-API-Key
 func main() {

 	// --version flag to print the version
--- a/docker/Dockerfile.source
+++ b/docker/Dockerfile.source
@@ -33,7 +33,7 @@ RUN go mod download
 # Copy source code
 COPY cmd/ ./cmd/
 COPY pkg/ ./pkg/
-COPY apidocs/ ./apidocs/
+COPY docs/ ./docs/
 COPY webui/webui.go ./webui/

 # Copy built webui from webui-builder
--- a/docs-requirements.txt
+++ b/docs-requirements.txt
@@ -1,5 +1,6 @@
-mkdocs-material==9.5.3
-mkdocs==1.5.3
-pymdown-extensions==10.7
-mkdocs-git-revision-date-localized-plugin==1.2.4
-mike==2.0.0
+mkdocs-material==9.6.22
+mkdocs==1.6.1
+pymdown-extensions==10.16.1
+mkdocs-git-revision-date-localized-plugin==1.4.7
+mike==2.1.3
+neoteroi-mkdocs==1.1.3
--- a/docs/api-reference.md
+++ b/docs/api-reference.md
@@ -0,0 +1 @@
+[OAD(swagger.yaml)]
--- a/docs/getting-started/configuration.md
+++ b/docs/getting-started/configuration.md
@@ -80,7 +80,7 @@ nodes:                           # Node configuration for multi-node deployment

 ### Configuration File Locations

-Configuration files are searched in the following locations (in order of precedence):
+Configuration files are searched in the following locations (in order of precedence, first found is used):

 **Linux:**  
 - `./llamactl.yaml` or `./config.yaml` (current directory)  
--- a/docs/css/css-v1.1.3.css
+++ b/docs/css/css-v1.1.3.css
--- a/apidocs/docs.go
+++ b/apidocs/docs.go
--- a/docs/getting-started/quick-start.md
+++ b/docs/getting-started/quick-start.md
@@ -1,190 +0,0 @@
-# Quick Start
-
-This guide will help you get Llamactl up and running in just a few minutes.
-
-## Step 1: Start Llamactl
-
-Start the Llamactl server:
-
-```bash
-llamactl
-```
-
-By default, Llamactl will start on `http://localhost:8080`.
-
-## Step 2: Access the Web UI
-
-Open your web browser and navigate to:
-
-```
-http://localhost:8080
-```
-
-Login with the management API key. By default it is generated during server startup. Copy it from the terminal output.
-
-You should see the Llamactl web interface.
-
-## Step 3: Create Your First Instance
-
-1. Click the "Add Instance" button
-2. Fill in the instance configuration:
-   - **Name**: Give your instance a descriptive name
-   - **Backend Type**: Choose from llama.cpp, MLX, or vLLM
-   - **Model**: Model path or identifier for your chosen backend
-   - **Additional Options**: Backend-specific parameters
-
-3. Click "Create Instance"
-
-## Step 4: Start Your Instance
-
-Once created, you can:
-
- **Start** the instance by clicking the start button
- **Monitor** its status in real-time
- **View logs** by clicking the logs button
- **Stop** the instance when needed
-
-## Example Configurations
-
-Here are basic example configurations for each backend:
-
-**llama.cpp backend:**
-```json
-{
-  "name": "llama2-7b",
-  "backend_type": "llama_cpp",
-  "backend_options": {
-    "model": "/path/to/llama-2-7b-chat.gguf",
-    "threads": 4,
-    "ctx_size": 2048,
-    "gpu_layers": 32
-  }
-}
-```
-
-**MLX backend (macOS only):**
-```json
-{
-  "name": "mistral-mlx",
-  "backend_type": "mlx_lm",
-  "backend_options": {
-    "model": "mlx-community/Mistral-7B-Instruct-v0.3-4bit",
-    "temp": 0.7,
-    "max_tokens": 2048
-  }
-}
-```
-
-**vLLM backend:**
-```json
-{
-  "name": "dialogpt-vllm",
-  "backend_type": "vllm",
-  "backend_options": {
-    "model": "microsoft/DialoGPT-medium",
-    "tensor_parallel_size": 2,
-    "gpu_memory_utilization": 0.9
-  }
-}
-```
-
-## Docker Support
-
-Llamactl can run backends in Docker containers. To enable Docker for a backend, add a `docker` section to that backend in your YAML configuration file (e.g. `config.yaml`) as shown below:
-
-```yaml
-backends:
-  vllm:
-    command: "vllm"
-    args: ["serve"]
-    docker:
-      enabled: true
-      image: "vllm/vllm-openai:latest"
-      args: ["run", "--rm", "--network", "host", "--gpus", "all", "--shm-size", "1g"]
-```
-
-## Using the API
-
-You can also manage instances via the REST API:
-
-```bash
-# List all instances
-curl http://localhost:8080/api/instances
-
-# Create a new llama.cpp instance
-curl -X POST http://localhost:8080/api/instances/my-model \
-  -H "Content-Type: application/json" \
-  -d '{
-    "backend_type": "llama_cpp",
-    "backend_options": {
-      "model": "/path/to/model.gguf"
-    }
-  }'
-
-# Start an instance
-curl -X POST http://localhost:8080/api/instances/my-model/start
-```
-
-## OpenAI Compatible API
-
-Llamactl provides OpenAI-compatible endpoints, making it easy to integrate with existing OpenAI client libraries and tools.
-
-### Chat Completions
-
-Once you have an instance running, you can use it with the OpenAI-compatible chat completions endpoint:
-
-```bash
-curl -X POST http://localhost:8080/v1/chat/completions \
-  -H "Content-Type: application/json" \
-  -d '{
-    "model": "my-model",
-    "messages": [
-      {
-        "role": "user",
-        "content": "Hello! Can you help me write a Python function?"
-      }
-    ],
-    "max_tokens": 150,
-    "temperature": 0.7
-  }'
-```
-
-### Using with Python OpenAI Client
-
-You can also use the official OpenAI Python client:
-
-```python
-from openai import OpenAI
-
-# Point the client to your Llamactl server
-client = OpenAI(
-    base_url="http://localhost:8080/v1",
-    api_key="not-needed"  # Llamactl doesn't require API keys by default
-)
-
-# Create a chat completion
-response = client.chat.completions.create(
-    model="my-model",  # Use the name of your instance
-    messages=[
-        {"role": "user", "content": "Explain quantum computing in simple terms"}
-    ],
-    max_tokens=200,
-    temperature=0.7
-)
-
-print(response.choices[0].message.content)
-```
-
-### List Available Models
-
-Get a list of running instances (models) in OpenAI-compatible format:
-
-```bash
-curl http://localhost:8080/v1/models
-```
-
-## Next Steps
-
- Manage instances [Managing Instances](../user-guide/managing-instances.md)
- Explore the [API Reference](../user-guide/api-reference.md)
- Configure advanced settings in the [Configuration](configuration.md) guide
--- a/docs/images/create_instance.png
+++ b/docs/images/create_instance.png
--- a/docs/images/dashboard.png
+++ b/docs/images/dashboard.png
--- a/docs/index.md
+++ b/docs/index.md
@@ -14,20 +14,20 @@ Welcome to the Llamactl documentation!

 ## Quick Links

- [Installation Guide](getting-started/installation.md) - Get Llamactl up and running
- [Configuration Guide](getting-started/configuration.md) - Detailed configuration options
- [Quick Start](getting-started/quick-start.md) - Your first steps with Llamactl
- [Managing Instances](user-guide/managing-instances.md) - Instance lifecycle management
- [API Reference](user-guide/api-reference.md) - Complete API documentation
+- [Installation Guide](installation.md) - Get Llamactl up and running
+- [Configuration Guide](configuration.md) - Detailed configuration options
+- [Quick Start](quick-start.md) - Your first steps with Llamactl
+- [Managing Instances](managing-instances.md) - Instance lifecycle management
+- [API Reference](api-reference.md) - Complete API documentation


 ## Getting Help

 If you need help or have questions:

- Check the [Troubleshooting](user-guide/troubleshooting.md) guide
+- Check the [Troubleshooting](troubleshooting.md) guide
 - Visit the [GitHub repository](https://github.com/lordmathis/llamactl)
- Review the [Configuration Guide](getting-started/configuration.md) for advanced settings
+- Review the [Configuration Guide](configuration.md) for advanced settings

 ## License

--- a/docs/getting-started/installation.md
+++ b/docs/getting-started/installation.md
@@ -42,15 +42,10 @@ Note: MLX backend is only available on macOS with Apple Silicon (M1, M2, M3, etc
 vLLM provides high-throughput distributed serving for LLMs. Install vLLM:

 ```bash
-# Install via pip (requires Python 3.8+, GPU required)
-pip install vllm
-
-# Or in a virtual environment (recommended)
+# Install in a virtual environment
 python -m venv vllm-env
 source vllm-env/bin/activate
 pip install vllm
-
-# For production deployments, consider container-based installation
 ```

 ## Installation Methods
@@ -82,7 +77,7 @@ llamactl provides Dockerfiles for creating Docker images with backends pre-insta

 **Note:** These Dockerfiles are configured for CUDA. For other platforms (CPU, ROCm, Vulkan, etc.), adapt the base image. For llama.cpp, see available tags at [llama.cpp Docker docs](https://github.com/ggml-org/llama.cpp/blob/master/docs/docker.md). For vLLM, check [vLLM docs](https://docs.vllm.ai/en/v0.6.5/serving/deploying_with_docker.html).

-#### Using Docker Compose
+**Using Docker Compose**

 ```bash
 # Clone the repository
@@ -103,9 +98,9 @@ Access the dashboard at:
 - llamactl with llama.cpp: http://localhost:8080
 - llamactl with vLLM: http://localhost:8081

-#### Using Docker Build and Run
+**Using Docker Build and Run**

-**llamactl with llama.cpp CUDA:**
+1. llamactl with llama.cpp CUDA:
 ```bash
 docker build -f docker/Dockerfile.llamacpp -t llamactl:llamacpp-cuda .
 docker run -d \
@@ -116,7 +111,7 @@ docker run -d \
  llamactl:llamacpp-cuda
 ```

-**llamactl with vLLM CUDA:**
+2. llamactl with vLLM CUDA:
 ```bash
 docker build -f docker/Dockerfile.vllm -t llamactl:vllm-cuda .
 docker run -d \
@@ -127,7 +122,7 @@ docker run -d \
  llamactl:vllm-cuda
 ```

-**llamactl built from source:**
+3. llamactl built from source:
 ```bash
 docker build -f docker/Dockerfile.source -t llamactl:source .
 docker run -d \
--- a/docs/user-guide/managing-instances.md
+++ b/docs/user-guide/managing-instances.md
@@ -9,13 +9,17 @@ Llamactl provides two ways to manage instances:
 - **Web UI**: Accessible at `http://localhost:8080` with an intuitive dashboard
 - **REST API**: Programmatic access for automation and integration

-![Dashboard Screenshot](../images/dashboard.png)
+![Dashboard Screenshot](images/dashboard.png)

 ### Authentication

-If authentication is enabled:
+Llamactl uses a **Management API Key** to authenticate requests to the management API (creating, starting, stopping instances). All curl examples below use `<token>` as a placeholder - replace this with your actual Management API Key.
+
+By default, authentication is required. If you don't configure a management API key in your configuration file, llamactl will auto-generate one and print it to the terminal on startup. See the [Configuration](configuration.md) guide for details.
+
+For Web UI access:
 1. Navigate to the web UI
-2. Enter your credentials
+2. Enter your Management API Key
 3. Bearer token is stored for the session

 ### Theme Support
@@ -33,9 +37,9 @@ Each instance is displayed as a card showing:

 ## Create Instance

-### Via Web UI
+**Via Web UI**

-![Create Instance Screenshot](../images/create_instance.png)
+![Create Instance Screenshot](images/create_instance.png)

 1. Click the **"Create Instance"** button on the dashboard
 2. Enter a unique **Name** for your instance (only required field)
@@ -59,14 +63,19 @@ Each instance is displayed as a card showing:
    - **llama.cpp**: Threads, context size, GPU layers, port, etc.
    - **MLX**: Temperature, top-p, adapter path, Python environment, etc.
    - **vLLM**: Tensor parallel size, GPU memory utilization, quantization, etc.
+
+!!! tip "Auto-Assignment"
+    Llamactl automatically assigns ports from the configured port range (default: 8000-9000) and generates API keys if authentication is enabled. You typically don't need to manually specify these values.
+
 8. Click **"Create"** to save the instance  

-### Via API
+**Via API**

 ```bash
 # Create llama.cpp instance with local model file
-curl -X POST http://localhost:8080/api/instances/my-llama-instance \
+curl -X POST http://localhost:8080/api/v1/instances/my-llama-instance \
  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer <token>" \
  -d '{
    "backend_type": "llama_cpp",
    "backend_options": {
@@ -74,12 +83,14 @@ curl -X POST http://localhost:8080/api/instances/my-llama-instance \
      "threads": 8,
      "ctx_size": 4096,
      "gpu_layers": 32
-    }
+    },
+    "nodes": ["main"]
  }'

 # Create MLX instance (macOS only)
-curl -X POST http://localhost:8080/api/instances/my-mlx-instance \
+curl -X POST http://localhost:8080/api/v1/instances/my-mlx-instance \
  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer <token>" \
  -d '{
    "backend_type": "mlx_lm",
    "backend_options": {
@@ -89,12 +100,14 @@ curl -X POST http://localhost:8080/api/instances/my-mlx-instance \
      "max_tokens": 2048
    },
    "auto_restart": true,
-    "max_restarts": 3
+    "max_restarts": 3,
+    "nodes": ["main"]
  }'

 # Create vLLM instance
-curl -X POST http://localhost:8080/api/instances/my-vllm-instance \
+curl -X POST http://localhost:8080/api/v1/instances/my-vllm-instance \
  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer <token>" \
  -d '{
    "backend_type": "vllm",
    "backend_options": {
@@ -108,24 +121,28 @@ curl -X POST http://localhost:8080/api/instances/my-vllm-instance \
      "CUDA_VISIBLE_DEVICES": "0,1",
      "NCCL_DEBUG": "INFO",
      "PYTHONPATH": "/custom/path"
-    }
+    },
+    "nodes": ["main"]
  }'

 # Create llama.cpp instance with HuggingFace model
-curl -X POST http://localhost:8080/api/instances/gemma-3-27b \
+curl -X POST http://localhost:8080/api/v1/instances/gemma-3-27b \
  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer <token>" \
  -d '{
    "backend_type": "llama_cpp",
    "backend_options": {
      "hf_repo": "unsloth/gemma-3-27b-it-GGUF",
      "hf_file": "gemma-3-27b-it-GGUF.gguf",
      "gpu_layers": 32
-    }
+    },
+    "nodes": ["main"]
  }'

 # Create instance on specific remote node
-curl -X POST http://localhost:8080/api/instances/remote-llama \
+curl -X POST http://localhost:8080/api/v1/instances/remote-llama \
  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer <token>" \
  -d '{
    "backend_type": "llama_cpp",
    "backend_options": {
@@ -134,46 +151,62 @@ curl -X POST http://localhost:8080/api/instances/remote-llama \
    },
    "nodes": ["worker1"]
  }'
+
+# Create instance on multiple nodes for high availability
+curl -X POST http://localhost:8080/api/v1/instances/multi-node-llama \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer <token>" \
+  -d '{
+    "backend_type": "llama_cpp",
+    "backend_options": {
+      "model": "/models/llama-7b.gguf",
+      "gpu_layers": 32
+    },
+    "nodes": ["worker1", "worker2", "worker3"]
+  }'
 ```

 ## Start Instance

-### Via Web UI
+**Via Web UI**
 1. Click the **"Start"** button on an instance card
 2. Watch the status change to "Unknown"
 3. Monitor progress in the logs
 4. Instance status changes to "Ready" when ready

-### Via API
+**Via API**
 ```bash
-curl -X POST http://localhost:8080/api/instances/{name}/start
+curl -X POST http://localhost:8080/api/v1/instances/{name}/start \
+  -H "Authorization: Bearer <token>"
 ```

 ## Stop Instance

-### Via Web UI
+**Via Web UI**
 1. Click the **"Stop"** button on an instance card
 2. Instance gracefully shuts down

-### Via API
+**Via API**
 ```bash
-curl -X POST http://localhost:8080/api/instances/{name}/stop
+curl -X POST http://localhost:8080/api/v1/instances/{name}/stop \
+  -H "Authorization: Bearer <token>"
 ```

 ## Edit Instance

-### Via Web UI
+**Via Web UI**
 1. Click the **"Edit"** button on an instance card
 2. Modify settings in the configuration dialog
 3. Changes require instance restart to take effect
 4. Click **"Update & Restart"** to apply changes

-### Via API
+**Via API**
 Modify instance settings:

 ```bash
-curl -X PUT http://localhost:8080/api/instances/{name} \
+curl -X PUT http://localhost:8080/api/v1/instances/{name} \
  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer <token>" \
  -d '{
    "backend_options": {
      "threads": 8,
@@ -188,29 +221,31 @@ curl -X PUT http://localhost:8080/api/instances/{name} \

 ## View Logs

-### Via Web UI
+**Via Web UI**

 1. Click the **"Logs"** button on any instance card
 2. Real-time log viewer opens

-### Via API
+**Via API**
 Check instance status in real-time:

 ```bash
-# Get instance details
-curl http://localhost:8080/api/instances/{name}/logs
+# Get instance logs
+curl http://localhost:8080/api/v1/instances/{name}/logs \
+  -H "Authorization: Bearer <token>"
 ```

 ## Delete Instance

-### Via Web UI
+**Via Web UI**
 1. Click the **"Delete"** button on an instance card
 2. Only stopped instances can be deleted
 3. Confirm deletion in the dialog

-### Via API
+**Via API**
 ```bash
-curl -X DELETE http://localhost:8080/api/instances/{name}
+curl -X DELETE http://localhost:8080/api/v1/instances/{name} \
+  -H "Authorization: Bearer <token>"
 ```

 ## Instance Proxy
@@ -218,8 +253,9 @@ curl -X DELETE http://localhost:8080/api/instances/{name}
 Llamactl proxies all requests to the underlying backend instances (llama-server, MLX, or vLLM).

 ```bash
-# Get instance details
-curl http://localhost:8080/api/instances/{name}/proxy/
+# Proxy requests to the instance
+curl http://localhost:8080/api/v1/instances/{name}/proxy/ \
+  -H "Authorization: Bearer <token>"
 ```

 All backends provide OpenAI-compatible endpoints. Check the respective documentation:
@@ -229,15 +265,16 @@ All backends provide OpenAI-compatible endpoints. Check the respective documenta

 ### Instance Health

-#### Via Web UI
+**Via Web UI**

 1. The health status badge is displayed on each instance card

-#### Via API
+**Via API**

 Check the health status of your instances:

 ```bash
-curl http://localhost:8080/api/instances/{name}/proxy/health
+curl http://localhost:8080/api/v1/instances/{name}/proxy/health \
+  -H "Authorization: Bearer <token>"
 ```

--- a/docs/quick-start.md
+++ b/docs/quick-start.md
@@ -0,0 +1,263 @@
+# Quick Start
+
+This guide will help you get Llamactl up and running in just a few minutes.
+
+**Before you begin:** Ensure you have at least one backend installed (llama.cpp, MLX, or vLLM). See the [Installation Guide](installation.md#prerequisites) for backend setup.
+
+## Core Concepts
+
+Before you start, let's clarify a few key terms:
+
+- **Instance**: A running backend server that serves a specific model. Each instance has a unique name and runs independently.
+- **Backend**: The inference engine that actually runs the model (llama.cpp, MLX, or vLLM). You need at least one backend installed before creating instances.
+- **Node**: In multi-machine setups, a node represents one machine. Most users will just use the default "main" node for single-machine deployments.
+- **Proxy Architecture**: Llamactl acts as a proxy in front of your instances. You make requests to llamactl (e.g., `http://localhost:8080/v1/chat/completions`), and it routes them to the appropriate backend instance. This means you don't need to track individual instance ports or endpoints.
+
+## Authentication
+
+Llamactl uses two types of API keys:
+
+- **Management API Key**: Used to authenticate with the Llamactl management API (creating, starting, stopping instances).
+- **Inference API Key**: Used to authenticate requests to the OpenAI-compatible endpoints (`/v1/chat/completions`, `/v1/completions`, etc.).
+
+By default, authentication is required. If you don't configure these keys in your configuration file, llamactl will auto-generate them and print them to the terminal on startup. You can also configure custom keys or disable authentication entirely in the [Configuration](configuration.md) guide.
+
+## Start Llamactl
+
+Start the Llamactl server:
+
+```bash
+llamactl
+```
+
+```
+━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+⚠️  MANAGEMENT AUTHENTICATION REQUIRED
+━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+🔑  Generated Management API Key:
+
+    sk-management-...
+
+━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+⚠️  INFERENCE AUTHENTICATION REQUIRED
+━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+🔑  Generated Inference API Key:
+
+    sk-inference-...
+
+━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+⚠️  IMPORTANT
+━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+• These keys are auto-generated and will change on restart
+• For production, add explicit keys to your configuration
+• Copy these keys before they disappear from the terminal
+━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+Llamactl server listening on 0.0.0.0:8080
+```
+
+Copy the **Management** and **Inference** API Keys from the terminal - you'll need them to access the web UI and make inference requests.
+
+By default, Llamactl will start on `http://localhost:8080`.
+
+## Access the Web UI
+
+Open your web browser and navigate to:
+
+```
+http://localhost:8080
+```
+
+Login with the management API key from the terminal output.
+
+You should see the Llamactl web interface.
+
+## Create Your First Instance
+
+1. Click the "Add Instance" button
+2. Fill in the instance configuration:
+     - **Name**: Give your instance a descriptive name
+     - **Node**: Select which node to deploy the instance to (defaults to "main" for single-node setups)
+     - **Backend Type**: Choose from llama.cpp, MLX, or vLLM
+     - **Model**: Model path or huggingface repo
+     - **Additional Options**: Backend-specific parameters
+
+    !!! tip "Auto-Assignment"
+        Llamactl automatically assigns ports from the configured port range (default: 8000-9000) and generates API keys if authentication is enabled. You typically don't need to manually specify these values.
+
+    !!! note "Remote Node Deployment"
+        If you have configured remote nodes in your configuration file, you can select which node to deploy the instance to. This allows you to distribute instances across multiple machines. See the [Configuration](configuration.md#remote-node-configuration) guide for details on setting up remote nodes.
+
+3. Click "Create Instance"
+
+## Start Your Instance
+
+Once created, you can:
+
+- **Start** the instance by clicking the start button
+- **Monitor** its status in real-time
+- **View logs** by clicking the logs button
+- **Stop** the instance when needed
+
+## Example Configurations
+
+Here are basic example configurations for each backend:
+
+**llama.cpp backend:**
+```json
+{
+  "name": "llama2-7b",
+  "backend_type": "llama_cpp",
+  "backend_options": {
+    "model": "/path/to/llama-2-7b-chat.gguf",
+    "threads": 4,
+    "ctx_size": 2048,
+    "gpu_layers": 32
+  },
+  "nodes": ["main"]
+}
+```
+
+**MLX backend (macOS only):**
+```json
+{
+  "name": "mistral-mlx",
+  "backend_type": "mlx_lm",
+  "backend_options": {
+    "model": "mlx-community/Mistral-7B-Instruct-v0.3-4bit",
+    "temp": 0.7,
+    "max_tokens": 2048
+  },
+  "nodes": ["main"]
+}
+```
+
+**vLLM backend:**
+```json
+{
+  "name": "dialogpt-vllm",
+  "backend_type": "vllm",
+  "backend_options": {
+    "model": "microsoft/DialoGPT-medium",
+    "tensor_parallel_size": 2,
+    "gpu_memory_utilization": 0.9
+  },
+  "nodes": ["main"]
+}
+```
+
+**Remote node deployment example:**
+```json
+{
+  "name": "distributed-model",
+  "backend_type": "llama_cpp",
+  "backend_options": {
+    "model": "/path/to/model.gguf",
+    "gpu_layers": 32
+  },
+  "nodes": ["worker1"]
+}
+```
+
+## Docker Support
+
+Llamactl can run backends in Docker containers. To enable Docker for a backend, add a `docker` section to that backend in your YAML configuration file (e.g. `config.yaml`) as shown below:
+
+```yaml
+backends:
+  vllm:
+    command: "vllm"
+    args: ["serve"]
+    docker:
+      enabled: true
+      image: "vllm/vllm-openai:latest"
+      args: ["run", "--rm", "--network", "host", "--gpus", "all", "--shm-size", "1g"]
+```
+
+## Using the API
+
+You can also manage instances via the REST API:
+
+```bash
+# List all instances
+curl http://localhost:8080/api/v1/instances
+
+# Create a new llama.cpp instance
+curl -X POST http://localhost:8080/api/v1/instances/my-model \
+  -H "Content-Type: application/json" \
+  -d '{
+    "backend_type": "llama_cpp",
+    "backend_options": {
+      "model": "/path/to/model.gguf"
+    }
+  }'
+
+# Start an instance
+curl -X POST http://localhost:8080/api/v1/instances/my-model/start
+```
+
+## OpenAI Compatible API
+
+Llamactl provides OpenAI-compatible endpoints, making it easy to integrate with existing OpenAI client libraries and tools.
+
+### Chat Completions
+
+Once you have an instance running, you can use it with the OpenAI-compatible chat completions endpoint:
+
+```bash
+curl -X POST http://localhost:8080/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "my-model",
+    "messages": [
+      {
+        "role": "user",
+        "content": "Hello! Can you help me write a Python function?"
+      }
+    ],
+    "max_tokens": 150,
+    "temperature": 0.7
+  }'
+```
+
+### Using with Python OpenAI Client
+
+You can also use the official OpenAI Python client:
+
+```python
+from openai import OpenAI
+
+# Point the client to your Llamactl server
+client = OpenAI(
+    base_url="http://localhost:8080/v1",
+    api_key="your-inference-api-key"  # Use the inference API key from terminal or config
+)
+
+# Create a chat completion
+response = client.chat.completions.create(
+    model="my-model",  # Use the name of your instance
+    messages=[
+        {"role": "user", "content": "Explain quantum computing in simple terms"}
+    ],
+    max_tokens=200,
+    temperature=0.7
+)
+
+print(response.choices[0].message.content)
+```
+
+!!! note "API Key"
+    If you disabled authentication in your config, you can use any value for `api_key` (e.g., `"not-needed"`). Otherwise, use the inference API key shown in the terminal output on startup.
+
+### List Available Models
+
+Get a list of running instances (models) in OpenAI-compatible format:
+
+```bash
+curl http://localhost:8080/v1/models
+```
+
+## Next Steps
+
+- Manage instances [Managing Instances](managing-instances.md)
+- Explore the [API Reference](api-reference.md)
+- Configure advanced settings in the [Configuration](configuration.md) guide
--- a/apidocs/swagger.json
+++ b/apidocs/swagger.json
--- a/apidocs/swagger.yaml
+++ b/apidocs/swagger.yaml
@@ -1,25 +1,23 @@
 basePath: /api/v1
 definitions:
-  backends.BackendType:
-    enum:
-    - llama_cpp
-    - mlx_lm
-    - vllm
-    type: string
-    x-enum-varnames:
-    - BackendTypeLlamaCpp
-    - BackendTypeMlxLm
-    - BackendTypeVllm
-  instance.CreateInstanceOptions:
+  instance.Instance:
+    properties:
+      created:
+        description: Unix timestamp when the instance was created
+        type: integer
+      name:
+        type: string
+    type: object
+  instance.Options:
    properties:
      auto_restart:
        description: Auto restart
        type: boolean
-      backend_options:
-        additionalProperties: {}
+      environment:
+        additionalProperties:
+          type: string
+        description: Environment variables
        type: object
-      backend_type:
-        $ref: '#/definitions/backends.BackendType'
      idle_timeout:
        description: Idle timeout
        type: integer
@@ -32,27 +30,10 @@ definitions:
        description: seconds
        type: integer
    type: object
-  instance.InstanceStatus:
-    enum:
-    - 0
-    - 1
-    - 2
-    type: integer
-    x-enum-varnames:
-    - Stopped
-    - Running
-    - Failed
-  instance.Process:
+  server.NodeResponse:
    properties:
-      created:
-        description: Creation time
-        type: integer
-      name:
+      address:
        type: string
-      status:
-        allOf:
-        - $ref: '#/definitions/instance.InstanceStatus'
-        description: Status
    type: object
  server.OpenAIInstance:
    properties:
@@ -88,7 +69,7 @@ info:
  title: llamactl API
  version: "1.0"
 paths:
-  /backends/llama-cpp/devices:
+  /api/v1/backends/llama-cpp/devices:
    get:
      description: Returns a list of available devices for the llama server
      responses:
@@ -104,8 +85,8 @@ paths:
      - ApiKeyAuth: []
      summary: List available devices for llama server
      tags:
-      - backends
-  /backends/llama-cpp/help:
+      - Backends
+  /api/v1/backends/llama-cpp/help:
    get:
      description: Returns the help text for the llama server command
      responses:
@@ -121,8 +102,8 @@ paths:
      - ApiKeyAuth: []
      summary: Get help for llama server
      tags:
-      - backends
-  /backends/llama-cpp/parse-command:
+      - Backends
+  /api/v1/backends/llama-cpp/parse-command:
    post:
      consumes:
      - application/json
@@ -140,7 +121,7 @@ paths:
        "200":
          description: Parsed options
          schema:
-            $ref: '#/definitions/instance.CreateInstanceOptions'
+            $ref: '#/definitions/instance.Options'
        "400":
          description: Invalid request or command
          schema:
@@ -157,8 +138,8 @@ paths:
      - ApiKeyAuth: []
      summary: Parse llama-server command
      tags:
-      - backends
-  /backends/llama-cpp/version:
+      - Backends
+  /api/v1/backends/llama-cpp/version:
    get:
      description: Returns the version of the llama server command
      responses:
@@ -174,8 +155,8 @@ paths:
      - ApiKeyAuth: []
      summary: Get version of llama server
      tags:
-      - backends
-  /backends/mlx/parse-command:
+      - Backends
+  /api/v1/backends/mlx/parse-command:
    post:
      consumes:
      - application/json
@@ -193,7 +174,7 @@ paths:
        "200":
          description: Parsed options
          schema:
-            $ref: '#/definitions/instance.CreateInstanceOptions'
+            $ref: '#/definitions/instance.Options'
        "400":
          description: Invalid request or command
          schema:
@@ -204,8 +185,8 @@ paths:
      - ApiKeyAuth: []
      summary: Parse mlx_lm.server command
      tags:
-      - backends
-  /backends/vllm/parse-command:
+      - Backends
+  /api/v1/backends/vllm/parse-command:
    post:
      consumes:
      - application/json
@@ -223,7 +204,7 @@ paths:
        "200":
          description: Parsed options
          schema:
-            $ref: '#/definitions/instance.CreateInstanceOptions'
+            $ref: '#/definitions/instance.Options'
        "400":
          description: Invalid request or command
          schema:
@@ -234,8 +215,8 @@ paths:
      - ApiKeyAuth: []
      summary: Parse vllm serve command
      tags:
-      - backends
-  /instances:
+      - Backends
+  /api/v1/instances:
    get:
      description: Returns a list of all instances managed by the server
      responses:
@@ -243,7 +224,7 @@ paths:
          description: List of instances
          schema:
            items:
-              $ref: '#/definitions/instance.Process'
+              $ref: '#/definitions/instance.Instance'
            type: array
        "500":
          description: Internal Server Error
@@ -253,8 +234,8 @@ paths:
      - ApiKeyAuth: []
      summary: List all instances
      tags:
-      - instances
-  /instances/{name}:
+      - Instances
+  /api/v1/instances/{name}:
    delete:
      description: Stops and removes a specific instance by name
      parameters:
@@ -278,7 +259,7 @@ paths:
      - ApiKeyAuth: []
      summary: Delete an instance
      tags:
-      - instances
+      - Instances
    get:
      description: Returns the details of a specific instance by name
      parameters:
@@ -291,7 +272,7 @@ paths:
        "200":
          description: Instance details
          schema:
-            $ref: '#/definitions/instance.Process'
+            $ref: '#/definitions/instance.Instance'
        "400":
          description: Invalid name format
          schema:
@@ -304,7 +285,7 @@ paths:
      - ApiKeyAuth: []
      summary: Get details of a specific instance
      tags:
-      - instances
+      - Instances
    post:
      consumes:
      - application/json
@@ -320,12 +301,12 @@ paths:
        name: options
        required: true
        schema:
-          $ref: '#/definitions/instance.CreateInstanceOptions'
+          $ref: '#/definitions/instance.Options'
      responses:
        "201":
          description: Created instance details
          schema:
-            $ref: '#/definitions/instance.Process'
+            $ref: '#/definitions/instance.Instance'
        "400":
          description: Invalid request body
          schema:
@@ -338,7 +319,7 @@ paths:
      - ApiKeyAuth: []
      summary: Create and start a new instance
      tags:
-      - instances
+      - Instances
    put:
      consumes:
      - application/json
@@ -354,12 +335,12 @@ paths:
        name: options
        required: true
        schema:
-          $ref: '#/definitions/instance.CreateInstanceOptions'
+          $ref: '#/definitions/instance.Options'
      responses:
        "200":
          description: Updated instance details
          schema:
-            $ref: '#/definitions/instance.Process'
+            $ref: '#/definitions/instance.Instance'
        "400":
          description: Invalid name format
          schema:
@@ -372,8 +353,8 @@ paths:
      - ApiKeyAuth: []
      summary: Update an instance's configuration
      tags:
-      - instances
-  /instances/{name}/logs:
+      - Instances
+  /api/v1/instances/{name}/logs:
    get:
      description: Returns the logs from a specific instance by name with optional
        line limit
@@ -404,8 +385,8 @@ paths:
      - ApiKeyAuth: []
      summary: Get logs from a specific instance
      tags:
-      - instances
-  /instances/{name}/proxy:
+      - Instances
+  /api/v1/instances/{name}/proxy:
    get:
      description: Forwards HTTP requests to the llama-server instance running on
        a specific port
@@ -432,9 +413,10 @@ paths:
            type: string
      security:
      - ApiKeyAuth: []
-      summary: Proxy requests to a specific instance
+      summary: Proxy requests to a specific instance, does not autostart instance
+        if stopped
      tags:
-      - instances
+      - Instances
    post:
      description: Forwards HTTP requests to the llama-server instance running on
        a specific port
@@ -461,10 +443,11 @@ paths:
            type: string
      security:
      - ApiKeyAuth: []
-      summary: Proxy requests to a specific instance
+      summary: Proxy requests to a specific instance, does not autostart instance
+        if stopped
      tags:
-      - instances
-  /instances/{name}/restart:
+      - Instances
+  /api/v1/instances/{name}/restart:
    post:
      description: Restarts a specific instance by name
      parameters:
@@ -477,7 +460,7 @@ paths:
        "200":
          description: Restarted instance details
          schema:
-            $ref: '#/definitions/instance.Process'
+            $ref: '#/definitions/instance.Instance'
        "400":
          description: Invalid name format
          schema:
@@ -490,8 +473,8 @@ paths:
      - ApiKeyAuth: []
      summary: Restart a running instance
      tags:
-      - instances
-  /instances/{name}/start:
+      - Instances
+  /api/v1/instances/{name}/start:
    post:
      description: Starts a specific instance by name
      parameters:
@@ -504,7 +487,7 @@ paths:
        "200":
          description: Started instance details
          schema:
-            $ref: '#/definitions/instance.Process'
+            $ref: '#/definitions/instance.Instance'
        "400":
          description: Invalid name format
          schema:
@@ -517,8 +500,8 @@ paths:
      - ApiKeyAuth: []
      summary: Start a stopped instance
      tags:
-      - instances
-  /instances/{name}/stop:
+      - Instances
+  /api/v1/instances/{name}/stop:
    post:
      description: Stops a specific instance by name
      parameters:
@@ -531,7 +514,7 @@ paths:
        "200":
          description: Stopped instance details
          schema:
-            $ref: '#/definitions/instance.Process'
+            $ref: '#/definitions/instance.Instance'
        "400":
          description: Invalid name format
          schema:
@@ -544,7 +527,444 @@ paths:
      - ApiKeyAuth: []
      summary: Stop a running instance
      tags:
-      - instances
+      - Instances
+  /api/v1/nodes:
+    get:
+      description: Returns a map of all nodes configured in the server (node name
+        -> node config)
+      responses:
+        "200":
+          description: Map of nodes
+          schema:
+            additionalProperties:
+              $ref: '#/definitions/server.NodeResponse'
+            type: object
+        "500":
+          description: Internal Server Error
+          schema:
+            type: string
+      security:
+      - ApiKeyAuth: []
+      summary: List all configured nodes
+      tags:
+      - Nodes
+  /api/v1/nodes/{name}:
+    get:
+      description: Returns the details of a specific node by name
+      parameters:
+      - description: Node Name
+        in: path
+        name: name
+        required: true
+        type: string
+      responses:
+        "200":
+          description: Node details
+          schema:
+            $ref: '#/definitions/server.NodeResponse'
+        "400":
+          description: Invalid name format
+          schema:
+            type: string
+        "404":
+          description: Node not found
+          schema:
+            type: string
+        "500":
+          description: Internal Server Error
+          schema:
+            type: string
+      security:
+      - ApiKeyAuth: []
+      summary: Get details of a specific node
+      tags:
+      - Nodes
+  /api/v1/version:
+    get:
+      description: Returns the version of the llamactl command
+      responses:
+        "200":
+          description: Version information
+          schema:
+            type: string
+        "500":
+          description: Internal Server Error
+          schema:
+            type: string
+      security:
+      - ApiKeyAuth: []
+      summary: Get llamactl version
+      tags:
+      - System
+  /llama-cpp/{name}/:
+    get:
+      description: Proxies requests to the llama.cpp UI for the specified instance
+      parameters:
+      - description: Instance Name
+        in: query
+        name: name
+        required: true
+        type: string
+      produces:
+      - text/html
+      responses:
+        "200":
+          description: Proxied HTML response
+          schema:
+            type: string
+        "400":
+          description: Invalid instance
+          schema:
+            type: string
+        "500":
+          description: Internal Server Error
+          schema:
+            type: string
+      security:
+      - ApiKeyAuth: []
+      summary: Proxy requests to llama.cpp UI for the instance
+      tags:
+      - Llama.cpp
+  /llama-cpp/{name}/apply-template:
+    post:
+      description: Proxies requests to the specified llama.cpp server instance, starting
+        it on-demand if configured
+      parameters:
+      - description: Instance Name
+        in: path
+        name: name
+        required: true
+        type: string
+      produces:
+      - application/json
+      responses:
+        "200":
+          description: Proxied response
+          schema:
+            additionalProperties: true
+            type: object
+        "400":
+          description: Invalid instance
+          schema:
+            type: string
+        "500":
+          description: Internal Server Error
+          schema:
+            type: string
+      security:
+      - ApiKeyAuth: []
+      summary: Proxy requests to llama.cpp server instance
+      tags:
+      - Llama.cpp
+  /llama-cpp/{name}/completion:
+    post:
+      description: Proxies requests to the specified llama.cpp server instance, starting
+        it on-demand if configured
+      parameters:
+      - description: Instance Name
+        in: path
+        name: name
+        required: true
+        type: string
+      produces:
+      - application/json
+      responses:
+        "200":
+          description: Proxied response
+          schema:
+            additionalProperties: true
+            type: object
+        "400":
+          description: Invalid instance
+          schema:
+            type: string
+        "500":
+          description: Internal Server Error
+          schema:
+            type: string
+      security:
+      - ApiKeyAuth: []
+      summary: Proxy requests to llama.cpp server instance
+      tags:
+      - Llama.cpp
+  /llama-cpp/{name}/detokenize:
+    post:
+      description: Proxies requests to the specified llama.cpp server instance, starting
+        it on-demand if configured
+      parameters:
+      - description: Instance Name
+        in: path
+        name: name
+        required: true
+        type: string
+      produces:
+      - application/json
+      responses:
+        "200":
+          description: Proxied response
+          schema:
+            additionalProperties: true
+            type: object
+        "400":
+          description: Invalid instance
+          schema:
+            type: string
+        "500":
+          description: Internal Server Error
+          schema:
+            type: string
+      security:
+      - ApiKeyAuth: []
+      summary: Proxy requests to llama.cpp server instance
+      tags:
+      - Llama.cpp
+  /llama-cpp/{name}/embeddings:
+    post:
+      description: Proxies requests to the specified llama.cpp server instance, starting
+        it on-demand if configured
+      parameters:
+      - description: Instance Name
+        in: path
+        name: name
+        required: true
+        type: string
+      produces:
+      - application/json
+      responses:
+        "200":
+          description: Proxied response
+          schema:
+            additionalProperties: true
+            type: object
+        "400":
+          description: Invalid instance
+          schema:
+            type: string
+        "500":
+          description: Internal Server Error
+          schema:
+            type: string
+      security:
+      - ApiKeyAuth: []
+      summary: Proxy requests to llama.cpp server instance
+      tags:
+      - Llama.cpp
+  /llama-cpp/{name}/infill:
+    post:
+      description: Proxies requests to the specified llama.cpp server instance, starting
+        it on-demand if configured
+      parameters:
+      - description: Instance Name
+        in: path
+        name: name
+        required: true
+        type: string
+      produces:
+      - application/json
+      responses:
+        "200":
+          description: Proxied response
+          schema:
+            additionalProperties: true
+            type: object
+        "400":
+          description: Invalid instance
+          schema:
+            type: string
+        "500":
+          description: Internal Server Error
+          schema:
+            type: string
+      security:
+      - ApiKeyAuth: []
+      summary: Proxy requests to llama.cpp server instance
+      tags:
+      - Llama.cpp
+  /llama-cpp/{name}/metrics:
+    post:
+      description: Proxies requests to the specified llama.cpp server instance, starting
+        it on-demand if configured
+      parameters:
+      - description: Instance Name
+        in: path
+        name: name
+        required: true
+        type: string
+      produces:
+      - application/json
+      responses:
+        "200":
+          description: Proxied response
+          schema:
+            additionalProperties: true
+            type: object
+        "400":
+          description: Invalid instance
+          schema:
+            type: string
+        "500":
+          description: Internal Server Error
+          schema:
+            type: string
+      security:
+      - ApiKeyAuth: []
+      summary: Proxy requests to llama.cpp server instance
+      tags:
+      - Llama.cpp
+  /llama-cpp/{name}/props:
+    get:
+      description: Proxies requests to the specified llama.cpp server instance, starting
+        it on-demand if configured
+      parameters:
+      - description: Instance Name
+        in: path
+        name: name
+        required: true
+        type: string
+      produces:
+      - application/json
+      responses:
+        "200":
+          description: Proxied response
+          schema:
+            additionalProperties: true
+            type: object
+        "400":
+          description: Invalid instance
+          schema:
+            type: string
+        "500":
+          description: Internal Server Error
+          schema:
+            type: string
+      security:
+      - ApiKeyAuth: []
+      summary: Proxy requests to llama.cpp server instance
+      tags:
+      - Llama.cpp
+    post:
+      description: Proxies requests to the specified llama.cpp server instance, starting
+        it on-demand if configured
+      parameters:
+      - description: Instance Name
+        in: path
+        name: name
+        required: true
+        type: string
+      produces:
+      - application/json
+      responses:
+        "200":
+          description: Proxied response
+          schema:
+            additionalProperties: true
+            type: object
+        "400":
+          description: Invalid instance
+          schema:
+            type: string
+        "500":
+          description: Internal Server Error
+          schema:
+            type: string
+      security:
+      - ApiKeyAuth: []
+      summary: Proxy requests to llama.cpp server instance
+      tags:
+      - Llama.cpp
+  /llama-cpp/{name}/reranking:
+    post:
+      description: Proxies requests to the specified llama.cpp server instance, starting
+        it on-demand if configured
+      parameters:
+      - description: Instance Name
+        in: path
+        name: name
+        required: true
+        type: string
+      produces:
+      - application/json
+      responses:
+        "200":
+          description: Proxied response
+          schema:
+            additionalProperties: true
+            type: object
+        "400":
+          description: Invalid instance
+          schema:
+            type: string
+        "500":
+          description: Internal Server Error
+          schema:
+            type: string
+      security:
+      - ApiKeyAuth: []
+      summary: Proxy requests to llama.cpp server instance
+      tags:
+      - Llama.cpp
+  /llama-cpp/{name}/slots:
+    get:
+      description: Proxies requests to the specified llama.cpp server instance, starting
+        it on-demand if configured
+      parameters:
+      - description: Instance Name
+        in: path
+        name: name
+        required: true
+        type: string
+      produces:
+      - application/json
+      responses:
+        "200":
+          description: Proxied response
+          schema:
+            additionalProperties: true
+            type: object
+        "400":
+          description: Invalid instance
+          schema:
+            type: string
+        "500":
+          description: Internal Server Error
+          schema:
+            type: string
+      security:
+      - ApiKeyAuth: []
+      summary: Proxy requests to llama.cpp server instance
+      tags:
+      - Llama.cpp
+  /llama-cpp/{name}/tokenize:
+    post:
+      description: Proxies requests to the specified llama.cpp server instance, starting
+        it on-demand if configured
+      parameters:
+      - description: Instance Name
+        in: path
+        name: name
+        required: true
+        type: string
+      produces:
+      - application/json
+      responses:
+        "200":
+          description: Proxied response
+          schema:
+            additionalProperties: true
+            type: object
+        "400":
+          description: Invalid instance
+          schema:
+            type: string
+        "500":
+          description: Internal Server Error
+          schema:
+            type: string
+      security:
+      - ApiKeyAuth: []
+      summary: Proxy requests to llama.cpp server instance
+      tags:
+      - Llama.cpp
  /v1/:
    post:
      consumes:
@@ -567,7 +987,7 @@ paths:
      - ApiKeyAuth: []
      summary: OpenAI-compatible proxy endpoint
      tags:
-      - openai
+      - OpenAI
  /v1/models:
    get:
      description: Returns a list of instances in a format compatible with OpenAI
@@ -585,22 +1005,10 @@ paths:
      - ApiKeyAuth: []
      summary: List instances in OpenAI-compatible format
      tags:
-      - openai
-  /version:
-    get:
-      description: Returns the version of the llamactl command
-      responses:
-        "200":
-          description: Version information
-          schema:
-            type: string
-        "500":
-          description: Internal Server Error
-          schema:
-            type: string
-      security:
-      - ApiKeyAuth: []
-      summary: Get llamactl version
-      tags:
-      - version
+      - OpenAI
+securityDefinitions:
+  ApiKeyAuth:
+    in: header
+    name: X-API-Key
+    type: apiKey
 swagger: "2.0"
--- a/docs/user-guide/troubleshooting.md
+++ b/docs/user-guide/troubleshooting.md
@@ -26,62 +26,67 @@ Issues specific to Llamactl deployment and operation.

 ## Instance Management Issues

-### Model Loading Failures
+### Instance Fails to Start

-**Problem:** Instance fails to start with model loading errors
-
-**Common Solutions:**  
- **llama-server not found:** Ensure `llama-server` binary is in PATH  
- **Wrong model format:** Ensure model is in GGUF format  
- **Insufficient memory:** Use smaller model or reduce context size  
- **Path issues:** Use absolute paths to model files  
-
-### Memory Issues
-
-**Problem:** Out of memory errors or system becomes unresponsive
+**Problem:** Instance fails to start or immediately stops

 **Solutions:**
-1. **Reduce context size:**
-   ```json
-   {
-     "n_ctx": 1024
-   }
+
+1. **Check instance logs** to see the actual error:
+   ```bash
+   curl http://localhost:8080/api/v1/instances/{name}/logs
+   # Or check log files directly
+   tail -f ~/.local/share/llamactl/logs/{instance-name}.log
   ```

-2. **Use quantized models:**  
-   - Try Q4_K_M instead of higher precision models  
-   - Use smaller model variants (7B instead of 13B)  
+2. **Verify backend is installed:**  
+     - **llama.cpp**: Ensure `llama-server` is in PATH
+     - **MLX**: Ensure `mlx-lm` Python package is installed
+     - **vLLM**: Ensure `vllm` Python package is installed

-### GPU Configuration
+3. **Check model path and format:**
+     - Use absolute paths to model files
+     - Verify model format matches backend (GGUF for llama.cpp, etc.)

-**Problem:** GPU not being used effectively
+4. **Verify backend command configuration:**
+     - Check that the backend `command` is correctly configured in the global config
+     - For virtual environments, specify the full path to the command (e.g., `/path/to/venv/bin/mlx_lm.server`)
+     - See the [Configuration Guide](configuration.md) for backend configuration details
+     - Test the backend directly (see [Backend-Specific Issues](#backend-specific-issues) below)

-**Solutions:**
-1. **Configure GPU layers:**
-   ```json
-   {
-     "n_gpu_layers": 35
-   }
-   ```
+### Backend-Specific Issues

-### Advanced Instance Issues
+**Problem:** Model loading, memory, GPU, or performance issues

-**Problem:** Complex model loading, performance, or compatibility issues
+Most model-specific issues (memory, GPU configuration, performance tuning) are backend-specific and should be resolved by consulting the respective backend documentation:

-Since llamactl uses `llama-server` under the hood, many instance-related issues are actually llama.cpp issues. For advanced troubleshooting:
+**llama.cpp:**
+- [llama.cpp GitHub](https://github.com/ggml-org/llama.cpp)
+- [llama-server README](https://github.com/ggml-org/llama.cpp/blob/master/tools/server/README.md)

-**Resources:**  
- **llama.cpp Documentation:** [https://github.com/ggml/llama.cpp](https://github.com/ggml/llama.cpp)  
- **llama.cpp Issues:** [https://github.com/ggml/llama.cpp/issues](https://github.com/ggml/llama.cpp/issues)  
- **llama.cpp Discussions:** [https://github.com/ggml/llama.cpp/discussions](https://github.com/ggml/llama.cpp/discussions)  
+**MLX:**
+- [MLX-LM GitHub](https://github.com/ml-explore/mlx-lm)
+- [MLX-LM Server Guide](https://github.com/ml-explore/mlx-lm/blob/main/mlx_lm/SERVER.md)
+
+**vLLM:**
+- [vLLM Documentation](https://docs.vllm.ai/en/stable/)
+- [OpenAI Compatible Server](https://docs.vllm.ai/en/stable/serving/openai_compatible_server.html)
+- [vllm serve Command](https://docs.vllm.ai/en/stable/cli/serve.html#vllm-serve)
+
+**Testing backends directly:**
+
+Testing your model and configuration directly with the backend helps determine if the issue is with llamactl or the backend itself:

-**Testing directly with llama-server:**  
 ```bash
-# Test your model and parameters directly with llama-server
-llama-server --model /path/to/model.gguf --port 8081 --n-gpu-layers 35
-```
+# llama.cpp
+llama-server --model /path/to/model.gguf --port 8081

-This helps determine if the issue is with llamactl or with the underlying llama.cpp/llama-server.
+# MLX
+mlx_lm.server --model mlx-community/Mistral-7B-Instruct-v0.3-4bit --port 8081
+
+# vLLM
+vllm serve microsoft/DialoGPT-medium --port 8081
+```

 ## API and Network Issues

--- a/docs/user-guide/api-reference.md
+++ b/docs/user-guide/api-reference.md
@@ -1,560 +0,0 @@
-# API Reference
-
-Complete reference for the Llamactl REST API.
-
-## Base URL
-
-All API endpoints are relative to the base URL:
-
-```
-http://localhost:8080/api/v1
-```
-
-## Authentication
-
-Llamactl supports API key authentication. If authentication is enabled, include the API key in the Authorization header:
-
-```bash
-curl -H "Authorization: Bearer <your-api-key>" \
-  http://localhost:8080/api/v1/instances
-```
-
-The server supports two types of API keys:
- **Management API Keys**: Required for instance management operations (CRUD operations on instances)
- **Inference API Keys**: Required for OpenAI-compatible inference endpoints
-
-## System Endpoints
-
-### Get Llamactl Version
-
-Get the version information of the llamactl server.
-
-```http
-GET /api/v1/version
-```
-
-**Response:**
-```
-Version: 1.0.0
-Commit: abc123
-Build Time: 2024-01-15T10:00:00Z
-```
-
-### Get Llama Server Help
-
-Get help text for the llama-server command.
-
-```http
-GET /api/v1/server/help
-```
-
-**Response:** Plain text help output from `llama-server --help`
-
-### Get Llama Server Version
-
-Get version information of the llama-server binary.
-
-```http
-GET /api/v1/server/version
-```
-
-**Response:** Plain text version output from `llama-server --version`
-
-### List Available Devices
-
-List available devices for llama-server.
-
-```http
-GET /api/v1/server/devices
-```
-
-**Response:** Plain text device list from `llama-server --list-devices`
-
-## Instances
-
-### List All Instances
-
-Get a list of all instances.
-
-```http
-GET /api/v1/instances
-```
-
-**Response:**
-```json
-[
-  {
-    "name": "llama2-7b",
-    "status": "running",
-    "created": 1705312200
-  }
-]
-```
-
-### Get Instance Details
-
-Get detailed information about a specific instance.
-
-```http
-GET /api/v1/instances/{name}
-```
-
-**Response:**
-```json
-{
-  "name": "llama2-7b",
-  "status": "running",
-  "created": 1705312200
-}
-```
-
-### Create Instance
-
-Create and start a new instance.
-
-```http
-POST /api/v1/instances/{name}
-```
-
-**Request Body:** JSON object with instance configuration. Common fields include:
-
- `backend_type`: Backend type (`llama_cpp`, `mlx_lm`, or `vllm`)
- `backend_options`: Backend-specific configuration
- `auto_restart`: Enable automatic restart on failure
- `max_restarts`: Maximum restart attempts
- `restart_delay`: Delay between restarts in seconds
- `on_demand_start`: Start instance when receiving requests
- `idle_timeout`: Idle timeout in minutes
- `environment`: Environment variables as key-value pairs
- `nodes`: Array with single node name to deploy the instance to (for remote deployments)
-
-See [Managing Instances](managing-instances.md) for complete configuration options.
-
-**Response:**
-```json
-{
-  "name": "llama2-7b",
-  "status": "running",
-  "created": 1705312200
-}
-```
-
-### Update Instance
-
-Update an existing instance configuration. See [Managing Instances](managing-instances.md) for available configuration options.
-
-```http
-PUT /api/v1/instances/{name}
-```
-
-**Request Body:** JSON object with configuration fields to update.
-
-**Response:**
-```json
-{
-  "name": "llama2-7b",
-  "status": "running",
-  "created": 1705312200
-}
-```
-
-### Delete Instance
-
-Stop and remove an instance.
-
-```http
-DELETE /api/v1/instances/{name}
-```
-
-**Response:** `204 No Content`
-
-## Instance Operations
-
-### Start Instance
-
-Start a stopped instance.
-
-```http
-POST /api/v1/instances/{name}/start
-```
-
-**Response:**
-```json
-{
-  "name": "llama2-7b",
-  "status": "running",
-  "created": 1705312200
-}
-```
-
-**Error Responses:**
- `409 Conflict`: Maximum number of running instances reached
- `500 Internal Server Error`: Failed to start instance
-
-### Stop Instance
-
-Stop a running instance.
-
-```http
-POST /api/v1/instances/{name}/stop
-```
-
-**Response:**
-```json
-{
-  "name": "llama2-7b",
-  "status": "stopped",
-  "created": 1705312200
-}
-```
-
-### Restart Instance
-
-Restart an instance (stop then start).
-
-```http
-POST /api/v1/instances/{name}/restart
-```
-
-**Response:**
-```json
-{
-  "name": "llama2-7b",
-  "status": "running",
-  "created": 1705312200
-}
-```
-
-### Get Instance Logs
-
-Retrieve instance logs.
-
-```http
-GET /api/v1/instances/{name}/logs
-```
-
-**Query Parameters:**
- `lines`: Number of lines to return (default: all lines, use -1 for all)
-
-**Response:** Plain text log output
-
-**Example:**
-```bash
-curl "http://localhost:8080/api/v1/instances/my-instance/logs?lines=100"
-```
-
-### Proxy to Instance
-
-Proxy HTTP requests directly to the llama-server instance.
-
-```http
-GET /api/v1/instances/{name}/proxy/*
-POST /api/v1/instances/{name}/proxy/*
-```
-
-This endpoint forwards all requests to the underlying llama-server instance running on its configured port. The proxy strips the `/api/v1/instances/{name}/proxy` prefix and forwards the remaining path to the instance.
-
-**Example - Check Instance Health:**
-```bash
-curl -H "Authorization: Bearer your-api-key" \
-  http://localhost:8080/api/v1/instances/my-model/proxy/health
-```
-
-This forwards the request to `http://instance-host:instance-port/health` on the actual llama-server instance.
-
-**Error Responses:**
- `503 Service Unavailable`: Instance is not running
-
-## OpenAI-Compatible API
-
-Llamactl provides OpenAI-compatible endpoints for inference operations.
-
-### List Models
-
-List all instances in OpenAI-compatible format.
-
-```http
-GET /v1/models
-```
-
-**Response:**
-```json
-{
-  "object": "list",
-  "data": [
-    {
-      "id": "llama2-7b",
-      "object": "model",
-      "created": 1705312200,
-      "owned_by": "llamactl"
-    }
-  ]
-}
-```
-
-### Chat Completions, Completions, Embeddings
-
-All OpenAI-compatible inference endpoints are available:
-
-```http
-POST /v1/chat/completions
-POST /v1/completions
-POST /v1/embeddings
-POST /v1/rerank
-POST /v1/reranking
-```
-
-**Request Body:** Standard OpenAI format with `model` field specifying the instance name
-
-**Example:**
-```json
-{
-  "model": "llama2-7b",
-  "messages": [
-    {
-      "role": "user",
-      "content": "Hello, how are you?"
-    }
-  ]
-}
-```
-
-The server routes requests to the appropriate instance based on the `model` field in the request body. Instances with on-demand starting enabled will be automatically started if not running. For configuration details, see [Managing Instances](managing-instances.md).
-
-**Error Responses:**
- `400 Bad Request`: Invalid request body or missing instance name
- `503 Service Unavailable`: Instance is not running and on-demand start is disabled
- `409 Conflict`: Cannot start instance due to maximum instances limit
-
-## Instance Status Values
-
-Instances can have the following status values:
- `stopped`: Instance is not running
- `running`: Instance is running and ready to accept requests
- `failed`: Instance failed to start or crashed  
-
-## Error Responses
-
-All endpoints may return error responses in the following format:
-
-```json
-{
-  "error": "Error message description"
-}
-```
-
-### Common HTTP Status Codes
-
- `200`: Success
- `201`: Created
- `204`: No Content (successful deletion)
- `400`: Bad Request (invalid parameters or request body)
- `401`: Unauthorized (missing or invalid API key)
- `403`: Forbidden (insufficient permissions)
- `404`: Not Found (instance not found)
- `409`: Conflict (instance already exists, max instances reached)
- `500`: Internal Server Error
- `503`: Service Unavailable (instance not running)
-
-## Examples
-
-### Complete Instance Lifecycle
-
-```bash
-# Create and start instance
-curl -X POST http://localhost:8080/api/v1/instances/my-model \
-  -H "Content-Type: application/json" \
-  -H "Authorization: Bearer your-api-key" \
-  -d '{
-    "backend_type": "llama_cpp",
-    "backend_options": {
-      "model": "/models/llama-2-7b.gguf",
-      "gpu_layers": 32
-    },
-    "environment": {
-      "CUDA_VISIBLE_DEVICES": "0",
-      "OMP_NUM_THREADS": "8"
-    }
-  }'
-
-# Check instance status
-curl -H "Authorization: Bearer your-api-key" \
-  http://localhost:8080/api/v1/instances/my-model
-
-# Get instance logs
-curl -H "Authorization: Bearer your-api-key" \
-  "http://localhost:8080/api/v1/instances/my-model/logs?lines=50"
-
-# Use OpenAI-compatible chat completions
-curl -X POST http://localhost:8080/v1/chat/completions \
-  -H "Content-Type: application/json" \
-  -H "Authorization: Bearer your-inference-api-key" \
-  -d '{
-    "model": "my-model",
-    "messages": [
-      {"role": "user", "content": "Hello!"}
-    ],
-    "max_tokens": 100
-  }'
-
-# Stop instance
-curl -X POST -H "Authorization: Bearer your-api-key" \
-  http://localhost:8080/api/v1/instances/my-model/stop
-
-# Delete instance
-curl -X DELETE -H "Authorization: Bearer your-api-key" \
-  http://localhost:8080/api/v1/instances/my-model
-```
-
-### Remote Node Instance Example
-
-```bash
-# Create instance on specific remote node
-curl -X POST http://localhost:8080/api/v1/instances/remote-model \
-  -H "Content-Type: application/json" \
-  -H "Authorization: Bearer your-api-key" \
-  -d '{
-    "backend_type": "llama_cpp",
-    "backend_options": {
-      "model": "/models/llama-2-7b.gguf",
-      "gpu_layers": 32
-    },
-    "nodes": ["worker1"]
-  }'
-
-# Check status of remote instance
-curl -H "Authorization: Bearer your-api-key" \
-  http://localhost:8080/api/v1/instances/remote-model
-
-# Use remote instance with OpenAI-compatible API
-curl -X POST http://localhost:8080/v1/chat/completions \
-  -H "Content-Type: application/json" \
-  -H "Authorization: Bearer your-inference-api-key" \
-  -d '{
-    "model": "remote-model",
-    "messages": [
-      {"role": "user", "content": "Hello from remote node!"}
-    ]
-  }'
-```
-
-### Using the Proxy Endpoint
-
-You can also directly proxy requests to the llama-server instance:
-
-```bash
-# Direct proxy to instance (bypasses OpenAI compatibility layer)
-curl -X POST http://localhost:8080/api/v1/instances/my-model/proxy/completion \
-  -H "Content-Type: application/json" \
-  -H "Authorization: Bearer your-api-key" \
-  -d '{
-    "prompt": "Hello, world!",
-    "n_predict": 50
-  }'
-```
-
-## Backend-Specific Endpoints
-
-### Parse Commands
-
-Llamactl provides endpoints to parse command strings from different backends into instance configuration options.
-
-#### Parse Llama.cpp Command
-
-Parse a llama-server command string into instance options.
-
-```http
-POST /api/v1/backends/llama-cpp/parse-command
-```
-
-**Request Body:**
-```json
-{
-  "command": "llama-server -m /path/to/model.gguf -c 2048 --port 8080"
-}
-```
-
-**Response:**
-```json
-{
-  "backend_type": "llama_cpp",
-  "llama_server_options": {
-    "model": "/path/to/model.gguf",
-    "ctx_size": 2048,
-    "port": 8080
-  }
-}
-```
-
-#### Parse MLX-LM Command
-
-Parse an MLX-LM server command string into instance options.
-
-```http
-POST /api/v1/backends/mlx/parse-command
-```
-
-**Request Body:**
-```json
-{
-  "command": "mlx_lm.server --model /path/to/model --port 8080"
-}
-```
-
-**Response:**
-```json
-{
-  "backend_type": "mlx_lm",
-  "mlx_server_options": {
-    "model": "/path/to/model",
-    "port": 8080
-  }
-}
-```
-
-#### Parse vLLM Command
-
-Parse a vLLM serve command string into instance options.
-
-```http
-POST /api/v1/backends/vllm/parse-command
-```
-
-**Request Body:**
-```json
-{
-  "command": "vllm serve /path/to/model --port 8080"
-}
-```
-
-**Response:**
-```json
-{
-  "backend_type": "vllm",
-  "vllm_server_options": {
-    "model": "/path/to/model",
-    "port": 8080
-  }
-}
-```
-
-**Error Responses for Parse Commands:**
- `400 Bad Request`: Invalid request body, empty command, or parse error
- `500 Internal Server Error`: Encoding error
-
-## Auto-Generated Documentation
-
-The API documentation is automatically generated from code annotations using Swagger/OpenAPI. To regenerate the documentation:
-
-1. Install the swag tool: `go install github.com/swaggo/swag/cmd/swag@latest`
-2. Generate docs: `swag init -g cmd/server/main.go -o apidocs`
-
-## Swagger Documentation
-
-If swagger documentation is enabled in the server configuration, you can access the interactive API documentation at:
-
-```
-http://localhost:8080/swagger/
-```
-
-This provides a complete interactive interface for testing all API endpoints.
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -25,8 +25,8 @@ theme:
        name: Switch to light mode
  features:
    - navigation.tabs
-    - navigation.sections
-    - navigation.expand
+    - navigation.tabs.sticky
+    - toc.integrate
    - navigation.top
    - search.highlight
    - search.share
@@ -49,14 +49,12 @@ markdown_extensions:

 nav:
  - Home: index.md
-  - Getting Started:
-    - Installation: getting-started/installation.md
-    - Quick Start: getting-started/quick-start.md
-    - Configuration: getting-started/configuration.md
-  - User Guide:
-    - Managing Instances: user-guide/managing-instances.md
-    - API Reference: user-guide/api-reference.md
-    - Troubleshooting: user-guide/troubleshooting.md
+  - Installation: installation.md
+  - Quick Start: quick-start.md
+  - Configuration: configuration.md
+  - Managing Instances: managing-instances.md
+  - API Reference: api-reference.md
+  - Troubleshooting: troubleshooting.md

 plugins:
  - search
@@ -66,6 +64,8 @@ plugins:
      css_dir: css
      javascript_dir: js
      canonical_version: null
+  - neoteroi.mkdocsoad:
+      use_pymdownx: true

 hooks:
  - docs/readme_sync.py
@@ -78,3 +78,6 @@ extra:
  social:
    - icon: fontawesome/brands/github
      link: https://github.com/lordmathis/llamactl
+
+extra_css:
+- css/css-v1.1.3.css
--- a/pkg/server/handlers_backends.go
+++ b/pkg/server/handlers_backends.go
@@ -44,7 +44,7 @@ func (h *Handler) stripLlamaCppPrefix(r *http.Request, instName string) {
 // LlamaCppUIProxy godoc
 // @Summary Proxy requests to llama.cpp UI for the instance
 // @Description Proxies requests to the llama.cpp UI for the specified instance
-// @Tags backends
+// @Tags Llama.cpp
 // @Security ApiKeyAuth
 // @Produce html
 // @Param name query string true "Instance Name"
@@ -83,14 +83,24 @@ func (h *Handler) LlamaCppUIProxy() http.HandlerFunc {
 // LlamaCppProxy godoc
 // @Summary Proxy requests to llama.cpp server instance
 // @Description Proxies requests to the specified llama.cpp server instance, starting it on-demand if configured
-// @Tags backends
+// @Tags Llama.cpp
 // @Security ApiKeyAuth
 // @Produce json
-// @Param name query string true "Instance Name"
+// @Param name path string true "Instance Name"
 // @Success 200 {object} map[string]any "Proxied response"
 // @Failure 400 {string} string "Invalid instance"
 // @Failure 500 {string} string "Internal Server Error"
-// @Router /llama-cpp/{name}/* [post]
+// @Router /llama-cpp/{name}/props [get]
+// @Router /llama-cpp/{name}/slots [get]
+// @Router /llama-cpp/{name}/apply-template [post]
+// @Router /llama-cpp/{name}/completion [post]
+// @Router /llama-cpp/{name}/detokenize [post]
+// @Router /llama-cpp/{name}/embeddings [post]
+// @Router /llama-cpp/{name}/infill [post]
+// @Router /llama-cpp/{name}/metrics [post]
+// @Router /llama-cpp/{name}/props [post]
+// @Router /llama-cpp/{name}/reranking [post]
+// @Router /llama-cpp/{name}/tokenize [post]
 func (h *Handler) LlamaCppProxy() http.HandlerFunc {
 	return func(w http.ResponseWriter, r *http.Request) {

@@ -150,7 +160,7 @@ func parseHelper(w http.ResponseWriter, r *http.Request, backend interface {
 // ParseLlamaCommand godoc
 // @Summary Parse llama-server command
 // @Description Parses a llama-server command string into instance options
-// @Tags backends
+// @Tags Backends
 // @Security ApiKeyAuth
 // @Accept json
 // @Produce json
@@ -158,7 +168,7 @@ func parseHelper(w http.ResponseWriter, r *http.Request, backend interface {
 // @Success 200 {object} instance.Options "Parsed options"
 // @Failure 400 {object} map[string]string "Invalid request or command"
 // @Failure 500 {object} map[string]string "Internal Server Error"
-// @Router /backends/llama-cpp/parse-command [post]
+// @Router /api/v1/backends/llama-cpp/parse-command [post]
 func (h *Handler) ParseLlamaCommand() http.HandlerFunc {
 	return func(w http.ResponseWriter, r *http.Request) {
 		parsedOptions, ok := parseHelper(w, r, &backends.LlamaServerOptions{})
@@ -180,14 +190,14 @@ func (h *Handler) ParseLlamaCommand() http.HandlerFunc {
 // ParseMlxCommand godoc
 // @Summary Parse mlx_lm.server command
 // @Description Parses MLX-LM server command string into instance options
-// @Tags backends
+// @Tags Backends
 // @Security ApiKeyAuth
 // @Accept json
 // @Produce json
 // @Param request body ParseCommandRequest true "Command to parse"
 // @Success 200 {object} instance.Options "Parsed options"
 // @Failure 400 {object} map[string]string "Invalid request or command"
-// @Router /backends/mlx/parse-command [post]
+// @Router /api/v1/backends/mlx/parse-command [post]
 func (h *Handler) ParseMlxCommand() http.HandlerFunc {
 	return func(w http.ResponseWriter, r *http.Request) {
 		parsedOptions, ok := parseHelper(w, r, &backends.MlxServerOptions{})
@@ -209,14 +219,14 @@ func (h *Handler) ParseMlxCommand() http.HandlerFunc {
 // ParseVllmCommand godoc
 // @Summary Parse vllm serve command
 // @Description Parses a vLLM serve command string into instance options
-// @Tags backends
+// @Tags Backends
 // @Security ApiKeyAuth
 // @Accept json
 // @Produce json
 // @Param request body ParseCommandRequest true "Command to parse"
 // @Success 200 {object} instance.Options "Parsed options"
 // @Failure 400 {object} map[string]string "Invalid request or command"
-// @Router /backends/vllm/parse-command [post]
+// @Router /api/v1/backends/vllm/parse-command [post]
 func (h *Handler) ParseVllmCommand() http.HandlerFunc {
 	return func(w http.ResponseWriter, r *http.Request) {
 		parsedOptions, ok := parseHelper(w, r, &backends.VllmServerOptions{})
@@ -251,12 +261,12 @@ func (h *Handler) executeLlamaServerCommand(flag, errorMsg string) http.HandlerF
 // LlamaServerHelpHandler godoc
 // @Summary Get help for llama server
 // @Description Returns the help text for the llama server command
-// @Tags backends
+// @Tags Backends
 // @Security ApiKeyAuth
 // @Produces text/plain
 // @Success 200 {string} string "Help text"
 // @Failure 500 {string} string "Internal Server Error"
-// @Router /backends/llama-cpp/help [get]
+// @Router /api/v1/backends/llama-cpp/help [get]
 func (h *Handler) LlamaServerHelpHandler() http.HandlerFunc {
 	return h.executeLlamaServerCommand("--help", "Failed to get help")
 }
@@ -264,12 +274,12 @@ func (h *Handler) LlamaServerHelpHandler() http.HandlerFunc {
 // LlamaServerVersionHandler godoc
 // @Summary Get version of llama server
 // @Description Returns the version of the llama server command
-// @Tags backends
+// @Tags Backends
 // @Security ApiKeyAuth
 // @Produces text/plain
 // @Success 200 {string} string "Version information"
 // @Failure 500 {string} string "Internal Server Error"
-// @Router /backends/llama-cpp/version [get]
+// @Router /api/v1/backends/llama-cpp/version [get]
 func (h *Handler) LlamaServerVersionHandler() http.HandlerFunc {
 	return h.executeLlamaServerCommand("--version", "Failed to get version")
 }
@@ -277,12 +287,12 @@ func (h *Handler) LlamaServerVersionHandler() http.HandlerFunc {
 // LlamaServerListDevicesHandler godoc
 // @Summary List available devices for llama server
 // @Description Returns a list of available devices for the llama server
-// @Tags backends
+// @Tags Backends
 // @Security ApiKeyAuth
 // @Produces text/plain
 // @Success 200 {string} string "List of devices"
 // @Failure 500 {string} string "Internal Server Error"
-// @Router /backends/llama-cpp/devices [get]
+// @Router /api/v1/backends/llama-cpp/devices [get]
 func (h *Handler) LlamaServerListDevicesHandler() http.HandlerFunc {
 	return h.executeLlamaServerCommand("--list-devices", "Failed to list devices")
 }
--- a/pkg/server/handlers_instances.go
+++ b/pkg/server/handlers_instances.go
@@ -16,12 +16,12 @@ import (
 // ListInstances godoc
 // @Summary List all instances
 // @Description Returns a list of all instances managed by the server
-// @Tags instances
+// @Tags Instances
 // @Security ApiKeyAuth
 // @Produces json
-// @Success 200 {array} instance.Process "List of instances"
+// @Success 200 {array} instance.Instance "List of instances"
 // @Failure 500 {string} string "Internal Server Error"
-// @Router /instances [get]
+// @Router /api/v1/instances [get]
 func (h *Handler) ListInstances() http.HandlerFunc {
 	return func(w http.ResponseWriter, r *http.Request) {
 		instances, err := h.InstanceManager.ListInstances()
@@ -37,16 +37,16 @@ func (h *Handler) ListInstances() http.HandlerFunc {
 // CreateInstance godoc
 // @Summary Create and start a new instance
 // @Description Creates a new instance with the provided configuration options
-// @Tags instances
+// @Tags Instances
 // @Security ApiKeyAuth
 // @Accept json
 // @Produces json
 // @Param name path string true "Instance Name"
 // @Param options body instance.Options true "Instance configuration options"
-// @Success 201 {object} instance.Process "Created instance details"
+// @Success 201 {object} instance.Instance "Created instance details"
 // @Failure 400 {string} string "Invalid request body"
 // @Failure 500 {string} string "Internal Server Error"
-// @Router /instances/{name} [post]
+// @Router /api/v1/instances/{name} [post]
 func (h *Handler) CreateInstance() http.HandlerFunc {
 	return func(w http.ResponseWriter, r *http.Request) {
 		name := chi.URLParam(r, "name")
@@ -75,14 +75,14 @@ func (h *Handler) CreateInstance() http.HandlerFunc {
 // GetInstance godoc
 // @Summary Get details of a specific instance
 // @Description Returns the details of a specific instance by name
-// @Tags instances
+// @Tags Instances
 // @Security ApiKeyAuth
 // @Produces json
 // @Param name path string true "Instance Name"
-// @Success 200 {object} instance.Process "Instance details"
+// @Success 200 {object} instance.Instance "Instance details"
 // @Failure 400 {string} string "Invalid name format"
 // @Failure 500 {string} string "Internal Server Error"
-// @Router /instances/{name} [get]
+// @Router /api/v1/instances/{name} [get]
 func (h *Handler) GetInstance() http.HandlerFunc {
 	return func(w http.ResponseWriter, r *http.Request) {
 		name := chi.URLParam(r, "name")
@@ -105,16 +105,16 @@ func (h *Handler) GetInstance() http.HandlerFunc {
 // UpdateInstance godoc
 // @Summary Update an instance's configuration
 // @Description Updates the configuration of a specific instance by name
-// @Tags instances
+// @Tags Instances
 // @Security ApiKeyAuth
 // @Accept json
 // @Produces json
 // @Param name path string true "Instance Name"
 // @Param options body instance.Options true "Instance configuration options"
-// @Success 200 {object} instance.Process "Updated instance details"
+// @Success 200 {object} instance.Instance "Updated instance details"
 // @Failure 400 {string} string "Invalid name format"
 // @Failure 500 {string} string "Internal Server Error"
-// @Router /instances/{name} [put]
+// @Router /api/v1/instances/{name} [put]
 func (h *Handler) UpdateInstance() http.HandlerFunc {
 	return func(w http.ResponseWriter, r *http.Request) {
 		name := chi.URLParam(r, "name")
@@ -143,14 +143,14 @@ func (h *Handler) UpdateInstance() http.HandlerFunc {
 // StartInstance godoc
 // @Summary Start a stopped instance
 // @Description Starts a specific instance by name
-// @Tags instances
+// @Tags Instances
 // @Security ApiKeyAuth
 // @Produces json
 // @Param name path string true "Instance Name"
-// @Success 200 {object} instance.Process "Started instance details"
+// @Success 200 {object} instance.Instance "Started instance details"
 // @Failure 400 {string} string "Invalid name format"
 // @Failure 500 {string} string "Internal Server Error"
-// @Router /instances/{name}/start [post]
+// @Router /api/v1/instances/{name}/start [post]
 func (h *Handler) StartInstance() http.HandlerFunc {
 	return func(w http.ResponseWriter, r *http.Request) {
 		name := chi.URLParam(r, "name")
@@ -179,14 +179,14 @@ func (h *Handler) StartInstance() http.HandlerFunc {
 // StopInstance godoc
 // @Summary Stop a running instance
 // @Description Stops a specific instance by name
-// @Tags instances
+// @Tags Instances
 // @Security ApiKeyAuth
 // @Produces json
 // @Param name path string true "Instance Name"
-// @Success 200 {object} instance.Process "Stopped instance details"
+// @Success 200 {object} instance.Instance "Stopped instance details"
 // @Failure 400 {string} string "Invalid name format"
 // @Failure 500 {string} string "Internal Server Error"
-// @Router /instances/{name}/stop [post]
+// @Router /api/v1/instances/{name}/stop [post]
 func (h *Handler) StopInstance() http.HandlerFunc {
 	return func(w http.ResponseWriter, r *http.Request) {
 		name := chi.URLParam(r, "name")
@@ -209,14 +209,14 @@ func (h *Handler) StopInstance() http.HandlerFunc {
 // RestartInstance godoc
 // @Summary Restart a running instance
 // @Description Restarts a specific instance by name
-// @Tags instances
+// @Tags Instances
 // @Security ApiKeyAuth
 // @Produces json
 // @Param name path string true "Instance Name"
-// @Success 200 {object} instance.Process "Restarted instance details"
+// @Success 200 {object} instance.Instance "Restarted instance details"
 // @Failure 400 {string} string "Invalid name format"
 // @Failure 500 {string} string "Internal Server Error"
-// @Router /instances/{name}/restart [post]
+// @Router /api/v1/instances/{name}/restart [post]
 func (h *Handler) RestartInstance() http.HandlerFunc {
 	return func(w http.ResponseWriter, r *http.Request) {
 		name := chi.URLParam(r, "name")
@@ -239,13 +239,13 @@ func (h *Handler) RestartInstance() http.HandlerFunc {
 // DeleteInstance godoc
 // @Summary Delete an instance
 // @Description Stops and removes a specific instance by name
-// @Tags instances
+// @Tags Instances
 // @Security ApiKeyAuth
 // @Param name path string true "Instance Name"
 // @Success 204 "No Content"
 // @Failure 400 {string} string "Invalid name format"
 // @Failure 500 {string} string "Internal Server Error"
-// @Router /instances/{name} [delete]
+// @Router /api/v1/instances/{name} [delete]
 func (h *Handler) DeleteInstance() http.HandlerFunc {
 	return func(w http.ResponseWriter, r *http.Request) {
 		name := chi.URLParam(r, "name")
@@ -267,7 +267,7 @@ func (h *Handler) DeleteInstance() http.HandlerFunc {
 // GetInstanceLogs godoc
 // @Summary Get logs from a specific instance
 // @Description Returns the logs from a specific instance by name with optional line limit
-// @Tags instances
+// @Tags Instances
 // @Security ApiKeyAuth
 // @Param name path string true "Instance Name"
 // @Param lines query string false "Number of lines to retrieve (default: all lines)"
@@ -275,7 +275,7 @@ func (h *Handler) DeleteInstance() http.HandlerFunc {
 // @Success 200 {string} string "Instance logs"
 // @Failure 400 {string} string "Invalid name format or lines parameter"
 // @Failure 500 {string} string "Internal Server Error"
-// @Router /instances/{name}/logs [get]
+// @Router /api/v1/instances/{name}/logs [get]
 func (h *Handler) GetInstanceLogs() http.HandlerFunc {
 	return func(w http.ResponseWriter, r *http.Request) {
 		name := chi.URLParam(r, "name")
@@ -310,15 +310,15 @@ func (h *Handler) GetInstanceLogs() http.HandlerFunc {
 // InstanceProxy godoc
 // @Summary Proxy requests to a specific instance, does not autostart instance if stopped
 // @Description Forwards HTTP requests to the llama-server instance running on a specific port
-// @Tags instances
+// @Tags Instances
 // @Security ApiKeyAuth
 // @Param name path string true "Instance Name"
 // @Success 200 "Request successfully proxied to instance"
 // @Failure 400 {string} string "Invalid name format"
 // @Failure 500 {string} string "Internal Server Error"
 // @Failure 503 {string} string "Instance is not running"
-// @Router /instances/{name}/proxy [get]
-// @Router /instances/{name}/proxy [post]
+// @Router /api/v1/instances/{name}/proxy [get]
+// @Router /api/v1/instances/{name}/proxy [post]
 func (h *Handler) InstanceProxy() http.HandlerFunc {
 	return func(w http.ResponseWriter, r *http.Request) {
 		inst, err := h.getInstance(r)
--- a/pkg/server/handlers_nodes.go
+++ b/pkg/server/handlers_nodes.go
@@ -14,12 +14,12 @@ type NodeResponse struct {
 // ListNodes godoc
 // @Summary List all configured nodes
 // @Description Returns a map of all nodes configured in the server (node name -> node config)
-// @Tags nodes
+// @Tags Nodes
 // @Security ApiKeyAuth
 // @Produces json
 // @Success 200 {object} map[string]NodeResponse "Map of nodes"
 // @Failure 500 {string} string "Internal Server Error"
-// @Router /nodes [get]
+// @Router /api/v1/nodes [get]
 func (h *Handler) ListNodes() http.HandlerFunc {
 	return func(w http.ResponseWriter, r *http.Request) {
 		// Convert to sanitized response format (map of name -> NodeResponse)
@@ -37,7 +37,7 @@ func (h *Handler) ListNodes() http.HandlerFunc {
 // GetNode godoc
 // @Summary Get details of a specific node
 // @Description Returns the details of a specific node by name
-// @Tags nodes
+// @Tags Nodes
 // @Security ApiKeyAuth
 // @Produces json
 // @Param name path string true "Node Name"
@@ -45,7 +45,7 @@ func (h *Handler) ListNodes() http.HandlerFunc {
 // @Failure 400 {string} string "Invalid name format"
 // @Failure 404 {string} string "Node not found"
 // @Failure 500 {string} string "Internal Server Error"
-// @Router /nodes/{name} [get]
+// @Router /api/v1/nodes/{name} [get]
 func (h *Handler) GetNode() http.HandlerFunc {
 	return func(w http.ResponseWriter, r *http.Request) {
 		name := chi.URLParam(r, "name")
--- a/pkg/server/handlers_openai.go
+++ b/pkg/server/handlers_openai.go
@@ -25,7 +25,7 @@ type OpenAIInstance struct {
 // OpenAIListInstances godoc
 // @Summary List instances in OpenAI-compatible format
 // @Description Returns a list of instances in a format compatible with OpenAI API
-// @Tags openai
+// @Tags OpenAI
 // @Security ApiKeyAuth
 // @Produces json
 // @Success 200 {object} OpenAIListInstancesResponse "List of OpenAI-compatible instances"
@@ -61,7 +61,7 @@ func (h *Handler) OpenAIListInstances() http.HandlerFunc {
 // OpenAIProxy godoc
 // @Summary OpenAI-compatible proxy endpoint
 // @Description Handles all POST requests to /v1/*, routing to the appropriate instance based on the request body. Requires API key authentication via the `Authorization` header.
-// @Tags openai
+// @Tags OpenAI
 // @Security ApiKeyAuth
 // @Accept json
 // @Produces json
--- a/pkg/server/handlers_system.go
+++ b/pkg/server/handlers_system.go
@@ -8,12 +8,12 @@ import (
 // VersionHandler godoc
 // @Summary Get llamactl version
 // @Description Returns the version of the llamactl command
-// @Tags version
+// @Tags System
 // @Security ApiKeyAuth
 // @Produces text/plain
 // @Success 200 {string} string "Version information"
 // @Failure 500 {string} string "Internal Server Error"
-// @Router /version [get]
+// @Router /api/v1/version [get]
 func (h *Handler) VersionHandler() http.HandlerFunc {
 	return func(w http.ResponseWriter, r *http.Request) {
 		versionInfo := fmt.Sprintf("Version: %s\nCommit: %s\nBuild Time: %s\n", h.cfg.Version, h.cfg.CommitHash, h.cfg.BuildTime)
--- a/pkg/server/routes.go
+++ b/pkg/server/routes.go
@@ -8,7 +8,7 @@ import (
 	"github.com/go-chi/cors"
 	httpSwagger "github.com/swaggo/http-swagger"

-	_ "llamactl/apidocs"
+	_ "llamactl/docs"
 	"llamactl/webui"
 )