From 55765d2020fabb165bd609ec017c13c5f570f770 Mon Sep 17 00:00:00 2001
From: LordMathis <matus@namesny.com>
Date: Sun, 21 Sep 2025 21:57:36 +0200
Subject: [PATCH] Add vLLM backend support to documentation and update instance
 management instructions

---
 README.md                             | 26 ++++++++++++-
 docs/getting-started/configuration.md |  3 ++
 docs/getting-started/installation.md  | 16 ++++++++
 docs/getting-started/quick-start.md   | 54 +++++++++++++++++++++------
 docs/user-guide/managing-instances.md | 24 ++++++++++--
 5 files changed, 107 insertions(+), 16 deletions(-)

diff --git a/README.md b/README.md
index 2a24520..31c827c 100644
--- a/README.md
+++ b/README.md
@@ -13,7 +13,7 @@
 
 ### 🔗 Universal Compatibility
 - **OpenAI API Compatible**: Drop-in replacement - route requests by model name
-- **Multi-Backend Support**: Native support for both llama.cpp and MLX (Apple Silicon optimized)
+- **Multi-Backend Support**: Native support for llama.cpp, MLX (Apple Silicon optimized), and vLLM
 
 ### 🌐 User-Friendly Interface
 - **Web Dashboard**: Modern React UI for visual management (unlike CLI-only tools)
@@ -31,6 +31,7 @@
 # 1. Install backend (one-time setup)
 # For llama.cpp: https://github.com/ggml-org/llama.cpp#quick-start
 # For MLX on macOS: pip install mlx-lm
+# For vLLM: pip install vllm
 
 # 2. Download and run llamactl
 LATEST_VERSION=$(curl -s https://api.github.com/repos/lordmathis/llamactl/releases/latest | grep '"tag_name":' | sed -E 's/.*"([^"]+)".*/\1/')
@@ -47,7 +48,7 @@ llamactl
 ### Create and manage instances via web dashboard:
 1. Open http://localhost:8080
 2. Click "Create Instance"
-3. Choose backend type (llama.cpp or MLX)
+3. Choose backend type (llama.cpp, MLX, or vLLM)
 4. Set model path and backend-specific options
 5. Start or stop the instance
 
@@ -63,6 +64,11 @@ curl -X POST localhost:8080/api/v1/instances/my-mlx-model \
   -H "Authorization: Bearer your-key" \
   -d '{"backend_type": "mlx_lm", "backend_options": {"model": "mlx-community/Mistral-7B-Instruct-v0.3-4bit"}}'
 
+# Create vLLM instance
+curl -X POST localhost:8080/api/v1/instances/my-vllm-model \
+  -H "Authorization: Bearer your-key" \
+  -d '{"backend_type": "vllm", "backend_options": {"model": "microsoft/DialoGPT-medium", "tensor_parallel_size": 2}}'
+
 # Use with OpenAI SDK
 curl -X POST localhost:8080/v1/chat/completions \
   -H "Authorization: Bearer your-key" \
@@ -121,6 +127,21 @@ source mlx-env/bin/activate
 pip install mlx-lm
 ```
 
+**For vLLM backend:**
+You need vLLM installed:
+
+```bash
+# Install via pip (requires Python 3.8+, GPU required)
+pip install vllm
+
+# Or in a virtual environment (recommended)
+python -m venv vllm-env
+source vllm-env/bin/activate
+pip install vllm
+
+# For production deployments, consider container-based installation
+```
+
 ## Configuration
 
 llamactl works out of the box with sensible defaults.
@@ -135,6 +156,7 @@ server:
 backends:
   llama_executable: llama-server # Path to llama-server executable
   mlx_lm_executable: mlx_lm.server # Path to mlx_lm.server executable
+  vllm_executable: vllm # Path to vllm executable
 
 instances:
   port_range: [8000, 9000]       # Port range for instances
diff --git a/docs/getting-started/configuration.md b/docs/getting-started/configuration.md
index f8003ef..4100492 100644
--- a/docs/getting-started/configuration.md
+++ b/docs/getting-started/configuration.md
@@ -22,6 +22,7 @@ server:
 backends:
   llama_executable: llama-server # Path to llama-server executable
   mlx_lm_executable: mlx_lm.server # Path to mlx_lm.server executable
+  vllm_executable: vllm # Path to vllm executable
 
 instances:
   port_range: [8000, 9000]       # Port range for instances
@@ -94,11 +95,13 @@ server:
 backends:
   llama_executable: "llama-server"     # Path to llama-server executable (default: "llama-server")
   mlx_lm_executable: "mlx_lm.server"   # Path to mlx_lm.server executable (default: "mlx_lm.server")
+  vllm_executable: "vllm"              # Path to vllm executable (default: "vllm")
 ```
 
 **Environment Variables:**
 - `LLAMACTL_LLAMA_EXECUTABLE` - Path to llama-server executable
 - `LLAMACTL_MLX_LM_EXECUTABLE` - Path to mlx_lm.server executable
+- `LLAMACTL_VLLM_EXECUTABLE` - Path to vllm executable
 
 ### Instance Configuration
 
diff --git a/docs/getting-started/installation.md b/docs/getting-started/installation.md
index a3ceae6..6f52fff 100644
--- a/docs/getting-started/installation.md
+++ b/docs/getting-started/installation.md
@@ -37,6 +37,22 @@ pip install mlx-lm
 
 Note: MLX backend is only available on macOS with Apple Silicon (M1, M2, M3, etc.)
 
+**For vLLM backend:**
+
+vLLM provides high-throughput distributed serving for LLMs. Install vLLM:
+
+```bash
+# Install via pip (requires Python 3.8+, GPU required)
+pip install vllm
+
+# Or in a virtual environment (recommended)
+python -m venv vllm-env
+source vllm-env/bin/activate
+pip install vllm
+
+# For production deployments, consider container-based installation
+```
+
 ## Installation Methods
 
 ### Option 1: Download Binary (Recommended)
diff --git a/docs/getting-started/quick-start.md b/docs/getting-started/quick-start.md
index 4de1065..20d8aa8 100644
--- a/docs/getting-started/quick-start.md
+++ b/docs/getting-started/quick-start.md
@@ -29,8 +29,9 @@ You should see the Llamactl web interface.
 1. Click the "Add Instance" button
 2. Fill in the instance configuration:
    - **Name**: Give your instance a descriptive name
-   - **Model Path**: Path to your Llama.cpp model file
-   - **Additional Options**: Any extra Llama.cpp parameters
+   - **Backend Type**: Choose from llama.cpp, MLX, or vLLM
+   - **Model**: Model path or identifier for your chosen backend
+   - **Additional Options**: Backend-specific parameters
 
 3. Click "Create Instance"
 
@@ -43,17 +44,46 @@ Once created, you can:
 - **View logs** by clicking the logs button
 - **Stop** the instance when needed
 
-## Example Configuration
+## Example Configurations
 
-Here's a basic example configuration for a Llama 2 model:
+Here are basic example configurations for each backend:
 
+**llama.cpp backend:**
 ```json
 {
   "name": "llama2-7b",
-  "model_path": "/path/to/llama-2-7b-chat.gguf",
-  "options": {
+  "backend_type": "llama_cpp",
+  "backend_options": {
+    "model": "/path/to/llama-2-7b-chat.gguf",
     "threads": 4,
-    "context_size": 2048
+    "ctx_size": 2048,
+    "gpu_layers": 32
+  }
+}
+```
+
+**MLX backend (macOS only):**
+```json
+{
+  "name": "mistral-mlx",
+  "backend_type": "mlx_lm",
+  "backend_options": {
+    "model": "mlx-community/Mistral-7B-Instruct-v0.3-4bit",
+    "temp": 0.7,
+    "max_tokens": 2048
+  }
+}
+```
+
+**vLLM backend:**
+```json
+{
+  "name": "dialogpt-vllm",
+  "backend_type": "vllm",
+  "backend_options": {
+    "model": "microsoft/DialoGPT-medium",
+    "tensor_parallel_size": 2,
+    "gpu_memory_utilization": 0.9
   }
 }
 ```
@@ -66,12 +96,14 @@ You can also manage instances via the REST API:
 # List all instances
 curl http://localhost:8080/api/instances
 
-# Create a new instance
-curl -X POST http://localhost:8080/api/instances \
+# Create a new llama.cpp instance
+curl -X POST http://localhost:8080/api/instances/my-model \
   -H "Content-Type: application/json" \
   -d '{
-    "name": "my-model",
-    "model_path": "/path/to/model.gguf",
+    "backend_type": "llama_cpp",
+    "backend_options": {
+      "model": "/path/to/model.gguf"
+    }
   }'
 
 # Start an instance
diff --git a/docs/user-guide/managing-instances.md b/docs/user-guide/managing-instances.md
index 186670c..e094d42 100644
--- a/docs/user-guide/managing-instances.md
+++ b/docs/user-guide/managing-instances.md
@@ -1,6 +1,6 @@
 # Managing Instances
 
-Learn how to effectively manage your llama.cpp and MLX instances with Llamactl through both the Web UI and API.
+Learn how to effectively manage your llama.cpp, MLX, and vLLM instances with Llamactl through both the Web UI and API.
 
 ## Overview
 
@@ -42,9 +42,11 @@ Each instance is displayed as a card showing:
 3. **Choose Backend Type**:
     - **llama.cpp**: For GGUF models using llama-server
     - **MLX**: For MLX-optimized models (macOS only)
+    - **vLLM**: For distributed serving and high-throughput inference
 4. Configure model source:
     - **For llama.cpp**: GGUF model path or HuggingFace repo
     - **For MLX**: MLX model path or identifier (e.g., `mlx-community/Mistral-7B-Instruct-v0.3-4bit`)
+    - **For vLLM**: HuggingFace model identifier (e.g., `microsoft/DialoGPT-medium`)
 5. Configure optional instance management settings:
     - **Auto Restart**: Automatically restart instance on failure
     - **Max Restarts**: Maximum number of restart attempts
@@ -54,6 +56,7 @@ Each instance is displayed as a card showing:
 6. Configure backend-specific options:
     - **llama.cpp**: Threads, context size, GPU layers, port, etc.
     - **MLX**: Temperature, top-p, adapter path, Python environment, etc.
+    - **vLLM**: Tensor parallel size, GPU memory utilization, quantization, etc.
 7. Click **"Create"** to save the instance  
 
 ### Via API
@@ -87,6 +90,20 @@ curl -X POST http://localhost:8080/api/instances/my-mlx-instance \
     "max_restarts": 3
   }'
 
+# Create vLLM instance
+curl -X POST http://localhost:8080/api/instances/my-vllm-instance \
+  -H "Content-Type: application/json" \
+  -d '{
+    "backend_type": "vllm",
+    "backend_options": {
+      "model": "microsoft/DialoGPT-medium",
+      "tensor_parallel_size": 2,
+      "gpu_memory_utilization": 0.9
+    },
+    "auto_restart": true,
+    "on_demand_start": true
+  }'
+
 # Create llama.cpp instance with HuggingFace model
 curl -X POST http://localhost:8080/api/instances/gemma-3-27b \
   -H "Content-Type: application/json" \
@@ -179,16 +196,17 @@ curl -X DELETE http://localhost:8080/api/instances/{name}
 
 ## Instance Proxy
 
-Llamactl proxies all requests to the underlying backend instances (llama-server or MLX).
+Llamactl proxies all requests to the underlying backend instances (llama-server, MLX, or vLLM).
 
 ```bash
 # Get instance details
 curl http://localhost:8080/api/instances/{name}/proxy/
 ```
 
-Both backends provide OpenAI-compatible endpoints. Check the respective documentation:
+All backends provide OpenAI-compatible endpoints. Check the respective documentation:
 - [llama-server docs](https://github.com/ggml-org/llama.cpp/blob/master/tools/server/README.md)
 - [MLX-LM docs](https://github.com/ml-explore/mlx-lm/blob/main/mlx_lm/SERVER.md)
+- [vLLM docs](https://docs.vllm.ai/en/latest/)
 
 ### Instance Health