From 55765d2020fabb165bd609ec017c13c5f570f770 Mon Sep 17 00:00:00 2001 From: LordMathis Date: Sun, 21 Sep 2025 21:57:36 +0200 Subject: [PATCH] Add vLLM backend support to documentation and update instance management instructions --- README.md | 26 ++++++++++++- docs/getting-started/configuration.md | 3 ++ docs/getting-started/installation.md | 16 ++++++++ docs/getting-started/quick-start.md | 54 +++++++++++++++++++++------ docs/user-guide/managing-instances.md | 24 ++++++++++-- 5 files changed, 107 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index 2a24520..31c827c 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,7 @@ ### 🔗 Universal Compatibility - **OpenAI API Compatible**: Drop-in replacement - route requests by model name -- **Multi-Backend Support**: Native support for both llama.cpp and MLX (Apple Silicon optimized) +- **Multi-Backend Support**: Native support for llama.cpp, MLX (Apple Silicon optimized), and vLLM ### 🌐 User-Friendly Interface - **Web Dashboard**: Modern React UI for visual management (unlike CLI-only tools) @@ -31,6 +31,7 @@ # 1. Install backend (one-time setup) # For llama.cpp: https://github.com/ggml-org/llama.cpp#quick-start # For MLX on macOS: pip install mlx-lm +# For vLLM: pip install vllm # 2. Download and run llamactl LATEST_VERSION=$(curl -s https://api.github.com/repos/lordmathis/llamactl/releases/latest | grep '"tag_name":' | sed -E 's/.*"([^"]+)".*/\1/') @@ -47,7 +48,7 @@ llamactl ### Create and manage instances via web dashboard: 1. Open http://localhost:8080 2. Click "Create Instance" -3. Choose backend type (llama.cpp or MLX) +3. Choose backend type (llama.cpp, MLX, or vLLM) 4. Set model path and backend-specific options 5. Start or stop the instance @@ -63,6 +64,11 @@ curl -X POST localhost:8080/api/v1/instances/my-mlx-model \ -H "Authorization: Bearer your-key" \ -d '{"backend_type": "mlx_lm", "backend_options": {"model": "mlx-community/Mistral-7B-Instruct-v0.3-4bit"}}' +# Create vLLM instance +curl -X POST localhost:8080/api/v1/instances/my-vllm-model \ + -H "Authorization: Bearer your-key" \ + -d '{"backend_type": "vllm", "backend_options": {"model": "microsoft/DialoGPT-medium", "tensor_parallel_size": 2}}' + # Use with OpenAI SDK curl -X POST localhost:8080/v1/chat/completions \ -H "Authorization: Bearer your-key" \ @@ -121,6 +127,21 @@ source mlx-env/bin/activate pip install mlx-lm ``` +**For vLLM backend:** +You need vLLM installed: + +```bash +# Install via pip (requires Python 3.8+, GPU required) +pip install vllm + +# Or in a virtual environment (recommended) +python -m venv vllm-env +source vllm-env/bin/activate +pip install vllm + +# For production deployments, consider container-based installation +``` + ## Configuration llamactl works out of the box with sensible defaults. @@ -135,6 +156,7 @@ server: backends: llama_executable: llama-server # Path to llama-server executable mlx_lm_executable: mlx_lm.server # Path to mlx_lm.server executable + vllm_executable: vllm # Path to vllm executable instances: port_range: [8000, 9000] # Port range for instances diff --git a/docs/getting-started/configuration.md b/docs/getting-started/configuration.md index f8003ef..4100492 100644 --- a/docs/getting-started/configuration.md +++ b/docs/getting-started/configuration.md @@ -22,6 +22,7 @@ server: backends: llama_executable: llama-server # Path to llama-server executable mlx_lm_executable: mlx_lm.server # Path to mlx_lm.server executable + vllm_executable: vllm # Path to vllm executable instances: port_range: [8000, 9000] # Port range for instances @@ -94,11 +95,13 @@ server: backends: llama_executable: "llama-server" # Path to llama-server executable (default: "llama-server") mlx_lm_executable: "mlx_lm.server" # Path to mlx_lm.server executable (default: "mlx_lm.server") + vllm_executable: "vllm" # Path to vllm executable (default: "vllm") ``` **Environment Variables:** - `LLAMACTL_LLAMA_EXECUTABLE` - Path to llama-server executable - `LLAMACTL_MLX_LM_EXECUTABLE` - Path to mlx_lm.server executable +- `LLAMACTL_VLLM_EXECUTABLE` - Path to vllm executable ### Instance Configuration diff --git a/docs/getting-started/installation.md b/docs/getting-started/installation.md index a3ceae6..6f52fff 100644 --- a/docs/getting-started/installation.md +++ b/docs/getting-started/installation.md @@ -37,6 +37,22 @@ pip install mlx-lm Note: MLX backend is only available on macOS with Apple Silicon (M1, M2, M3, etc.) +**For vLLM backend:** + +vLLM provides high-throughput distributed serving for LLMs. Install vLLM: + +```bash +# Install via pip (requires Python 3.8+, GPU required) +pip install vllm + +# Or in a virtual environment (recommended) +python -m venv vllm-env +source vllm-env/bin/activate +pip install vllm + +# For production deployments, consider container-based installation +``` + ## Installation Methods ### Option 1: Download Binary (Recommended) diff --git a/docs/getting-started/quick-start.md b/docs/getting-started/quick-start.md index 4de1065..20d8aa8 100644 --- a/docs/getting-started/quick-start.md +++ b/docs/getting-started/quick-start.md @@ -29,8 +29,9 @@ You should see the Llamactl web interface. 1. Click the "Add Instance" button 2. Fill in the instance configuration: - **Name**: Give your instance a descriptive name - - **Model Path**: Path to your Llama.cpp model file - - **Additional Options**: Any extra Llama.cpp parameters + - **Backend Type**: Choose from llama.cpp, MLX, or vLLM + - **Model**: Model path or identifier for your chosen backend + - **Additional Options**: Backend-specific parameters 3. Click "Create Instance" @@ -43,17 +44,46 @@ Once created, you can: - **View logs** by clicking the logs button - **Stop** the instance when needed -## Example Configuration +## Example Configurations -Here's a basic example configuration for a Llama 2 model: +Here are basic example configurations for each backend: +**llama.cpp backend:** ```json { "name": "llama2-7b", - "model_path": "/path/to/llama-2-7b-chat.gguf", - "options": { + "backend_type": "llama_cpp", + "backend_options": { + "model": "/path/to/llama-2-7b-chat.gguf", "threads": 4, - "context_size": 2048 + "ctx_size": 2048, + "gpu_layers": 32 + } +} +``` + +**MLX backend (macOS only):** +```json +{ + "name": "mistral-mlx", + "backend_type": "mlx_lm", + "backend_options": { + "model": "mlx-community/Mistral-7B-Instruct-v0.3-4bit", + "temp": 0.7, + "max_tokens": 2048 + } +} +``` + +**vLLM backend:** +```json +{ + "name": "dialogpt-vllm", + "backend_type": "vllm", + "backend_options": { + "model": "microsoft/DialoGPT-medium", + "tensor_parallel_size": 2, + "gpu_memory_utilization": 0.9 } } ``` @@ -66,12 +96,14 @@ You can also manage instances via the REST API: # List all instances curl http://localhost:8080/api/instances -# Create a new instance -curl -X POST http://localhost:8080/api/instances \ +# Create a new llama.cpp instance +curl -X POST http://localhost:8080/api/instances/my-model \ -H "Content-Type: application/json" \ -d '{ - "name": "my-model", - "model_path": "/path/to/model.gguf", + "backend_type": "llama_cpp", + "backend_options": { + "model": "/path/to/model.gguf" + } }' # Start an instance diff --git a/docs/user-guide/managing-instances.md b/docs/user-guide/managing-instances.md index 186670c..e094d42 100644 --- a/docs/user-guide/managing-instances.md +++ b/docs/user-guide/managing-instances.md @@ -1,6 +1,6 @@ # Managing Instances -Learn how to effectively manage your llama.cpp and MLX instances with Llamactl through both the Web UI and API. +Learn how to effectively manage your llama.cpp, MLX, and vLLM instances with Llamactl through both the Web UI and API. ## Overview @@ -42,9 +42,11 @@ Each instance is displayed as a card showing: 3. **Choose Backend Type**: - **llama.cpp**: For GGUF models using llama-server - **MLX**: For MLX-optimized models (macOS only) + - **vLLM**: For distributed serving and high-throughput inference 4. Configure model source: - **For llama.cpp**: GGUF model path or HuggingFace repo - **For MLX**: MLX model path or identifier (e.g., `mlx-community/Mistral-7B-Instruct-v0.3-4bit`) + - **For vLLM**: HuggingFace model identifier (e.g., `microsoft/DialoGPT-medium`) 5. Configure optional instance management settings: - **Auto Restart**: Automatically restart instance on failure - **Max Restarts**: Maximum number of restart attempts @@ -54,6 +56,7 @@ Each instance is displayed as a card showing: 6. Configure backend-specific options: - **llama.cpp**: Threads, context size, GPU layers, port, etc. - **MLX**: Temperature, top-p, adapter path, Python environment, etc. + - **vLLM**: Tensor parallel size, GPU memory utilization, quantization, etc. 7. Click **"Create"** to save the instance ### Via API @@ -87,6 +90,20 @@ curl -X POST http://localhost:8080/api/instances/my-mlx-instance \ "max_restarts": 3 }' +# Create vLLM instance +curl -X POST http://localhost:8080/api/instances/my-vllm-instance \ + -H "Content-Type: application/json" \ + -d '{ + "backend_type": "vllm", + "backend_options": { + "model": "microsoft/DialoGPT-medium", + "tensor_parallel_size": 2, + "gpu_memory_utilization": 0.9 + }, + "auto_restart": true, + "on_demand_start": true + }' + # Create llama.cpp instance with HuggingFace model curl -X POST http://localhost:8080/api/instances/gemma-3-27b \ -H "Content-Type: application/json" \ @@ -179,16 +196,17 @@ curl -X DELETE http://localhost:8080/api/instances/{name} ## Instance Proxy -Llamactl proxies all requests to the underlying backend instances (llama-server or MLX). +Llamactl proxies all requests to the underlying backend instances (llama-server, MLX, or vLLM). ```bash # Get instance details curl http://localhost:8080/api/instances/{name}/proxy/ ``` -Both backends provide OpenAI-compatible endpoints. Check the respective documentation: +All backends provide OpenAI-compatible endpoints. Check the respective documentation: - [llama-server docs](https://github.com/ggml-org/llama.cpp/blob/master/tools/server/README.md) - [MLX-LM docs](https://github.com/ml-explore/mlx-lm/blob/main/mlx_lm/SERVER.md) +- [vLLM docs](https://docs.vllm.ai/en/latest/) ### Instance Health