diff --git a/docs/installation.md b/docs/installation.md index 9442877..1e4f4ae 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -42,15 +42,10 @@ Note: MLX backend is only available on macOS with Apple Silicon (M1, M2, M3, etc vLLM provides high-throughput distributed serving for LLMs. Install vLLM: ```bash -# Install via pip (requires Python 3.8+, GPU required) -pip install vllm - -# Or in a virtual environment (recommended) +# Install in a virtual environment python -m venv vllm-env source vllm-env/bin/activate pip install vllm - -# For production deployments, consider container-based installation ``` ## Installation Methods diff --git a/docs/managing-instances.md b/docs/managing-instances.md index c298b15..4ac9477 100644 --- a/docs/managing-instances.md +++ b/docs/managing-instances.md @@ -78,7 +78,8 @@ curl -X POST http://localhost:8080/api/instances/my-llama-instance \ "threads": 8, "ctx_size": 4096, "gpu_layers": 32 - } + }, + "nodes": ["main"] }' # Create MLX instance (macOS only) @@ -93,7 +94,8 @@ curl -X POST http://localhost:8080/api/instances/my-mlx-instance \ "max_tokens": 2048 }, "auto_restart": true, - "max_restarts": 3 + "max_restarts": 3, + "nodes": ["main"] }' # Create vLLM instance @@ -112,7 +114,8 @@ curl -X POST http://localhost:8080/api/instances/my-vllm-instance \ "CUDA_VISIBLE_DEVICES": "0,1", "NCCL_DEBUG": "INFO", "PYTHONPATH": "/custom/path" - } + }, + "nodes": ["main"] }' # Create llama.cpp instance with HuggingFace model @@ -124,7 +127,8 @@ curl -X POST http://localhost:8080/api/instances/gemma-3-27b \ "hf_repo": "unsloth/gemma-3-27b-it-GGUF", "hf_file": "gemma-3-27b-it-GGUF.gguf", "gpu_layers": 32 - } + }, + "nodes": ["main"] }' # Create instance on specific remote node @@ -138,6 +142,18 @@ curl -X POST http://localhost:8080/api/instances/remote-llama \ }, "nodes": ["worker1"] }' + +# Create instance on multiple nodes for high availability +curl -X POST http://localhost:8080/api/instances/multi-node-llama \ + -H "Content-Type: application/json" \ + -d '{ + "backend_type": "llama_cpp", + "backend_options": { + "model": "/models/llama-7b.gguf", + "gpu_layers": 32 + }, + "nodes": ["worker1", "worker2", "worker3"] + }' ``` ## Start Instance diff --git a/docs/quick-start.md b/docs/quick-start.md index 3fc562e..15311c0 100644 --- a/docs/quick-start.md +++ b/docs/quick-start.md @@ -28,13 +28,17 @@ You should see the Llamactl web interface. 1. Click the "Add Instance" button 2. Fill in the instance configuration: - - **Name**: Give your instance a descriptive name - - **Backend Type**: Choose from llama.cpp, MLX, or vLLM - - **Model**: Model path or huggingface repo - - **Additional Options**: Backend-specific parameters + - **Name**: Give your instance a descriptive name + - **Node**: Select which node to deploy the instance to (defaults to "main" for single-node setups) + - **Backend Type**: Choose from llama.cpp, MLX, or vLLM + - **Model**: Model path or huggingface repo + - **Additional Options**: Backend-specific parameters -!!! tip "Auto-Assignment" - Llamactl automatically assigns ports from the configured port range (default: 8000-9000) and generates API keys if authentication is enabled. You typically don't need to manually specify these values. + !!! tip "Auto-Assignment" + Llamactl automatically assigns ports from the configured port range (default: 8000-9000) and generates API keys if authentication is enabled. You typically don't need to manually specify these values. + + !!! note "Remote Node Deployment" + If you have configured remote nodes in your configuration file, you can select which node to deploy the instance to. This allows you to distribute instances across multiple machines. See the [Configuration](configuration.md#remote-node-configuration) guide for details on setting up remote nodes. 3. Click "Create Instance" @@ -61,7 +65,8 @@ Here are basic example configurations for each backend: "threads": 4, "ctx_size": 2048, "gpu_layers": 32 - } + }, + "nodes": ["main"] } ``` @@ -74,7 +79,8 @@ Here are basic example configurations for each backend: "model": "mlx-community/Mistral-7B-Instruct-v0.3-4bit", "temp": 0.7, "max_tokens": 2048 - } + }, + "nodes": ["main"] } ``` @@ -87,7 +93,21 @@ Here are basic example configurations for each backend: "model": "microsoft/DialoGPT-medium", "tensor_parallel_size": 2, "gpu_memory_utilization": 0.9 - } + }, + "nodes": ["main"] +} +``` + +**Multi-node deployment example:** +```json +{ + "name": "distributed-model", + "backend_type": "llama_cpp", + "backend_options": { + "model": "/path/to/model.gguf", + "gpu_layers": 32 + }, + "nodes": ["worker1", "worker2"] } ```