mirror of
https://github.com/lordmathis/llamactl.git
synced 2025-11-05 16:44:22 +00:00
Merge pull request #73 from lordmathis/refactor/docs
refactor: Update docs structure and improve content clarity
This commit is contained in:
@@ -86,7 +86,7 @@ go install github.com/swaggo/swag/cmd/swag@latest
|
||||
|
||||
# Update Swagger comments in pkg/server/handlers.go
|
||||
# Then regenerate docs
|
||||
swag init -g cmd/server/main.go -o apidocs
|
||||
swag init -g cmd/server/main.go
|
||||
```
|
||||
|
||||
## Pull Request Guidelines
|
||||
|
||||
204
README.md
204
README.md
@@ -4,133 +4,32 @@
|
||||
|
||||
**Unified management and routing for llama.cpp, MLX and vLLM models with web dashboard.**
|
||||
|
||||
## Features
|
||||
|
||||
### 🚀 Easy Model Management
|
||||
- **Multiple Model Serving**: Run different models simultaneously (7B for speed, 70B for quality)
|
||||
- **On-Demand Instance Start**: Automatically launch instances upon receiving API requests
|
||||
- **State Persistence**: Ensure instances remain intact across server restarts
|
||||
|
||||
### 🔗 Universal Compatibility
|
||||
- **OpenAI API Compatible**: Drop-in replacement - route requests by instance name
|
||||
- **Multi-Backend Support**: Native support for llama.cpp, MLX (Apple Silicon optimized), and vLLM
|
||||
- **Docker Support**: Run backends in containers
|
||||
|
||||
### 🌐 User-Friendly Interface
|
||||
- **Web Dashboard**: Modern React UI for visual management (unlike CLI-only tools)
|
||||
- **API Key Authentication**: Separate keys for management vs inference access
|
||||
|
||||
### ⚡ Smart Operations
|
||||
- **Instance Monitoring**: Health checks, auto-restart, log management
|
||||
- **Smart Resource Management**: Idle timeout, LRU eviction, and configurable instance limits
|
||||
- **Environment Variables**: Set custom environment variables per instance for advanced configuration
|
||||
|
||||
### 🔗 Remote Instance Deployment
|
||||
- **Remote Node Support**: Deploy instances on remote hosts
|
||||
- **Central Management**: Manage remote instances from a single dashboard
|
||||
- **Seamless Routing**: Automatic request routing to remote instances
|
||||
📚 **[Full Documentation →](https://llamactl.org)**
|
||||
|
||||

|
||||
|
||||
## Features
|
||||
|
||||
**🚀 Easy Model Management**
|
||||
- **Multiple Models Simultaneously**: Run different models at the same time (7B for speed, 70B for quality)
|
||||
- **Smart Resource Management**: Automatic idle timeout, LRU eviction, and configurable instance limits
|
||||
- **Web Dashboard**: Modern React UI for managing instances, monitoring health, and viewing logs
|
||||
|
||||
**🔗 Flexible Integration**
|
||||
- **OpenAI API Compatible**: Drop-in replacement - route requests to different models by instance name
|
||||
- **Multi-Backend Support**: Native support for llama.cpp, MLX (Apple Silicon optimized), and vLLM
|
||||
- **Docker Ready**: Run backends in containers with full GPU support
|
||||
|
||||
**🌐 Distributed Deployment**
|
||||
- **Remote Instances**: Deploy instances on remote hosts
|
||||
- **Central Management**: Manage everything from a single dashboard with automatic routing
|
||||
|
||||
## Quick Start
|
||||
|
||||
```bash
|
||||
# 1. Install backend (one-time setup)
|
||||
# For llama.cpp: https://github.com/ggml-org/llama.cpp#quick-start
|
||||
# For MLX on macOS: pip install mlx-lm
|
||||
# For vLLM: pip install vllm
|
||||
# Or use Docker - no local installation required
|
||||
|
||||
# 2. Download and run llamactl
|
||||
LATEST_VERSION=$(curl -s https://api.github.com/repos/lordmathis/llamactl/releases/latest | grep '"tag_name":' | sed -E 's/.*"([^"]+)".*/\1/')
|
||||
curl -L https://github.com/lordmathis/llamactl/releases/download/${LATEST_VERSION}/llamactl-${LATEST_VERSION}-linux-amd64.tar.gz | tar -xz
|
||||
sudo mv llamactl /usr/local/bin/
|
||||
|
||||
# 3. Start the server
|
||||
llamactl
|
||||
# Access dashboard at http://localhost:8080
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
### Create and manage instances via web dashboard:
|
||||
1. Open http://localhost:8080
|
||||
2. Click "Create Instance"
|
||||
3. Choose backend type (llama.cpp, MLX, or vLLM)
|
||||
4. Set model path and backend-specific options
|
||||
5. Configure environment variables if needed (optional)
|
||||
6. Start or stop the instance
|
||||
|
||||
### Or use the REST API:
|
||||
```bash
|
||||
# Create llama.cpp instance
|
||||
curl -X POST localhost:8080/api/v1/instances/my-7b-model \
|
||||
-H "Authorization: Bearer your-key" \
|
||||
-d '{"backend_type": "llama_cpp", "backend_options": {"model": "/path/to/model.gguf", "gpu_layers": 32}}'
|
||||
|
||||
# Create MLX instance (macOS)
|
||||
curl -X POST localhost:8080/api/v1/instances/my-mlx-model \
|
||||
-H "Authorization: Bearer your-key" \
|
||||
-d '{"backend_type": "mlx_lm", "backend_options": {"model": "mlx-community/Mistral-7B-Instruct-v0.3-4bit"}}'
|
||||
|
||||
# Create vLLM instance with environment variables
|
||||
curl -X POST localhost:8080/api/v1/instances/my-vllm-model \
|
||||
-H "Authorization: Bearer your-key" \
|
||||
-d '{"backend_type": "vllm", "backend_options": {"model": "microsoft/DialoGPT-medium", "tensor_parallel_size": 2}, "environment": {"CUDA_VISIBLE_DEVICES": "0,1", "NCCL_DEBUG": "INFO"}}'
|
||||
|
||||
# Use with OpenAI SDK
|
||||
curl -X POST localhost:8080/v1/chat/completions \
|
||||
-H "Authorization: Bearer your-key" \
|
||||
-d '{"model": "my-7b-model", "messages": [{"role": "user", "content": "Hello!"}]}'
|
||||
```
|
||||
|
||||
## Installation
|
||||
|
||||
### Option 1: Download Binary (Recommended)
|
||||
|
||||
```bash
|
||||
# Linux/macOS - Get latest version and download
|
||||
LATEST_VERSION=$(curl -s https://api.github.com/repos/lordmathis/llamactl/releases/latest | grep '"tag_name":' | sed -E 's/.*"([^"]+)".*/\1/')
|
||||
curl -L https://github.com/lordmathis/llamactl/releases/download/${LATEST_VERSION}/llamactl-${LATEST_VERSION}-$(uname -s | tr '[:upper:]' '[:lower:]')-$(uname -m).tar.gz | tar -xz
|
||||
sudo mv llamactl /usr/local/bin/
|
||||
|
||||
# Or download manually from the releases page:
|
||||
# https://github.com/lordmathis/llamactl/releases/latest
|
||||
|
||||
# Windows - Download from releases page
|
||||
```
|
||||
|
||||
### Option 2: Docker (No local backend installation required)
|
||||
|
||||
```bash
|
||||
# Clone repository and build Docker images
|
||||
git clone https://github.com/lordmathis/llamactl.git
|
||||
cd llamactl
|
||||
mkdir -p data/llamacpp data/vllm models
|
||||
|
||||
# Build and start llamactl with llama.cpp CUDA backend
|
||||
docker-compose -f docker/docker-compose.yml up llamactl-llamacpp -d
|
||||
|
||||
# Build and start llamactl with vLLM CUDA backend
|
||||
docker-compose -f docker/docker-compose.yml up llamactl-vllm -d
|
||||
|
||||
# Build from source using multi-stage build
|
||||
docker build -f docker/Dockerfile.source -t llamactl:source .
|
||||
```
|
||||
|
||||
**Features:** CUDA support, automatic latest release installation, no backend dependencies.
|
||||
**Note:** Dockerfiles are configured for CUDA. Adapt base images for other platforms (CPU, ROCm, etc.).
|
||||
|
||||
For detailed Docker setup and configuration, see the [Installation Guide](docs/getting-started/installation.md).
|
||||
|
||||
### Option 3: Build from Source
|
||||
Requires Go 1.24+ and Node.js 22+
|
||||
```bash
|
||||
git clone https://github.com/lordmathis/llamactl.git
|
||||
cd llamactl
|
||||
cd webui && npm ci && npm run build && cd ..
|
||||
go build -o llamactl ./cmd/server
|
||||
```
|
||||
1. Install a backend (llama.cpp, MLX, or vLLM) - see [Prerequisites](#prerequisites) below
|
||||
2. [Download llamactl](#installation) for your platform
|
||||
3. Run `llamactl` and open http://localhost:8080
|
||||
4. Create an instance and start inferencing!
|
||||
|
||||
## Prerequisites
|
||||
|
||||
@@ -175,9 +74,9 @@ pip install vllm
|
||||
# Or use Docker - no local installation required
|
||||
```
|
||||
|
||||
## Backend Docker Support
|
||||
### Docker Support
|
||||
|
||||
llamactl can run backends in Docker containers:
|
||||
llamactl can run backends in Docker containers, eliminating the need for local backend installation:
|
||||
|
||||
```yaml
|
||||
backends:
|
||||
@@ -189,9 +88,58 @@ backends:
|
||||
enabled: true
|
||||
```
|
||||
|
||||
**Requirements:** Docker installed and running. For GPU support: nvidia-docker2 (Linux) or Docker Desktop with GPU support.
|
||||
## Installation
|
||||
|
||||
For detailed Docker configuration options, see the [Configuration Guide](docs/getting-started/configuration.md).
|
||||
### Option 1: Download Binary (Recommended)
|
||||
|
||||
```bash
|
||||
# Linux/macOS - Get latest version and download
|
||||
LATEST_VERSION=$(curl -s https://api.github.com/repos/lordmathis/llamactl/releases/latest | grep '"tag_name":' | sed -E 's/.*"([^"]+)".*/\1/')
|
||||
curl -L https://github.com/lordmathis/llamactl/releases/download/${LATEST_VERSION}/llamactl-${LATEST_VERSION}-$(uname -s | tr '[:upper:]' '[:lower:]')-$(uname -m).tar.gz | tar -xz
|
||||
sudo mv llamactl /usr/local/bin/
|
||||
|
||||
# Or download manually from the releases page:
|
||||
# https://github.com/lordmathis/llamactl/releases/latest
|
||||
|
||||
# Windows - Download from releases page
|
||||
```
|
||||
|
||||
### Option 2: Docker (No local backend installation required)
|
||||
|
||||
```bash
|
||||
# Clone repository and build Docker images
|
||||
git clone https://github.com/lordmathis/llamactl.git
|
||||
cd llamactl
|
||||
mkdir -p data/llamacpp data/vllm models
|
||||
|
||||
# Build and start llamactl with llama.cpp CUDA backend
|
||||
docker-compose -f docker/docker-compose.yml up llamactl-llamacpp -d
|
||||
|
||||
# Build and start llamactl with vLLM CUDA backend
|
||||
docker-compose -f docker/docker-compose.yml up llamactl-vllm -d
|
||||
|
||||
# Build from source using multi-stage build
|
||||
docker build -f docker/Dockerfile.source -t llamactl:source .
|
||||
```
|
||||
|
||||
**Note:** Dockerfiles are configured for CUDA. Adapt base images for other platforms (CPU, ROCm, etc.).
|
||||
|
||||
### Option 3: Build from Source
|
||||
Requires Go 1.24+ and Node.js 22+
|
||||
```bash
|
||||
git clone https://github.com/lordmathis/llamactl.git
|
||||
cd llamactl
|
||||
cd webui && npm ci && npm run build && cd ..
|
||||
go build -o llamactl ./cmd/server
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
1. Open http://localhost:8080
|
||||
2. Click "Create Instance"
|
||||
3. Choose backend type (llama.cpp, MLX, or vLLM)
|
||||
4. Configure your model and options (ports and API keys are auto-assigned)
|
||||
5. Start the instance and use it with any OpenAI-compatible client
|
||||
|
||||
## Configuration
|
||||
|
||||
@@ -213,7 +161,7 @@ backends:
|
||||
docker:
|
||||
enabled: false
|
||||
image: "ghcr.io/ggml-org/llama.cpp:server"
|
||||
args: ["run", "--rm", "--network", "host", "--gpus", "all"]
|
||||
args: ["run", "--rm", "--network", "host", "--gpus", "all", "-v", "~/.local/share/llamactl/llama.cpp:/root/.cache/llama.cpp"]
|
||||
environment: {} # Environment variables for the container
|
||||
|
||||
vllm:
|
||||
@@ -223,7 +171,7 @@ backends:
|
||||
docker:
|
||||
enabled: false
|
||||
image: "vllm/vllm-openai:latest"
|
||||
args: ["run", "--rm", "--network", "host", "--gpus", "all", "--shm-size", "1g"]
|
||||
args: ["run", "--rm", "--network", "host", "--gpus", "all", "--shm-size", "1g", "-v", "~/.local/share/llamactl/huggingface:/root/.cache/huggingface"]
|
||||
environment: {} # Environment variables for the container
|
||||
|
||||
mlx:
|
||||
|
||||
@@ -22,6 +22,9 @@ var buildTime string = "unknown"
|
||||
// @license.name MIT License
|
||||
// @license.url https://opensource.org/license/mit/
|
||||
// @basePath /api/v1
|
||||
// @securityDefinitions.apikey ApiKeyAuth
|
||||
// @in header
|
||||
// @name X-API-Key
|
||||
func main() {
|
||||
|
||||
// --version flag to print the version
|
||||
|
||||
@@ -33,7 +33,7 @@ RUN go mod download
|
||||
# Copy source code
|
||||
COPY cmd/ ./cmd/
|
||||
COPY pkg/ ./pkg/
|
||||
COPY apidocs/ ./apidocs/
|
||||
COPY docs/ ./docs/
|
||||
COPY webui/webui.go ./webui/
|
||||
|
||||
# Copy built webui from webui-builder
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
mkdocs-material==9.5.3
|
||||
mkdocs==1.5.3
|
||||
pymdown-extensions==10.7
|
||||
mkdocs-git-revision-date-localized-plugin==1.2.4
|
||||
mike==2.0.0
|
||||
mkdocs-material==9.6.22
|
||||
mkdocs==1.6.1
|
||||
pymdown-extensions==10.16.1
|
||||
mkdocs-git-revision-date-localized-plugin==1.4.7
|
||||
mike==2.1.3
|
||||
neoteroi-mkdocs==1.1.3
|
||||
1
docs/api-reference.md
Normal file
1
docs/api-reference.md
Normal file
@@ -0,0 +1 @@
|
||||
[OAD(swagger.yaml)]
|
||||
@@ -80,7 +80,7 @@ nodes: # Node configuration for multi-node deployment
|
||||
|
||||
### Configuration File Locations
|
||||
|
||||
Configuration files are searched in the following locations (in order of precedence):
|
||||
Configuration files are searched in the following locations (in order of precedence, first found is used):
|
||||
|
||||
**Linux:**
|
||||
- `./llamactl.yaml` or `./config.yaml` (current directory)
|
||||
1814
docs/css/css-v1.1.3.css
Normal file
1814
docs/css/css-v1.1.3.css
Normal file
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -1,190 +0,0 @@
|
||||
# Quick Start
|
||||
|
||||
This guide will help you get Llamactl up and running in just a few minutes.
|
||||
|
||||
## Step 1: Start Llamactl
|
||||
|
||||
Start the Llamactl server:
|
||||
|
||||
```bash
|
||||
llamactl
|
||||
```
|
||||
|
||||
By default, Llamactl will start on `http://localhost:8080`.
|
||||
|
||||
## Step 2: Access the Web UI
|
||||
|
||||
Open your web browser and navigate to:
|
||||
|
||||
```
|
||||
http://localhost:8080
|
||||
```
|
||||
|
||||
Login with the management API key. By default it is generated during server startup. Copy it from the terminal output.
|
||||
|
||||
You should see the Llamactl web interface.
|
||||
|
||||
## Step 3: Create Your First Instance
|
||||
|
||||
1. Click the "Add Instance" button
|
||||
2. Fill in the instance configuration:
|
||||
- **Name**: Give your instance a descriptive name
|
||||
- **Backend Type**: Choose from llama.cpp, MLX, or vLLM
|
||||
- **Model**: Model path or identifier for your chosen backend
|
||||
- **Additional Options**: Backend-specific parameters
|
||||
|
||||
3. Click "Create Instance"
|
||||
|
||||
## Step 4: Start Your Instance
|
||||
|
||||
Once created, you can:
|
||||
|
||||
- **Start** the instance by clicking the start button
|
||||
- **Monitor** its status in real-time
|
||||
- **View logs** by clicking the logs button
|
||||
- **Stop** the instance when needed
|
||||
|
||||
## Example Configurations
|
||||
|
||||
Here are basic example configurations for each backend:
|
||||
|
||||
**llama.cpp backend:**
|
||||
```json
|
||||
{
|
||||
"name": "llama2-7b",
|
||||
"backend_type": "llama_cpp",
|
||||
"backend_options": {
|
||||
"model": "/path/to/llama-2-7b-chat.gguf",
|
||||
"threads": 4,
|
||||
"ctx_size": 2048,
|
||||
"gpu_layers": 32
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**MLX backend (macOS only):**
|
||||
```json
|
||||
{
|
||||
"name": "mistral-mlx",
|
||||
"backend_type": "mlx_lm",
|
||||
"backend_options": {
|
||||
"model": "mlx-community/Mistral-7B-Instruct-v0.3-4bit",
|
||||
"temp": 0.7,
|
||||
"max_tokens": 2048
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**vLLM backend:**
|
||||
```json
|
||||
{
|
||||
"name": "dialogpt-vllm",
|
||||
"backend_type": "vllm",
|
||||
"backend_options": {
|
||||
"model": "microsoft/DialoGPT-medium",
|
||||
"tensor_parallel_size": 2,
|
||||
"gpu_memory_utilization": 0.9
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Docker Support
|
||||
|
||||
Llamactl can run backends in Docker containers. To enable Docker for a backend, add a `docker` section to that backend in your YAML configuration file (e.g. `config.yaml`) as shown below:
|
||||
|
||||
```yaml
|
||||
backends:
|
||||
vllm:
|
||||
command: "vllm"
|
||||
args: ["serve"]
|
||||
docker:
|
||||
enabled: true
|
||||
image: "vllm/vllm-openai:latest"
|
||||
args: ["run", "--rm", "--network", "host", "--gpus", "all", "--shm-size", "1g"]
|
||||
```
|
||||
|
||||
## Using the API
|
||||
|
||||
You can also manage instances via the REST API:
|
||||
|
||||
```bash
|
||||
# List all instances
|
||||
curl http://localhost:8080/api/instances
|
||||
|
||||
# Create a new llama.cpp instance
|
||||
curl -X POST http://localhost:8080/api/instances/my-model \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"backend_type": "llama_cpp",
|
||||
"backend_options": {
|
||||
"model": "/path/to/model.gguf"
|
||||
}
|
||||
}'
|
||||
|
||||
# Start an instance
|
||||
curl -X POST http://localhost:8080/api/instances/my-model/start
|
||||
```
|
||||
|
||||
## OpenAI Compatible API
|
||||
|
||||
Llamactl provides OpenAI-compatible endpoints, making it easy to integrate with existing OpenAI client libraries and tools.
|
||||
|
||||
### Chat Completions
|
||||
|
||||
Once you have an instance running, you can use it with the OpenAI-compatible chat completions endpoint:
|
||||
|
||||
```bash
|
||||
curl -X POST http://localhost:8080/v1/chat/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"model": "my-model",
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Hello! Can you help me write a Python function?"
|
||||
}
|
||||
],
|
||||
"max_tokens": 150,
|
||||
"temperature": 0.7
|
||||
}'
|
||||
```
|
||||
|
||||
### Using with Python OpenAI Client
|
||||
|
||||
You can also use the official OpenAI Python client:
|
||||
|
||||
```python
|
||||
from openai import OpenAI
|
||||
|
||||
# Point the client to your Llamactl server
|
||||
client = OpenAI(
|
||||
base_url="http://localhost:8080/v1",
|
||||
api_key="not-needed" # Llamactl doesn't require API keys by default
|
||||
)
|
||||
|
||||
# Create a chat completion
|
||||
response = client.chat.completions.create(
|
||||
model="my-model", # Use the name of your instance
|
||||
messages=[
|
||||
{"role": "user", "content": "Explain quantum computing in simple terms"}
|
||||
],
|
||||
max_tokens=200,
|
||||
temperature=0.7
|
||||
)
|
||||
|
||||
print(response.choices[0].message.content)
|
||||
```
|
||||
|
||||
### List Available Models
|
||||
|
||||
Get a list of running instances (models) in OpenAI-compatible format:
|
||||
|
||||
```bash
|
||||
curl http://localhost:8080/v1/models
|
||||
```
|
||||
|
||||
## Next Steps
|
||||
|
||||
- Manage instances [Managing Instances](../user-guide/managing-instances.md)
|
||||
- Explore the [API Reference](../user-guide/api-reference.md)
|
||||
- Configure advanced settings in the [Configuration](configuration.md) guide
|
||||
Binary file not shown.
|
Before Width: | Height: | Size: 69 KiB After Width: | Height: | Size: 66 KiB |
Binary file not shown.
|
Before Width: | Height: | Size: 31 KiB After Width: | Height: | Size: 45 KiB |
@@ -14,20 +14,20 @@ Welcome to the Llamactl documentation!
|
||||
|
||||
## Quick Links
|
||||
|
||||
- [Installation Guide](getting-started/installation.md) - Get Llamactl up and running
|
||||
- [Configuration Guide](getting-started/configuration.md) - Detailed configuration options
|
||||
- [Quick Start](getting-started/quick-start.md) - Your first steps with Llamactl
|
||||
- [Managing Instances](user-guide/managing-instances.md) - Instance lifecycle management
|
||||
- [API Reference](user-guide/api-reference.md) - Complete API documentation
|
||||
- [Installation Guide](installation.md) - Get Llamactl up and running
|
||||
- [Configuration Guide](configuration.md) - Detailed configuration options
|
||||
- [Quick Start](quick-start.md) - Your first steps with Llamactl
|
||||
- [Managing Instances](managing-instances.md) - Instance lifecycle management
|
||||
- [API Reference](api-reference.md) - Complete API documentation
|
||||
|
||||
|
||||
## Getting Help
|
||||
|
||||
If you need help or have questions:
|
||||
|
||||
- Check the [Troubleshooting](user-guide/troubleshooting.md) guide
|
||||
- Check the [Troubleshooting](troubleshooting.md) guide
|
||||
- Visit the [GitHub repository](https://github.com/lordmathis/llamactl)
|
||||
- Review the [Configuration Guide](getting-started/configuration.md) for advanced settings
|
||||
- Review the [Configuration Guide](configuration.md) for advanced settings
|
||||
|
||||
## License
|
||||
|
||||
|
||||
@@ -42,15 +42,10 @@ Note: MLX backend is only available on macOS with Apple Silicon (M1, M2, M3, etc
|
||||
vLLM provides high-throughput distributed serving for LLMs. Install vLLM:
|
||||
|
||||
```bash
|
||||
# Install via pip (requires Python 3.8+, GPU required)
|
||||
pip install vllm
|
||||
|
||||
# Or in a virtual environment (recommended)
|
||||
# Install in a virtual environment
|
||||
python -m venv vllm-env
|
||||
source vllm-env/bin/activate
|
||||
pip install vllm
|
||||
|
||||
# For production deployments, consider container-based installation
|
||||
```
|
||||
|
||||
## Installation Methods
|
||||
@@ -82,7 +77,7 @@ llamactl provides Dockerfiles for creating Docker images with backends pre-insta
|
||||
|
||||
**Note:** These Dockerfiles are configured for CUDA. For other platforms (CPU, ROCm, Vulkan, etc.), adapt the base image. For llama.cpp, see available tags at [llama.cpp Docker docs](https://github.com/ggml-org/llama.cpp/blob/master/docs/docker.md). For vLLM, check [vLLM docs](https://docs.vllm.ai/en/v0.6.5/serving/deploying_with_docker.html).
|
||||
|
||||
#### Using Docker Compose
|
||||
**Using Docker Compose**
|
||||
|
||||
```bash
|
||||
# Clone the repository
|
||||
@@ -103,9 +98,9 @@ Access the dashboard at:
|
||||
- llamactl with llama.cpp: http://localhost:8080
|
||||
- llamactl with vLLM: http://localhost:8081
|
||||
|
||||
#### Using Docker Build and Run
|
||||
**Using Docker Build and Run**
|
||||
|
||||
**llamactl with llama.cpp CUDA:**
|
||||
1. llamactl with llama.cpp CUDA:
|
||||
```bash
|
||||
docker build -f docker/Dockerfile.llamacpp -t llamactl:llamacpp-cuda .
|
||||
docker run -d \
|
||||
@@ -116,7 +111,7 @@ docker run -d \
|
||||
llamactl:llamacpp-cuda
|
||||
```
|
||||
|
||||
**llamactl with vLLM CUDA:**
|
||||
2. llamactl with vLLM CUDA:
|
||||
```bash
|
||||
docker build -f docker/Dockerfile.vllm -t llamactl:vllm-cuda .
|
||||
docker run -d \
|
||||
@@ -127,7 +122,7 @@ docker run -d \
|
||||
llamactl:vllm-cuda
|
||||
```
|
||||
|
||||
**llamactl built from source:**
|
||||
3. llamactl built from source:
|
||||
```bash
|
||||
docker build -f docker/Dockerfile.source -t llamactl:source .
|
||||
docker run -d \
|
||||
@@ -9,13 +9,17 @@ Llamactl provides two ways to manage instances:
|
||||
- **Web UI**: Accessible at `http://localhost:8080` with an intuitive dashboard
|
||||
- **REST API**: Programmatic access for automation and integration
|
||||
|
||||

|
||||

|
||||
|
||||
### Authentication
|
||||
|
||||
If authentication is enabled:
|
||||
Llamactl uses a **Management API Key** to authenticate requests to the management API (creating, starting, stopping instances). All curl examples below use `<token>` as a placeholder - replace this with your actual Management API Key.
|
||||
|
||||
By default, authentication is required. If you don't configure a management API key in your configuration file, llamactl will auto-generate one and print it to the terminal on startup. See the [Configuration](configuration.md) guide for details.
|
||||
|
||||
For Web UI access:
|
||||
1. Navigate to the web UI
|
||||
2. Enter your credentials
|
||||
2. Enter your Management API Key
|
||||
3. Bearer token is stored for the session
|
||||
|
||||
### Theme Support
|
||||
@@ -33,9 +37,9 @@ Each instance is displayed as a card showing:
|
||||
|
||||
## Create Instance
|
||||
|
||||
### Via Web UI
|
||||
**Via Web UI**
|
||||
|
||||

|
||||

|
||||
|
||||
1. Click the **"Create Instance"** button on the dashboard
|
||||
2. Enter a unique **Name** for your instance (only required field)
|
||||
@@ -59,14 +63,19 @@ Each instance is displayed as a card showing:
|
||||
- **llama.cpp**: Threads, context size, GPU layers, port, etc.
|
||||
- **MLX**: Temperature, top-p, adapter path, Python environment, etc.
|
||||
- **vLLM**: Tensor parallel size, GPU memory utilization, quantization, etc.
|
||||
|
||||
!!! tip "Auto-Assignment"
|
||||
Llamactl automatically assigns ports from the configured port range (default: 8000-9000) and generates API keys if authentication is enabled. You typically don't need to manually specify these values.
|
||||
|
||||
8. Click **"Create"** to save the instance
|
||||
|
||||
### Via API
|
||||
**Via API**
|
||||
|
||||
```bash
|
||||
# Create llama.cpp instance with local model file
|
||||
curl -X POST http://localhost:8080/api/instances/my-llama-instance \
|
||||
curl -X POST http://localhost:8080/api/v1/instances/my-llama-instance \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "Authorization: Bearer <token>" \
|
||||
-d '{
|
||||
"backend_type": "llama_cpp",
|
||||
"backend_options": {
|
||||
@@ -74,12 +83,14 @@ curl -X POST http://localhost:8080/api/instances/my-llama-instance \
|
||||
"threads": 8,
|
||||
"ctx_size": 4096,
|
||||
"gpu_layers": 32
|
||||
}
|
||||
},
|
||||
"nodes": ["main"]
|
||||
}'
|
||||
|
||||
# Create MLX instance (macOS only)
|
||||
curl -X POST http://localhost:8080/api/instances/my-mlx-instance \
|
||||
curl -X POST http://localhost:8080/api/v1/instances/my-mlx-instance \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "Authorization: Bearer <token>" \
|
||||
-d '{
|
||||
"backend_type": "mlx_lm",
|
||||
"backend_options": {
|
||||
@@ -89,12 +100,14 @@ curl -X POST http://localhost:8080/api/instances/my-mlx-instance \
|
||||
"max_tokens": 2048
|
||||
},
|
||||
"auto_restart": true,
|
||||
"max_restarts": 3
|
||||
"max_restarts": 3,
|
||||
"nodes": ["main"]
|
||||
}'
|
||||
|
||||
# Create vLLM instance
|
||||
curl -X POST http://localhost:8080/api/instances/my-vllm-instance \
|
||||
curl -X POST http://localhost:8080/api/v1/instances/my-vllm-instance \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "Authorization: Bearer <token>" \
|
||||
-d '{
|
||||
"backend_type": "vllm",
|
||||
"backend_options": {
|
||||
@@ -108,24 +121,28 @@ curl -X POST http://localhost:8080/api/instances/my-vllm-instance \
|
||||
"CUDA_VISIBLE_DEVICES": "0,1",
|
||||
"NCCL_DEBUG": "INFO",
|
||||
"PYTHONPATH": "/custom/path"
|
||||
}
|
||||
},
|
||||
"nodes": ["main"]
|
||||
}'
|
||||
|
||||
# Create llama.cpp instance with HuggingFace model
|
||||
curl -X POST http://localhost:8080/api/instances/gemma-3-27b \
|
||||
curl -X POST http://localhost:8080/api/v1/instances/gemma-3-27b \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "Authorization: Bearer <token>" \
|
||||
-d '{
|
||||
"backend_type": "llama_cpp",
|
||||
"backend_options": {
|
||||
"hf_repo": "unsloth/gemma-3-27b-it-GGUF",
|
||||
"hf_file": "gemma-3-27b-it-GGUF.gguf",
|
||||
"gpu_layers": 32
|
||||
}
|
||||
},
|
||||
"nodes": ["main"]
|
||||
}'
|
||||
|
||||
# Create instance on specific remote node
|
||||
curl -X POST http://localhost:8080/api/instances/remote-llama \
|
||||
curl -X POST http://localhost:8080/api/v1/instances/remote-llama \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "Authorization: Bearer <token>" \
|
||||
-d '{
|
||||
"backend_type": "llama_cpp",
|
||||
"backend_options": {
|
||||
@@ -134,46 +151,62 @@ curl -X POST http://localhost:8080/api/instances/remote-llama \
|
||||
},
|
||||
"nodes": ["worker1"]
|
||||
}'
|
||||
|
||||
# Create instance on multiple nodes for high availability
|
||||
curl -X POST http://localhost:8080/api/v1/instances/multi-node-llama \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "Authorization: Bearer <token>" \
|
||||
-d '{
|
||||
"backend_type": "llama_cpp",
|
||||
"backend_options": {
|
||||
"model": "/models/llama-7b.gguf",
|
||||
"gpu_layers": 32
|
||||
},
|
||||
"nodes": ["worker1", "worker2", "worker3"]
|
||||
}'
|
||||
```
|
||||
|
||||
## Start Instance
|
||||
|
||||
### Via Web UI
|
||||
**Via Web UI**
|
||||
1. Click the **"Start"** button on an instance card
|
||||
2. Watch the status change to "Unknown"
|
||||
3. Monitor progress in the logs
|
||||
4. Instance status changes to "Ready" when ready
|
||||
|
||||
### Via API
|
||||
**Via API**
|
||||
```bash
|
||||
curl -X POST http://localhost:8080/api/instances/{name}/start
|
||||
curl -X POST http://localhost:8080/api/v1/instances/{name}/start \
|
||||
-H "Authorization: Bearer <token>"
|
||||
```
|
||||
|
||||
## Stop Instance
|
||||
|
||||
### Via Web UI
|
||||
**Via Web UI**
|
||||
1. Click the **"Stop"** button on an instance card
|
||||
2. Instance gracefully shuts down
|
||||
|
||||
### Via API
|
||||
**Via API**
|
||||
```bash
|
||||
curl -X POST http://localhost:8080/api/instances/{name}/stop
|
||||
curl -X POST http://localhost:8080/api/v1/instances/{name}/stop \
|
||||
-H "Authorization: Bearer <token>"
|
||||
```
|
||||
|
||||
## Edit Instance
|
||||
|
||||
### Via Web UI
|
||||
**Via Web UI**
|
||||
1. Click the **"Edit"** button on an instance card
|
||||
2. Modify settings in the configuration dialog
|
||||
3. Changes require instance restart to take effect
|
||||
4. Click **"Update & Restart"** to apply changes
|
||||
|
||||
### Via API
|
||||
**Via API**
|
||||
Modify instance settings:
|
||||
|
||||
```bash
|
||||
curl -X PUT http://localhost:8080/api/instances/{name} \
|
||||
curl -X PUT http://localhost:8080/api/v1/instances/{name} \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "Authorization: Bearer <token>" \
|
||||
-d '{
|
||||
"backend_options": {
|
||||
"threads": 8,
|
||||
@@ -188,29 +221,31 @@ curl -X PUT http://localhost:8080/api/instances/{name} \
|
||||
|
||||
## View Logs
|
||||
|
||||
### Via Web UI
|
||||
**Via Web UI**
|
||||
|
||||
1. Click the **"Logs"** button on any instance card
|
||||
2. Real-time log viewer opens
|
||||
|
||||
### Via API
|
||||
**Via API**
|
||||
Check instance status in real-time:
|
||||
|
||||
```bash
|
||||
# Get instance details
|
||||
curl http://localhost:8080/api/instances/{name}/logs
|
||||
# Get instance logs
|
||||
curl http://localhost:8080/api/v1/instances/{name}/logs \
|
||||
-H "Authorization: Bearer <token>"
|
||||
```
|
||||
|
||||
## Delete Instance
|
||||
|
||||
### Via Web UI
|
||||
**Via Web UI**
|
||||
1. Click the **"Delete"** button on an instance card
|
||||
2. Only stopped instances can be deleted
|
||||
3. Confirm deletion in the dialog
|
||||
|
||||
### Via API
|
||||
**Via API**
|
||||
```bash
|
||||
curl -X DELETE http://localhost:8080/api/instances/{name}
|
||||
curl -X DELETE http://localhost:8080/api/v1/instances/{name} \
|
||||
-H "Authorization: Bearer <token>"
|
||||
```
|
||||
|
||||
## Instance Proxy
|
||||
@@ -218,8 +253,9 @@ curl -X DELETE http://localhost:8080/api/instances/{name}
|
||||
Llamactl proxies all requests to the underlying backend instances (llama-server, MLX, or vLLM).
|
||||
|
||||
```bash
|
||||
# Get instance details
|
||||
curl http://localhost:8080/api/instances/{name}/proxy/
|
||||
# Proxy requests to the instance
|
||||
curl http://localhost:8080/api/v1/instances/{name}/proxy/ \
|
||||
-H "Authorization: Bearer <token>"
|
||||
```
|
||||
|
||||
All backends provide OpenAI-compatible endpoints. Check the respective documentation:
|
||||
@@ -229,15 +265,16 @@ All backends provide OpenAI-compatible endpoints. Check the respective documenta
|
||||
|
||||
### Instance Health
|
||||
|
||||
#### Via Web UI
|
||||
**Via Web UI**
|
||||
|
||||
1. The health status badge is displayed on each instance card
|
||||
|
||||
#### Via API
|
||||
**Via API**
|
||||
|
||||
Check the health status of your instances:
|
||||
|
||||
```bash
|
||||
curl http://localhost:8080/api/instances/{name}/proxy/health
|
||||
curl http://localhost:8080/api/v1/instances/{name}/proxy/health \
|
||||
-H "Authorization: Bearer <token>"
|
||||
```
|
||||
|
||||
263
docs/quick-start.md
Normal file
263
docs/quick-start.md
Normal file
@@ -0,0 +1,263 @@
|
||||
# Quick Start
|
||||
|
||||
This guide will help you get Llamactl up and running in just a few minutes.
|
||||
|
||||
**Before you begin:** Ensure you have at least one backend installed (llama.cpp, MLX, or vLLM). See the [Installation Guide](installation.md#prerequisites) for backend setup.
|
||||
|
||||
## Core Concepts
|
||||
|
||||
Before you start, let's clarify a few key terms:
|
||||
|
||||
- **Instance**: A running backend server that serves a specific model. Each instance has a unique name and runs independently.
|
||||
- **Backend**: The inference engine that actually runs the model (llama.cpp, MLX, or vLLM). You need at least one backend installed before creating instances.
|
||||
- **Node**: In multi-machine setups, a node represents one machine. Most users will just use the default "main" node for single-machine deployments.
|
||||
- **Proxy Architecture**: Llamactl acts as a proxy in front of your instances. You make requests to llamactl (e.g., `http://localhost:8080/v1/chat/completions`), and it routes them to the appropriate backend instance. This means you don't need to track individual instance ports or endpoints.
|
||||
|
||||
## Authentication
|
||||
|
||||
Llamactl uses two types of API keys:
|
||||
|
||||
- **Management API Key**: Used to authenticate with the Llamactl management API (creating, starting, stopping instances).
|
||||
- **Inference API Key**: Used to authenticate requests to the OpenAI-compatible endpoints (`/v1/chat/completions`, `/v1/completions`, etc.).
|
||||
|
||||
By default, authentication is required. If you don't configure these keys in your configuration file, llamactl will auto-generate them and print them to the terminal on startup. You can also configure custom keys or disable authentication entirely in the [Configuration](configuration.md) guide.
|
||||
|
||||
## Start Llamactl
|
||||
|
||||
Start the Llamactl server:
|
||||
|
||||
```bash
|
||||
llamactl
|
||||
```
|
||||
|
||||
```
|
||||
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
||||
⚠️ MANAGEMENT AUTHENTICATION REQUIRED
|
||||
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
||||
🔑 Generated Management API Key:
|
||||
|
||||
sk-management-...
|
||||
|
||||
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
||||
⚠️ INFERENCE AUTHENTICATION REQUIRED
|
||||
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
||||
🔑 Generated Inference API Key:
|
||||
|
||||
sk-inference-...
|
||||
|
||||
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
||||
⚠️ IMPORTANT
|
||||
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
||||
• These keys are auto-generated and will change on restart
|
||||
• For production, add explicit keys to your configuration
|
||||
• Copy these keys before they disappear from the terminal
|
||||
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
||||
Llamactl server listening on 0.0.0.0:8080
|
||||
```
|
||||
|
||||
Copy the **Management** and **Inference** API Keys from the terminal - you'll need them to access the web UI and make inference requests.
|
||||
|
||||
By default, Llamactl will start on `http://localhost:8080`.
|
||||
|
||||
## Access the Web UI
|
||||
|
||||
Open your web browser and navigate to:
|
||||
|
||||
```
|
||||
http://localhost:8080
|
||||
```
|
||||
|
||||
Login with the management API key from the terminal output.
|
||||
|
||||
You should see the Llamactl web interface.
|
||||
|
||||
## Create Your First Instance
|
||||
|
||||
1. Click the "Add Instance" button
|
||||
2. Fill in the instance configuration:
|
||||
- **Name**: Give your instance a descriptive name
|
||||
- **Node**: Select which node to deploy the instance to (defaults to "main" for single-node setups)
|
||||
- **Backend Type**: Choose from llama.cpp, MLX, or vLLM
|
||||
- **Model**: Model path or huggingface repo
|
||||
- **Additional Options**: Backend-specific parameters
|
||||
|
||||
!!! tip "Auto-Assignment"
|
||||
Llamactl automatically assigns ports from the configured port range (default: 8000-9000) and generates API keys if authentication is enabled. You typically don't need to manually specify these values.
|
||||
|
||||
!!! note "Remote Node Deployment"
|
||||
If you have configured remote nodes in your configuration file, you can select which node to deploy the instance to. This allows you to distribute instances across multiple machines. See the [Configuration](configuration.md#remote-node-configuration) guide for details on setting up remote nodes.
|
||||
|
||||
3. Click "Create Instance"
|
||||
|
||||
## Start Your Instance
|
||||
|
||||
Once created, you can:
|
||||
|
||||
- **Start** the instance by clicking the start button
|
||||
- **Monitor** its status in real-time
|
||||
- **View logs** by clicking the logs button
|
||||
- **Stop** the instance when needed
|
||||
|
||||
## Example Configurations
|
||||
|
||||
Here are basic example configurations for each backend:
|
||||
|
||||
**llama.cpp backend:**
|
||||
```json
|
||||
{
|
||||
"name": "llama2-7b",
|
||||
"backend_type": "llama_cpp",
|
||||
"backend_options": {
|
||||
"model": "/path/to/llama-2-7b-chat.gguf",
|
||||
"threads": 4,
|
||||
"ctx_size": 2048,
|
||||
"gpu_layers": 32
|
||||
},
|
||||
"nodes": ["main"]
|
||||
}
|
||||
```
|
||||
|
||||
**MLX backend (macOS only):**
|
||||
```json
|
||||
{
|
||||
"name": "mistral-mlx",
|
||||
"backend_type": "mlx_lm",
|
||||
"backend_options": {
|
||||
"model": "mlx-community/Mistral-7B-Instruct-v0.3-4bit",
|
||||
"temp": 0.7,
|
||||
"max_tokens": 2048
|
||||
},
|
||||
"nodes": ["main"]
|
||||
}
|
||||
```
|
||||
|
||||
**vLLM backend:**
|
||||
```json
|
||||
{
|
||||
"name": "dialogpt-vllm",
|
||||
"backend_type": "vllm",
|
||||
"backend_options": {
|
||||
"model": "microsoft/DialoGPT-medium",
|
||||
"tensor_parallel_size": 2,
|
||||
"gpu_memory_utilization": 0.9
|
||||
},
|
||||
"nodes": ["main"]
|
||||
}
|
||||
```
|
||||
|
||||
**Remote node deployment example:**
|
||||
```json
|
||||
{
|
||||
"name": "distributed-model",
|
||||
"backend_type": "llama_cpp",
|
||||
"backend_options": {
|
||||
"model": "/path/to/model.gguf",
|
||||
"gpu_layers": 32
|
||||
},
|
||||
"nodes": ["worker1"]
|
||||
}
|
||||
```
|
||||
|
||||
## Docker Support
|
||||
|
||||
Llamactl can run backends in Docker containers. To enable Docker for a backend, add a `docker` section to that backend in your YAML configuration file (e.g. `config.yaml`) as shown below:
|
||||
|
||||
```yaml
|
||||
backends:
|
||||
vllm:
|
||||
command: "vllm"
|
||||
args: ["serve"]
|
||||
docker:
|
||||
enabled: true
|
||||
image: "vllm/vllm-openai:latest"
|
||||
args: ["run", "--rm", "--network", "host", "--gpus", "all", "--shm-size", "1g"]
|
||||
```
|
||||
|
||||
## Using the API
|
||||
|
||||
You can also manage instances via the REST API:
|
||||
|
||||
```bash
|
||||
# List all instances
|
||||
curl http://localhost:8080/api/v1/instances
|
||||
|
||||
# Create a new llama.cpp instance
|
||||
curl -X POST http://localhost:8080/api/v1/instances/my-model \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"backend_type": "llama_cpp",
|
||||
"backend_options": {
|
||||
"model": "/path/to/model.gguf"
|
||||
}
|
||||
}'
|
||||
|
||||
# Start an instance
|
||||
curl -X POST http://localhost:8080/api/v1/instances/my-model/start
|
||||
```
|
||||
|
||||
## OpenAI Compatible API
|
||||
|
||||
Llamactl provides OpenAI-compatible endpoints, making it easy to integrate with existing OpenAI client libraries and tools.
|
||||
|
||||
### Chat Completions
|
||||
|
||||
Once you have an instance running, you can use it with the OpenAI-compatible chat completions endpoint:
|
||||
|
||||
```bash
|
||||
curl -X POST http://localhost:8080/v1/chat/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"model": "my-model",
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Hello! Can you help me write a Python function?"
|
||||
}
|
||||
],
|
||||
"max_tokens": 150,
|
||||
"temperature": 0.7
|
||||
}'
|
||||
```
|
||||
|
||||
### Using with Python OpenAI Client
|
||||
|
||||
You can also use the official OpenAI Python client:
|
||||
|
||||
```python
|
||||
from openai import OpenAI
|
||||
|
||||
# Point the client to your Llamactl server
|
||||
client = OpenAI(
|
||||
base_url="http://localhost:8080/v1",
|
||||
api_key="your-inference-api-key" # Use the inference API key from terminal or config
|
||||
)
|
||||
|
||||
# Create a chat completion
|
||||
response = client.chat.completions.create(
|
||||
model="my-model", # Use the name of your instance
|
||||
messages=[
|
||||
{"role": "user", "content": "Explain quantum computing in simple terms"}
|
||||
],
|
||||
max_tokens=200,
|
||||
temperature=0.7
|
||||
)
|
||||
|
||||
print(response.choices[0].message.content)
|
||||
```
|
||||
|
||||
!!! note "API Key"
|
||||
If you disabled authentication in your config, you can use any value for `api_key` (e.g., `"not-needed"`). Otherwise, use the inference API key shown in the terminal output on startup.
|
||||
|
||||
### List Available Models
|
||||
|
||||
Get a list of running instances (models) in OpenAI-compatible format:
|
||||
|
||||
```bash
|
||||
curl http://localhost:8080/v1/models
|
||||
```
|
||||
|
||||
## Next Steps
|
||||
|
||||
- Manage instances [Managing Instances](managing-instances.md)
|
||||
- Explore the [API Reference](api-reference.md)
|
||||
- Configure advanced settings in the [Configuration](configuration.md) guide
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,25 +1,23 @@
|
||||
basePath: /api/v1
|
||||
definitions:
|
||||
backends.BackendType:
|
||||
enum:
|
||||
- llama_cpp
|
||||
- mlx_lm
|
||||
- vllm
|
||||
type: string
|
||||
x-enum-varnames:
|
||||
- BackendTypeLlamaCpp
|
||||
- BackendTypeMlxLm
|
||||
- BackendTypeVllm
|
||||
instance.CreateInstanceOptions:
|
||||
instance.Instance:
|
||||
properties:
|
||||
created:
|
||||
description: Unix timestamp when the instance was created
|
||||
type: integer
|
||||
name:
|
||||
type: string
|
||||
type: object
|
||||
instance.Options:
|
||||
properties:
|
||||
auto_restart:
|
||||
description: Auto restart
|
||||
type: boolean
|
||||
backend_options:
|
||||
additionalProperties: {}
|
||||
environment:
|
||||
additionalProperties:
|
||||
type: string
|
||||
description: Environment variables
|
||||
type: object
|
||||
backend_type:
|
||||
$ref: '#/definitions/backends.BackendType'
|
||||
idle_timeout:
|
||||
description: Idle timeout
|
||||
type: integer
|
||||
@@ -32,27 +30,10 @@ definitions:
|
||||
description: seconds
|
||||
type: integer
|
||||
type: object
|
||||
instance.InstanceStatus:
|
||||
enum:
|
||||
- 0
|
||||
- 1
|
||||
- 2
|
||||
type: integer
|
||||
x-enum-varnames:
|
||||
- Stopped
|
||||
- Running
|
||||
- Failed
|
||||
instance.Process:
|
||||
server.NodeResponse:
|
||||
properties:
|
||||
created:
|
||||
description: Creation time
|
||||
type: integer
|
||||
name:
|
||||
address:
|
||||
type: string
|
||||
status:
|
||||
allOf:
|
||||
- $ref: '#/definitions/instance.InstanceStatus'
|
||||
description: Status
|
||||
type: object
|
||||
server.OpenAIInstance:
|
||||
properties:
|
||||
@@ -88,7 +69,7 @@ info:
|
||||
title: llamactl API
|
||||
version: "1.0"
|
||||
paths:
|
||||
/backends/llama-cpp/devices:
|
||||
/api/v1/backends/llama-cpp/devices:
|
||||
get:
|
||||
description: Returns a list of available devices for the llama server
|
||||
responses:
|
||||
@@ -104,8 +85,8 @@ paths:
|
||||
- ApiKeyAuth: []
|
||||
summary: List available devices for llama server
|
||||
tags:
|
||||
- backends
|
||||
/backends/llama-cpp/help:
|
||||
- Backends
|
||||
/api/v1/backends/llama-cpp/help:
|
||||
get:
|
||||
description: Returns the help text for the llama server command
|
||||
responses:
|
||||
@@ -121,8 +102,8 @@ paths:
|
||||
- ApiKeyAuth: []
|
||||
summary: Get help for llama server
|
||||
tags:
|
||||
- backends
|
||||
/backends/llama-cpp/parse-command:
|
||||
- Backends
|
||||
/api/v1/backends/llama-cpp/parse-command:
|
||||
post:
|
||||
consumes:
|
||||
- application/json
|
||||
@@ -140,7 +121,7 @@ paths:
|
||||
"200":
|
||||
description: Parsed options
|
||||
schema:
|
||||
$ref: '#/definitions/instance.CreateInstanceOptions'
|
||||
$ref: '#/definitions/instance.Options'
|
||||
"400":
|
||||
description: Invalid request or command
|
||||
schema:
|
||||
@@ -157,8 +138,8 @@ paths:
|
||||
- ApiKeyAuth: []
|
||||
summary: Parse llama-server command
|
||||
tags:
|
||||
- backends
|
||||
/backends/llama-cpp/version:
|
||||
- Backends
|
||||
/api/v1/backends/llama-cpp/version:
|
||||
get:
|
||||
description: Returns the version of the llama server command
|
||||
responses:
|
||||
@@ -174,8 +155,8 @@ paths:
|
||||
- ApiKeyAuth: []
|
||||
summary: Get version of llama server
|
||||
tags:
|
||||
- backends
|
||||
/backends/mlx/parse-command:
|
||||
- Backends
|
||||
/api/v1/backends/mlx/parse-command:
|
||||
post:
|
||||
consumes:
|
||||
- application/json
|
||||
@@ -193,7 +174,7 @@ paths:
|
||||
"200":
|
||||
description: Parsed options
|
||||
schema:
|
||||
$ref: '#/definitions/instance.CreateInstanceOptions'
|
||||
$ref: '#/definitions/instance.Options'
|
||||
"400":
|
||||
description: Invalid request or command
|
||||
schema:
|
||||
@@ -204,8 +185,8 @@ paths:
|
||||
- ApiKeyAuth: []
|
||||
summary: Parse mlx_lm.server command
|
||||
tags:
|
||||
- backends
|
||||
/backends/vllm/parse-command:
|
||||
- Backends
|
||||
/api/v1/backends/vllm/parse-command:
|
||||
post:
|
||||
consumes:
|
||||
- application/json
|
||||
@@ -223,7 +204,7 @@ paths:
|
||||
"200":
|
||||
description: Parsed options
|
||||
schema:
|
||||
$ref: '#/definitions/instance.CreateInstanceOptions'
|
||||
$ref: '#/definitions/instance.Options'
|
||||
"400":
|
||||
description: Invalid request or command
|
||||
schema:
|
||||
@@ -234,8 +215,8 @@ paths:
|
||||
- ApiKeyAuth: []
|
||||
summary: Parse vllm serve command
|
||||
tags:
|
||||
- backends
|
||||
/instances:
|
||||
- Backends
|
||||
/api/v1/instances:
|
||||
get:
|
||||
description: Returns a list of all instances managed by the server
|
||||
responses:
|
||||
@@ -243,7 +224,7 @@ paths:
|
||||
description: List of instances
|
||||
schema:
|
||||
items:
|
||||
$ref: '#/definitions/instance.Process'
|
||||
$ref: '#/definitions/instance.Instance'
|
||||
type: array
|
||||
"500":
|
||||
description: Internal Server Error
|
||||
@@ -253,8 +234,8 @@ paths:
|
||||
- ApiKeyAuth: []
|
||||
summary: List all instances
|
||||
tags:
|
||||
- instances
|
||||
/instances/{name}:
|
||||
- Instances
|
||||
/api/v1/instances/{name}:
|
||||
delete:
|
||||
description: Stops and removes a specific instance by name
|
||||
parameters:
|
||||
@@ -278,7 +259,7 @@ paths:
|
||||
- ApiKeyAuth: []
|
||||
summary: Delete an instance
|
||||
tags:
|
||||
- instances
|
||||
- Instances
|
||||
get:
|
||||
description: Returns the details of a specific instance by name
|
||||
parameters:
|
||||
@@ -291,7 +272,7 @@ paths:
|
||||
"200":
|
||||
description: Instance details
|
||||
schema:
|
||||
$ref: '#/definitions/instance.Process'
|
||||
$ref: '#/definitions/instance.Instance'
|
||||
"400":
|
||||
description: Invalid name format
|
||||
schema:
|
||||
@@ -304,7 +285,7 @@ paths:
|
||||
- ApiKeyAuth: []
|
||||
summary: Get details of a specific instance
|
||||
tags:
|
||||
- instances
|
||||
- Instances
|
||||
post:
|
||||
consumes:
|
||||
- application/json
|
||||
@@ -320,12 +301,12 @@ paths:
|
||||
name: options
|
||||
required: true
|
||||
schema:
|
||||
$ref: '#/definitions/instance.CreateInstanceOptions'
|
||||
$ref: '#/definitions/instance.Options'
|
||||
responses:
|
||||
"201":
|
||||
description: Created instance details
|
||||
schema:
|
||||
$ref: '#/definitions/instance.Process'
|
||||
$ref: '#/definitions/instance.Instance'
|
||||
"400":
|
||||
description: Invalid request body
|
||||
schema:
|
||||
@@ -338,7 +319,7 @@ paths:
|
||||
- ApiKeyAuth: []
|
||||
summary: Create and start a new instance
|
||||
tags:
|
||||
- instances
|
||||
- Instances
|
||||
put:
|
||||
consumes:
|
||||
- application/json
|
||||
@@ -354,12 +335,12 @@ paths:
|
||||
name: options
|
||||
required: true
|
||||
schema:
|
||||
$ref: '#/definitions/instance.CreateInstanceOptions'
|
||||
$ref: '#/definitions/instance.Options'
|
||||
responses:
|
||||
"200":
|
||||
description: Updated instance details
|
||||
schema:
|
||||
$ref: '#/definitions/instance.Process'
|
||||
$ref: '#/definitions/instance.Instance'
|
||||
"400":
|
||||
description: Invalid name format
|
||||
schema:
|
||||
@@ -372,8 +353,8 @@ paths:
|
||||
- ApiKeyAuth: []
|
||||
summary: Update an instance's configuration
|
||||
tags:
|
||||
- instances
|
||||
/instances/{name}/logs:
|
||||
- Instances
|
||||
/api/v1/instances/{name}/logs:
|
||||
get:
|
||||
description: Returns the logs from a specific instance by name with optional
|
||||
line limit
|
||||
@@ -404,8 +385,8 @@ paths:
|
||||
- ApiKeyAuth: []
|
||||
summary: Get logs from a specific instance
|
||||
tags:
|
||||
- instances
|
||||
/instances/{name}/proxy:
|
||||
- Instances
|
||||
/api/v1/instances/{name}/proxy:
|
||||
get:
|
||||
description: Forwards HTTP requests to the llama-server instance running on
|
||||
a specific port
|
||||
@@ -432,9 +413,10 @@ paths:
|
||||
type: string
|
||||
security:
|
||||
- ApiKeyAuth: []
|
||||
summary: Proxy requests to a specific instance
|
||||
summary: Proxy requests to a specific instance, does not autostart instance
|
||||
if stopped
|
||||
tags:
|
||||
- instances
|
||||
- Instances
|
||||
post:
|
||||
description: Forwards HTTP requests to the llama-server instance running on
|
||||
a specific port
|
||||
@@ -461,10 +443,11 @@ paths:
|
||||
type: string
|
||||
security:
|
||||
- ApiKeyAuth: []
|
||||
summary: Proxy requests to a specific instance
|
||||
summary: Proxy requests to a specific instance, does not autostart instance
|
||||
if stopped
|
||||
tags:
|
||||
- instances
|
||||
/instances/{name}/restart:
|
||||
- Instances
|
||||
/api/v1/instances/{name}/restart:
|
||||
post:
|
||||
description: Restarts a specific instance by name
|
||||
parameters:
|
||||
@@ -477,7 +460,7 @@ paths:
|
||||
"200":
|
||||
description: Restarted instance details
|
||||
schema:
|
||||
$ref: '#/definitions/instance.Process'
|
||||
$ref: '#/definitions/instance.Instance'
|
||||
"400":
|
||||
description: Invalid name format
|
||||
schema:
|
||||
@@ -490,8 +473,8 @@ paths:
|
||||
- ApiKeyAuth: []
|
||||
summary: Restart a running instance
|
||||
tags:
|
||||
- instances
|
||||
/instances/{name}/start:
|
||||
- Instances
|
||||
/api/v1/instances/{name}/start:
|
||||
post:
|
||||
description: Starts a specific instance by name
|
||||
parameters:
|
||||
@@ -504,7 +487,7 @@ paths:
|
||||
"200":
|
||||
description: Started instance details
|
||||
schema:
|
||||
$ref: '#/definitions/instance.Process'
|
||||
$ref: '#/definitions/instance.Instance'
|
||||
"400":
|
||||
description: Invalid name format
|
||||
schema:
|
||||
@@ -517,8 +500,8 @@ paths:
|
||||
- ApiKeyAuth: []
|
||||
summary: Start a stopped instance
|
||||
tags:
|
||||
- instances
|
||||
/instances/{name}/stop:
|
||||
- Instances
|
||||
/api/v1/instances/{name}/stop:
|
||||
post:
|
||||
description: Stops a specific instance by name
|
||||
parameters:
|
||||
@@ -531,7 +514,7 @@ paths:
|
||||
"200":
|
||||
description: Stopped instance details
|
||||
schema:
|
||||
$ref: '#/definitions/instance.Process'
|
||||
$ref: '#/definitions/instance.Instance'
|
||||
"400":
|
||||
description: Invalid name format
|
||||
schema:
|
||||
@@ -544,7 +527,444 @@ paths:
|
||||
- ApiKeyAuth: []
|
||||
summary: Stop a running instance
|
||||
tags:
|
||||
- instances
|
||||
- Instances
|
||||
/api/v1/nodes:
|
||||
get:
|
||||
description: Returns a map of all nodes configured in the server (node name
|
||||
-> node config)
|
||||
responses:
|
||||
"200":
|
||||
description: Map of nodes
|
||||
schema:
|
||||
additionalProperties:
|
||||
$ref: '#/definitions/server.NodeResponse'
|
||||
type: object
|
||||
"500":
|
||||
description: Internal Server Error
|
||||
schema:
|
||||
type: string
|
||||
security:
|
||||
- ApiKeyAuth: []
|
||||
summary: List all configured nodes
|
||||
tags:
|
||||
- Nodes
|
||||
/api/v1/nodes/{name}:
|
||||
get:
|
||||
description: Returns the details of a specific node by name
|
||||
parameters:
|
||||
- description: Node Name
|
||||
in: path
|
||||
name: name
|
||||
required: true
|
||||
type: string
|
||||
responses:
|
||||
"200":
|
||||
description: Node details
|
||||
schema:
|
||||
$ref: '#/definitions/server.NodeResponse'
|
||||
"400":
|
||||
description: Invalid name format
|
||||
schema:
|
||||
type: string
|
||||
"404":
|
||||
description: Node not found
|
||||
schema:
|
||||
type: string
|
||||
"500":
|
||||
description: Internal Server Error
|
||||
schema:
|
||||
type: string
|
||||
security:
|
||||
- ApiKeyAuth: []
|
||||
summary: Get details of a specific node
|
||||
tags:
|
||||
- Nodes
|
||||
/api/v1/version:
|
||||
get:
|
||||
description: Returns the version of the llamactl command
|
||||
responses:
|
||||
"200":
|
||||
description: Version information
|
||||
schema:
|
||||
type: string
|
||||
"500":
|
||||
description: Internal Server Error
|
||||
schema:
|
||||
type: string
|
||||
security:
|
||||
- ApiKeyAuth: []
|
||||
summary: Get llamactl version
|
||||
tags:
|
||||
- System
|
||||
/llama-cpp/{name}/:
|
||||
get:
|
||||
description: Proxies requests to the llama.cpp UI for the specified instance
|
||||
parameters:
|
||||
- description: Instance Name
|
||||
in: query
|
||||
name: name
|
||||
required: true
|
||||
type: string
|
||||
produces:
|
||||
- text/html
|
||||
responses:
|
||||
"200":
|
||||
description: Proxied HTML response
|
||||
schema:
|
||||
type: string
|
||||
"400":
|
||||
description: Invalid instance
|
||||
schema:
|
||||
type: string
|
||||
"500":
|
||||
description: Internal Server Error
|
||||
schema:
|
||||
type: string
|
||||
security:
|
||||
- ApiKeyAuth: []
|
||||
summary: Proxy requests to llama.cpp UI for the instance
|
||||
tags:
|
||||
- Llama.cpp
|
||||
/llama-cpp/{name}/apply-template:
|
||||
post:
|
||||
description: Proxies requests to the specified llama.cpp server instance, starting
|
||||
it on-demand if configured
|
||||
parameters:
|
||||
- description: Instance Name
|
||||
in: path
|
||||
name: name
|
||||
required: true
|
||||
type: string
|
||||
produces:
|
||||
- application/json
|
||||
responses:
|
||||
"200":
|
||||
description: Proxied response
|
||||
schema:
|
||||
additionalProperties: true
|
||||
type: object
|
||||
"400":
|
||||
description: Invalid instance
|
||||
schema:
|
||||
type: string
|
||||
"500":
|
||||
description: Internal Server Error
|
||||
schema:
|
||||
type: string
|
||||
security:
|
||||
- ApiKeyAuth: []
|
||||
summary: Proxy requests to llama.cpp server instance
|
||||
tags:
|
||||
- Llama.cpp
|
||||
/llama-cpp/{name}/completion:
|
||||
post:
|
||||
description: Proxies requests to the specified llama.cpp server instance, starting
|
||||
it on-demand if configured
|
||||
parameters:
|
||||
- description: Instance Name
|
||||
in: path
|
||||
name: name
|
||||
required: true
|
||||
type: string
|
||||
produces:
|
||||
- application/json
|
||||
responses:
|
||||
"200":
|
||||
description: Proxied response
|
||||
schema:
|
||||
additionalProperties: true
|
||||
type: object
|
||||
"400":
|
||||
description: Invalid instance
|
||||
schema:
|
||||
type: string
|
||||
"500":
|
||||
description: Internal Server Error
|
||||
schema:
|
||||
type: string
|
||||
security:
|
||||
- ApiKeyAuth: []
|
||||
summary: Proxy requests to llama.cpp server instance
|
||||
tags:
|
||||
- Llama.cpp
|
||||
/llama-cpp/{name}/detokenize:
|
||||
post:
|
||||
description: Proxies requests to the specified llama.cpp server instance, starting
|
||||
it on-demand if configured
|
||||
parameters:
|
||||
- description: Instance Name
|
||||
in: path
|
||||
name: name
|
||||
required: true
|
||||
type: string
|
||||
produces:
|
||||
- application/json
|
||||
responses:
|
||||
"200":
|
||||
description: Proxied response
|
||||
schema:
|
||||
additionalProperties: true
|
||||
type: object
|
||||
"400":
|
||||
description: Invalid instance
|
||||
schema:
|
||||
type: string
|
||||
"500":
|
||||
description: Internal Server Error
|
||||
schema:
|
||||
type: string
|
||||
security:
|
||||
- ApiKeyAuth: []
|
||||
summary: Proxy requests to llama.cpp server instance
|
||||
tags:
|
||||
- Llama.cpp
|
||||
/llama-cpp/{name}/embeddings:
|
||||
post:
|
||||
description: Proxies requests to the specified llama.cpp server instance, starting
|
||||
it on-demand if configured
|
||||
parameters:
|
||||
- description: Instance Name
|
||||
in: path
|
||||
name: name
|
||||
required: true
|
||||
type: string
|
||||
produces:
|
||||
- application/json
|
||||
responses:
|
||||
"200":
|
||||
description: Proxied response
|
||||
schema:
|
||||
additionalProperties: true
|
||||
type: object
|
||||
"400":
|
||||
description: Invalid instance
|
||||
schema:
|
||||
type: string
|
||||
"500":
|
||||
description: Internal Server Error
|
||||
schema:
|
||||
type: string
|
||||
security:
|
||||
- ApiKeyAuth: []
|
||||
summary: Proxy requests to llama.cpp server instance
|
||||
tags:
|
||||
- Llama.cpp
|
||||
/llama-cpp/{name}/infill:
|
||||
post:
|
||||
description: Proxies requests to the specified llama.cpp server instance, starting
|
||||
it on-demand if configured
|
||||
parameters:
|
||||
- description: Instance Name
|
||||
in: path
|
||||
name: name
|
||||
required: true
|
||||
type: string
|
||||
produces:
|
||||
- application/json
|
||||
responses:
|
||||
"200":
|
||||
description: Proxied response
|
||||
schema:
|
||||
additionalProperties: true
|
||||
type: object
|
||||
"400":
|
||||
description: Invalid instance
|
||||
schema:
|
||||
type: string
|
||||
"500":
|
||||
description: Internal Server Error
|
||||
schema:
|
||||
type: string
|
||||
security:
|
||||
- ApiKeyAuth: []
|
||||
summary: Proxy requests to llama.cpp server instance
|
||||
tags:
|
||||
- Llama.cpp
|
||||
/llama-cpp/{name}/metrics:
|
||||
post:
|
||||
description: Proxies requests to the specified llama.cpp server instance, starting
|
||||
it on-demand if configured
|
||||
parameters:
|
||||
- description: Instance Name
|
||||
in: path
|
||||
name: name
|
||||
required: true
|
||||
type: string
|
||||
produces:
|
||||
- application/json
|
||||
responses:
|
||||
"200":
|
||||
description: Proxied response
|
||||
schema:
|
||||
additionalProperties: true
|
||||
type: object
|
||||
"400":
|
||||
description: Invalid instance
|
||||
schema:
|
||||
type: string
|
||||
"500":
|
||||
description: Internal Server Error
|
||||
schema:
|
||||
type: string
|
||||
security:
|
||||
- ApiKeyAuth: []
|
||||
summary: Proxy requests to llama.cpp server instance
|
||||
tags:
|
||||
- Llama.cpp
|
||||
/llama-cpp/{name}/props:
|
||||
get:
|
||||
description: Proxies requests to the specified llama.cpp server instance, starting
|
||||
it on-demand if configured
|
||||
parameters:
|
||||
- description: Instance Name
|
||||
in: path
|
||||
name: name
|
||||
required: true
|
||||
type: string
|
||||
produces:
|
||||
- application/json
|
||||
responses:
|
||||
"200":
|
||||
description: Proxied response
|
||||
schema:
|
||||
additionalProperties: true
|
||||
type: object
|
||||
"400":
|
||||
description: Invalid instance
|
||||
schema:
|
||||
type: string
|
||||
"500":
|
||||
description: Internal Server Error
|
||||
schema:
|
||||
type: string
|
||||
security:
|
||||
- ApiKeyAuth: []
|
||||
summary: Proxy requests to llama.cpp server instance
|
||||
tags:
|
||||
- Llama.cpp
|
||||
post:
|
||||
description: Proxies requests to the specified llama.cpp server instance, starting
|
||||
it on-demand if configured
|
||||
parameters:
|
||||
- description: Instance Name
|
||||
in: path
|
||||
name: name
|
||||
required: true
|
||||
type: string
|
||||
produces:
|
||||
- application/json
|
||||
responses:
|
||||
"200":
|
||||
description: Proxied response
|
||||
schema:
|
||||
additionalProperties: true
|
||||
type: object
|
||||
"400":
|
||||
description: Invalid instance
|
||||
schema:
|
||||
type: string
|
||||
"500":
|
||||
description: Internal Server Error
|
||||
schema:
|
||||
type: string
|
||||
security:
|
||||
- ApiKeyAuth: []
|
||||
summary: Proxy requests to llama.cpp server instance
|
||||
tags:
|
||||
- Llama.cpp
|
||||
/llama-cpp/{name}/reranking:
|
||||
post:
|
||||
description: Proxies requests to the specified llama.cpp server instance, starting
|
||||
it on-demand if configured
|
||||
parameters:
|
||||
- description: Instance Name
|
||||
in: path
|
||||
name: name
|
||||
required: true
|
||||
type: string
|
||||
produces:
|
||||
- application/json
|
||||
responses:
|
||||
"200":
|
||||
description: Proxied response
|
||||
schema:
|
||||
additionalProperties: true
|
||||
type: object
|
||||
"400":
|
||||
description: Invalid instance
|
||||
schema:
|
||||
type: string
|
||||
"500":
|
||||
description: Internal Server Error
|
||||
schema:
|
||||
type: string
|
||||
security:
|
||||
- ApiKeyAuth: []
|
||||
summary: Proxy requests to llama.cpp server instance
|
||||
tags:
|
||||
- Llama.cpp
|
||||
/llama-cpp/{name}/slots:
|
||||
get:
|
||||
description: Proxies requests to the specified llama.cpp server instance, starting
|
||||
it on-demand if configured
|
||||
parameters:
|
||||
- description: Instance Name
|
||||
in: path
|
||||
name: name
|
||||
required: true
|
||||
type: string
|
||||
produces:
|
||||
- application/json
|
||||
responses:
|
||||
"200":
|
||||
description: Proxied response
|
||||
schema:
|
||||
additionalProperties: true
|
||||
type: object
|
||||
"400":
|
||||
description: Invalid instance
|
||||
schema:
|
||||
type: string
|
||||
"500":
|
||||
description: Internal Server Error
|
||||
schema:
|
||||
type: string
|
||||
security:
|
||||
- ApiKeyAuth: []
|
||||
summary: Proxy requests to llama.cpp server instance
|
||||
tags:
|
||||
- Llama.cpp
|
||||
/llama-cpp/{name}/tokenize:
|
||||
post:
|
||||
description: Proxies requests to the specified llama.cpp server instance, starting
|
||||
it on-demand if configured
|
||||
parameters:
|
||||
- description: Instance Name
|
||||
in: path
|
||||
name: name
|
||||
required: true
|
||||
type: string
|
||||
produces:
|
||||
- application/json
|
||||
responses:
|
||||
"200":
|
||||
description: Proxied response
|
||||
schema:
|
||||
additionalProperties: true
|
||||
type: object
|
||||
"400":
|
||||
description: Invalid instance
|
||||
schema:
|
||||
type: string
|
||||
"500":
|
||||
description: Internal Server Error
|
||||
schema:
|
||||
type: string
|
||||
security:
|
||||
- ApiKeyAuth: []
|
||||
summary: Proxy requests to llama.cpp server instance
|
||||
tags:
|
||||
- Llama.cpp
|
||||
/v1/:
|
||||
post:
|
||||
consumes:
|
||||
@@ -567,7 +987,7 @@ paths:
|
||||
- ApiKeyAuth: []
|
||||
summary: OpenAI-compatible proxy endpoint
|
||||
tags:
|
||||
- openai
|
||||
- OpenAI
|
||||
/v1/models:
|
||||
get:
|
||||
description: Returns a list of instances in a format compatible with OpenAI
|
||||
@@ -585,22 +1005,10 @@ paths:
|
||||
- ApiKeyAuth: []
|
||||
summary: List instances in OpenAI-compatible format
|
||||
tags:
|
||||
- openai
|
||||
/version:
|
||||
get:
|
||||
description: Returns the version of the llamactl command
|
||||
responses:
|
||||
"200":
|
||||
description: Version information
|
||||
schema:
|
||||
type: string
|
||||
"500":
|
||||
description: Internal Server Error
|
||||
schema:
|
||||
type: string
|
||||
security:
|
||||
- ApiKeyAuth: []
|
||||
summary: Get llamactl version
|
||||
tags:
|
||||
- version
|
||||
- OpenAI
|
||||
securityDefinitions:
|
||||
ApiKeyAuth:
|
||||
in: header
|
||||
name: X-API-Key
|
||||
type: apiKey
|
||||
swagger: "2.0"
|
||||
@@ -26,62 +26,67 @@ Issues specific to Llamactl deployment and operation.
|
||||
|
||||
## Instance Management Issues
|
||||
|
||||
### Model Loading Failures
|
||||
### Instance Fails to Start
|
||||
|
||||
**Problem:** Instance fails to start with model loading errors
|
||||
|
||||
**Common Solutions:**
|
||||
- **llama-server not found:** Ensure `llama-server` binary is in PATH
|
||||
- **Wrong model format:** Ensure model is in GGUF format
|
||||
- **Insufficient memory:** Use smaller model or reduce context size
|
||||
- **Path issues:** Use absolute paths to model files
|
||||
|
||||
### Memory Issues
|
||||
|
||||
**Problem:** Out of memory errors or system becomes unresponsive
|
||||
**Problem:** Instance fails to start or immediately stops
|
||||
|
||||
**Solutions:**
|
||||
1. **Reduce context size:**
|
||||
```json
|
||||
{
|
||||
"n_ctx": 1024
|
||||
}
|
||||
|
||||
1. **Check instance logs** to see the actual error:
|
||||
```bash
|
||||
curl http://localhost:8080/api/v1/instances/{name}/logs
|
||||
# Or check log files directly
|
||||
tail -f ~/.local/share/llamactl/logs/{instance-name}.log
|
||||
```
|
||||
|
||||
2. **Use quantized models:**
|
||||
- Try Q4_K_M instead of higher precision models
|
||||
- Use smaller model variants (7B instead of 13B)
|
||||
2. **Verify backend is installed:**
|
||||
- **llama.cpp**: Ensure `llama-server` is in PATH
|
||||
- **MLX**: Ensure `mlx-lm` Python package is installed
|
||||
- **vLLM**: Ensure `vllm` Python package is installed
|
||||
|
||||
### GPU Configuration
|
||||
3. **Check model path and format:**
|
||||
- Use absolute paths to model files
|
||||
- Verify model format matches backend (GGUF for llama.cpp, etc.)
|
||||
|
||||
**Problem:** GPU not being used effectively
|
||||
4. **Verify backend command configuration:**
|
||||
- Check that the backend `command` is correctly configured in the global config
|
||||
- For virtual environments, specify the full path to the command (e.g., `/path/to/venv/bin/mlx_lm.server`)
|
||||
- See the [Configuration Guide](configuration.md) for backend configuration details
|
||||
- Test the backend directly (see [Backend-Specific Issues](#backend-specific-issues) below)
|
||||
|
||||
**Solutions:**
|
||||
1. **Configure GPU layers:**
|
||||
```json
|
||||
{
|
||||
"n_gpu_layers": 35
|
||||
}
|
||||
```
|
||||
### Backend-Specific Issues
|
||||
|
||||
### Advanced Instance Issues
|
||||
**Problem:** Model loading, memory, GPU, or performance issues
|
||||
|
||||
**Problem:** Complex model loading, performance, or compatibility issues
|
||||
Most model-specific issues (memory, GPU configuration, performance tuning) are backend-specific and should be resolved by consulting the respective backend documentation:
|
||||
|
||||
Since llamactl uses `llama-server` under the hood, many instance-related issues are actually llama.cpp issues. For advanced troubleshooting:
|
||||
**llama.cpp:**
|
||||
- [llama.cpp GitHub](https://github.com/ggml-org/llama.cpp)
|
||||
- [llama-server README](https://github.com/ggml-org/llama.cpp/blob/master/tools/server/README.md)
|
||||
|
||||
**Resources:**
|
||||
- **llama.cpp Documentation:** [https://github.com/ggml/llama.cpp](https://github.com/ggml/llama.cpp)
|
||||
- **llama.cpp Issues:** [https://github.com/ggml/llama.cpp/issues](https://github.com/ggml/llama.cpp/issues)
|
||||
- **llama.cpp Discussions:** [https://github.com/ggml/llama.cpp/discussions](https://github.com/ggml/llama.cpp/discussions)
|
||||
**MLX:**
|
||||
- [MLX-LM GitHub](https://github.com/ml-explore/mlx-lm)
|
||||
- [MLX-LM Server Guide](https://github.com/ml-explore/mlx-lm/blob/main/mlx_lm/SERVER.md)
|
||||
|
||||
**vLLM:**
|
||||
- [vLLM Documentation](https://docs.vllm.ai/en/stable/)
|
||||
- [OpenAI Compatible Server](https://docs.vllm.ai/en/stable/serving/openai_compatible_server.html)
|
||||
- [vllm serve Command](https://docs.vllm.ai/en/stable/cli/serve.html#vllm-serve)
|
||||
|
||||
**Testing backends directly:**
|
||||
|
||||
Testing your model and configuration directly with the backend helps determine if the issue is with llamactl or the backend itself:
|
||||
|
||||
**Testing directly with llama-server:**
|
||||
```bash
|
||||
# Test your model and parameters directly with llama-server
|
||||
llama-server --model /path/to/model.gguf --port 8081 --n-gpu-layers 35
|
||||
```
|
||||
# llama.cpp
|
||||
llama-server --model /path/to/model.gguf --port 8081
|
||||
|
||||
This helps determine if the issue is with llamactl or with the underlying llama.cpp/llama-server.
|
||||
# MLX
|
||||
mlx_lm.server --model mlx-community/Mistral-7B-Instruct-v0.3-4bit --port 8081
|
||||
|
||||
# vLLM
|
||||
vllm serve microsoft/DialoGPT-medium --port 8081
|
||||
```
|
||||
|
||||
## API and Network Issues
|
||||
|
||||
@@ -1,560 +0,0 @@
|
||||
# API Reference
|
||||
|
||||
Complete reference for the Llamactl REST API.
|
||||
|
||||
## Base URL
|
||||
|
||||
All API endpoints are relative to the base URL:
|
||||
|
||||
```
|
||||
http://localhost:8080/api/v1
|
||||
```
|
||||
|
||||
## Authentication
|
||||
|
||||
Llamactl supports API key authentication. If authentication is enabled, include the API key in the Authorization header:
|
||||
|
||||
```bash
|
||||
curl -H "Authorization: Bearer <your-api-key>" \
|
||||
http://localhost:8080/api/v1/instances
|
||||
```
|
||||
|
||||
The server supports two types of API keys:
|
||||
- **Management API Keys**: Required for instance management operations (CRUD operations on instances)
|
||||
- **Inference API Keys**: Required for OpenAI-compatible inference endpoints
|
||||
|
||||
## System Endpoints
|
||||
|
||||
### Get Llamactl Version
|
||||
|
||||
Get the version information of the llamactl server.
|
||||
|
||||
```http
|
||||
GET /api/v1/version
|
||||
```
|
||||
|
||||
**Response:**
|
||||
```
|
||||
Version: 1.0.0
|
||||
Commit: abc123
|
||||
Build Time: 2024-01-15T10:00:00Z
|
||||
```
|
||||
|
||||
### Get Llama Server Help
|
||||
|
||||
Get help text for the llama-server command.
|
||||
|
||||
```http
|
||||
GET /api/v1/server/help
|
||||
```
|
||||
|
||||
**Response:** Plain text help output from `llama-server --help`
|
||||
|
||||
### Get Llama Server Version
|
||||
|
||||
Get version information of the llama-server binary.
|
||||
|
||||
```http
|
||||
GET /api/v1/server/version
|
||||
```
|
||||
|
||||
**Response:** Plain text version output from `llama-server --version`
|
||||
|
||||
### List Available Devices
|
||||
|
||||
List available devices for llama-server.
|
||||
|
||||
```http
|
||||
GET /api/v1/server/devices
|
||||
```
|
||||
|
||||
**Response:** Plain text device list from `llama-server --list-devices`
|
||||
|
||||
## Instances
|
||||
|
||||
### List All Instances
|
||||
|
||||
Get a list of all instances.
|
||||
|
||||
```http
|
||||
GET /api/v1/instances
|
||||
```
|
||||
|
||||
**Response:**
|
||||
```json
|
||||
[
|
||||
{
|
||||
"name": "llama2-7b",
|
||||
"status": "running",
|
||||
"created": 1705312200
|
||||
}
|
||||
]
|
||||
```
|
||||
|
||||
### Get Instance Details
|
||||
|
||||
Get detailed information about a specific instance.
|
||||
|
||||
```http
|
||||
GET /api/v1/instances/{name}
|
||||
```
|
||||
|
||||
**Response:**
|
||||
```json
|
||||
{
|
||||
"name": "llama2-7b",
|
||||
"status": "running",
|
||||
"created": 1705312200
|
||||
}
|
||||
```
|
||||
|
||||
### Create Instance
|
||||
|
||||
Create and start a new instance.
|
||||
|
||||
```http
|
||||
POST /api/v1/instances/{name}
|
||||
```
|
||||
|
||||
**Request Body:** JSON object with instance configuration. Common fields include:
|
||||
|
||||
- `backend_type`: Backend type (`llama_cpp`, `mlx_lm`, or `vllm`)
|
||||
- `backend_options`: Backend-specific configuration
|
||||
- `auto_restart`: Enable automatic restart on failure
|
||||
- `max_restarts`: Maximum restart attempts
|
||||
- `restart_delay`: Delay between restarts in seconds
|
||||
- `on_demand_start`: Start instance when receiving requests
|
||||
- `idle_timeout`: Idle timeout in minutes
|
||||
- `environment`: Environment variables as key-value pairs
|
||||
- `nodes`: Array with single node name to deploy the instance to (for remote deployments)
|
||||
|
||||
See [Managing Instances](managing-instances.md) for complete configuration options.
|
||||
|
||||
**Response:**
|
||||
```json
|
||||
{
|
||||
"name": "llama2-7b",
|
||||
"status": "running",
|
||||
"created": 1705312200
|
||||
}
|
||||
```
|
||||
|
||||
### Update Instance
|
||||
|
||||
Update an existing instance configuration. See [Managing Instances](managing-instances.md) for available configuration options.
|
||||
|
||||
```http
|
||||
PUT /api/v1/instances/{name}
|
||||
```
|
||||
|
||||
**Request Body:** JSON object with configuration fields to update.
|
||||
|
||||
**Response:**
|
||||
```json
|
||||
{
|
||||
"name": "llama2-7b",
|
||||
"status": "running",
|
||||
"created": 1705312200
|
||||
}
|
||||
```
|
||||
|
||||
### Delete Instance
|
||||
|
||||
Stop and remove an instance.
|
||||
|
||||
```http
|
||||
DELETE /api/v1/instances/{name}
|
||||
```
|
||||
|
||||
**Response:** `204 No Content`
|
||||
|
||||
## Instance Operations
|
||||
|
||||
### Start Instance
|
||||
|
||||
Start a stopped instance.
|
||||
|
||||
```http
|
||||
POST /api/v1/instances/{name}/start
|
||||
```
|
||||
|
||||
**Response:**
|
||||
```json
|
||||
{
|
||||
"name": "llama2-7b",
|
||||
"status": "running",
|
||||
"created": 1705312200
|
||||
}
|
||||
```
|
||||
|
||||
**Error Responses:**
|
||||
- `409 Conflict`: Maximum number of running instances reached
|
||||
- `500 Internal Server Error`: Failed to start instance
|
||||
|
||||
### Stop Instance
|
||||
|
||||
Stop a running instance.
|
||||
|
||||
```http
|
||||
POST /api/v1/instances/{name}/stop
|
||||
```
|
||||
|
||||
**Response:**
|
||||
```json
|
||||
{
|
||||
"name": "llama2-7b",
|
||||
"status": "stopped",
|
||||
"created": 1705312200
|
||||
}
|
||||
```
|
||||
|
||||
### Restart Instance
|
||||
|
||||
Restart an instance (stop then start).
|
||||
|
||||
```http
|
||||
POST /api/v1/instances/{name}/restart
|
||||
```
|
||||
|
||||
**Response:**
|
||||
```json
|
||||
{
|
||||
"name": "llama2-7b",
|
||||
"status": "running",
|
||||
"created": 1705312200
|
||||
}
|
||||
```
|
||||
|
||||
### Get Instance Logs
|
||||
|
||||
Retrieve instance logs.
|
||||
|
||||
```http
|
||||
GET /api/v1/instances/{name}/logs
|
||||
```
|
||||
|
||||
**Query Parameters:**
|
||||
- `lines`: Number of lines to return (default: all lines, use -1 for all)
|
||||
|
||||
**Response:** Plain text log output
|
||||
|
||||
**Example:**
|
||||
```bash
|
||||
curl "http://localhost:8080/api/v1/instances/my-instance/logs?lines=100"
|
||||
```
|
||||
|
||||
### Proxy to Instance
|
||||
|
||||
Proxy HTTP requests directly to the llama-server instance.
|
||||
|
||||
```http
|
||||
GET /api/v1/instances/{name}/proxy/*
|
||||
POST /api/v1/instances/{name}/proxy/*
|
||||
```
|
||||
|
||||
This endpoint forwards all requests to the underlying llama-server instance running on its configured port. The proxy strips the `/api/v1/instances/{name}/proxy` prefix and forwards the remaining path to the instance.
|
||||
|
||||
**Example - Check Instance Health:**
|
||||
```bash
|
||||
curl -H "Authorization: Bearer your-api-key" \
|
||||
http://localhost:8080/api/v1/instances/my-model/proxy/health
|
||||
```
|
||||
|
||||
This forwards the request to `http://instance-host:instance-port/health` on the actual llama-server instance.
|
||||
|
||||
**Error Responses:**
|
||||
- `503 Service Unavailable`: Instance is not running
|
||||
|
||||
## OpenAI-Compatible API
|
||||
|
||||
Llamactl provides OpenAI-compatible endpoints for inference operations.
|
||||
|
||||
### List Models
|
||||
|
||||
List all instances in OpenAI-compatible format.
|
||||
|
||||
```http
|
||||
GET /v1/models
|
||||
```
|
||||
|
||||
**Response:**
|
||||
```json
|
||||
{
|
||||
"object": "list",
|
||||
"data": [
|
||||
{
|
||||
"id": "llama2-7b",
|
||||
"object": "model",
|
||||
"created": 1705312200,
|
||||
"owned_by": "llamactl"
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
### Chat Completions, Completions, Embeddings
|
||||
|
||||
All OpenAI-compatible inference endpoints are available:
|
||||
|
||||
```http
|
||||
POST /v1/chat/completions
|
||||
POST /v1/completions
|
||||
POST /v1/embeddings
|
||||
POST /v1/rerank
|
||||
POST /v1/reranking
|
||||
```
|
||||
|
||||
**Request Body:** Standard OpenAI format with `model` field specifying the instance name
|
||||
|
||||
**Example:**
|
||||
```json
|
||||
{
|
||||
"model": "llama2-7b",
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Hello, how are you?"
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
The server routes requests to the appropriate instance based on the `model` field in the request body. Instances with on-demand starting enabled will be automatically started if not running. For configuration details, see [Managing Instances](managing-instances.md).
|
||||
|
||||
**Error Responses:**
|
||||
- `400 Bad Request`: Invalid request body or missing instance name
|
||||
- `503 Service Unavailable`: Instance is not running and on-demand start is disabled
|
||||
- `409 Conflict`: Cannot start instance due to maximum instances limit
|
||||
|
||||
## Instance Status Values
|
||||
|
||||
Instances can have the following status values:
|
||||
- `stopped`: Instance is not running
|
||||
- `running`: Instance is running and ready to accept requests
|
||||
- `failed`: Instance failed to start or crashed
|
||||
|
||||
## Error Responses
|
||||
|
||||
All endpoints may return error responses in the following format:
|
||||
|
||||
```json
|
||||
{
|
||||
"error": "Error message description"
|
||||
}
|
||||
```
|
||||
|
||||
### Common HTTP Status Codes
|
||||
|
||||
- `200`: Success
|
||||
- `201`: Created
|
||||
- `204`: No Content (successful deletion)
|
||||
- `400`: Bad Request (invalid parameters or request body)
|
||||
- `401`: Unauthorized (missing or invalid API key)
|
||||
- `403`: Forbidden (insufficient permissions)
|
||||
- `404`: Not Found (instance not found)
|
||||
- `409`: Conflict (instance already exists, max instances reached)
|
||||
- `500`: Internal Server Error
|
||||
- `503`: Service Unavailable (instance not running)
|
||||
|
||||
## Examples
|
||||
|
||||
### Complete Instance Lifecycle
|
||||
|
||||
```bash
|
||||
# Create and start instance
|
||||
curl -X POST http://localhost:8080/api/v1/instances/my-model \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "Authorization: Bearer your-api-key" \
|
||||
-d '{
|
||||
"backend_type": "llama_cpp",
|
||||
"backend_options": {
|
||||
"model": "/models/llama-2-7b.gguf",
|
||||
"gpu_layers": 32
|
||||
},
|
||||
"environment": {
|
||||
"CUDA_VISIBLE_DEVICES": "0",
|
||||
"OMP_NUM_THREADS": "8"
|
||||
}
|
||||
}'
|
||||
|
||||
# Check instance status
|
||||
curl -H "Authorization: Bearer your-api-key" \
|
||||
http://localhost:8080/api/v1/instances/my-model
|
||||
|
||||
# Get instance logs
|
||||
curl -H "Authorization: Bearer your-api-key" \
|
||||
"http://localhost:8080/api/v1/instances/my-model/logs?lines=50"
|
||||
|
||||
# Use OpenAI-compatible chat completions
|
||||
curl -X POST http://localhost:8080/v1/chat/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "Authorization: Bearer your-inference-api-key" \
|
||||
-d '{
|
||||
"model": "my-model",
|
||||
"messages": [
|
||||
{"role": "user", "content": "Hello!"}
|
||||
],
|
||||
"max_tokens": 100
|
||||
}'
|
||||
|
||||
# Stop instance
|
||||
curl -X POST -H "Authorization: Bearer your-api-key" \
|
||||
http://localhost:8080/api/v1/instances/my-model/stop
|
||||
|
||||
# Delete instance
|
||||
curl -X DELETE -H "Authorization: Bearer your-api-key" \
|
||||
http://localhost:8080/api/v1/instances/my-model
|
||||
```
|
||||
|
||||
### Remote Node Instance Example
|
||||
|
||||
```bash
|
||||
# Create instance on specific remote node
|
||||
curl -X POST http://localhost:8080/api/v1/instances/remote-model \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "Authorization: Bearer your-api-key" \
|
||||
-d '{
|
||||
"backend_type": "llama_cpp",
|
||||
"backend_options": {
|
||||
"model": "/models/llama-2-7b.gguf",
|
||||
"gpu_layers": 32
|
||||
},
|
||||
"nodes": ["worker1"]
|
||||
}'
|
||||
|
||||
# Check status of remote instance
|
||||
curl -H "Authorization: Bearer your-api-key" \
|
||||
http://localhost:8080/api/v1/instances/remote-model
|
||||
|
||||
# Use remote instance with OpenAI-compatible API
|
||||
curl -X POST http://localhost:8080/v1/chat/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "Authorization: Bearer your-inference-api-key" \
|
||||
-d '{
|
||||
"model": "remote-model",
|
||||
"messages": [
|
||||
{"role": "user", "content": "Hello from remote node!"}
|
||||
]
|
||||
}'
|
||||
```
|
||||
|
||||
### Using the Proxy Endpoint
|
||||
|
||||
You can also directly proxy requests to the llama-server instance:
|
||||
|
||||
```bash
|
||||
# Direct proxy to instance (bypasses OpenAI compatibility layer)
|
||||
curl -X POST http://localhost:8080/api/v1/instances/my-model/proxy/completion \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "Authorization: Bearer your-api-key" \
|
||||
-d '{
|
||||
"prompt": "Hello, world!",
|
||||
"n_predict": 50
|
||||
}'
|
||||
```
|
||||
|
||||
## Backend-Specific Endpoints
|
||||
|
||||
### Parse Commands
|
||||
|
||||
Llamactl provides endpoints to parse command strings from different backends into instance configuration options.
|
||||
|
||||
#### Parse Llama.cpp Command
|
||||
|
||||
Parse a llama-server command string into instance options.
|
||||
|
||||
```http
|
||||
POST /api/v1/backends/llama-cpp/parse-command
|
||||
```
|
||||
|
||||
**Request Body:**
|
||||
```json
|
||||
{
|
||||
"command": "llama-server -m /path/to/model.gguf -c 2048 --port 8080"
|
||||
}
|
||||
```
|
||||
|
||||
**Response:**
|
||||
```json
|
||||
{
|
||||
"backend_type": "llama_cpp",
|
||||
"llama_server_options": {
|
||||
"model": "/path/to/model.gguf",
|
||||
"ctx_size": 2048,
|
||||
"port": 8080
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
#### Parse MLX-LM Command
|
||||
|
||||
Parse an MLX-LM server command string into instance options.
|
||||
|
||||
```http
|
||||
POST /api/v1/backends/mlx/parse-command
|
||||
```
|
||||
|
||||
**Request Body:**
|
||||
```json
|
||||
{
|
||||
"command": "mlx_lm.server --model /path/to/model --port 8080"
|
||||
}
|
||||
```
|
||||
|
||||
**Response:**
|
||||
```json
|
||||
{
|
||||
"backend_type": "mlx_lm",
|
||||
"mlx_server_options": {
|
||||
"model": "/path/to/model",
|
||||
"port": 8080
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
#### Parse vLLM Command
|
||||
|
||||
Parse a vLLM serve command string into instance options.
|
||||
|
||||
```http
|
||||
POST /api/v1/backends/vllm/parse-command
|
||||
```
|
||||
|
||||
**Request Body:**
|
||||
```json
|
||||
{
|
||||
"command": "vllm serve /path/to/model --port 8080"
|
||||
}
|
||||
```
|
||||
|
||||
**Response:**
|
||||
```json
|
||||
{
|
||||
"backend_type": "vllm",
|
||||
"vllm_server_options": {
|
||||
"model": "/path/to/model",
|
||||
"port": 8080
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**Error Responses for Parse Commands:**
|
||||
- `400 Bad Request`: Invalid request body, empty command, or parse error
|
||||
- `500 Internal Server Error`: Encoding error
|
||||
|
||||
## Auto-Generated Documentation
|
||||
|
||||
The API documentation is automatically generated from code annotations using Swagger/OpenAPI. To regenerate the documentation:
|
||||
|
||||
1. Install the swag tool: `go install github.com/swaggo/swag/cmd/swag@latest`
|
||||
2. Generate docs: `swag init -g cmd/server/main.go -o apidocs`
|
||||
|
||||
## Swagger Documentation
|
||||
|
||||
If swagger documentation is enabled in the server configuration, you can access the interactive API documentation at:
|
||||
|
||||
```
|
||||
http://localhost:8080/swagger/
|
||||
```
|
||||
|
||||
This provides a complete interactive interface for testing all API endpoints.
|
||||
23
mkdocs.yml
23
mkdocs.yml
@@ -25,8 +25,8 @@ theme:
|
||||
name: Switch to light mode
|
||||
features:
|
||||
- navigation.tabs
|
||||
- navigation.sections
|
||||
- navigation.expand
|
||||
- navigation.tabs.sticky
|
||||
- toc.integrate
|
||||
- navigation.top
|
||||
- search.highlight
|
||||
- search.share
|
||||
@@ -49,14 +49,12 @@ markdown_extensions:
|
||||
|
||||
nav:
|
||||
- Home: index.md
|
||||
- Getting Started:
|
||||
- Installation: getting-started/installation.md
|
||||
- Quick Start: getting-started/quick-start.md
|
||||
- Configuration: getting-started/configuration.md
|
||||
- User Guide:
|
||||
- Managing Instances: user-guide/managing-instances.md
|
||||
- API Reference: user-guide/api-reference.md
|
||||
- Troubleshooting: user-guide/troubleshooting.md
|
||||
- Installation: installation.md
|
||||
- Quick Start: quick-start.md
|
||||
- Configuration: configuration.md
|
||||
- Managing Instances: managing-instances.md
|
||||
- API Reference: api-reference.md
|
||||
- Troubleshooting: troubleshooting.md
|
||||
|
||||
plugins:
|
||||
- search
|
||||
@@ -66,6 +64,8 @@ plugins:
|
||||
css_dir: css
|
||||
javascript_dir: js
|
||||
canonical_version: null
|
||||
- neoteroi.mkdocsoad:
|
||||
use_pymdownx: true
|
||||
|
||||
hooks:
|
||||
- docs/readme_sync.py
|
||||
@@ -78,3 +78,6 @@ extra:
|
||||
social:
|
||||
- icon: fontawesome/brands/github
|
||||
link: https://github.com/lordmathis/llamactl
|
||||
|
||||
extra_css:
|
||||
- css/css-v1.1.3.css
|
||||
|
||||
@@ -44,7 +44,7 @@ func (h *Handler) stripLlamaCppPrefix(r *http.Request, instName string) {
|
||||
// LlamaCppUIProxy godoc
|
||||
// @Summary Proxy requests to llama.cpp UI for the instance
|
||||
// @Description Proxies requests to the llama.cpp UI for the specified instance
|
||||
// @Tags backends
|
||||
// @Tags Llama.cpp
|
||||
// @Security ApiKeyAuth
|
||||
// @Produce html
|
||||
// @Param name query string true "Instance Name"
|
||||
@@ -83,14 +83,24 @@ func (h *Handler) LlamaCppUIProxy() http.HandlerFunc {
|
||||
// LlamaCppProxy godoc
|
||||
// @Summary Proxy requests to llama.cpp server instance
|
||||
// @Description Proxies requests to the specified llama.cpp server instance, starting it on-demand if configured
|
||||
// @Tags backends
|
||||
// @Tags Llama.cpp
|
||||
// @Security ApiKeyAuth
|
||||
// @Produce json
|
||||
// @Param name query string true "Instance Name"
|
||||
// @Param name path string true "Instance Name"
|
||||
// @Success 200 {object} map[string]any "Proxied response"
|
||||
// @Failure 400 {string} string "Invalid instance"
|
||||
// @Failure 500 {string} string "Internal Server Error"
|
||||
// @Router /llama-cpp/{name}/* [post]
|
||||
// @Router /llama-cpp/{name}/props [get]
|
||||
// @Router /llama-cpp/{name}/slots [get]
|
||||
// @Router /llama-cpp/{name}/apply-template [post]
|
||||
// @Router /llama-cpp/{name}/completion [post]
|
||||
// @Router /llama-cpp/{name}/detokenize [post]
|
||||
// @Router /llama-cpp/{name}/embeddings [post]
|
||||
// @Router /llama-cpp/{name}/infill [post]
|
||||
// @Router /llama-cpp/{name}/metrics [post]
|
||||
// @Router /llama-cpp/{name}/props [post]
|
||||
// @Router /llama-cpp/{name}/reranking [post]
|
||||
// @Router /llama-cpp/{name}/tokenize [post]
|
||||
func (h *Handler) LlamaCppProxy() http.HandlerFunc {
|
||||
return func(w http.ResponseWriter, r *http.Request) {
|
||||
|
||||
@@ -150,7 +160,7 @@ func parseHelper(w http.ResponseWriter, r *http.Request, backend interface {
|
||||
// ParseLlamaCommand godoc
|
||||
// @Summary Parse llama-server command
|
||||
// @Description Parses a llama-server command string into instance options
|
||||
// @Tags backends
|
||||
// @Tags Backends
|
||||
// @Security ApiKeyAuth
|
||||
// @Accept json
|
||||
// @Produce json
|
||||
@@ -158,7 +168,7 @@ func parseHelper(w http.ResponseWriter, r *http.Request, backend interface {
|
||||
// @Success 200 {object} instance.Options "Parsed options"
|
||||
// @Failure 400 {object} map[string]string "Invalid request or command"
|
||||
// @Failure 500 {object} map[string]string "Internal Server Error"
|
||||
// @Router /backends/llama-cpp/parse-command [post]
|
||||
// @Router /api/v1/backends/llama-cpp/parse-command [post]
|
||||
func (h *Handler) ParseLlamaCommand() http.HandlerFunc {
|
||||
return func(w http.ResponseWriter, r *http.Request) {
|
||||
parsedOptions, ok := parseHelper(w, r, &backends.LlamaServerOptions{})
|
||||
@@ -180,14 +190,14 @@ func (h *Handler) ParseLlamaCommand() http.HandlerFunc {
|
||||
// ParseMlxCommand godoc
|
||||
// @Summary Parse mlx_lm.server command
|
||||
// @Description Parses MLX-LM server command string into instance options
|
||||
// @Tags backends
|
||||
// @Tags Backends
|
||||
// @Security ApiKeyAuth
|
||||
// @Accept json
|
||||
// @Produce json
|
||||
// @Param request body ParseCommandRequest true "Command to parse"
|
||||
// @Success 200 {object} instance.Options "Parsed options"
|
||||
// @Failure 400 {object} map[string]string "Invalid request or command"
|
||||
// @Router /backends/mlx/parse-command [post]
|
||||
// @Router /api/v1/backends/mlx/parse-command [post]
|
||||
func (h *Handler) ParseMlxCommand() http.HandlerFunc {
|
||||
return func(w http.ResponseWriter, r *http.Request) {
|
||||
parsedOptions, ok := parseHelper(w, r, &backends.MlxServerOptions{})
|
||||
@@ -209,14 +219,14 @@ func (h *Handler) ParseMlxCommand() http.HandlerFunc {
|
||||
// ParseVllmCommand godoc
|
||||
// @Summary Parse vllm serve command
|
||||
// @Description Parses a vLLM serve command string into instance options
|
||||
// @Tags backends
|
||||
// @Tags Backends
|
||||
// @Security ApiKeyAuth
|
||||
// @Accept json
|
||||
// @Produce json
|
||||
// @Param request body ParseCommandRequest true "Command to parse"
|
||||
// @Success 200 {object} instance.Options "Parsed options"
|
||||
// @Failure 400 {object} map[string]string "Invalid request or command"
|
||||
// @Router /backends/vllm/parse-command [post]
|
||||
// @Router /api/v1/backends/vllm/parse-command [post]
|
||||
func (h *Handler) ParseVllmCommand() http.HandlerFunc {
|
||||
return func(w http.ResponseWriter, r *http.Request) {
|
||||
parsedOptions, ok := parseHelper(w, r, &backends.VllmServerOptions{})
|
||||
@@ -251,12 +261,12 @@ func (h *Handler) executeLlamaServerCommand(flag, errorMsg string) http.HandlerF
|
||||
// LlamaServerHelpHandler godoc
|
||||
// @Summary Get help for llama server
|
||||
// @Description Returns the help text for the llama server command
|
||||
// @Tags backends
|
||||
// @Tags Backends
|
||||
// @Security ApiKeyAuth
|
||||
// @Produces text/plain
|
||||
// @Success 200 {string} string "Help text"
|
||||
// @Failure 500 {string} string "Internal Server Error"
|
||||
// @Router /backends/llama-cpp/help [get]
|
||||
// @Router /api/v1/backends/llama-cpp/help [get]
|
||||
func (h *Handler) LlamaServerHelpHandler() http.HandlerFunc {
|
||||
return h.executeLlamaServerCommand("--help", "Failed to get help")
|
||||
}
|
||||
@@ -264,12 +274,12 @@ func (h *Handler) LlamaServerHelpHandler() http.HandlerFunc {
|
||||
// LlamaServerVersionHandler godoc
|
||||
// @Summary Get version of llama server
|
||||
// @Description Returns the version of the llama server command
|
||||
// @Tags backends
|
||||
// @Tags Backends
|
||||
// @Security ApiKeyAuth
|
||||
// @Produces text/plain
|
||||
// @Success 200 {string} string "Version information"
|
||||
// @Failure 500 {string} string "Internal Server Error"
|
||||
// @Router /backends/llama-cpp/version [get]
|
||||
// @Router /api/v1/backends/llama-cpp/version [get]
|
||||
func (h *Handler) LlamaServerVersionHandler() http.HandlerFunc {
|
||||
return h.executeLlamaServerCommand("--version", "Failed to get version")
|
||||
}
|
||||
@@ -277,12 +287,12 @@ func (h *Handler) LlamaServerVersionHandler() http.HandlerFunc {
|
||||
// LlamaServerListDevicesHandler godoc
|
||||
// @Summary List available devices for llama server
|
||||
// @Description Returns a list of available devices for the llama server
|
||||
// @Tags backends
|
||||
// @Tags Backends
|
||||
// @Security ApiKeyAuth
|
||||
// @Produces text/plain
|
||||
// @Success 200 {string} string "List of devices"
|
||||
// @Failure 500 {string} string "Internal Server Error"
|
||||
// @Router /backends/llama-cpp/devices [get]
|
||||
// @Router /api/v1/backends/llama-cpp/devices [get]
|
||||
func (h *Handler) LlamaServerListDevicesHandler() http.HandlerFunc {
|
||||
return h.executeLlamaServerCommand("--list-devices", "Failed to list devices")
|
||||
}
|
||||
|
||||
@@ -16,12 +16,12 @@ import (
|
||||
// ListInstances godoc
|
||||
// @Summary List all instances
|
||||
// @Description Returns a list of all instances managed by the server
|
||||
// @Tags instances
|
||||
// @Tags Instances
|
||||
// @Security ApiKeyAuth
|
||||
// @Produces json
|
||||
// @Success 200 {array} instance.Process "List of instances"
|
||||
// @Success 200 {array} instance.Instance "List of instances"
|
||||
// @Failure 500 {string} string "Internal Server Error"
|
||||
// @Router /instances [get]
|
||||
// @Router /api/v1/instances [get]
|
||||
func (h *Handler) ListInstances() http.HandlerFunc {
|
||||
return func(w http.ResponseWriter, r *http.Request) {
|
||||
instances, err := h.InstanceManager.ListInstances()
|
||||
@@ -37,16 +37,16 @@ func (h *Handler) ListInstances() http.HandlerFunc {
|
||||
// CreateInstance godoc
|
||||
// @Summary Create and start a new instance
|
||||
// @Description Creates a new instance with the provided configuration options
|
||||
// @Tags instances
|
||||
// @Tags Instances
|
||||
// @Security ApiKeyAuth
|
||||
// @Accept json
|
||||
// @Produces json
|
||||
// @Param name path string true "Instance Name"
|
||||
// @Param options body instance.Options true "Instance configuration options"
|
||||
// @Success 201 {object} instance.Process "Created instance details"
|
||||
// @Success 201 {object} instance.Instance "Created instance details"
|
||||
// @Failure 400 {string} string "Invalid request body"
|
||||
// @Failure 500 {string} string "Internal Server Error"
|
||||
// @Router /instances/{name} [post]
|
||||
// @Router /api/v1/instances/{name} [post]
|
||||
func (h *Handler) CreateInstance() http.HandlerFunc {
|
||||
return func(w http.ResponseWriter, r *http.Request) {
|
||||
name := chi.URLParam(r, "name")
|
||||
@@ -75,14 +75,14 @@ func (h *Handler) CreateInstance() http.HandlerFunc {
|
||||
// GetInstance godoc
|
||||
// @Summary Get details of a specific instance
|
||||
// @Description Returns the details of a specific instance by name
|
||||
// @Tags instances
|
||||
// @Tags Instances
|
||||
// @Security ApiKeyAuth
|
||||
// @Produces json
|
||||
// @Param name path string true "Instance Name"
|
||||
// @Success 200 {object} instance.Process "Instance details"
|
||||
// @Success 200 {object} instance.Instance "Instance details"
|
||||
// @Failure 400 {string} string "Invalid name format"
|
||||
// @Failure 500 {string} string "Internal Server Error"
|
||||
// @Router /instances/{name} [get]
|
||||
// @Router /api/v1/instances/{name} [get]
|
||||
func (h *Handler) GetInstance() http.HandlerFunc {
|
||||
return func(w http.ResponseWriter, r *http.Request) {
|
||||
name := chi.URLParam(r, "name")
|
||||
@@ -105,16 +105,16 @@ func (h *Handler) GetInstance() http.HandlerFunc {
|
||||
// UpdateInstance godoc
|
||||
// @Summary Update an instance's configuration
|
||||
// @Description Updates the configuration of a specific instance by name
|
||||
// @Tags instances
|
||||
// @Tags Instances
|
||||
// @Security ApiKeyAuth
|
||||
// @Accept json
|
||||
// @Produces json
|
||||
// @Param name path string true "Instance Name"
|
||||
// @Param options body instance.Options true "Instance configuration options"
|
||||
// @Success 200 {object} instance.Process "Updated instance details"
|
||||
// @Success 200 {object} instance.Instance "Updated instance details"
|
||||
// @Failure 400 {string} string "Invalid name format"
|
||||
// @Failure 500 {string} string "Internal Server Error"
|
||||
// @Router /instances/{name} [put]
|
||||
// @Router /api/v1/instances/{name} [put]
|
||||
func (h *Handler) UpdateInstance() http.HandlerFunc {
|
||||
return func(w http.ResponseWriter, r *http.Request) {
|
||||
name := chi.URLParam(r, "name")
|
||||
@@ -143,14 +143,14 @@ func (h *Handler) UpdateInstance() http.HandlerFunc {
|
||||
// StartInstance godoc
|
||||
// @Summary Start a stopped instance
|
||||
// @Description Starts a specific instance by name
|
||||
// @Tags instances
|
||||
// @Tags Instances
|
||||
// @Security ApiKeyAuth
|
||||
// @Produces json
|
||||
// @Param name path string true "Instance Name"
|
||||
// @Success 200 {object} instance.Process "Started instance details"
|
||||
// @Success 200 {object} instance.Instance "Started instance details"
|
||||
// @Failure 400 {string} string "Invalid name format"
|
||||
// @Failure 500 {string} string "Internal Server Error"
|
||||
// @Router /instances/{name}/start [post]
|
||||
// @Router /api/v1/instances/{name}/start [post]
|
||||
func (h *Handler) StartInstance() http.HandlerFunc {
|
||||
return func(w http.ResponseWriter, r *http.Request) {
|
||||
name := chi.URLParam(r, "name")
|
||||
@@ -179,14 +179,14 @@ func (h *Handler) StartInstance() http.HandlerFunc {
|
||||
// StopInstance godoc
|
||||
// @Summary Stop a running instance
|
||||
// @Description Stops a specific instance by name
|
||||
// @Tags instances
|
||||
// @Tags Instances
|
||||
// @Security ApiKeyAuth
|
||||
// @Produces json
|
||||
// @Param name path string true "Instance Name"
|
||||
// @Success 200 {object} instance.Process "Stopped instance details"
|
||||
// @Success 200 {object} instance.Instance "Stopped instance details"
|
||||
// @Failure 400 {string} string "Invalid name format"
|
||||
// @Failure 500 {string} string "Internal Server Error"
|
||||
// @Router /instances/{name}/stop [post]
|
||||
// @Router /api/v1/instances/{name}/stop [post]
|
||||
func (h *Handler) StopInstance() http.HandlerFunc {
|
||||
return func(w http.ResponseWriter, r *http.Request) {
|
||||
name := chi.URLParam(r, "name")
|
||||
@@ -209,14 +209,14 @@ func (h *Handler) StopInstance() http.HandlerFunc {
|
||||
// RestartInstance godoc
|
||||
// @Summary Restart a running instance
|
||||
// @Description Restarts a specific instance by name
|
||||
// @Tags instances
|
||||
// @Tags Instances
|
||||
// @Security ApiKeyAuth
|
||||
// @Produces json
|
||||
// @Param name path string true "Instance Name"
|
||||
// @Success 200 {object} instance.Process "Restarted instance details"
|
||||
// @Success 200 {object} instance.Instance "Restarted instance details"
|
||||
// @Failure 400 {string} string "Invalid name format"
|
||||
// @Failure 500 {string} string "Internal Server Error"
|
||||
// @Router /instances/{name}/restart [post]
|
||||
// @Router /api/v1/instances/{name}/restart [post]
|
||||
func (h *Handler) RestartInstance() http.HandlerFunc {
|
||||
return func(w http.ResponseWriter, r *http.Request) {
|
||||
name := chi.URLParam(r, "name")
|
||||
@@ -239,13 +239,13 @@ func (h *Handler) RestartInstance() http.HandlerFunc {
|
||||
// DeleteInstance godoc
|
||||
// @Summary Delete an instance
|
||||
// @Description Stops and removes a specific instance by name
|
||||
// @Tags instances
|
||||
// @Tags Instances
|
||||
// @Security ApiKeyAuth
|
||||
// @Param name path string true "Instance Name"
|
||||
// @Success 204 "No Content"
|
||||
// @Failure 400 {string} string "Invalid name format"
|
||||
// @Failure 500 {string} string "Internal Server Error"
|
||||
// @Router /instances/{name} [delete]
|
||||
// @Router /api/v1/instances/{name} [delete]
|
||||
func (h *Handler) DeleteInstance() http.HandlerFunc {
|
||||
return func(w http.ResponseWriter, r *http.Request) {
|
||||
name := chi.URLParam(r, "name")
|
||||
@@ -267,7 +267,7 @@ func (h *Handler) DeleteInstance() http.HandlerFunc {
|
||||
// GetInstanceLogs godoc
|
||||
// @Summary Get logs from a specific instance
|
||||
// @Description Returns the logs from a specific instance by name with optional line limit
|
||||
// @Tags instances
|
||||
// @Tags Instances
|
||||
// @Security ApiKeyAuth
|
||||
// @Param name path string true "Instance Name"
|
||||
// @Param lines query string false "Number of lines to retrieve (default: all lines)"
|
||||
@@ -275,7 +275,7 @@ func (h *Handler) DeleteInstance() http.HandlerFunc {
|
||||
// @Success 200 {string} string "Instance logs"
|
||||
// @Failure 400 {string} string "Invalid name format or lines parameter"
|
||||
// @Failure 500 {string} string "Internal Server Error"
|
||||
// @Router /instances/{name}/logs [get]
|
||||
// @Router /api/v1/instances/{name}/logs [get]
|
||||
func (h *Handler) GetInstanceLogs() http.HandlerFunc {
|
||||
return func(w http.ResponseWriter, r *http.Request) {
|
||||
name := chi.URLParam(r, "name")
|
||||
@@ -310,15 +310,15 @@ func (h *Handler) GetInstanceLogs() http.HandlerFunc {
|
||||
// InstanceProxy godoc
|
||||
// @Summary Proxy requests to a specific instance, does not autostart instance if stopped
|
||||
// @Description Forwards HTTP requests to the llama-server instance running on a specific port
|
||||
// @Tags instances
|
||||
// @Tags Instances
|
||||
// @Security ApiKeyAuth
|
||||
// @Param name path string true "Instance Name"
|
||||
// @Success 200 "Request successfully proxied to instance"
|
||||
// @Failure 400 {string} string "Invalid name format"
|
||||
// @Failure 500 {string} string "Internal Server Error"
|
||||
// @Failure 503 {string} string "Instance is not running"
|
||||
// @Router /instances/{name}/proxy [get]
|
||||
// @Router /instances/{name}/proxy [post]
|
||||
// @Router /api/v1/instances/{name}/proxy [get]
|
||||
// @Router /api/v1/instances/{name}/proxy [post]
|
||||
func (h *Handler) InstanceProxy() http.HandlerFunc {
|
||||
return func(w http.ResponseWriter, r *http.Request) {
|
||||
inst, err := h.getInstance(r)
|
||||
|
||||
@@ -14,12 +14,12 @@ type NodeResponse struct {
|
||||
// ListNodes godoc
|
||||
// @Summary List all configured nodes
|
||||
// @Description Returns a map of all nodes configured in the server (node name -> node config)
|
||||
// @Tags nodes
|
||||
// @Tags Nodes
|
||||
// @Security ApiKeyAuth
|
||||
// @Produces json
|
||||
// @Success 200 {object} map[string]NodeResponse "Map of nodes"
|
||||
// @Failure 500 {string} string "Internal Server Error"
|
||||
// @Router /nodes [get]
|
||||
// @Router /api/v1/nodes [get]
|
||||
func (h *Handler) ListNodes() http.HandlerFunc {
|
||||
return func(w http.ResponseWriter, r *http.Request) {
|
||||
// Convert to sanitized response format (map of name -> NodeResponse)
|
||||
@@ -37,7 +37,7 @@ func (h *Handler) ListNodes() http.HandlerFunc {
|
||||
// GetNode godoc
|
||||
// @Summary Get details of a specific node
|
||||
// @Description Returns the details of a specific node by name
|
||||
// @Tags nodes
|
||||
// @Tags Nodes
|
||||
// @Security ApiKeyAuth
|
||||
// @Produces json
|
||||
// @Param name path string true "Node Name"
|
||||
@@ -45,7 +45,7 @@ func (h *Handler) ListNodes() http.HandlerFunc {
|
||||
// @Failure 400 {string} string "Invalid name format"
|
||||
// @Failure 404 {string} string "Node not found"
|
||||
// @Failure 500 {string} string "Internal Server Error"
|
||||
// @Router /nodes/{name} [get]
|
||||
// @Router /api/v1/nodes/{name} [get]
|
||||
func (h *Handler) GetNode() http.HandlerFunc {
|
||||
return func(w http.ResponseWriter, r *http.Request) {
|
||||
name := chi.URLParam(r, "name")
|
||||
|
||||
@@ -25,7 +25,7 @@ type OpenAIInstance struct {
|
||||
// OpenAIListInstances godoc
|
||||
// @Summary List instances in OpenAI-compatible format
|
||||
// @Description Returns a list of instances in a format compatible with OpenAI API
|
||||
// @Tags openai
|
||||
// @Tags OpenAI
|
||||
// @Security ApiKeyAuth
|
||||
// @Produces json
|
||||
// @Success 200 {object} OpenAIListInstancesResponse "List of OpenAI-compatible instances"
|
||||
@@ -61,7 +61,7 @@ func (h *Handler) OpenAIListInstances() http.HandlerFunc {
|
||||
// OpenAIProxy godoc
|
||||
// @Summary OpenAI-compatible proxy endpoint
|
||||
// @Description Handles all POST requests to /v1/*, routing to the appropriate instance based on the request body. Requires API key authentication via the `Authorization` header.
|
||||
// @Tags openai
|
||||
// @Tags OpenAI
|
||||
// @Security ApiKeyAuth
|
||||
// @Accept json
|
||||
// @Produces json
|
||||
|
||||
@@ -8,12 +8,12 @@ import (
|
||||
// VersionHandler godoc
|
||||
// @Summary Get llamactl version
|
||||
// @Description Returns the version of the llamactl command
|
||||
// @Tags version
|
||||
// @Tags System
|
||||
// @Security ApiKeyAuth
|
||||
// @Produces text/plain
|
||||
// @Success 200 {string} string "Version information"
|
||||
// @Failure 500 {string} string "Internal Server Error"
|
||||
// @Router /version [get]
|
||||
// @Router /api/v1/version [get]
|
||||
func (h *Handler) VersionHandler() http.HandlerFunc {
|
||||
return func(w http.ResponseWriter, r *http.Request) {
|
||||
versionInfo := fmt.Sprintf("Version: %s\nCommit: %s\nBuild Time: %s\n", h.cfg.Version, h.cfg.CommitHash, h.cfg.BuildTime)
|
||||
|
||||
@@ -8,7 +8,7 @@ import (
|
||||
"github.com/go-chi/cors"
|
||||
httpSwagger "github.com/swaggo/http-swagger"
|
||||
|
||||
_ "llamactl/apidocs"
|
||||
_ "llamactl/docs"
|
||||
"llamactl/webui"
|
||||
)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user