mirror of
https://github.com/lordmathis/llamactl.git
synced 2025-11-06 17:14:28 +00:00
Compare commits
28 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 1892dc8315 | |||
|
|
997bd1b063 | ||
|
|
fa43f9e967 | ||
| db9eebeb8b | |||
| bd062f8ca0 | |||
| 8ebdb1a183 | |||
| 7272212081 | |||
| 035e184789 | |||
| d15976e7aa | |||
| 4fa75d9801 | |||
|
|
0e1bc8a352 | ||
| 1e5e86d2c3 | |||
| 25d3d70707 | |||
| e54cfd006d | |||
| 7d39e7ee86 | |||
| 222d913b4a | |||
|
|
03a7a5d139 | ||
|
|
e50660c379 | ||
|
|
5906d89f8d | ||
| cb2d95139f | |||
| 889a8707e7 | |||
| 070c91787d | |||
| 169ee422ec | |||
| bb0176b7f5 | |||
| 291ec7995f | |||
| b940b38e46 | |||
| 92cb57e816 | |||
| 0ecd55c354 |
45
.dockerignore
Normal file
45
.dockerignore
Normal file
@@ -0,0 +1,45 @@
|
|||||||
|
# Git and version control
|
||||||
|
.git/
|
||||||
|
.gitignore
|
||||||
|
|
||||||
|
# Documentation
|
||||||
|
*.md
|
||||||
|
docs/
|
||||||
|
|
||||||
|
# Development files
|
||||||
|
.vscode/
|
||||||
|
.idea/
|
||||||
|
|
||||||
|
# Build artifacts
|
||||||
|
webui/node_modules/
|
||||||
|
webui/dist/
|
||||||
|
webui/.next/
|
||||||
|
*.log
|
||||||
|
*.tmp
|
||||||
|
|
||||||
|
# Data directories
|
||||||
|
data/
|
||||||
|
models/
|
||||||
|
logs/
|
||||||
|
|
||||||
|
# Test files
|
||||||
|
*_test.go
|
||||||
|
**/*_test.go
|
||||||
|
|
||||||
|
# CI/CD
|
||||||
|
.github/
|
||||||
|
|
||||||
|
# Local configuration
|
||||||
|
llamactl.yaml
|
||||||
|
config.yaml
|
||||||
|
.env
|
||||||
|
.env.local
|
||||||
|
|
||||||
|
# OS files
|
||||||
|
.DS_Store
|
||||||
|
Thumbs.db
|
||||||
|
|
||||||
|
# Backup files
|
||||||
|
*.bak
|
||||||
|
*.backup
|
||||||
|
*~
|
||||||
30
README.md
30
README.md
@@ -95,7 +95,30 @@ sudo mv llamactl /usr/local/bin/
|
|||||||
# Windows - Download from releases page
|
# Windows - Download from releases page
|
||||||
```
|
```
|
||||||
|
|
||||||
### Option 2: Build from Source
|
### Option 2: Docker (No local backend installation required)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Clone repository and build Docker images
|
||||||
|
git clone https://github.com/lordmathis/llamactl.git
|
||||||
|
cd llamactl
|
||||||
|
mkdir -p data/llamacpp data/vllm models
|
||||||
|
|
||||||
|
# Build and start llamactl with llama.cpp CUDA backend
|
||||||
|
docker-compose -f docker/docker-compose.yml up llamactl-llamacpp -d
|
||||||
|
|
||||||
|
# Build and start llamactl with vLLM CUDA backend
|
||||||
|
docker-compose -f docker/docker-compose.yml up llamactl-vllm -d
|
||||||
|
|
||||||
|
# Build from source using multi-stage build
|
||||||
|
docker build -f docker/Dockerfile.source -t llamactl:source .
|
||||||
|
```
|
||||||
|
|
||||||
|
**Features:** CUDA support, automatic latest release installation, no backend dependencies.
|
||||||
|
**Note:** Dockerfiles are configured for CUDA. Adapt base images for other platforms (CPU, ROCm, etc.).
|
||||||
|
|
||||||
|
For detailed Docker setup and configuration, see the [Installation Guide](docs/getting-started/installation.md).
|
||||||
|
|
||||||
|
### Option 3: Build from Source
|
||||||
Requires Go 1.24+ and Node.js 22+
|
Requires Go 1.24+ and Node.js 22+
|
||||||
```bash
|
```bash
|
||||||
git clone https://github.com/lordmathis/llamactl.git
|
git clone https://github.com/lordmathis/llamactl.git
|
||||||
@@ -147,9 +170,9 @@ pip install vllm
|
|||||||
# Or use Docker - no local installation required
|
# Or use Docker - no local installation required
|
||||||
```
|
```
|
||||||
|
|
||||||
## Docker Support
|
## Backend Docker Support
|
||||||
|
|
||||||
llamactl supports running backends in Docker containers - perfect for production deployments without local backend installation. Simply enable Docker in your configuration:
|
llamactl can run backends in Docker containers:
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
backends:
|
backends:
|
||||||
@@ -174,6 +197,7 @@ server:
|
|||||||
host: "0.0.0.0" # Server host to bind to
|
host: "0.0.0.0" # Server host to bind to
|
||||||
port: 8080 # Server port to bind to
|
port: 8080 # Server port to bind to
|
||||||
allowed_origins: ["*"] # Allowed CORS origins (default: all)
|
allowed_origins: ["*"] # Allowed CORS origins (default: all)
|
||||||
|
allowed_headers: ["*"] # Allowed CORS headers (default: all)
|
||||||
enable_swagger: false # Enable Swagger UI for API docs
|
enable_swagger: false # Enable Swagger UI for API docs
|
||||||
|
|
||||||
backends:
|
backends:
|
||||||
|
|||||||
23
docker/Dockerfile.llamacpp
Normal file
23
docker/Dockerfile.llamacpp
Normal file
@@ -0,0 +1,23 @@
|
|||||||
|
FROM ghcr.io/ggml-org/llama.cpp:server-cuda
|
||||||
|
|
||||||
|
# Install curl for downloading llamactl
|
||||||
|
RUN apt-get update && apt-get install -y curl && rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# Download and install the latest llamactl release
|
||||||
|
RUN LATEST_VERSION=$(curl -s https://api.github.com/repos/lordmathis/llamactl/releases/latest | grep '"tag_name":' | sed -E 's/.*"([^"]+)".*/\1/') && \
|
||||||
|
curl -L "https://github.com/lordmathis/llamactl/releases/download/${LATEST_VERSION}/llamactl-${LATEST_VERSION}-linux-amd64.tar.gz" | tar -xz && \
|
||||||
|
mv llamactl /usr/local/bin/ && \
|
||||||
|
chmod +x /usr/local/bin/llamactl
|
||||||
|
|
||||||
|
# Set working directory
|
||||||
|
RUN mkdir -p /data
|
||||||
|
WORKDIR /data
|
||||||
|
|
||||||
|
# Expose the default llamactl port
|
||||||
|
EXPOSE 8080
|
||||||
|
|
||||||
|
ENV LLAMACTL_LLAMACPP_COMMAND=/app/llama-server
|
||||||
|
ENV LD_LIBRARY_PATH="/app:/usr/local/lib:/usr/lib"
|
||||||
|
|
||||||
|
# Set llamactl as the entrypoint
|
||||||
|
ENTRYPOINT ["llamactl"]
|
||||||
64
docker/Dockerfile.source
Normal file
64
docker/Dockerfile.source
Normal file
@@ -0,0 +1,64 @@
|
|||||||
|
# WebUI build stage
|
||||||
|
FROM node:20-alpine AS webui-builder
|
||||||
|
|
||||||
|
WORKDIR /webui
|
||||||
|
|
||||||
|
# Copy webui package files
|
||||||
|
COPY webui/package*.json ./
|
||||||
|
|
||||||
|
# Install dependencies
|
||||||
|
RUN npm ci
|
||||||
|
|
||||||
|
# Copy webui source
|
||||||
|
COPY webui/ ./
|
||||||
|
|
||||||
|
# Build webui
|
||||||
|
RUN npm run build
|
||||||
|
|
||||||
|
# Go build stage
|
||||||
|
FROM golang:1.24-alpine AS builder
|
||||||
|
|
||||||
|
# Install build dependencies
|
||||||
|
RUN apk add --no-cache git ca-certificates
|
||||||
|
|
||||||
|
# Set working directory
|
||||||
|
WORKDIR /build
|
||||||
|
|
||||||
|
# Copy go mod files
|
||||||
|
COPY go.mod go.sum ./
|
||||||
|
|
||||||
|
# Download dependencies
|
||||||
|
RUN go mod download
|
||||||
|
|
||||||
|
# Copy source code
|
||||||
|
COPY cmd/ ./cmd/
|
||||||
|
COPY pkg/ ./pkg/
|
||||||
|
COPY apidocs/ ./apidocs/
|
||||||
|
COPY webui/webui.go ./webui/
|
||||||
|
|
||||||
|
# Copy built webui from webui-builder
|
||||||
|
COPY --from=webui-builder /webui/dist ./webui/dist
|
||||||
|
|
||||||
|
# Build the application
|
||||||
|
RUN CGO_ENABLED=0 GOOS=linux go build -a -installsuffix cgo -ldflags="-w -s" -o llamactl ./cmd/server
|
||||||
|
|
||||||
|
# Final stage
|
||||||
|
FROM alpine:latest
|
||||||
|
|
||||||
|
# Install runtime dependencies
|
||||||
|
RUN apk --no-cache add ca-certificates
|
||||||
|
|
||||||
|
# Create data directory
|
||||||
|
RUN mkdir -p /data
|
||||||
|
|
||||||
|
# Set working directory
|
||||||
|
WORKDIR /data
|
||||||
|
|
||||||
|
# Copy binary from builder
|
||||||
|
COPY --from=builder /build/llamactl /usr/local/bin/llamactl
|
||||||
|
|
||||||
|
# Expose the default port
|
||||||
|
EXPOSE 8080
|
||||||
|
|
||||||
|
# Set llamactl as the entrypoint
|
||||||
|
ENTRYPOINT ["llamactl"]
|
||||||
20
docker/Dockerfile.vllm
Normal file
20
docker/Dockerfile.vllm
Normal file
@@ -0,0 +1,20 @@
|
|||||||
|
FROM vllm/vllm-openai:latest
|
||||||
|
|
||||||
|
# Install curl for downloading llamactl
|
||||||
|
RUN apt-get update && apt-get install -y curl && rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# Download and install the latest llamactl release
|
||||||
|
RUN LATEST_VERSION=$(curl -s https://api.github.com/repos/lordmathis/llamactl/releases/latest | grep '"tag_name":' | sed -E 's/.*"([^"]+)".*/\1/') && \
|
||||||
|
curl -L "https://github.com/lordmathis/llamactl/releases/download/${LATEST_VERSION}/llamactl-${LATEST_VERSION}-linux-amd64.tar.gz" | tar -xz && \
|
||||||
|
mv llamactl /usr/local/bin/ && \
|
||||||
|
chmod +x /usr/local/bin/llamactl
|
||||||
|
|
||||||
|
# Set working directory
|
||||||
|
RUN mkdir -p /data
|
||||||
|
WORKDIR /data
|
||||||
|
|
||||||
|
# Expose the default llamactl port
|
||||||
|
EXPOSE 8080
|
||||||
|
|
||||||
|
# Set llamactl as the entrypoint
|
||||||
|
ENTRYPOINT ["llamactl"]
|
||||||
56
docker/docker-compose.yml
Normal file
56
docker/docker-compose.yml
Normal file
@@ -0,0 +1,56 @@
|
|||||||
|
version: '3.8'
|
||||||
|
|
||||||
|
services:
|
||||||
|
llamactl-llamacpp:
|
||||||
|
build:
|
||||||
|
context: ..
|
||||||
|
dockerfile: docker/Dockerfile.llamacpp
|
||||||
|
image: llamactl:llamacpp-cuda
|
||||||
|
container_name: llamactl-llamacpp
|
||||||
|
ports:
|
||||||
|
- "8080:8080"
|
||||||
|
volumes:
|
||||||
|
- ./data/llamacpp:/data
|
||||||
|
- ./models:/models # Mount models directory
|
||||||
|
- ~/.cache/llama.cpp:/root/.cache/llama.cpp # Llama.cpp cache
|
||||||
|
environment:
|
||||||
|
# Set data directory for persistence
|
||||||
|
- LLAMACTL_DATA_DIR=/data
|
||||||
|
# Enable Docker mode for nested containers (if needed)
|
||||||
|
- LLAMACTL_LLAMACPP_DOCKER_ENABLED=false
|
||||||
|
deploy:
|
||||||
|
resources:
|
||||||
|
reservations:
|
||||||
|
devices:
|
||||||
|
- driver: nvidia
|
||||||
|
count: all
|
||||||
|
capabilities: [gpu]
|
||||||
|
restart: unless-stopped
|
||||||
|
|
||||||
|
llamactl-vllm:
|
||||||
|
build:
|
||||||
|
context: ..
|
||||||
|
dockerfile: docker/Dockerfile.vllm
|
||||||
|
image: llamactl:vllm-cuda
|
||||||
|
container_name: llamactl-vllm
|
||||||
|
ports:
|
||||||
|
- "8081:8080" # Use different port to avoid conflicts
|
||||||
|
volumes:
|
||||||
|
- ./data/vllm:/data
|
||||||
|
- ./models:/models # Mount models directory
|
||||||
|
- ~/.cache/huggingface:/root/.cache/huggingface # HuggingFace cache
|
||||||
|
environment:
|
||||||
|
# Set data directory for persistence
|
||||||
|
- LLAMACTL_DATA_DIR=/data
|
||||||
|
# Enable Docker mode for nested containers (if needed)
|
||||||
|
- LLAMACTL_VLLM_DOCKER_ENABLED=false
|
||||||
|
# vLLM specific environment variables
|
||||||
|
- CUDA_VISIBLE_DEVICES=all
|
||||||
|
deploy:
|
||||||
|
resources:
|
||||||
|
reservations:
|
||||||
|
devices:
|
||||||
|
- driver: nvidia
|
||||||
|
count: all
|
||||||
|
capabilities: [gpu]
|
||||||
|
restart: unless-stopped
|
||||||
@@ -17,33 +17,37 @@ server:
|
|||||||
host: "0.0.0.0" # Server host to bind to
|
host: "0.0.0.0" # Server host to bind to
|
||||||
port: 8080 # Server port to bind to
|
port: 8080 # Server port to bind to
|
||||||
allowed_origins: ["*"] # Allowed CORS origins (default: all)
|
allowed_origins: ["*"] # Allowed CORS origins (default: all)
|
||||||
|
allowed_headers: ["*"] # Allowed CORS headers (default: all)
|
||||||
enable_swagger: false # Enable Swagger UI for API docs
|
enable_swagger: false # Enable Swagger UI for API docs
|
||||||
|
|
||||||
backends:
|
backends:
|
||||||
llama-cpp:
|
llama-cpp:
|
||||||
command: "llama-server"
|
command: "llama-server"
|
||||||
args: []
|
args: []
|
||||||
environment: {} # Environment variables for the backend process
|
environment: {} # Environment variables for the backend process
|
||||||
docker:
|
docker:
|
||||||
enabled: false
|
enabled: false
|
||||||
image: "ghcr.io/ggml-org/llama.cpp:server"
|
image: "ghcr.io/ggml-org/llama.cpp:server"
|
||||||
args: ["run", "--rm", "--network", "host", "--gpus", "all"]
|
args: ["run", "--rm", "--network", "host", "--gpus", "all"]
|
||||||
environment: {}
|
environment: {}
|
||||||
|
response_headers: {} # Additional response headers to send with responses
|
||||||
|
|
||||||
vllm:
|
vllm:
|
||||||
command: "vllm"
|
command: "vllm"
|
||||||
args: ["serve"]
|
args: ["serve"]
|
||||||
environment: {} # Environment variables for the backend process
|
environment: {} # Environment variables for the backend process
|
||||||
docker:
|
docker:
|
||||||
enabled: false
|
enabled: false
|
||||||
image: "vllm/vllm-openai:latest"
|
image: "vllm/vllm-openai:latest"
|
||||||
args: ["run", "--rm", "--network", "host", "--gpus", "all", "--shm-size", "1g"]
|
args: ["run", "--rm", "--network", "host", "--gpus", "all", "--shm-size", "1g"]
|
||||||
environment: {}
|
environment: {}
|
||||||
|
response_headers: {} # Additional response headers to send with responses
|
||||||
|
|
||||||
mlx:
|
mlx:
|
||||||
command: "mlx_lm.server"
|
command: "mlx_lm.server"
|
||||||
args: []
|
args: []
|
||||||
environment: {} # Environment variables for the backend process
|
environment: {} # Environment variables for the backend process
|
||||||
|
response_headers: {} # Additional response headers to send with responses
|
||||||
|
|
||||||
instances:
|
instances:
|
||||||
port_range: [8000, 9000] # Port range for instances
|
port_range: [8000, 9000] # Port range for instances
|
||||||
@@ -101,6 +105,7 @@ server:
|
|||||||
host: "0.0.0.0" # Server host to bind to (default: "0.0.0.0")
|
host: "0.0.0.0" # Server host to bind to (default: "0.0.0.0")
|
||||||
port: 8080 # Server port to bind to (default: 8080)
|
port: 8080 # Server port to bind to (default: 8080)
|
||||||
allowed_origins: ["*"] # CORS allowed origins (default: ["*"])
|
allowed_origins: ["*"] # CORS allowed origins (default: ["*"])
|
||||||
|
allowed_headers: ["*"] # CORS allowed headers (default: ["*"])
|
||||||
enable_swagger: false # Enable Swagger UI (default: false)
|
enable_swagger: false # Enable Swagger UI (default: false)
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -116,40 +121,46 @@ backends:
|
|||||||
llama-cpp:
|
llama-cpp:
|
||||||
command: "llama-server"
|
command: "llama-server"
|
||||||
args: []
|
args: []
|
||||||
environment: {} # Environment variables for the backend process
|
environment: {} # Environment variables for the backend process
|
||||||
docker:
|
docker:
|
||||||
enabled: false # Enable Docker runtime (default: false)
|
enabled: false # Enable Docker runtime (default: false)
|
||||||
image: "ghcr.io/ggml-org/llama.cpp:server"
|
image: "ghcr.io/ggml-org/llama.cpp:server"
|
||||||
args: ["run", "--rm", "--network", "host", "--gpus", "all"]
|
args: ["run", "--rm", "--network", "host", "--gpus", "all"]
|
||||||
environment: {}
|
environment: {}
|
||||||
|
response_headers: {} # Additional response headers to send with responses
|
||||||
|
|
||||||
vllm:
|
vllm:
|
||||||
command: "vllm"
|
command: "vllm"
|
||||||
args: ["serve"]
|
args: ["serve"]
|
||||||
environment: {} # Environment variables for the backend process
|
environment: {} # Environment variables for the backend process
|
||||||
docker:
|
docker:
|
||||||
enabled: false
|
enabled: false # Enable Docker runtime (default: false)
|
||||||
image: "vllm/vllm-openai:latest"
|
image: "vllm/vllm-openai:latest"
|
||||||
args: ["run", "--rm", "--network", "host", "--gpus", "all", "--shm-size", "1g"]
|
args: ["run", "--rm", "--network", "host", "--gpus", "all", "--shm-size", "1g"]
|
||||||
environment: {}
|
environment: {}
|
||||||
|
response_headers: {} # Additional response headers to send with responses
|
||||||
|
|
||||||
mlx:
|
mlx:
|
||||||
command: "mlx_lm.server"
|
command: "mlx_lm.server"
|
||||||
args: []
|
args: []
|
||||||
environment: {} # Environment variables for the backend process
|
environment: {} # Environment variables for the backend process
|
||||||
# MLX does not support Docker
|
# MLX does not support Docker
|
||||||
|
response_headers: {} # Additional response headers to send with responses
|
||||||
```
|
```
|
||||||
|
|
||||||
**Backend Configuration Fields:**
|
**Backend Configuration Fields:**
|
||||||
- `command`: Executable name/path for the backend
|
- `command`: Executable name/path for the backend
|
||||||
- `args`: Default arguments prepended to all instances
|
- `args`: Default arguments prepended to all instances
|
||||||
- `environment`: Environment variables for the backend process (optional)
|
- `environment`: Environment variables for the backend process (optional)
|
||||||
|
- `response_headers`: Additional response headers to send with responses (optional)
|
||||||
- `docker`: Docker-specific configuration (optional)
|
- `docker`: Docker-specific configuration (optional)
|
||||||
- `enabled`: Boolean flag to enable Docker runtime
|
- `enabled`: Boolean flag to enable Docker runtime
|
||||||
- `image`: Docker image to use
|
- `image`: Docker image to use
|
||||||
- `args`: Additional arguments passed to `docker run`
|
- `args`: Additional arguments passed to `docker run`
|
||||||
- `environment`: Environment variables for the container (optional)
|
- `environment`: Environment variables for the container (optional)
|
||||||
|
|
||||||
|
> If llamactl is behind an NGINX proxy, `X-Accel-Buffering: no` response header may be required for NGINX to properly stream the responses without buffering.
|
||||||
|
|
||||||
**Environment Variables:**
|
**Environment Variables:**
|
||||||
|
|
||||||
**LlamaCpp Backend:**
|
**LlamaCpp Backend:**
|
||||||
@@ -160,6 +171,7 @@ backends:
|
|||||||
- `LLAMACTL_LLAMACPP_DOCKER_IMAGE` - Docker image to use
|
- `LLAMACTL_LLAMACPP_DOCKER_IMAGE` - Docker image to use
|
||||||
- `LLAMACTL_LLAMACPP_DOCKER_ARGS` - Space-separated Docker arguments
|
- `LLAMACTL_LLAMACPP_DOCKER_ARGS` - Space-separated Docker arguments
|
||||||
- `LLAMACTL_LLAMACPP_DOCKER_ENV` - Docker environment variables in format "KEY1=value1,KEY2=value2"
|
- `LLAMACTL_LLAMACPP_DOCKER_ENV` - Docker environment variables in format "KEY1=value1,KEY2=value2"
|
||||||
|
- `LLAMACTL_LLAMACPP_RESPONSE_HEADERS` - Response headers in format "KEY1=value1;KEY2=value2"
|
||||||
|
|
||||||
**VLLM Backend:**
|
**VLLM Backend:**
|
||||||
- `LLAMACTL_VLLM_COMMAND` - VLLM executable command
|
- `LLAMACTL_VLLM_COMMAND` - VLLM executable command
|
||||||
@@ -169,11 +181,13 @@ backends:
|
|||||||
- `LLAMACTL_VLLM_DOCKER_IMAGE` - Docker image to use
|
- `LLAMACTL_VLLM_DOCKER_IMAGE` - Docker image to use
|
||||||
- `LLAMACTL_VLLM_DOCKER_ARGS` - Space-separated Docker arguments
|
- `LLAMACTL_VLLM_DOCKER_ARGS` - Space-separated Docker arguments
|
||||||
- `LLAMACTL_VLLM_DOCKER_ENV` - Docker environment variables in format "KEY1=value1,KEY2=value2"
|
- `LLAMACTL_VLLM_DOCKER_ENV` - Docker environment variables in format "KEY1=value1,KEY2=value2"
|
||||||
|
- `LLAMACTL_VLLM_RESPONSE_HEADERS` - Response headers in format "KEY1=value1;KEY2=value2"
|
||||||
|
|
||||||
**MLX Backend:**
|
**MLX Backend:**
|
||||||
- `LLAMACTL_MLX_COMMAND` - MLX executable command
|
- `LLAMACTL_MLX_COMMAND` - MLX executable command
|
||||||
- `LLAMACTL_MLX_ARGS` - Space-separated default arguments
|
- `LLAMACTL_MLX_ARGS` - Space-separated default arguments
|
||||||
- `LLAMACTL_MLX_ENV` - Environment variables in format "KEY1=value1,KEY2=value2"
|
- `LLAMACTL_MLX_ENV` - Environment variables in format "KEY1=value1,KEY2=value2"
|
||||||
|
- `LLAMACTL_MLX_RESPONSE_HEADERS` - Response headers in format "KEY1=value1;KEY2=value2"
|
||||||
|
|
||||||
### Instance Configuration
|
### Instance Configuration
|
||||||
|
|
||||||
|
|||||||
@@ -71,7 +71,72 @@ sudo mv llamactl /usr/local/bin/
|
|||||||
# Windows - Download from releases page
|
# Windows - Download from releases page
|
||||||
```
|
```
|
||||||
|
|
||||||
### Option 2: Build from Source
|
### Option 2: Docker
|
||||||
|
|
||||||
|
llamactl provides Dockerfiles for creating Docker images with backends pre-installed. The resulting images include the latest llamactl release with the respective backend.
|
||||||
|
|
||||||
|
**Available Dockerfiles (CUDA):**
|
||||||
|
- **llamactl with llama.cpp CUDA**: `docker/Dockerfile.llamacpp` (based on `ghcr.io/ggml-org/llama.cpp:server-cuda`)
|
||||||
|
- **llamactl with vLLM CUDA**: `docker/Dockerfile.vllm` (based on `vllm/vllm-openai:latest`)
|
||||||
|
- **llamactl built from source**: `docker/Dockerfile.source` (multi-stage build with webui)
|
||||||
|
|
||||||
|
**Note:** These Dockerfiles are configured for CUDA. For other platforms (CPU, ROCm, Vulkan, etc.), adapt the base image. For llama.cpp, see available tags at [llama.cpp Docker docs](https://github.com/ggml-org/llama.cpp/blob/master/docs/docker.md). For vLLM, check [vLLM docs](https://docs.vllm.ai/en/v0.6.5/serving/deploying_with_docker.html).
|
||||||
|
|
||||||
|
#### Using Docker Compose
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Clone the repository
|
||||||
|
git clone https://github.com/lordmathis/llamactl.git
|
||||||
|
cd llamactl
|
||||||
|
|
||||||
|
# Create directories for data and models
|
||||||
|
mkdir -p data/llamacpp data/vllm models
|
||||||
|
|
||||||
|
# Start llamactl with llama.cpp backend
|
||||||
|
docker-compose -f docker/docker-compose.yml up llamactl-llamacpp -d
|
||||||
|
|
||||||
|
# Or start llamactl with vLLM backend
|
||||||
|
docker-compose -f docker/docker-compose.yml up llamactl-vllm -d
|
||||||
|
```
|
||||||
|
|
||||||
|
Access the dashboard at:
|
||||||
|
- llamactl with llama.cpp: http://localhost:8080
|
||||||
|
- llamactl with vLLM: http://localhost:8081
|
||||||
|
|
||||||
|
#### Using Docker Build and Run
|
||||||
|
|
||||||
|
**llamactl with llama.cpp CUDA:**
|
||||||
|
```bash
|
||||||
|
docker build -f docker/Dockerfile.llamacpp -t llamactl:llamacpp-cuda .
|
||||||
|
docker run -d \
|
||||||
|
--name llamactl-llamacpp \
|
||||||
|
--gpus all \
|
||||||
|
-p 8080:8080 \
|
||||||
|
-v ~/.cache/llama.cpp:/root/.cache/llama.cpp \
|
||||||
|
llamactl:llamacpp-cuda
|
||||||
|
```
|
||||||
|
|
||||||
|
**llamactl with vLLM CUDA:**
|
||||||
|
```bash
|
||||||
|
docker build -f docker/Dockerfile.vllm -t llamactl:vllm-cuda .
|
||||||
|
docker run -d \
|
||||||
|
--name llamactl-vllm \
|
||||||
|
--gpus all \
|
||||||
|
-p 8080:8080 \
|
||||||
|
-v ~/.cache/huggingface:/root/.cache/huggingface \
|
||||||
|
llamactl:vllm-cuda
|
||||||
|
```
|
||||||
|
|
||||||
|
**llamactl built from source:**
|
||||||
|
```bash
|
||||||
|
docker build -f docker/Dockerfile.source -t llamactl:source .
|
||||||
|
docker run -d \
|
||||||
|
--name llamactl \
|
||||||
|
-p 8080:8080 \
|
||||||
|
llamactl:source
|
||||||
|
```
|
||||||
|
|
||||||
|
### Option 3: Build from Source
|
||||||
|
|
||||||
Requirements:
|
Requirements:
|
||||||
- Go 1.24 or later
|
- Go 1.24 or later
|
||||||
|
|||||||
@@ -13,10 +13,11 @@ import (
|
|||||||
|
|
||||||
// BackendSettings contains structured backend configuration
|
// BackendSettings contains structured backend configuration
|
||||||
type BackendSettings struct {
|
type BackendSettings struct {
|
||||||
Command string `yaml:"command"`
|
Command string `yaml:"command"`
|
||||||
Args []string `yaml:"args"`
|
Args []string `yaml:"args"`
|
||||||
Environment map[string]string `yaml:"environment,omitempty"`
|
Environment map[string]string `yaml:"environment,omitempty"`
|
||||||
Docker *DockerSettings `yaml:"docker,omitempty"`
|
Docker *DockerSettings `yaml:"docker,omitempty"`
|
||||||
|
ResponseHeaders map[string]string `yaml:"response_headers,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
// DockerSettings contains Docker-specific configuration
|
// DockerSettings contains Docker-specific configuration
|
||||||
@@ -56,8 +57,14 @@ type ServerConfig struct {
|
|||||||
// Allowed origins for CORS (e.g., "http://localhost:3000")
|
// Allowed origins for CORS (e.g., "http://localhost:3000")
|
||||||
AllowedOrigins []string `yaml:"allowed_origins"`
|
AllowedOrigins []string `yaml:"allowed_origins"`
|
||||||
|
|
||||||
|
// Allowed headers for CORS (e.g., "Accept", "Authorization", "Content-Type", "X-CSRF-Token")
|
||||||
|
AllowedHeaders []string `yaml:"allowed_headers"`
|
||||||
|
|
||||||
// Enable Swagger UI for API documentation
|
// Enable Swagger UI for API documentation
|
||||||
EnableSwagger bool `yaml:"enable_swagger"`
|
EnableSwagger bool `yaml:"enable_swagger"`
|
||||||
|
|
||||||
|
// Response headers to send with responses
|
||||||
|
ResponseHeaders map[string]string `yaml:"response_headers,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
// InstancesConfig contains instance management configuration
|
// InstancesConfig contains instance management configuration
|
||||||
@@ -132,6 +139,7 @@ func LoadConfig(configPath string) (AppConfig, error) {
|
|||||||
Host: "0.0.0.0",
|
Host: "0.0.0.0",
|
||||||
Port: 8080,
|
Port: 8080,
|
||||||
AllowedOrigins: []string{"*"}, // Default to allow all origins
|
AllowedOrigins: []string{"*"}, // Default to allow all origins
|
||||||
|
AllowedHeaders: []string{"*"}, // Default to allow all headers
|
||||||
EnableSwagger: false,
|
EnableSwagger: false,
|
||||||
},
|
},
|
||||||
Backends: BackendConfig{
|
Backends: BackendConfig{
|
||||||
@@ -337,6 +345,12 @@ func loadEnvVars(cfg *AppConfig) {
|
|||||||
}
|
}
|
||||||
parseEnvVars(llamaDockerEnv, cfg.Backends.LlamaCpp.Docker.Environment)
|
parseEnvVars(llamaDockerEnv, cfg.Backends.LlamaCpp.Docker.Environment)
|
||||||
}
|
}
|
||||||
|
if llamaEnv := os.Getenv("LLAMACTL_LLAMACPP_RESPONSE_HEADERS"); llamaEnv != "" {
|
||||||
|
if cfg.Backends.LlamaCpp.ResponseHeaders == nil {
|
||||||
|
cfg.Backends.LlamaCpp.ResponseHeaders = make(map[string]string)
|
||||||
|
}
|
||||||
|
parseHeaders(llamaEnv, cfg.Backends.LlamaCpp.ResponseHeaders)
|
||||||
|
}
|
||||||
|
|
||||||
// vLLM backend
|
// vLLM backend
|
||||||
if vllmCmd := os.Getenv("LLAMACTL_VLLM_COMMAND"); vllmCmd != "" {
|
if vllmCmd := os.Getenv("LLAMACTL_VLLM_COMMAND"); vllmCmd != "" {
|
||||||
@@ -380,6 +394,12 @@ func loadEnvVars(cfg *AppConfig) {
|
|||||||
}
|
}
|
||||||
parseEnvVars(vllmDockerEnv, cfg.Backends.VLLM.Docker.Environment)
|
parseEnvVars(vllmDockerEnv, cfg.Backends.VLLM.Docker.Environment)
|
||||||
}
|
}
|
||||||
|
if llamaEnv := os.Getenv("LLAMACTL_VLLM_RESPONSE_HEADERS"); llamaEnv != "" {
|
||||||
|
if cfg.Backends.VLLM.ResponseHeaders == nil {
|
||||||
|
cfg.Backends.VLLM.ResponseHeaders = make(map[string]string)
|
||||||
|
}
|
||||||
|
parseHeaders(llamaEnv, cfg.Backends.VLLM.ResponseHeaders)
|
||||||
|
}
|
||||||
|
|
||||||
// MLX backend
|
// MLX backend
|
||||||
if mlxCmd := os.Getenv("LLAMACTL_MLX_COMMAND"); mlxCmd != "" {
|
if mlxCmd := os.Getenv("LLAMACTL_MLX_COMMAND"); mlxCmd != "" {
|
||||||
@@ -394,6 +414,12 @@ func loadEnvVars(cfg *AppConfig) {
|
|||||||
}
|
}
|
||||||
parseEnvVars(mlxEnv, cfg.Backends.MLX.Environment)
|
parseEnvVars(mlxEnv, cfg.Backends.MLX.Environment)
|
||||||
}
|
}
|
||||||
|
if llamaEnv := os.Getenv("LLAMACTL_MLX_RESPONSE_HEADERS"); llamaEnv != "" {
|
||||||
|
if cfg.Backends.MLX.ResponseHeaders == nil {
|
||||||
|
cfg.Backends.MLX.ResponseHeaders = make(map[string]string)
|
||||||
|
}
|
||||||
|
parseHeaders(llamaEnv, cfg.Backends.MLX.ResponseHeaders)
|
||||||
|
}
|
||||||
|
|
||||||
// Instance defaults
|
// Instance defaults
|
||||||
if autoRestart := os.Getenv("LLAMACTL_DEFAULT_AUTO_RESTART"); autoRestart != "" {
|
if autoRestart := os.Getenv("LLAMACTL_DEFAULT_AUTO_RESTART"); autoRestart != "" {
|
||||||
@@ -481,6 +507,19 @@ func parseEnvVars(envString string, envMap map[string]string) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// parseHeaders parses HTTP headers in format "KEY1=value1;KEY2=value2"
|
||||||
|
// and populates the provided environment map
|
||||||
|
func parseHeaders(envString string, envMap map[string]string) {
|
||||||
|
if envString == "" {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
for _, envPair := range strings.Split(envString, ";") {
|
||||||
|
if parts := strings.SplitN(strings.TrimSpace(envPair), "=", 2); len(parts) == 2 {
|
||||||
|
envMap[parts[0]] = parts[1]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// getDefaultDataDirectory returns platform-specific default data directory
|
// getDefaultDataDirectory returns platform-specific default data directory
|
||||||
func getDefaultDataDirectory() string {
|
func getDefaultDataDirectory() string {
|
||||||
switch runtime.GOOS {
|
switch runtime.GOOS {
|
||||||
|
|||||||
@@ -198,6 +198,15 @@ func (i *Process) GetProxy() (*httputil.ReverseProxy, error) {
|
|||||||
|
|
||||||
proxy := httputil.NewSingleHostReverseProxy(targetURL)
|
proxy := httputil.NewSingleHostReverseProxy(targetURL)
|
||||||
|
|
||||||
|
var responseHeaders map[string]string
|
||||||
|
switch i.options.BackendType {
|
||||||
|
case backends.BackendTypeLlamaCpp:
|
||||||
|
responseHeaders = i.globalBackendSettings.LlamaCpp.ResponseHeaders
|
||||||
|
case backends.BackendTypeVllm:
|
||||||
|
responseHeaders = i.globalBackendSettings.VLLM.ResponseHeaders
|
||||||
|
case backends.BackendTypeMlxLm:
|
||||||
|
responseHeaders = i.globalBackendSettings.MLX.ResponseHeaders
|
||||||
|
}
|
||||||
proxy.ModifyResponse = func(resp *http.Response) error {
|
proxy.ModifyResponse = func(resp *http.Response) error {
|
||||||
// Remove CORS headers from llama-server response to avoid conflicts
|
// Remove CORS headers from llama-server response to avoid conflicts
|
||||||
// llamactl will add its own CORS headers
|
// llamactl will add its own CORS headers
|
||||||
@@ -207,6 +216,10 @@ func (i *Process) GetProxy() (*httputil.ReverseProxy, error) {
|
|||||||
resp.Header.Del("Access-Control-Allow-Credentials")
|
resp.Header.Del("Access-Control-Allow-Credentials")
|
||||||
resp.Header.Del("Access-Control-Max-Age")
|
resp.Header.Del("Access-Control-Max-Age")
|
||||||
resp.Header.Del("Access-Control-Expose-Headers")
|
resp.Header.Del("Access-Control-Expose-Headers")
|
||||||
|
|
||||||
|
for key, value := range responseHeaders {
|
||||||
|
resp.Header.Set(key, value)
|
||||||
|
}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ import (
|
|||||||
"fmt"
|
"fmt"
|
||||||
"log"
|
"log"
|
||||||
"net/http"
|
"net/http"
|
||||||
|
"os"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"runtime"
|
"runtime"
|
||||||
"syscall"
|
"syscall"
|
||||||
@@ -384,7 +385,11 @@ func (i *Process) buildCommand() (*exec.Cmd, error) {
|
|||||||
|
|
||||||
// Create the exec.Cmd
|
// Create the exec.Cmd
|
||||||
cmd := exec.CommandContext(i.ctx, command, args...)
|
cmd := exec.CommandContext(i.ctx, command, args...)
|
||||||
cmd.Env = []string{}
|
|
||||||
|
// Start with host environment variables
|
||||||
|
cmd.Env = os.Environ()
|
||||||
|
|
||||||
|
// Add/override with backend-specific environment variables
|
||||||
for k, v := range env {
|
for k, v := range env {
|
||||||
cmd.Env = append(cmd.Env, fmt.Sprintf("%s=%s", k, v))
|
cmd.Env = append(cmd.Env, fmt.Sprintf("%s=%s", k, v))
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -263,19 +263,32 @@ func (im *instanceManager) loadInstance(name, path string) error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// autoStartInstances starts instances that were running when persisted and have auto-restart enabled
|
// autoStartInstances starts instances that were running when persisted and have auto-restart enabled
|
||||||
|
// For instances with auto-restart disabled, it sets their status to Stopped
|
||||||
func (im *instanceManager) autoStartInstances() {
|
func (im *instanceManager) autoStartInstances() {
|
||||||
im.mu.RLock()
|
im.mu.RLock()
|
||||||
var instancesToStart []*instance.Process
|
var instancesToStart []*instance.Process
|
||||||
|
var instancesToStop []*instance.Process
|
||||||
for _, inst := range im.instances {
|
for _, inst := range im.instances {
|
||||||
if inst.IsRunning() && // Was running when persisted
|
if inst.IsRunning() && // Was running when persisted
|
||||||
inst.GetOptions() != nil &&
|
inst.GetOptions() != nil &&
|
||||||
inst.GetOptions().AutoRestart != nil &&
|
inst.GetOptions().AutoRestart != nil {
|
||||||
*inst.GetOptions().AutoRestart {
|
if *inst.GetOptions().AutoRestart {
|
||||||
instancesToStart = append(instancesToStart, inst)
|
instancesToStart = append(instancesToStart, inst)
|
||||||
|
} else {
|
||||||
|
// Instance was running but auto-restart is disabled, mark as stopped
|
||||||
|
instancesToStop = append(instancesToStop, inst)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
im.mu.RUnlock()
|
im.mu.RUnlock()
|
||||||
|
|
||||||
|
// Stop instances that have auto-restart disabled
|
||||||
|
for _, inst := range instancesToStop {
|
||||||
|
log.Printf("Instance %s was running but auto-restart is disabled, setting status to stopped", inst.Name)
|
||||||
|
inst.SetStatus(instance.Stopped)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Start instances that have auto-restart enabled
|
||||||
for _, inst := range instancesToStart {
|
for _, inst := range instancesToStart {
|
||||||
log.Printf("Auto-starting instance %s", inst.Name)
|
log.Printf("Auto-starting instance %s", inst.Name)
|
||||||
// Reset running state before starting (since Start() expects stopped instance)
|
// Reset running state before starting (since Start() expects stopped instance)
|
||||||
|
|||||||
@@ -209,3 +209,66 @@ func createTestManager() manager.InstanceManager {
|
|||||||
}
|
}
|
||||||
return manager.NewInstanceManager(backendConfig, cfg)
|
return manager.NewInstanceManager(backendConfig, cfg)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestAutoRestartDisabledInstanceStatus(t *testing.T) {
|
||||||
|
tempDir := t.TempDir()
|
||||||
|
|
||||||
|
backendConfig := config.BackendConfig{
|
||||||
|
LlamaCpp: config.BackendSettings{
|
||||||
|
Command: "llama-server",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
cfg := config.InstancesConfig{
|
||||||
|
PortRange: [2]int{8000, 9000},
|
||||||
|
InstancesDir: tempDir,
|
||||||
|
MaxInstances: 10,
|
||||||
|
TimeoutCheckInterval: 5,
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create first manager and instance with auto-restart disabled
|
||||||
|
manager1 := manager.NewInstanceManager(backendConfig, cfg)
|
||||||
|
|
||||||
|
autoRestart := false
|
||||||
|
options := &instance.CreateInstanceOptions{
|
||||||
|
BackendType: backends.BackendTypeLlamaCpp,
|
||||||
|
AutoRestart: &autoRestart,
|
||||||
|
LlamaServerOptions: &llamacpp.LlamaServerOptions{
|
||||||
|
Model: "/path/to/model.gguf",
|
||||||
|
Port: 8080,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
inst, err := manager1.CreateInstance("test-instance", options)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("CreateInstance failed: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Simulate instance being in running state when persisted
|
||||||
|
// (this would happen if the instance was running when llamactl was stopped)
|
||||||
|
inst.SetStatus(instance.Running)
|
||||||
|
|
||||||
|
// Shutdown first manager
|
||||||
|
manager1.Shutdown()
|
||||||
|
|
||||||
|
// Create second manager (simulating restart of llamactl)
|
||||||
|
manager2 := manager.NewInstanceManager(backendConfig, cfg)
|
||||||
|
|
||||||
|
// Get the loaded instance
|
||||||
|
loadedInst, err := manager2.GetInstance("test-instance")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("GetInstance failed: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// The instance should be marked as Stopped, not Running
|
||||||
|
// because auto-restart is disabled
|
||||||
|
if loadedInst.IsRunning() {
|
||||||
|
t.Errorf("Expected instance with auto-restart disabled to be stopped after manager restart, but it was running")
|
||||||
|
}
|
||||||
|
|
||||||
|
if loadedInst.GetStatus() != instance.Stopped {
|
||||||
|
t.Errorf("Expected instance status to be Stopped, got %v", loadedInst.GetStatus())
|
||||||
|
}
|
||||||
|
|
||||||
|
manager2.Shutdown()
|
||||||
|
}
|
||||||
|
|||||||
@@ -131,11 +131,16 @@ func (h *Handler) ListInstances() http.HandlerFunc {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
w.Header().Set("Content-Type", "application/json")
|
// Marshal to bytes first to set Content-Length header
|
||||||
if err := json.NewEncoder(w).Encode(instances); err != nil {
|
data, err := json.Marshal(instances)
|
||||||
|
if err != nil {
|
||||||
http.Error(w, "Failed to encode instances: "+err.Error(), http.StatusInternalServerError)
|
http.Error(w, "Failed to encode instances: "+err.Error(), http.StatusInternalServerError)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
w.Header().Set("Content-Type", "application/json")
|
||||||
|
w.Header().Set("Content-Length", strconv.Itoa(len(data)))
|
||||||
|
w.Write(data)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -202,7 +207,7 @@ func (h *Handler) GetInstance() http.HandlerFunc {
|
|||||||
|
|
||||||
inst, err := h.InstanceManager.GetInstance(name)
|
inst, err := h.InstanceManager.GetInstance(name)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
http.Error(w, "Failed to get instance: "+err.Error(), http.StatusInternalServerError)
|
http.Error(w, "Invalid instance: "+err.Error(), http.StatusBadRequest)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -475,29 +480,15 @@ func (h *Handler) ProxyToInstance() http.HandlerFunc {
|
|||||||
|
|
||||||
// Strip the "/api/v1/instances/<name>/proxy" prefix from the request URL
|
// Strip the "/api/v1/instances/<name>/proxy" prefix from the request URL
|
||||||
prefix := fmt.Sprintf("/api/v1/instances/%s/proxy", name)
|
prefix := fmt.Sprintf("/api/v1/instances/%s/proxy", name)
|
||||||
proxyPath := r.URL.Path[len(prefix):]
|
r.URL.Path = strings.TrimPrefix(r.URL.Path, prefix)
|
||||||
|
|
||||||
// Ensure the proxy path starts with "/"
|
|
||||||
if !strings.HasPrefix(proxyPath, "/") {
|
|
||||||
proxyPath = "/" + proxyPath
|
|
||||||
}
|
|
||||||
|
|
||||||
// Update the last request time for the instance
|
// Update the last request time for the instance
|
||||||
inst.UpdateLastRequestTime()
|
inst.UpdateLastRequestTime()
|
||||||
|
|
||||||
// Modify the request to remove the proxy prefix
|
|
||||||
originalPath := r.URL.Path
|
|
||||||
r.URL.Path = proxyPath
|
|
||||||
|
|
||||||
// Set forwarded headers
|
// Set forwarded headers
|
||||||
r.Header.Set("X-Forwarded-Host", r.Header.Get("Host"))
|
r.Header.Set("X-Forwarded-Host", r.Header.Get("Host"))
|
||||||
r.Header.Set("X-Forwarded-Proto", "http")
|
r.Header.Set("X-Forwarded-Proto", "http")
|
||||||
|
|
||||||
// Restore original path for logging purposes
|
|
||||||
defer func() {
|
|
||||||
r.URL.Path = originalPath
|
|
||||||
}()
|
|
||||||
|
|
||||||
// Forward the request using the cached proxy
|
// Forward the request using the cached proxy
|
||||||
proxy.ServeHTTP(w, r)
|
proxy.ServeHTTP(w, r)
|
||||||
}
|
}
|
||||||
@@ -580,12 +571,13 @@ func (h *Handler) OpenAIProxy() http.HandlerFunc {
|
|||||||
// Route to the appropriate inst based on instance name
|
// Route to the appropriate inst based on instance name
|
||||||
inst, err := h.InstanceManager.GetInstance(modelName)
|
inst, err := h.InstanceManager.GetInstance(modelName)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
http.Error(w, "Failed to get instance: "+err.Error(), http.StatusInternalServerError)
|
http.Error(w, "Invalid instance: "+err.Error(), http.StatusBadRequest)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
if !inst.IsRunning() {
|
if !inst.IsRunning() {
|
||||||
allowOnDemand := inst.GetOptions() != nil && inst.GetOptions().OnDemandStart != nil && *inst.GetOptions().OnDemandStart
|
options := inst.GetOptions()
|
||||||
|
allowOnDemand := options != nil && options.OnDemandStart != nil && *options.OnDemandStart
|
||||||
if !allowOnDemand {
|
if !allowOnDemand {
|
||||||
http.Error(w, "Instance is not running", http.StatusServiceUnavailable)
|
http.Error(w, "Instance is not running", http.StatusServiceUnavailable)
|
||||||
return
|
return
|
||||||
@@ -634,6 +626,84 @@ func (h *Handler) OpenAIProxy() http.HandlerFunc {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (h *Handler) LlamaCppProxy(onDemandStart bool) http.HandlerFunc {
|
||||||
|
return func(w http.ResponseWriter, r *http.Request) {
|
||||||
|
|
||||||
|
// Get the instance name from the URL parameter
|
||||||
|
name := chi.URLParam(r, "name")
|
||||||
|
if name == "" {
|
||||||
|
http.Error(w, "Instance name cannot be empty", http.StatusBadRequest)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Route to the appropriate inst based on instance name
|
||||||
|
inst, err := h.InstanceManager.GetInstance(name)
|
||||||
|
if err != nil {
|
||||||
|
http.Error(w, "Invalid instance: "+err.Error(), http.StatusBadRequest)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
options := inst.GetOptions()
|
||||||
|
if options == nil {
|
||||||
|
http.Error(w, "Cannot obtain Instance's options", http.StatusInternalServerError)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if options.BackendType != backends.BackendTypeLlamaCpp {
|
||||||
|
http.Error(w, "Instance is not a llama.cpp server.", http.StatusBadRequest)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if !inst.IsRunning() {
|
||||||
|
|
||||||
|
if !(onDemandStart && options.OnDemandStart != nil && *options.OnDemandStart) {
|
||||||
|
http.Error(w, "Instance is not running", http.StatusServiceUnavailable)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if h.InstanceManager.IsMaxRunningInstancesReached() {
|
||||||
|
if h.cfg.Instances.EnableLRUEviction {
|
||||||
|
err := h.InstanceManager.EvictLRUInstance()
|
||||||
|
if err != nil {
|
||||||
|
http.Error(w, "Cannot start Instance, failed to evict instance "+err.Error(), http.StatusInternalServerError)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
http.Error(w, "Cannot start Instance, maximum number of instances reached", http.StatusConflict)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// If on-demand start is enabled, start the instance
|
||||||
|
if _, err := h.InstanceManager.StartInstance(name); err != nil {
|
||||||
|
http.Error(w, "Failed to start instance: "+err.Error(), http.StatusInternalServerError)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Wait for the instance to become healthy before proceeding
|
||||||
|
if err := inst.WaitForHealthy(h.cfg.Instances.OnDemandStartTimeout); err != nil { // 2 minutes timeout
|
||||||
|
http.Error(w, "Instance failed to become healthy: "+err.Error(), http.StatusServiceUnavailable)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
proxy, err := inst.GetProxy()
|
||||||
|
if err != nil {
|
||||||
|
http.Error(w, "Failed to get proxy: "+err.Error(), http.StatusInternalServerError)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Strip the "/llama-cpp/<name>" prefix from the request URL
|
||||||
|
prefix := fmt.Sprintf("/llama-cpp/%s", name)
|
||||||
|
r.URL.Path = strings.TrimPrefix(r.URL.Path, prefix)
|
||||||
|
|
||||||
|
// Update the last request time for the instance
|
||||||
|
inst.UpdateLastRequestTime()
|
||||||
|
|
||||||
|
proxy.ServeHTTP(w, r)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// ParseCommandRequest represents the request body for command parsing
|
// ParseCommandRequest represents the request body for command parsing
|
||||||
type ParseCommandRequest struct {
|
type ParseCommandRequest struct {
|
||||||
Command string `json:"command"`
|
Command string `json:"command"`
|
||||||
|
|||||||
@@ -20,7 +20,7 @@ func SetupRouter(handler *Handler) *chi.Mux {
|
|||||||
r.Use(cors.Handler(cors.Options{
|
r.Use(cors.Handler(cors.Options{
|
||||||
AllowedOrigins: handler.cfg.Server.AllowedOrigins,
|
AllowedOrigins: handler.cfg.Server.AllowedOrigins,
|
||||||
AllowedMethods: []string{"GET", "POST", "PUT", "DELETE", "OPTIONS"},
|
AllowedMethods: []string{"GET", "POST", "PUT", "DELETE", "OPTIONS"},
|
||||||
AllowedHeaders: []string{"Accept", "Authorization", "Content-Type", "X-CSRF-Token"},
|
AllowedHeaders: handler.cfg.Server.AllowedHeaders,
|
||||||
ExposedHeaders: []string{"Link"},
|
ExposedHeaders: []string{"Link"},
|
||||||
AllowCredentials: false,
|
AllowCredentials: false,
|
||||||
MaxAge: 300,
|
MaxAge: 300,
|
||||||
@@ -103,6 +103,51 @@ func SetupRouter(handler *Handler) *chi.Mux {
|
|||||||
|
|
||||||
})
|
})
|
||||||
|
|
||||||
|
r.Route("/llama-cpp/{name}", func(r chi.Router) {
|
||||||
|
|
||||||
|
// Public Routes
|
||||||
|
// Allow llama-cpp server to serve its own WebUI if it is running.
|
||||||
|
// Don't auto start the server since it can be accessed without an API key
|
||||||
|
r.Get("/", handler.LlamaCppProxy(false))
|
||||||
|
|
||||||
|
// Private Routes
|
||||||
|
r.Group(func(r chi.Router) {
|
||||||
|
|
||||||
|
if authMiddleware != nil && handler.cfg.Auth.RequireInferenceAuth {
|
||||||
|
r.Use(authMiddleware.AuthMiddleware(KeyTypeInference))
|
||||||
|
}
|
||||||
|
|
||||||
|
// This handler auto start the server if it's not running
|
||||||
|
llamaCppHandler := handler.LlamaCppProxy(true)
|
||||||
|
|
||||||
|
// llama.cpp server specific proxy endpoints
|
||||||
|
r.Get("/props", llamaCppHandler)
|
||||||
|
// /slots endpoint is secured (see: https://github.com/ggml-org/llama.cpp/pull/15630)
|
||||||
|
r.Get("/slots", llamaCppHandler)
|
||||||
|
r.Post("/apply-template", llamaCppHandler)
|
||||||
|
r.Post("/completion", llamaCppHandler)
|
||||||
|
r.Post("/detokenize", llamaCppHandler)
|
||||||
|
r.Post("/embeddings", llamaCppHandler)
|
||||||
|
r.Post("/infill", llamaCppHandler)
|
||||||
|
r.Post("/metrics", llamaCppHandler)
|
||||||
|
r.Post("/props", llamaCppHandler)
|
||||||
|
r.Post("/reranking", llamaCppHandler)
|
||||||
|
r.Post("/tokenize", llamaCppHandler)
|
||||||
|
|
||||||
|
// OpenAI-compatible proxy endpoint
|
||||||
|
// Handles all POST requests to /v1/*, including:
|
||||||
|
// - /v1/completions
|
||||||
|
// - /v1/chat/completions
|
||||||
|
// - /v1/embeddings
|
||||||
|
// - /v1/rerank
|
||||||
|
// - /v1/reranking
|
||||||
|
// llamaCppHandler is used here because some users of llama.cpp endpoints depend
|
||||||
|
// on "model" field being optional, and handler.OpenAIProxy requires it.
|
||||||
|
r.Post("/v1/*", llamaCppHandler)
|
||||||
|
})
|
||||||
|
|
||||||
|
})
|
||||||
|
|
||||||
// Serve WebUI files
|
// Serve WebUI files
|
||||||
if err := webui.SetupWebUI(r); err != nil {
|
if err := webui.SetupWebUI(r); err != nil {
|
||||||
fmt.Printf("Failed to set up WebUI: %v\n", err)
|
fmt.Printf("Failed to set up WebUI: %v\n", err)
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
import { type ReactNode, createContext, useContext, useState, useEffect, useCallback } from 'react'
|
import { type ReactNode, createContext, useCallback, useContext, useEffect, useState } from 'react'
|
||||||
|
|
||||||
interface AuthContextState {
|
interface AuthContextState {
|
||||||
isAuthenticated: boolean
|
isAuthenticated: boolean
|
||||||
@@ -62,7 +62,7 @@ export const AuthProvider = ({ children }: AuthProviderProps) => {
|
|||||||
// Validate API key by making a test request
|
// Validate API key by making a test request
|
||||||
const validateApiKey = async (key: string): Promise<boolean> => {
|
const validateApiKey = async (key: string): Promise<boolean> => {
|
||||||
try {
|
try {
|
||||||
const response = await fetch('/api/v1/instances', {
|
const response = await fetch(document.baseURI + 'api/v1/instances', {
|
||||||
headers: {
|
headers: {
|
||||||
'Authorization': `Bearer ${key}`,
|
'Authorization': `Bearer ${key}`,
|
||||||
'Content-Type': 'application/json'
|
'Content-Type': 'application/json'
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
import { describe, it, expect, vi, beforeEach } from 'vitest'
|
|
||||||
import { instancesApi } from '@/lib/api'
|
import { instancesApi } from '@/lib/api'
|
||||||
|
import { beforeEach, describe, expect, it, vi } from 'vitest'
|
||||||
|
|
||||||
// Mock fetch globally
|
// Mock fetch globally
|
||||||
const mockFetch = vi.fn()
|
const mockFetch = vi.fn()
|
||||||
@@ -11,11 +11,13 @@ describe('API Error Handling', () => {
|
|||||||
})
|
})
|
||||||
|
|
||||||
it('converts HTTP errors to meaningful messages', async () => {
|
it('converts HTTP errors to meaningful messages', async () => {
|
||||||
mockFetch.mockResolvedValue({
|
const mockResponse = {
|
||||||
ok: false,
|
ok: false,
|
||||||
status: 409,
|
status: 409,
|
||||||
text: () => Promise.resolve('Instance already exists')
|
text: () => Promise.resolve('Instance already exists'),
|
||||||
})
|
clone: function() { return this }
|
||||||
|
}
|
||||||
|
mockFetch.mockResolvedValue(mockResponse)
|
||||||
|
|
||||||
await expect(instancesApi.create('existing', {}))
|
await expect(instancesApi.create('existing', {}))
|
||||||
.rejects
|
.rejects
|
||||||
@@ -23,11 +25,13 @@ describe('API Error Handling', () => {
|
|||||||
})
|
})
|
||||||
|
|
||||||
it('handles empty error responses gracefully', async () => {
|
it('handles empty error responses gracefully', async () => {
|
||||||
mockFetch.mockResolvedValue({
|
const mockResponse = {
|
||||||
ok: false,
|
ok: false,
|
||||||
status: 500,
|
status: 500,
|
||||||
text: () => Promise.resolve('')
|
text: () => Promise.resolve(''),
|
||||||
})
|
clone: function() { return this }
|
||||||
|
}
|
||||||
|
mockFetch.mockResolvedValue(mockResponse)
|
||||||
|
|
||||||
await expect(instancesApi.list())
|
await expect(instancesApi.list())
|
||||||
.rejects
|
.rejects
|
||||||
@@ -53,7 +57,9 @@ describe('API Error Handling', () => {
|
|||||||
await instancesApi.getLogs('test-instance', 100)
|
await instancesApi.getLogs('test-instance', 100)
|
||||||
|
|
||||||
expect(mockFetch).toHaveBeenCalledWith(
|
expect(mockFetch).toHaveBeenCalledWith(
|
||||||
'/api/v1/instances/test-instance/logs?lines=100',
|
expect.stringMatching(
|
||||||
|
/^https?:\/\/[^/]+\/api\/v1\/instances\/test-instance\/logs\?lines=100$/
|
||||||
|
),
|
||||||
expect.any(Object)
|
expect.any(Object)
|
||||||
)
|
)
|
||||||
})
|
})
|
||||||
|
|||||||
@@ -1,7 +1,10 @@
|
|||||||
import type { CreateInstanceOptions, Instance } from "@/types/instance";
|
import type { CreateInstanceOptions, Instance } from "@/types/instance";
|
||||||
import { handleApiError } from "./errorUtils";
|
import { handleApiError } from "./errorUtils";
|
||||||
|
|
||||||
const API_BASE = "/api/v1";
|
// Adding baseURI as a prefix to support being served behind a subpath
|
||||||
|
// e.g. when llmamctl's `/` is served behind a reverse proxy at `/proxy/...`
|
||||||
|
// the baseURI will be `/proxy/` and the API calls will be made to `/proxy/api/v1/<endpoint>`
|
||||||
|
export const API_BASE = document.baseURI + "api/v1";
|
||||||
|
|
||||||
// Base API call function with error handling
|
// Base API call function with error handling
|
||||||
async function apiCall<T>(
|
async function apiCall<T>(
|
||||||
@@ -46,11 +49,8 @@ async function apiCall<T>(
|
|||||||
} else {
|
} else {
|
||||||
// Handle empty responses for JSON endpoints
|
// Handle empty responses for JSON endpoints
|
||||||
const contentLength = response.headers.get('content-length');
|
const contentLength = response.headers.get('content-length');
|
||||||
if (contentLength === '0' || contentLength === null) {
|
if (contentLength === '0') {
|
||||||
const text = await response.text();
|
return {} as T; // Return empty object for empty JSON responses
|
||||||
if (text.trim() === '') {
|
|
||||||
return {} as T; // Return empty object for empty JSON responses
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
const data = await response.json() as T;
|
const data = await response.json() as T;
|
||||||
return data;
|
return data;
|
||||||
|
|||||||
@@ -26,7 +26,8 @@ export async function handleApiError(response: Response): Promise<void> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (!response.ok) {
|
if (!response.ok) {
|
||||||
const errorMessage = await parseErrorResponse(response)
|
// Clone the response before reading to avoid consuming the body stream
|
||||||
|
const errorMessage = await parseErrorResponse(response.clone())
|
||||||
throw new Error(errorMessage)
|
throw new Error(errorMessage)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -21,4 +21,6 @@ export default defineConfig({
|
|||||||
setupFiles: ['./src/test/setup.ts'],
|
setupFiles: ['./src/test/setup.ts'],
|
||||||
css: true,
|
css: true,
|
||||||
},
|
},
|
||||||
|
// ensures relative asset paths to support being served behind a subpath
|
||||||
|
base: "./"
|
||||||
})
|
})
|
||||||
Reference in New Issue
Block a user