From 131b1b407d413be09cc7a50fcf5e18cdf8f4fc0b Mon Sep 17 00:00:00 2001 From: LordMathis Date: Sun, 31 Aug 2025 16:21:18 +0200 Subject: [PATCH] Update api-referrence --- docs/user-guide/api-reference.md | 516 ++++++++++++++----------------- 1 file changed, 232 insertions(+), 284 deletions(-) diff --git a/docs/user-guide/api-reference.md b/docs/user-guide/api-reference.md index fcd88f3..1152ebe 100644 --- a/docs/user-guide/api-reference.md +++ b/docs/user-guide/api-reference.md @@ -7,18 +7,69 @@ Complete reference for the Llamactl REST API. All API endpoints are relative to the base URL: ``` -http://localhost:8080/api +http://localhost:8080/api/v1 ``` ## Authentication -If authentication is enabled, include the JWT token in the Authorization header: +Llamactl supports API key authentication. If authentication is enabled, include the API key in the Authorization header: ```bash -curl -H "Authorization: Bearer " \ - http://localhost:8080/api/instances +curl -H "Authorization: Bearer " \ + http://localhost:8080/api/v1/instances ``` +The server supports two types of API keys: +- **Management API Keys**: Required for instance management operations (CRUD operations on instances) +- **Inference API Keys**: Required for OpenAI-compatible inference endpoints + +## System Endpoints + +### Get Llamactl Version + +Get the version information of the llamactl server. + +```http +GET /api/v1/version +``` + +**Response:** +``` +Version: 1.0.0 +Commit: abc123 +Build Time: 2024-01-15T10:00:00Z +``` + +### Get Llama Server Help + +Get help text for the llama-server command. + +```http +GET /api/v1/server/help +``` + +**Response:** Plain text help output from `llama-server --help` + +### Get Llama Server Version + +Get version information of the llama-server binary. + +```http +GET /api/v1/server/version +``` + +**Response:** Plain text version output from `llama-server --version` + +### List Available Devices + +List available devices for llama-server. + +```http +GET /api/v1/server/devices +``` + +**Response:** Plain text device list from `llama-server --list-devices` + ## Instances ### List All Instances @@ -26,23 +77,18 @@ curl -H "Authorization: Bearer " \ Get a list of all instances. ```http -GET /api/instances +GET /api/v1/instances ``` **Response:** ```json -{ - "instances": [ - { - "name": "llama2-7b", - "status": "running", - "model_path": "/models/llama-2-7b.gguf", - "port": 8081, - "created_at": "2024-01-15T10:30:00Z", - "updated_at": "2024-01-15T12:45:00Z" - } - ] -} +[ + { + "name": "llama2-7b", + "status": "running", + "created": 1705312200 + } +] ``` ### Get Instance Details @@ -50,7 +96,7 @@ GET /api/instances Get detailed information about a specific instance. ```http -GET /api/instances/{name} +GET /api/v1/instances/{name} ``` **Response:** @@ -58,92 +104,57 @@ GET /api/instances/{name} { "name": "llama2-7b", "status": "running", - "model_path": "/models/llama-2-7b.gguf", - "port": 8081, - "pid": 12345, - "options": { - "threads": 4, - "context_size": 2048, - "gpu_layers": 0 - }, - "stats": { - "memory_usage": 4294967296, - "cpu_usage": 25.5, - "uptime": 3600 - }, - "created_at": "2024-01-15T10:30:00Z", - "updated_at": "2024-01-15T12:45:00Z" + "created": 1705312200 } ``` ### Create Instance -Create a new instance. +Create and start a new instance. ```http -POST /api/instances +POST /api/v1/instances/{name} ``` -**Request Body:** -```json -{ - "name": "my-instance", - "model_path": "/path/to/model.gguf", - "port": 8081, - "options": { - "threads": 4, - "context_size": 2048, - "gpu_layers": 0 - } -} -``` +**Request Body:** JSON object with instance configuration. See [Managing Instances](managing-instances.md) for available configuration options. **Response:** ```json { - "message": "Instance created successfully", - "instance": { - "name": "my-instance", - "status": "stopped", - "model_path": "/path/to/model.gguf", - "port": 8081, - "created_at": "2024-01-15T14:30:00Z" - } + "name": "llama2-7b", + "status": "running", + "created": 1705312200 } ``` ### Update Instance -Update an existing instance configuration. +Update an existing instance configuration. See [Managing Instances](managing-instances.md) for available configuration options. ```http -PUT /api/instances/{name} +PUT /api/v1/instances/{name} ``` -**Request Body:** +**Request Body:** JSON object with configuration fields to update. + +**Response:** ```json { - "options": { - "threads": 8, - "context_size": 4096 - } + "name": "llama2-7b", + "status": "running", + "created": 1705312200 } ``` ### Delete Instance -Delete an instance (must be stopped first). +Stop and remove an instance. ```http -DELETE /api/instances/{name} +DELETE /api/v1/instances/{name} ``` -**Response:** -```json -{ - "message": "Instance deleted successfully" -} -``` +**Response:** `204 No Content` ## Instance Operations @@ -152,38 +163,36 @@ DELETE /api/instances/{name} Start a stopped instance. ```http -POST /api/instances/{name}/start +POST /api/v1/instances/{name}/start ``` **Response:** ```json { - "message": "Instance start initiated", - "status": "starting" + "name": "llama2-7b", + "status": "starting", + "created": 1705312200 } ``` +**Error Responses:** +- `409 Conflict`: Maximum number of running instances reached +- `500 Internal Server Error`: Failed to start instance + ### Stop Instance Stop a running instance. ```http -POST /api/instances/{name}/stop -``` - -**Request Body (Optional):** -```json -{ - "force": false, - "timeout": 30 -} +POST /api/v1/instances/{name}/stop ``` **Response:** ```json { - "message": "Instance stop initiated", - "status": "stopping" + "name": "llama2-7b", + "status": "stopping", + "created": 1705312200 } ``` @@ -192,27 +201,15 @@ POST /api/instances/{name}/stop Restart an instance (stop then start). ```http -POST /api/instances/{name}/restart -``` - -### Get Instance Health - -Check instance health status. - -```http -GET /api/instances/{name}/health +POST /api/v1/instances/{name}/restart ``` **Response:** ```json { - "status": "healthy", - "checks": { - "process": "running", - "port": "open", - "response": "ok" - }, - "last_check": "2024-01-15T14:30:00Z" + "name": "llama2-7b", + "status": "restarting", + "created": 1705312200 } ``` @@ -221,146 +218,108 @@ GET /api/instances/{name}/health Retrieve instance logs. ```http -GET /api/instances/{name}/logs +GET /api/v1/instances/{name}/logs ``` **Query Parameters:** -- `lines`: Number of lines to return (default: 100) -- `follow`: Stream logs (boolean) -- `level`: Filter by log level (debug, info, warn, error) +- `lines`: Number of lines to return (default: all lines, use -1 for all) + +**Response:** Plain text log output + +**Example:** +```bash +curl "http://localhost:8080/api/v1/instances/my-instance/logs?lines=100" +``` + +### Proxy to Instance + +Proxy HTTP requests directly to the llama-server instance. + +```http +GET /api/v1/instances/{name}/proxy/* +POST /api/v1/instances/{name}/proxy/* +``` + +This endpoint forwards all requests to the underlying llama-server instance running on its configured port. The proxy strips the `/api/v1/instances/{name}/proxy` prefix and forwards the remaining path to the instance. + +**Example - Check Instance Health:** +```bash +curl -H "Authorization: Bearer your-api-key" \ + http://localhost:8080/api/v1/instances/my-model/proxy/health +``` + +This forwards the request to `http://instance-host:instance-port/health` on the actual llama-server instance. + +**Error Responses:** +- `503 Service Unavailable`: Instance is not running + +## OpenAI-Compatible API + +Llamactl provides OpenAI-compatible endpoints for inference operations. + +### List Models + +List all instances in OpenAI-compatible format. + +```http +GET /v1/models +``` **Response:** ```json { - "logs": [ + "object": "list", + "data": [ { - "timestamp": "2024-01-15T14:30:00Z", - "level": "info", - "message": "Model loaded successfully" + "id": "llama2-7b", + "object": "model", + "created": 1705312200, + "owned_by": "llamactl" } ] } ``` -## Batch Operations +### Chat Completions, Completions, Embeddings -### Start All Instances - -Start all stopped instances. +All OpenAI-compatible inference endpoints are available: ```http -POST /api/instances/start-all +POST /v1/chat/completions +POST /v1/completions +POST /v1/embeddings +POST /v1/rerank +POST /v1/reranking ``` -### Stop All Instances +**Request Body:** Standard OpenAI format with `model` field specifying the instance name -Stop all running instances. - -```http -POST /api/instances/stop-all -``` - -## System Information - -### Get System Status - -Get overall system status and metrics. - -```http -GET /api/system/status -``` - -**Response:** +**Example:** ```json { - "version": "1.0.0", - "uptime": 86400, - "instances": { - "total": 5, - "running": 3, - "stopped": 2 - }, - "resources": { - "cpu_usage": 45.2, - "memory_usage": 8589934592, - "memory_total": 17179869184, - "disk_usage": 75.5 - } + "model": "llama2-7b", + "messages": [ + { + "role": "user", + "content": "Hello, how are you?" + } + ] } ``` -### Get System Information +The server routes requests to the appropriate instance based on the `model` field in the request body. Instances with on-demand starting enabled will be automatically started if not running. For configuration details, see [Managing Instances](managing-instances.md). -Get detailed system information. +**Error Responses:** +- `400 Bad Request`: Invalid request body or missing model name +- `503 Service Unavailable`: Instance is not running and on-demand start is disabled +- `409 Conflict`: Cannot start instance due to maximum instances limit -```http -GET /api/system/info -``` +## Instance Status Values -**Response:** -```json -{ - "hostname": "server-01", - "os": "linux", - "arch": "amd64", - "cpu_count": 8, - "memory_total": 17179869184, - "version": "1.0.0", - "build_time": "2024-01-15T10:00:00Z" -} -``` - -## Configuration - -### Get Configuration - -Get current Llamactl configuration. - -```http -GET /api/config -``` - -### Update Configuration - -Update Llamactl configuration (requires restart). - -```http -PUT /api/config -``` - -## Authentication - -### Login - -Authenticate and receive a JWT token. - -```http -POST /api/auth/login -``` - -**Request Body:** -```json -{ - "username": "admin", - "password": "password" -} -``` - -**Response:** -```json -{ - "token": "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9...", - "expires_at": "2024-01-16T14:30:00Z" -} -``` - -### Refresh Token - -Refresh an existing JWT token. - -```http -POST /api/auth/refresh -``` +Instances can have the following status values: +- `stopped`: Instance is not running +- `running`: Instance is running and ready to accept requests +- `failed`: Instance failed to start or crashed ## Error Responses @@ -368,9 +327,7 @@ All endpoints may return error responses in the following format: ```json { - "error": "Error message", - "code": "ERROR_CODE", - "details": "Additional error details" + "error": "Error message description" } ``` @@ -378,87 +335,78 @@ All endpoints may return error responses in the following format: - `200`: Success - `201`: Created -- `400`: Bad Request -- `401`: Unauthorized -- `403`: Forbidden -- `404`: Not Found -- `409`: Conflict (e.g., instance already exists) +- `204`: No Content (successful deletion) +- `400`: Bad Request (invalid parameters or request body) +- `401`: Unauthorized (missing or invalid API key) +- `403`: Forbidden (insufficient permissions) +- `404`: Not Found (instance not found) +- `409`: Conflict (instance already exists, max instances reached) - `500`: Internal Server Error - -## WebSocket API - -### Real-time Updates - -Connect to WebSocket for real-time updates: - -```javascript -const ws = new WebSocket('ws://localhost:8080/api/ws'); - -ws.onmessage = function(event) { - const data = JSON.parse(event.data); - console.log('Update:', data); -}; -``` - -**Message Types:** -- `instance_status_changed`: Instance status updates -- `instance_stats_updated`: Resource usage updates -- `system_alert`: System-level alerts - -## Rate Limiting - -API requests are rate limited to: -- **100 requests per minute** for regular endpoints -- **10 requests per minute** for resource-intensive operations - -Rate limit headers are included in responses: -- `X-RateLimit-Limit`: Request limit -- `X-RateLimit-Remaining`: Remaining requests -- `X-RateLimit-Reset`: Reset time (Unix timestamp) - -## SDKs and Libraries - -### Go Client - -```go -import "github.com/lordmathis/llamactl-go-client" - -client := llamactl.NewClient("http://localhost:8080") -instances, err := client.ListInstances() -``` - -### Python Client - -```python -from llamactl import Client - -client = Client("http://localhost:8080") -instances = client.list_instances() -``` +- `503`: Service Unavailable (instance not running) ## Examples ### Complete Instance Lifecycle ```bash -# Create instance -curl -X POST http://localhost:8080/api/instances \ +# Create and start instance +curl -X POST http://localhost:8080/api/v1/instances/my-model \ -H "Content-Type: application/json" \ + -H "Authorization: Bearer your-api-key" \ -d '{ - "name": "example", - "model_path": "/models/example.gguf", - "port": 8081 + "model": "/models/llama-2-7b.gguf" }' -# Start instance -curl -X POST http://localhost:8080/api/instances/example/start +# Check instance status +curl -H "Authorization: Bearer your-api-key" \ + http://localhost:8080/api/v1/instances/my-model -# Check status -curl http://localhost:8080/api/instances/example +# Get instance logs +curl -H "Authorization: Bearer your-api-key" \ + "http://localhost:8080/api/v1/instances/my-model/logs?lines=50" + +# Use OpenAI-compatible chat completions +curl -X POST http://localhost:8080/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer your-inference-api-key" \ + -d '{ + "model": "my-model", + "messages": [ + {"role": "user", "content": "Hello!"} + ], + "max_tokens": 100 + }' # Stop instance -curl -X POST http://localhost:8080/api/instances/example/stop +curl -X POST -H "Authorization: Bearer your-api-key" \ + http://localhost:8080/api/v1/instances/my-model/stop # Delete instance -curl -X DELETE http://localhost:8080/api/instances/example +curl -X DELETE -H "Authorization: Bearer your-api-key" \ + http://localhost:8080/api/v1/instances/my-model ``` + +### Using the Proxy Endpoint + +You can also directly proxy requests to the llama-server instance: + +```bash +# Direct proxy to instance (bypasses OpenAI compatibility layer) +curl -X POST http://localhost:8080/api/v1/instances/my-model/proxy/completion \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer your-api-key" \ + -d '{ + "prompt": "Hello, world!", + "n_predict": 50 + }' +``` + +## Swagger Documentation + +If swagger documentation is enabled in the server configuration, you can access the interactive API documentation at: + +``` +http://localhost:8080/swagger/ +``` + +This provides a complete interactive interface for testing all API endpoints.