mirror of
https://github.com/lordmathis/llamactl.git
synced 2025-11-06 00:54:23 +00:00
Merge pull request #34 from lordmathis/feat/vllm-backend
feat: Implement vLLM backend
This commit is contained in:
26
README.md
26
README.md
@@ -13,7 +13,7 @@
|
||||
|
||||
### 🔗 Universal Compatibility
|
||||
- **OpenAI API Compatible**: Drop-in replacement - route requests by model name
|
||||
- **Multi-Backend Support**: Native support for both llama.cpp and MLX (Apple Silicon optimized)
|
||||
- **Multi-Backend Support**: Native support for llama.cpp, MLX (Apple Silicon optimized), and vLLM
|
||||
|
||||
### 🌐 User-Friendly Interface
|
||||
- **Web Dashboard**: Modern React UI for visual management (unlike CLI-only tools)
|
||||
@@ -31,6 +31,7 @@
|
||||
# 1. Install backend (one-time setup)
|
||||
# For llama.cpp: https://github.com/ggml-org/llama.cpp#quick-start
|
||||
# For MLX on macOS: pip install mlx-lm
|
||||
# For vLLM: pip install vllm
|
||||
|
||||
# 2. Download and run llamactl
|
||||
LATEST_VERSION=$(curl -s https://api.github.com/repos/lordmathis/llamactl/releases/latest | grep '"tag_name":' | sed -E 's/.*"([^"]+)".*/\1/')
|
||||
@@ -47,7 +48,7 @@ llamactl
|
||||
### Create and manage instances via web dashboard:
|
||||
1. Open http://localhost:8080
|
||||
2. Click "Create Instance"
|
||||
3. Choose backend type (llama.cpp or MLX)
|
||||
3. Choose backend type (llama.cpp, MLX, or vLLM)
|
||||
4. Set model path and backend-specific options
|
||||
5. Start or stop the instance
|
||||
|
||||
@@ -63,6 +64,11 @@ curl -X POST localhost:8080/api/v1/instances/my-mlx-model \
|
||||
-H "Authorization: Bearer your-key" \
|
||||
-d '{"backend_type": "mlx_lm", "backend_options": {"model": "mlx-community/Mistral-7B-Instruct-v0.3-4bit"}}'
|
||||
|
||||
# Create vLLM instance
|
||||
curl -X POST localhost:8080/api/v1/instances/my-vllm-model \
|
||||
-H "Authorization: Bearer your-key" \
|
||||
-d '{"backend_type": "vllm", "backend_options": {"model": "microsoft/DialoGPT-medium", "tensor_parallel_size": 2}}'
|
||||
|
||||
# Use with OpenAI SDK
|
||||
curl -X POST localhost:8080/v1/chat/completions \
|
||||
-H "Authorization: Bearer your-key" \
|
||||
@@ -121,6 +127,21 @@ source mlx-env/bin/activate
|
||||
pip install mlx-lm
|
||||
```
|
||||
|
||||
**For vLLM backend:**
|
||||
You need vLLM installed:
|
||||
|
||||
```bash
|
||||
# Install via pip (requires Python 3.8+, GPU required)
|
||||
pip install vllm
|
||||
|
||||
# Or in a virtual environment (recommended)
|
||||
python -m venv vllm-env
|
||||
source vllm-env/bin/activate
|
||||
pip install vllm
|
||||
|
||||
# For production deployments, consider container-based installation
|
||||
```
|
||||
|
||||
## Configuration
|
||||
|
||||
llamactl works out of the box with sensible defaults.
|
||||
@@ -135,6 +156,7 @@ server:
|
||||
backends:
|
||||
llama_executable: llama-server # Path to llama-server executable
|
||||
mlx_lm_executable: mlx_lm.server # Path to mlx_lm.server executable
|
||||
vllm_executable: vllm # Path to vllm executable
|
||||
|
||||
instances:
|
||||
port_range: [8000, 9000] # Port range for instances
|
||||
|
||||
675
apidocs/docs.go
675
apidocs/docs.go
@@ -19,6 +19,159 @@ const docTemplate = `{
|
||||
"host": "{{.Host}}",
|
||||
"basePath": "{{.BasePath}}",
|
||||
"paths": {
|
||||
"/backends/llama-cpp/parse-command": {
|
||||
"post": {
|
||||
"security": [
|
||||
{
|
||||
"ApiKeyAuth": []
|
||||
}
|
||||
],
|
||||
"description": "Parses a llama-server command string into instance options",
|
||||
"consumes": [
|
||||
"application/json"
|
||||
],
|
||||
"produces": [
|
||||
"application/json"
|
||||
],
|
||||
"tags": [
|
||||
"backends"
|
||||
],
|
||||
"summary": "Parse llama-server command",
|
||||
"parameters": [
|
||||
{
|
||||
"description": "Command to parse",
|
||||
"name": "request",
|
||||
"in": "body",
|
||||
"required": true,
|
||||
"schema": {
|
||||
"$ref": "#/definitions/server.ParseCommandRequest"
|
||||
}
|
||||
}
|
||||
],
|
||||
"responses": {
|
||||
"200": {
|
||||
"description": "Parsed options",
|
||||
"schema": {
|
||||
"$ref": "#/definitions/instance.CreateInstanceOptions"
|
||||
}
|
||||
},
|
||||
"400": {
|
||||
"description": "Invalid request or command",
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"additionalProperties": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
},
|
||||
"500": {
|
||||
"description": "Internal Server Error",
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"additionalProperties": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"/backends/mlx/parse-command": {
|
||||
"post": {
|
||||
"security": [
|
||||
{
|
||||
"ApiKeyAuth": []
|
||||
}
|
||||
],
|
||||
"description": "Parses MLX-LM server command string into instance options",
|
||||
"consumes": [
|
||||
"application/json"
|
||||
],
|
||||
"produces": [
|
||||
"application/json"
|
||||
],
|
||||
"tags": [
|
||||
"backends"
|
||||
],
|
||||
"summary": "Parse mlx_lm.server command",
|
||||
"parameters": [
|
||||
{
|
||||
"description": "Command to parse",
|
||||
"name": "request",
|
||||
"in": "body",
|
||||
"required": true,
|
||||
"schema": {
|
||||
"$ref": "#/definitions/server.ParseCommandRequest"
|
||||
}
|
||||
}
|
||||
],
|
||||
"responses": {
|
||||
"200": {
|
||||
"description": "Parsed options",
|
||||
"schema": {
|
||||
"$ref": "#/definitions/instance.CreateInstanceOptions"
|
||||
}
|
||||
},
|
||||
"400": {
|
||||
"description": "Invalid request or command",
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"additionalProperties": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"/backends/vllm/parse-command": {
|
||||
"post": {
|
||||
"security": [
|
||||
{
|
||||
"ApiKeyAuth": []
|
||||
}
|
||||
],
|
||||
"description": "Parses a vLLM serve command string into instance options",
|
||||
"consumes": [
|
||||
"application/json"
|
||||
],
|
||||
"produces": [
|
||||
"application/json"
|
||||
],
|
||||
"tags": [
|
||||
"backends"
|
||||
],
|
||||
"summary": "Parse vllm serve command",
|
||||
"parameters": [
|
||||
{
|
||||
"description": "Command to parse",
|
||||
"name": "request",
|
||||
"in": "body",
|
||||
"required": true,
|
||||
"schema": {
|
||||
"$ref": "#/definitions/server.ParseCommandRequest"
|
||||
}
|
||||
}
|
||||
],
|
||||
"responses": {
|
||||
"200": {
|
||||
"description": "Parsed options",
|
||||
"schema": {
|
||||
"$ref": "#/definitions/instance.CreateInstanceOptions"
|
||||
}
|
||||
},
|
||||
"400": {
|
||||
"description": "Invalid request or command",
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"additionalProperties": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"/instances": {
|
||||
"get": {
|
||||
"security": [
|
||||
@@ -681,522 +834,46 @@ const docTemplate = `{
|
||||
}
|
||||
},
|
||||
"definitions": {
|
||||
"backends.BackendType": {
|
||||
"type": "string",
|
||||
"enum": [
|
||||
"llama_cpp",
|
||||
"mlx_lm",
|
||||
"vllm"
|
||||
],
|
||||
"x-enum-varnames": [
|
||||
"BackendTypeLlamaCpp",
|
||||
"BackendTypeMlxLm",
|
||||
"BackendTypeVllm"
|
||||
]
|
||||
},
|
||||
"instance.CreateInstanceOptions": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"alias": {
|
||||
"type": "string"
|
||||
},
|
||||
"api_key": {
|
||||
"type": "string"
|
||||
},
|
||||
"api_key_file": {
|
||||
"type": "string"
|
||||
},
|
||||
"auto_restart": {
|
||||
"description": "Auto restart",
|
||||
"type": "boolean"
|
||||
},
|
||||
"batch_size": {
|
||||
"type": "integer"
|
||||
"backend_options": {
|
||||
"type": "object",
|
||||
"additionalProperties": {}
|
||||
},
|
||||
"cache_reuse": {
|
||||
"type": "integer"
|
||||
},
|
||||
"cache_type_k": {
|
||||
"type": "string"
|
||||
},
|
||||
"cache_type_k_draft": {
|
||||
"type": "string"
|
||||
},
|
||||
"cache_type_v": {
|
||||
"type": "string"
|
||||
},
|
||||
"cache_type_v_draft": {
|
||||
"type": "string"
|
||||
},
|
||||
"chat_template": {
|
||||
"type": "string"
|
||||
},
|
||||
"chat_template_file": {
|
||||
"type": "string"
|
||||
},
|
||||
"chat_template_kwargs": {
|
||||
"type": "string"
|
||||
},
|
||||
"check_tensors": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"cont_batching": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"control_vector": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"control_vector_layer_range": {
|
||||
"type": "string"
|
||||
},
|
||||
"control_vector_scaled": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"cpu_mask": {
|
||||
"type": "string"
|
||||
},
|
||||
"cpu_mask_batch": {
|
||||
"type": "string"
|
||||
},
|
||||
"cpu_range": {
|
||||
"type": "string"
|
||||
},
|
||||
"cpu_range_batch": {
|
||||
"type": "string"
|
||||
},
|
||||
"cpu_strict": {
|
||||
"type": "integer"
|
||||
},
|
||||
"cpu_strict_batch": {
|
||||
"type": "integer"
|
||||
},
|
||||
"ctx_size": {
|
||||
"type": "integer"
|
||||
},
|
||||
"ctx_size_draft": {
|
||||
"type": "integer"
|
||||
},
|
||||
"defrag_thold": {
|
||||
"type": "number"
|
||||
},
|
||||
"device": {
|
||||
"type": "string"
|
||||
},
|
||||
"device_draft": {
|
||||
"type": "string"
|
||||
},
|
||||
"draft_max": {
|
||||
"type": "integer"
|
||||
},
|
||||
"draft_min": {
|
||||
"type": "integer"
|
||||
},
|
||||
"draft_p_min": {
|
||||
"type": "number"
|
||||
},
|
||||
"dry_allowed_length": {
|
||||
"type": "integer"
|
||||
},
|
||||
"dry_base": {
|
||||
"type": "number"
|
||||
},
|
||||
"dry_multiplier": {
|
||||
"type": "number"
|
||||
},
|
||||
"dry_penalty_last_n": {
|
||||
"type": "integer"
|
||||
},
|
||||
"dry_sequence_breaker": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"dump_kv_cache": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"dynatemp_exp": {
|
||||
"type": "number"
|
||||
},
|
||||
"dynatemp_range": {
|
||||
"type": "number"
|
||||
},
|
||||
"embd_bge_small_en_default": {
|
||||
"description": "Default model params",
|
||||
"type": "boolean"
|
||||
},
|
||||
"embd_e5_small_en_default": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"embd_gte_small_default": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"embedding": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"escape": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"fim_qwen_14b_spec": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"fim_qwen_1_5b_default": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"fim_qwen_3b_default": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"fim_qwen_7b_default": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"fim_qwen_7b_spec": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"flash_attn": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"frequency_penalty": {
|
||||
"type": "number"
|
||||
},
|
||||
"gpu_layers": {
|
||||
"type": "integer"
|
||||
},
|
||||
"gpu_layers_draft": {
|
||||
"type": "integer"
|
||||
},
|
||||
"grammar": {
|
||||
"type": "string"
|
||||
},
|
||||
"grammar_file": {
|
||||
"type": "string"
|
||||
},
|
||||
"hf_file": {
|
||||
"type": "string"
|
||||
},
|
||||
"hf_file_v": {
|
||||
"type": "string"
|
||||
},
|
||||
"hf_repo": {
|
||||
"type": "string"
|
||||
},
|
||||
"hf_repo_draft": {
|
||||
"type": "string"
|
||||
},
|
||||
"hf_repo_v": {
|
||||
"type": "string"
|
||||
},
|
||||
"hf_token": {
|
||||
"type": "string"
|
||||
},
|
||||
"host": {
|
||||
"type": "string"
|
||||
"backend_type": {
|
||||
"$ref": "#/definitions/backends.BackendType"
|
||||
},
|
||||
"idle_timeout": {
|
||||
"description": "Idle timeout",
|
||||
"type": "integer"
|
||||
},
|
||||
"ignore_eos": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"jinja": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"json_schema": {
|
||||
"type": "string"
|
||||
},
|
||||
"json_schema_file": {
|
||||
"type": "string"
|
||||
},
|
||||
"keep": {
|
||||
"type": "integer"
|
||||
},
|
||||
"log_colors": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"log_disable": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"log_file": {
|
||||
"type": "string"
|
||||
},
|
||||
"log_prefix": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"log_timestamps": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"logit_bias": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"lora": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"lora_init_without_apply": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"lora_scaled": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"main_gpu": {
|
||||
"type": "integer"
|
||||
},
|
||||
"max_restarts": {
|
||||
"type": "integer"
|
||||
},
|
||||
"metrics": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"min_p": {
|
||||
"type": "number"
|
||||
},
|
||||
"mirostat": {
|
||||
"type": "integer"
|
||||
},
|
||||
"mirostat_ent": {
|
||||
"type": "number"
|
||||
},
|
||||
"mirostat_lr": {
|
||||
"type": "number"
|
||||
},
|
||||
"mlock": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"mmproj": {
|
||||
"type": "string"
|
||||
},
|
||||
"mmproj_url": {
|
||||
"type": "string"
|
||||
},
|
||||
"model": {
|
||||
"type": "string"
|
||||
},
|
||||
"model_draft": {
|
||||
"type": "string"
|
||||
},
|
||||
"model_url": {
|
||||
"type": "string"
|
||||
},
|
||||
"model_vocoder": {
|
||||
"description": "Audio/TTS params",
|
||||
"type": "string"
|
||||
},
|
||||
"no_cont_batching": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"no_context_shift": {
|
||||
"description": "Example-specific params",
|
||||
"type": "boolean"
|
||||
},
|
||||
"no_escape": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"no_kv_offload": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"no_mmap": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"no_mmproj": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"no_mmproj_offload": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"no_perf": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"no_prefill_assistant": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"no_slots": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"no_warmup": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"no_webui": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"numa": {
|
||||
"type": "string"
|
||||
},
|
||||
"on_demand_start": {
|
||||
"description": "On demand start",
|
||||
"type": "boolean"
|
||||
},
|
||||
"override_kv": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"override_tensor": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"parallel": {
|
||||
"type": "integer"
|
||||
},
|
||||
"path": {
|
||||
"type": "string"
|
||||
},
|
||||
"poll": {
|
||||
"type": "integer"
|
||||
},
|
||||
"poll_batch": {
|
||||
"type": "integer"
|
||||
},
|
||||
"pooling": {
|
||||
"type": "string"
|
||||
},
|
||||
"port": {
|
||||
"type": "integer"
|
||||
},
|
||||
"predict": {
|
||||
"type": "integer"
|
||||
},
|
||||
"presence_penalty": {
|
||||
"type": "number"
|
||||
},
|
||||
"prio": {
|
||||
"type": "integer"
|
||||
},
|
||||
"prio_batch": {
|
||||
"type": "integer"
|
||||
},
|
||||
"props": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"reasoning_budget": {
|
||||
"type": "integer"
|
||||
},
|
||||
"reasoning_format": {
|
||||
"type": "string"
|
||||
},
|
||||
"repeat_last_n": {
|
||||
"type": "integer"
|
||||
},
|
||||
"repeat_penalty": {
|
||||
"type": "number"
|
||||
},
|
||||
"reranking": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"restart_delay": {
|
||||
"type": "integer"
|
||||
},
|
||||
"rope_freq_base": {
|
||||
"type": "number"
|
||||
},
|
||||
"rope_freq_scale": {
|
||||
"type": "number"
|
||||
},
|
||||
"rope_scale": {
|
||||
"type": "number"
|
||||
},
|
||||
"rope_scaling": {
|
||||
"type": "string"
|
||||
},
|
||||
"samplers": {
|
||||
"description": "Sampling params",
|
||||
"type": "string"
|
||||
},
|
||||
"sampling_seq": {
|
||||
"type": "string"
|
||||
},
|
||||
"seed": {
|
||||
"type": "integer"
|
||||
},
|
||||
"slot_prompt_similarity": {
|
||||
"type": "number"
|
||||
},
|
||||
"slot_save_path": {
|
||||
"type": "string"
|
||||
},
|
||||
"slots": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"special": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"split_mode": {
|
||||
"type": "string"
|
||||
},
|
||||
"spm_infill": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"ssl_cert_file": {
|
||||
"type": "string"
|
||||
},
|
||||
"ssl_key_file": {
|
||||
"type": "string"
|
||||
},
|
||||
"temp": {
|
||||
"type": "number"
|
||||
},
|
||||
"tensor_split": {
|
||||
"type": "string"
|
||||
},
|
||||
"threads": {
|
||||
"type": "integer"
|
||||
},
|
||||
"threads_batch": {
|
||||
"type": "integer"
|
||||
},
|
||||
"threads_http": {
|
||||
"type": "integer"
|
||||
},
|
||||
"timeout": {
|
||||
"type": "integer"
|
||||
},
|
||||
"top_k": {
|
||||
"type": "integer"
|
||||
},
|
||||
"top_p": {
|
||||
"type": "number"
|
||||
},
|
||||
"tts_use_guide_tokens": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"typical": {
|
||||
"type": "number"
|
||||
},
|
||||
"ubatch_size": {
|
||||
"type": "integer"
|
||||
},
|
||||
"verbose": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"verbose_prompt": {
|
||||
"description": "Common params",
|
||||
"type": "boolean"
|
||||
},
|
||||
"verbosity": {
|
||||
"type": "integer"
|
||||
},
|
||||
"xtc_probability": {
|
||||
"type": "number"
|
||||
},
|
||||
"xtc_threshold": {
|
||||
"type": "number"
|
||||
},
|
||||
"yarn_attn_factor": {
|
||||
"type": "number"
|
||||
},
|
||||
"yarn_beta_fast": {
|
||||
"type": "number"
|
||||
},
|
||||
"yarn_beta_slow": {
|
||||
"type": "number"
|
||||
},
|
||||
"yarn_ext_factor": {
|
||||
"type": "number"
|
||||
},
|
||||
"yarn_orig_ctx": {
|
||||
"description": "seconds",
|
||||
"type": "integer"
|
||||
}
|
||||
}
|
||||
@@ -1264,6 +941,14 @@ const docTemplate = `{
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
},
|
||||
"server.ParseCommandRequest": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"command": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}`
|
||||
|
||||
@@ -12,6 +12,159 @@
|
||||
},
|
||||
"basePath": "/api/v1",
|
||||
"paths": {
|
||||
"/backends/llama-cpp/parse-command": {
|
||||
"post": {
|
||||
"security": [
|
||||
{
|
||||
"ApiKeyAuth": []
|
||||
}
|
||||
],
|
||||
"description": "Parses a llama-server command string into instance options",
|
||||
"consumes": [
|
||||
"application/json"
|
||||
],
|
||||
"produces": [
|
||||
"application/json"
|
||||
],
|
||||
"tags": [
|
||||
"backends"
|
||||
],
|
||||
"summary": "Parse llama-server command",
|
||||
"parameters": [
|
||||
{
|
||||
"description": "Command to parse",
|
||||
"name": "request",
|
||||
"in": "body",
|
||||
"required": true,
|
||||
"schema": {
|
||||
"$ref": "#/definitions/server.ParseCommandRequest"
|
||||
}
|
||||
}
|
||||
],
|
||||
"responses": {
|
||||
"200": {
|
||||
"description": "Parsed options",
|
||||
"schema": {
|
||||
"$ref": "#/definitions/instance.CreateInstanceOptions"
|
||||
}
|
||||
},
|
||||
"400": {
|
||||
"description": "Invalid request or command",
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"additionalProperties": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
},
|
||||
"500": {
|
||||
"description": "Internal Server Error",
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"additionalProperties": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"/backends/mlx/parse-command": {
|
||||
"post": {
|
||||
"security": [
|
||||
{
|
||||
"ApiKeyAuth": []
|
||||
}
|
||||
],
|
||||
"description": "Parses MLX-LM server command string into instance options",
|
||||
"consumes": [
|
||||
"application/json"
|
||||
],
|
||||
"produces": [
|
||||
"application/json"
|
||||
],
|
||||
"tags": [
|
||||
"backends"
|
||||
],
|
||||
"summary": "Parse mlx_lm.server command",
|
||||
"parameters": [
|
||||
{
|
||||
"description": "Command to parse",
|
||||
"name": "request",
|
||||
"in": "body",
|
||||
"required": true,
|
||||
"schema": {
|
||||
"$ref": "#/definitions/server.ParseCommandRequest"
|
||||
}
|
||||
}
|
||||
],
|
||||
"responses": {
|
||||
"200": {
|
||||
"description": "Parsed options",
|
||||
"schema": {
|
||||
"$ref": "#/definitions/instance.CreateInstanceOptions"
|
||||
}
|
||||
},
|
||||
"400": {
|
||||
"description": "Invalid request or command",
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"additionalProperties": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"/backends/vllm/parse-command": {
|
||||
"post": {
|
||||
"security": [
|
||||
{
|
||||
"ApiKeyAuth": []
|
||||
}
|
||||
],
|
||||
"description": "Parses a vLLM serve command string into instance options",
|
||||
"consumes": [
|
||||
"application/json"
|
||||
],
|
||||
"produces": [
|
||||
"application/json"
|
||||
],
|
||||
"tags": [
|
||||
"backends"
|
||||
],
|
||||
"summary": "Parse vllm serve command",
|
||||
"parameters": [
|
||||
{
|
||||
"description": "Command to parse",
|
||||
"name": "request",
|
||||
"in": "body",
|
||||
"required": true,
|
||||
"schema": {
|
||||
"$ref": "#/definitions/server.ParseCommandRequest"
|
||||
}
|
||||
}
|
||||
],
|
||||
"responses": {
|
||||
"200": {
|
||||
"description": "Parsed options",
|
||||
"schema": {
|
||||
"$ref": "#/definitions/instance.CreateInstanceOptions"
|
||||
}
|
||||
},
|
||||
"400": {
|
||||
"description": "Invalid request or command",
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"additionalProperties": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"/instances": {
|
||||
"get": {
|
||||
"security": [
|
||||
@@ -674,522 +827,46 @@
|
||||
}
|
||||
},
|
||||
"definitions": {
|
||||
"backends.BackendType": {
|
||||
"type": "string",
|
||||
"enum": [
|
||||
"llama_cpp",
|
||||
"mlx_lm",
|
||||
"vllm"
|
||||
],
|
||||
"x-enum-varnames": [
|
||||
"BackendTypeLlamaCpp",
|
||||
"BackendTypeMlxLm",
|
||||
"BackendTypeVllm"
|
||||
]
|
||||
},
|
||||
"instance.CreateInstanceOptions": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"alias": {
|
||||
"type": "string"
|
||||
},
|
||||
"api_key": {
|
||||
"type": "string"
|
||||
},
|
||||
"api_key_file": {
|
||||
"type": "string"
|
||||
},
|
||||
"auto_restart": {
|
||||
"description": "Auto restart",
|
||||
"type": "boolean"
|
||||
},
|
||||
"batch_size": {
|
||||
"type": "integer"
|
||||
"backend_options": {
|
||||
"type": "object",
|
||||
"additionalProperties": {}
|
||||
},
|
||||
"cache_reuse": {
|
||||
"type": "integer"
|
||||
},
|
||||
"cache_type_k": {
|
||||
"type": "string"
|
||||
},
|
||||
"cache_type_k_draft": {
|
||||
"type": "string"
|
||||
},
|
||||
"cache_type_v": {
|
||||
"type": "string"
|
||||
},
|
||||
"cache_type_v_draft": {
|
||||
"type": "string"
|
||||
},
|
||||
"chat_template": {
|
||||
"type": "string"
|
||||
},
|
||||
"chat_template_file": {
|
||||
"type": "string"
|
||||
},
|
||||
"chat_template_kwargs": {
|
||||
"type": "string"
|
||||
},
|
||||
"check_tensors": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"cont_batching": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"control_vector": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"control_vector_layer_range": {
|
||||
"type": "string"
|
||||
},
|
||||
"control_vector_scaled": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"cpu_mask": {
|
||||
"type": "string"
|
||||
},
|
||||
"cpu_mask_batch": {
|
||||
"type": "string"
|
||||
},
|
||||
"cpu_range": {
|
||||
"type": "string"
|
||||
},
|
||||
"cpu_range_batch": {
|
||||
"type": "string"
|
||||
},
|
||||
"cpu_strict": {
|
||||
"type": "integer"
|
||||
},
|
||||
"cpu_strict_batch": {
|
||||
"type": "integer"
|
||||
},
|
||||
"ctx_size": {
|
||||
"type": "integer"
|
||||
},
|
||||
"ctx_size_draft": {
|
||||
"type": "integer"
|
||||
},
|
||||
"defrag_thold": {
|
||||
"type": "number"
|
||||
},
|
||||
"device": {
|
||||
"type": "string"
|
||||
},
|
||||
"device_draft": {
|
||||
"type": "string"
|
||||
},
|
||||
"draft_max": {
|
||||
"type": "integer"
|
||||
},
|
||||
"draft_min": {
|
||||
"type": "integer"
|
||||
},
|
||||
"draft_p_min": {
|
||||
"type": "number"
|
||||
},
|
||||
"dry_allowed_length": {
|
||||
"type": "integer"
|
||||
},
|
||||
"dry_base": {
|
||||
"type": "number"
|
||||
},
|
||||
"dry_multiplier": {
|
||||
"type": "number"
|
||||
},
|
||||
"dry_penalty_last_n": {
|
||||
"type": "integer"
|
||||
},
|
||||
"dry_sequence_breaker": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"dump_kv_cache": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"dynatemp_exp": {
|
||||
"type": "number"
|
||||
},
|
||||
"dynatemp_range": {
|
||||
"type": "number"
|
||||
},
|
||||
"embd_bge_small_en_default": {
|
||||
"description": "Default model params",
|
||||
"type": "boolean"
|
||||
},
|
||||
"embd_e5_small_en_default": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"embd_gte_small_default": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"embedding": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"escape": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"fim_qwen_14b_spec": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"fim_qwen_1_5b_default": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"fim_qwen_3b_default": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"fim_qwen_7b_default": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"fim_qwen_7b_spec": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"flash_attn": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"frequency_penalty": {
|
||||
"type": "number"
|
||||
},
|
||||
"gpu_layers": {
|
||||
"type": "integer"
|
||||
},
|
||||
"gpu_layers_draft": {
|
||||
"type": "integer"
|
||||
},
|
||||
"grammar": {
|
||||
"type": "string"
|
||||
},
|
||||
"grammar_file": {
|
||||
"type": "string"
|
||||
},
|
||||
"hf_file": {
|
||||
"type": "string"
|
||||
},
|
||||
"hf_file_v": {
|
||||
"type": "string"
|
||||
},
|
||||
"hf_repo": {
|
||||
"type": "string"
|
||||
},
|
||||
"hf_repo_draft": {
|
||||
"type": "string"
|
||||
},
|
||||
"hf_repo_v": {
|
||||
"type": "string"
|
||||
},
|
||||
"hf_token": {
|
||||
"type": "string"
|
||||
},
|
||||
"host": {
|
||||
"type": "string"
|
||||
"backend_type": {
|
||||
"$ref": "#/definitions/backends.BackendType"
|
||||
},
|
||||
"idle_timeout": {
|
||||
"description": "Idle timeout",
|
||||
"type": "integer"
|
||||
},
|
||||
"ignore_eos": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"jinja": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"json_schema": {
|
||||
"type": "string"
|
||||
},
|
||||
"json_schema_file": {
|
||||
"type": "string"
|
||||
},
|
||||
"keep": {
|
||||
"type": "integer"
|
||||
},
|
||||
"log_colors": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"log_disable": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"log_file": {
|
||||
"type": "string"
|
||||
},
|
||||
"log_prefix": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"log_timestamps": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"logit_bias": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"lora": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"lora_init_without_apply": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"lora_scaled": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"main_gpu": {
|
||||
"type": "integer"
|
||||
},
|
||||
"max_restarts": {
|
||||
"type": "integer"
|
||||
},
|
||||
"metrics": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"min_p": {
|
||||
"type": "number"
|
||||
},
|
||||
"mirostat": {
|
||||
"type": "integer"
|
||||
},
|
||||
"mirostat_ent": {
|
||||
"type": "number"
|
||||
},
|
||||
"mirostat_lr": {
|
||||
"type": "number"
|
||||
},
|
||||
"mlock": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"mmproj": {
|
||||
"type": "string"
|
||||
},
|
||||
"mmproj_url": {
|
||||
"type": "string"
|
||||
},
|
||||
"model": {
|
||||
"type": "string"
|
||||
},
|
||||
"model_draft": {
|
||||
"type": "string"
|
||||
},
|
||||
"model_url": {
|
||||
"type": "string"
|
||||
},
|
||||
"model_vocoder": {
|
||||
"description": "Audio/TTS params",
|
||||
"type": "string"
|
||||
},
|
||||
"no_cont_batching": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"no_context_shift": {
|
||||
"description": "Example-specific params",
|
||||
"type": "boolean"
|
||||
},
|
||||
"no_escape": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"no_kv_offload": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"no_mmap": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"no_mmproj": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"no_mmproj_offload": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"no_perf": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"no_prefill_assistant": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"no_slots": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"no_warmup": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"no_webui": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"numa": {
|
||||
"type": "string"
|
||||
},
|
||||
"on_demand_start": {
|
||||
"description": "On demand start",
|
||||
"type": "boolean"
|
||||
},
|
||||
"override_kv": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"override_tensor": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"parallel": {
|
||||
"type": "integer"
|
||||
},
|
||||
"path": {
|
||||
"type": "string"
|
||||
},
|
||||
"poll": {
|
||||
"type": "integer"
|
||||
},
|
||||
"poll_batch": {
|
||||
"type": "integer"
|
||||
},
|
||||
"pooling": {
|
||||
"type": "string"
|
||||
},
|
||||
"port": {
|
||||
"type": "integer"
|
||||
},
|
||||
"predict": {
|
||||
"type": "integer"
|
||||
},
|
||||
"presence_penalty": {
|
||||
"type": "number"
|
||||
},
|
||||
"prio": {
|
||||
"type": "integer"
|
||||
},
|
||||
"prio_batch": {
|
||||
"type": "integer"
|
||||
},
|
||||
"props": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"reasoning_budget": {
|
||||
"type": "integer"
|
||||
},
|
||||
"reasoning_format": {
|
||||
"type": "string"
|
||||
},
|
||||
"repeat_last_n": {
|
||||
"type": "integer"
|
||||
},
|
||||
"repeat_penalty": {
|
||||
"type": "number"
|
||||
},
|
||||
"reranking": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"restart_delay": {
|
||||
"type": "integer"
|
||||
},
|
||||
"rope_freq_base": {
|
||||
"type": "number"
|
||||
},
|
||||
"rope_freq_scale": {
|
||||
"type": "number"
|
||||
},
|
||||
"rope_scale": {
|
||||
"type": "number"
|
||||
},
|
||||
"rope_scaling": {
|
||||
"type": "string"
|
||||
},
|
||||
"samplers": {
|
||||
"description": "Sampling params",
|
||||
"type": "string"
|
||||
},
|
||||
"sampling_seq": {
|
||||
"type": "string"
|
||||
},
|
||||
"seed": {
|
||||
"type": "integer"
|
||||
},
|
||||
"slot_prompt_similarity": {
|
||||
"type": "number"
|
||||
},
|
||||
"slot_save_path": {
|
||||
"type": "string"
|
||||
},
|
||||
"slots": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"special": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"split_mode": {
|
||||
"type": "string"
|
||||
},
|
||||
"spm_infill": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"ssl_cert_file": {
|
||||
"type": "string"
|
||||
},
|
||||
"ssl_key_file": {
|
||||
"type": "string"
|
||||
},
|
||||
"temp": {
|
||||
"type": "number"
|
||||
},
|
||||
"tensor_split": {
|
||||
"type": "string"
|
||||
},
|
||||
"threads": {
|
||||
"type": "integer"
|
||||
},
|
||||
"threads_batch": {
|
||||
"type": "integer"
|
||||
},
|
||||
"threads_http": {
|
||||
"type": "integer"
|
||||
},
|
||||
"timeout": {
|
||||
"type": "integer"
|
||||
},
|
||||
"top_k": {
|
||||
"type": "integer"
|
||||
},
|
||||
"top_p": {
|
||||
"type": "number"
|
||||
},
|
||||
"tts_use_guide_tokens": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"typical": {
|
||||
"type": "number"
|
||||
},
|
||||
"ubatch_size": {
|
||||
"type": "integer"
|
||||
},
|
||||
"verbose": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"verbose_prompt": {
|
||||
"description": "Common params",
|
||||
"type": "boolean"
|
||||
},
|
||||
"verbosity": {
|
||||
"type": "integer"
|
||||
},
|
||||
"xtc_probability": {
|
||||
"type": "number"
|
||||
},
|
||||
"xtc_threshold": {
|
||||
"type": "number"
|
||||
},
|
||||
"yarn_attn_factor": {
|
||||
"type": "number"
|
||||
},
|
||||
"yarn_beta_fast": {
|
||||
"type": "number"
|
||||
},
|
||||
"yarn_beta_slow": {
|
||||
"type": "number"
|
||||
},
|
||||
"yarn_ext_factor": {
|
||||
"type": "number"
|
||||
},
|
||||
"yarn_orig_ctx": {
|
||||
"description": "seconds",
|
||||
"type": "integer"
|
||||
}
|
||||
}
|
||||
@@ -1257,6 +934,14 @@
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
},
|
||||
"server.ParseCommandRequest": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"command": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,352 +1,35 @@
|
||||
basePath: /api/v1
|
||||
definitions:
|
||||
backends.BackendType:
|
||||
enum:
|
||||
- llama_cpp
|
||||
- mlx_lm
|
||||
- vllm
|
||||
type: string
|
||||
x-enum-varnames:
|
||||
- BackendTypeLlamaCpp
|
||||
- BackendTypeMlxLm
|
||||
- BackendTypeVllm
|
||||
instance.CreateInstanceOptions:
|
||||
properties:
|
||||
alias:
|
||||
type: string
|
||||
api_key:
|
||||
type: string
|
||||
api_key_file:
|
||||
type: string
|
||||
auto_restart:
|
||||
description: Auto restart
|
||||
type: boolean
|
||||
batch_size:
|
||||
type: integer
|
||||
cache_reuse:
|
||||
type: integer
|
||||
cache_type_k:
|
||||
type: string
|
||||
cache_type_k_draft:
|
||||
type: string
|
||||
cache_type_v:
|
||||
type: string
|
||||
cache_type_v_draft:
|
||||
type: string
|
||||
chat_template:
|
||||
type: string
|
||||
chat_template_file:
|
||||
type: string
|
||||
chat_template_kwargs:
|
||||
type: string
|
||||
check_tensors:
|
||||
type: boolean
|
||||
cont_batching:
|
||||
type: boolean
|
||||
control_vector:
|
||||
items:
|
||||
type: string
|
||||
type: array
|
||||
control_vector_layer_range:
|
||||
type: string
|
||||
control_vector_scaled:
|
||||
items:
|
||||
type: string
|
||||
type: array
|
||||
cpu_mask:
|
||||
type: string
|
||||
cpu_mask_batch:
|
||||
type: string
|
||||
cpu_range:
|
||||
type: string
|
||||
cpu_range_batch:
|
||||
type: string
|
||||
cpu_strict:
|
||||
type: integer
|
||||
cpu_strict_batch:
|
||||
type: integer
|
||||
ctx_size:
|
||||
type: integer
|
||||
ctx_size_draft:
|
||||
type: integer
|
||||
defrag_thold:
|
||||
type: number
|
||||
device:
|
||||
type: string
|
||||
device_draft:
|
||||
type: string
|
||||
draft_max:
|
||||
type: integer
|
||||
draft_min:
|
||||
type: integer
|
||||
draft_p_min:
|
||||
type: number
|
||||
dry_allowed_length:
|
||||
type: integer
|
||||
dry_base:
|
||||
type: number
|
||||
dry_multiplier:
|
||||
type: number
|
||||
dry_penalty_last_n:
|
||||
type: integer
|
||||
dry_sequence_breaker:
|
||||
items:
|
||||
type: string
|
||||
type: array
|
||||
dump_kv_cache:
|
||||
type: boolean
|
||||
dynatemp_exp:
|
||||
type: number
|
||||
dynatemp_range:
|
||||
type: number
|
||||
embd_bge_small_en_default:
|
||||
description: Default model params
|
||||
type: boolean
|
||||
embd_e5_small_en_default:
|
||||
type: boolean
|
||||
embd_gte_small_default:
|
||||
type: boolean
|
||||
embedding:
|
||||
type: boolean
|
||||
escape:
|
||||
type: boolean
|
||||
fim_qwen_1_5b_default:
|
||||
type: boolean
|
||||
fim_qwen_3b_default:
|
||||
type: boolean
|
||||
fim_qwen_7b_default:
|
||||
type: boolean
|
||||
fim_qwen_7b_spec:
|
||||
type: boolean
|
||||
fim_qwen_14b_spec:
|
||||
type: boolean
|
||||
flash_attn:
|
||||
type: boolean
|
||||
frequency_penalty:
|
||||
type: number
|
||||
gpu_layers:
|
||||
type: integer
|
||||
gpu_layers_draft:
|
||||
type: integer
|
||||
grammar:
|
||||
type: string
|
||||
grammar_file:
|
||||
type: string
|
||||
hf_file:
|
||||
type: string
|
||||
hf_file_v:
|
||||
type: string
|
||||
hf_repo:
|
||||
type: string
|
||||
hf_repo_draft:
|
||||
type: string
|
||||
hf_repo_v:
|
||||
type: string
|
||||
hf_token:
|
||||
type: string
|
||||
host:
|
||||
type: string
|
||||
backend_options:
|
||||
additionalProperties: {}
|
||||
type: object
|
||||
backend_type:
|
||||
$ref: '#/definitions/backends.BackendType'
|
||||
idle_timeout:
|
||||
description: Idle timeout
|
||||
type: integer
|
||||
ignore_eos:
|
||||
type: boolean
|
||||
jinja:
|
||||
type: boolean
|
||||
json_schema:
|
||||
type: string
|
||||
json_schema_file:
|
||||
type: string
|
||||
keep:
|
||||
type: integer
|
||||
log_colors:
|
||||
type: boolean
|
||||
log_disable:
|
||||
type: boolean
|
||||
log_file:
|
||||
type: string
|
||||
log_prefix:
|
||||
type: boolean
|
||||
log_timestamps:
|
||||
type: boolean
|
||||
logit_bias:
|
||||
items:
|
||||
type: string
|
||||
type: array
|
||||
lora:
|
||||
items:
|
||||
type: string
|
||||
type: array
|
||||
lora_init_without_apply:
|
||||
type: boolean
|
||||
lora_scaled:
|
||||
items:
|
||||
type: string
|
||||
type: array
|
||||
main_gpu:
|
||||
type: integer
|
||||
max_restarts:
|
||||
type: integer
|
||||
metrics:
|
||||
type: boolean
|
||||
min_p:
|
||||
type: number
|
||||
mirostat:
|
||||
type: integer
|
||||
mirostat_ent:
|
||||
type: number
|
||||
mirostat_lr:
|
||||
type: number
|
||||
mlock:
|
||||
type: boolean
|
||||
mmproj:
|
||||
type: string
|
||||
mmproj_url:
|
||||
type: string
|
||||
model:
|
||||
type: string
|
||||
model_draft:
|
||||
type: string
|
||||
model_url:
|
||||
type: string
|
||||
model_vocoder:
|
||||
description: Audio/TTS params
|
||||
type: string
|
||||
no_cont_batching:
|
||||
type: boolean
|
||||
no_context_shift:
|
||||
description: Example-specific params
|
||||
type: boolean
|
||||
no_escape:
|
||||
type: boolean
|
||||
no_kv_offload:
|
||||
type: boolean
|
||||
no_mmap:
|
||||
type: boolean
|
||||
no_mmproj:
|
||||
type: boolean
|
||||
no_mmproj_offload:
|
||||
type: boolean
|
||||
no_perf:
|
||||
type: boolean
|
||||
no_prefill_assistant:
|
||||
type: boolean
|
||||
no_slots:
|
||||
type: boolean
|
||||
no_warmup:
|
||||
type: boolean
|
||||
no_webui:
|
||||
type: boolean
|
||||
numa:
|
||||
type: string
|
||||
on_demand_start:
|
||||
description: On demand start
|
||||
type: boolean
|
||||
override_kv:
|
||||
items:
|
||||
type: string
|
||||
type: array
|
||||
override_tensor:
|
||||
items:
|
||||
type: string
|
||||
type: array
|
||||
parallel:
|
||||
type: integer
|
||||
path:
|
||||
type: string
|
||||
poll:
|
||||
type: integer
|
||||
poll_batch:
|
||||
type: integer
|
||||
pooling:
|
||||
type: string
|
||||
port:
|
||||
type: integer
|
||||
predict:
|
||||
type: integer
|
||||
presence_penalty:
|
||||
type: number
|
||||
prio:
|
||||
type: integer
|
||||
prio_batch:
|
||||
type: integer
|
||||
props:
|
||||
type: boolean
|
||||
reasoning_budget:
|
||||
type: integer
|
||||
reasoning_format:
|
||||
type: string
|
||||
repeat_last_n:
|
||||
type: integer
|
||||
repeat_penalty:
|
||||
type: number
|
||||
reranking:
|
||||
type: boolean
|
||||
restart_delay:
|
||||
type: integer
|
||||
rope_freq_base:
|
||||
type: number
|
||||
rope_freq_scale:
|
||||
type: number
|
||||
rope_scale:
|
||||
type: number
|
||||
rope_scaling:
|
||||
type: string
|
||||
samplers:
|
||||
description: Sampling params
|
||||
type: string
|
||||
sampling_seq:
|
||||
type: string
|
||||
seed:
|
||||
type: integer
|
||||
slot_prompt_similarity:
|
||||
type: number
|
||||
slot_save_path:
|
||||
type: string
|
||||
slots:
|
||||
type: boolean
|
||||
special:
|
||||
type: boolean
|
||||
split_mode:
|
||||
type: string
|
||||
spm_infill:
|
||||
type: boolean
|
||||
ssl_cert_file:
|
||||
type: string
|
||||
ssl_key_file:
|
||||
type: string
|
||||
temp:
|
||||
type: number
|
||||
tensor_split:
|
||||
type: string
|
||||
threads:
|
||||
type: integer
|
||||
threads_batch:
|
||||
type: integer
|
||||
threads_http:
|
||||
type: integer
|
||||
timeout:
|
||||
type: integer
|
||||
top_k:
|
||||
type: integer
|
||||
top_p:
|
||||
type: number
|
||||
tts_use_guide_tokens:
|
||||
type: boolean
|
||||
typical:
|
||||
type: number
|
||||
ubatch_size:
|
||||
type: integer
|
||||
verbose:
|
||||
type: boolean
|
||||
verbose_prompt:
|
||||
description: Common params
|
||||
type: boolean
|
||||
verbosity:
|
||||
type: integer
|
||||
xtc_probability:
|
||||
type: number
|
||||
xtc_threshold:
|
||||
type: number
|
||||
yarn_attn_factor:
|
||||
type: number
|
||||
yarn_beta_fast:
|
||||
type: number
|
||||
yarn_beta_slow:
|
||||
type: number
|
||||
yarn_ext_factor:
|
||||
type: number
|
||||
yarn_orig_ctx:
|
||||
description: seconds
|
||||
type: integer
|
||||
type: object
|
||||
instance.InstanceStatus:
|
||||
@@ -391,6 +74,11 @@ definitions:
|
||||
object:
|
||||
type: string
|
||||
type: object
|
||||
server.ParseCommandRequest:
|
||||
properties:
|
||||
command:
|
||||
type: string
|
||||
type: object
|
||||
info:
|
||||
contact: {}
|
||||
description: llamactl is a control server for managing Llama Server instances.
|
||||
@@ -400,6 +88,102 @@ info:
|
||||
title: llamactl API
|
||||
version: "1.0"
|
||||
paths:
|
||||
/backends/llama-cpp/parse-command:
|
||||
post:
|
||||
consumes:
|
||||
- application/json
|
||||
description: Parses a llama-server command string into instance options
|
||||
parameters:
|
||||
- description: Command to parse
|
||||
in: body
|
||||
name: request
|
||||
required: true
|
||||
schema:
|
||||
$ref: '#/definitions/server.ParseCommandRequest'
|
||||
produces:
|
||||
- application/json
|
||||
responses:
|
||||
"200":
|
||||
description: Parsed options
|
||||
schema:
|
||||
$ref: '#/definitions/instance.CreateInstanceOptions'
|
||||
"400":
|
||||
description: Invalid request or command
|
||||
schema:
|
||||
additionalProperties:
|
||||
type: string
|
||||
type: object
|
||||
"500":
|
||||
description: Internal Server Error
|
||||
schema:
|
||||
additionalProperties:
|
||||
type: string
|
||||
type: object
|
||||
security:
|
||||
- ApiKeyAuth: []
|
||||
summary: Parse llama-server command
|
||||
tags:
|
||||
- backends
|
||||
/backends/mlx/parse-command:
|
||||
post:
|
||||
consumes:
|
||||
- application/json
|
||||
description: Parses MLX-LM server command string into instance options
|
||||
parameters:
|
||||
- description: Command to parse
|
||||
in: body
|
||||
name: request
|
||||
required: true
|
||||
schema:
|
||||
$ref: '#/definitions/server.ParseCommandRequest'
|
||||
produces:
|
||||
- application/json
|
||||
responses:
|
||||
"200":
|
||||
description: Parsed options
|
||||
schema:
|
||||
$ref: '#/definitions/instance.CreateInstanceOptions'
|
||||
"400":
|
||||
description: Invalid request or command
|
||||
schema:
|
||||
additionalProperties:
|
||||
type: string
|
||||
type: object
|
||||
security:
|
||||
- ApiKeyAuth: []
|
||||
summary: Parse mlx_lm.server command
|
||||
tags:
|
||||
- backends
|
||||
/backends/vllm/parse-command:
|
||||
post:
|
||||
consumes:
|
||||
- application/json
|
||||
description: Parses a vLLM serve command string into instance options
|
||||
parameters:
|
||||
- description: Command to parse
|
||||
in: body
|
||||
name: request
|
||||
required: true
|
||||
schema:
|
||||
$ref: '#/definitions/server.ParseCommandRequest'
|
||||
produces:
|
||||
- application/json
|
||||
responses:
|
||||
"200":
|
||||
description: Parsed options
|
||||
schema:
|
||||
$ref: '#/definitions/instance.CreateInstanceOptions'
|
||||
"400":
|
||||
description: Invalid request or command
|
||||
schema:
|
||||
additionalProperties:
|
||||
type: string
|
||||
type: object
|
||||
security:
|
||||
- ApiKeyAuth: []
|
||||
summary: Parse vllm serve command
|
||||
tags:
|
||||
- backends
|
||||
/instances:
|
||||
get:
|
||||
description: Returns a list of all instances managed by the server
|
||||
|
||||
@@ -22,6 +22,7 @@ server:
|
||||
backends:
|
||||
llama_executable: llama-server # Path to llama-server executable
|
||||
mlx_lm_executable: mlx_lm.server # Path to mlx_lm.server executable
|
||||
vllm_executable: vllm # Path to vllm executable
|
||||
|
||||
instances:
|
||||
port_range: [8000, 9000] # Port range for instances
|
||||
@@ -94,11 +95,13 @@ server:
|
||||
backends:
|
||||
llama_executable: "llama-server" # Path to llama-server executable (default: "llama-server")
|
||||
mlx_lm_executable: "mlx_lm.server" # Path to mlx_lm.server executable (default: "mlx_lm.server")
|
||||
vllm_executable: "vllm" # Path to vllm executable (default: "vllm")
|
||||
```
|
||||
|
||||
**Environment Variables:**
|
||||
- `LLAMACTL_LLAMA_EXECUTABLE` - Path to llama-server executable
|
||||
- `LLAMACTL_MLX_LM_EXECUTABLE` - Path to mlx_lm.server executable
|
||||
- `LLAMACTL_VLLM_EXECUTABLE` - Path to vllm executable
|
||||
|
||||
### Instance Configuration
|
||||
|
||||
|
||||
@@ -37,6 +37,22 @@ pip install mlx-lm
|
||||
|
||||
Note: MLX backend is only available on macOS with Apple Silicon (M1, M2, M3, etc.)
|
||||
|
||||
**For vLLM backend:**
|
||||
|
||||
vLLM provides high-throughput distributed serving for LLMs. Install vLLM:
|
||||
|
||||
```bash
|
||||
# Install via pip (requires Python 3.8+, GPU required)
|
||||
pip install vllm
|
||||
|
||||
# Or in a virtual environment (recommended)
|
||||
python -m venv vllm-env
|
||||
source vllm-env/bin/activate
|
||||
pip install vllm
|
||||
|
||||
# For production deployments, consider container-based installation
|
||||
```
|
||||
|
||||
## Installation Methods
|
||||
|
||||
### Option 1: Download Binary (Recommended)
|
||||
|
||||
@@ -29,8 +29,9 @@ You should see the Llamactl web interface.
|
||||
1. Click the "Add Instance" button
|
||||
2. Fill in the instance configuration:
|
||||
- **Name**: Give your instance a descriptive name
|
||||
- **Model Path**: Path to your Llama.cpp model file
|
||||
- **Additional Options**: Any extra Llama.cpp parameters
|
||||
- **Backend Type**: Choose from llama.cpp, MLX, or vLLM
|
||||
- **Model**: Model path or identifier for your chosen backend
|
||||
- **Additional Options**: Backend-specific parameters
|
||||
|
||||
3. Click "Create Instance"
|
||||
|
||||
@@ -43,17 +44,46 @@ Once created, you can:
|
||||
- **View logs** by clicking the logs button
|
||||
- **Stop** the instance when needed
|
||||
|
||||
## Example Configuration
|
||||
## Example Configurations
|
||||
|
||||
Here's a basic example configuration for a Llama 2 model:
|
||||
Here are basic example configurations for each backend:
|
||||
|
||||
**llama.cpp backend:**
|
||||
```json
|
||||
{
|
||||
"name": "llama2-7b",
|
||||
"model_path": "/path/to/llama-2-7b-chat.gguf",
|
||||
"options": {
|
||||
"backend_type": "llama_cpp",
|
||||
"backend_options": {
|
||||
"model": "/path/to/llama-2-7b-chat.gguf",
|
||||
"threads": 4,
|
||||
"context_size": 2048
|
||||
"ctx_size": 2048,
|
||||
"gpu_layers": 32
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**MLX backend (macOS only):**
|
||||
```json
|
||||
{
|
||||
"name": "mistral-mlx",
|
||||
"backend_type": "mlx_lm",
|
||||
"backend_options": {
|
||||
"model": "mlx-community/Mistral-7B-Instruct-v0.3-4bit",
|
||||
"temp": 0.7,
|
||||
"max_tokens": 2048
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**vLLM backend:**
|
||||
```json
|
||||
{
|
||||
"name": "dialogpt-vllm",
|
||||
"backend_type": "vllm",
|
||||
"backend_options": {
|
||||
"model": "microsoft/DialoGPT-medium",
|
||||
"tensor_parallel_size": 2,
|
||||
"gpu_memory_utilization": 0.9
|
||||
}
|
||||
}
|
||||
```
|
||||
@@ -66,12 +96,14 @@ You can also manage instances via the REST API:
|
||||
# List all instances
|
||||
curl http://localhost:8080/api/instances
|
||||
|
||||
# Create a new instance
|
||||
curl -X POST http://localhost:8080/api/instances \
|
||||
# Create a new llama.cpp instance
|
||||
curl -X POST http://localhost:8080/api/instances/my-model \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"name": "my-model",
|
||||
"model_path": "/path/to/model.gguf",
|
||||
"backend_type": "llama_cpp",
|
||||
"backend_options": {
|
||||
"model": "/path/to/model.gguf"
|
||||
}
|
||||
}'
|
||||
|
||||
# Start an instance
|
||||
|
||||
@@ -170,7 +170,7 @@ POST /api/v1/instances/{name}/start
|
||||
```json
|
||||
{
|
||||
"name": "llama2-7b",
|
||||
"status": "starting",
|
||||
"status": "running",
|
||||
"created": 1705312200
|
||||
}
|
||||
```
|
||||
@@ -191,7 +191,7 @@ POST /api/v1/instances/{name}/stop
|
||||
```json
|
||||
{
|
||||
"name": "llama2-7b",
|
||||
"status": "stopping",
|
||||
"status": "stopped",
|
||||
"created": 1705312200
|
||||
}
|
||||
```
|
||||
@@ -208,7 +208,7 @@ POST /api/v1/instances/{name}/restart
|
||||
```json
|
||||
{
|
||||
"name": "llama2-7b",
|
||||
"status": "restarting",
|
||||
"status": "running",
|
||||
"created": 1705312200
|
||||
}
|
||||
```
|
||||
@@ -401,6 +401,102 @@ curl -X POST http://localhost:8080/api/v1/instances/my-model/proxy/completion \
|
||||
}'
|
||||
```
|
||||
|
||||
## Backend-Specific Endpoints
|
||||
|
||||
### Parse Commands
|
||||
|
||||
Llamactl provides endpoints to parse command strings from different backends into instance configuration options.
|
||||
|
||||
#### Parse Llama.cpp Command
|
||||
|
||||
Parse a llama-server command string into instance options.
|
||||
|
||||
```http
|
||||
POST /api/v1/backends/llama-cpp/parse-command
|
||||
```
|
||||
|
||||
**Request Body:**
|
||||
```json
|
||||
{
|
||||
"command": "llama-server -m /path/to/model.gguf -c 2048 --port 8080"
|
||||
}
|
||||
```
|
||||
|
||||
**Response:**
|
||||
```json
|
||||
{
|
||||
"backend_type": "llama_cpp",
|
||||
"llama_server_options": {
|
||||
"model": "/path/to/model.gguf",
|
||||
"ctx_size": 2048,
|
||||
"port": 8080
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
#### Parse MLX-LM Command
|
||||
|
||||
Parse an MLX-LM server command string into instance options.
|
||||
|
||||
```http
|
||||
POST /api/v1/backends/mlx/parse-command
|
||||
```
|
||||
|
||||
**Request Body:**
|
||||
```json
|
||||
{
|
||||
"command": "mlx_lm.server --model /path/to/model --port 8080"
|
||||
}
|
||||
```
|
||||
|
||||
**Response:**
|
||||
```json
|
||||
{
|
||||
"backend_type": "mlx_lm",
|
||||
"mlx_server_options": {
|
||||
"model": "/path/to/model",
|
||||
"port": 8080
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
#### Parse vLLM Command
|
||||
|
||||
Parse a vLLM serve command string into instance options.
|
||||
|
||||
```http
|
||||
POST /api/v1/backends/vllm/parse-command
|
||||
```
|
||||
|
||||
**Request Body:**
|
||||
```json
|
||||
{
|
||||
"command": "vllm serve /path/to/model --port 8080"
|
||||
}
|
||||
```
|
||||
|
||||
**Response:**
|
||||
```json
|
||||
{
|
||||
"backend_type": "vllm",
|
||||
"vllm_server_options": {
|
||||
"model": "/path/to/model",
|
||||
"port": 8080
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**Error Responses for Parse Commands:**
|
||||
- `400 Bad Request`: Invalid request body, empty command, or parse error
|
||||
- `500 Internal Server Error`: Encoding error
|
||||
|
||||
## Auto-Generated Documentation
|
||||
|
||||
The API documentation is automatically generated from code annotations using Swagger/OpenAPI. To regenerate the documentation:
|
||||
|
||||
1. Install the swag tool: `go install github.com/swaggo/swag/cmd/swag@latest`
|
||||
2. Generate docs: `swag init -g cmd/server/main.go -o apidocs`
|
||||
|
||||
## Swagger Documentation
|
||||
|
||||
If swagger documentation is enabled in the server configuration, you can access the interactive API documentation at:
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# Managing Instances
|
||||
|
||||
Learn how to effectively manage your llama.cpp and MLX instances with Llamactl through both the Web UI and API.
|
||||
Learn how to effectively manage your llama.cpp, MLX, and vLLM instances with Llamactl through both the Web UI and API.
|
||||
|
||||
## Overview
|
||||
|
||||
@@ -42,9 +42,11 @@ Each instance is displayed as a card showing:
|
||||
3. **Choose Backend Type**:
|
||||
- **llama.cpp**: For GGUF models using llama-server
|
||||
- **MLX**: For MLX-optimized models (macOS only)
|
||||
- **vLLM**: For distributed serving and high-throughput inference
|
||||
4. Configure model source:
|
||||
- **For llama.cpp**: GGUF model path or HuggingFace repo
|
||||
- **For MLX**: MLX model path or identifier (e.g., `mlx-community/Mistral-7B-Instruct-v0.3-4bit`)
|
||||
- **For vLLM**: HuggingFace model identifier (e.g., `microsoft/DialoGPT-medium`)
|
||||
5. Configure optional instance management settings:
|
||||
- **Auto Restart**: Automatically restart instance on failure
|
||||
- **Max Restarts**: Maximum number of restart attempts
|
||||
@@ -54,6 +56,7 @@ Each instance is displayed as a card showing:
|
||||
6. Configure backend-specific options:
|
||||
- **llama.cpp**: Threads, context size, GPU layers, port, etc.
|
||||
- **MLX**: Temperature, top-p, adapter path, Python environment, etc.
|
||||
- **vLLM**: Tensor parallel size, GPU memory utilization, quantization, etc.
|
||||
7. Click **"Create"** to save the instance
|
||||
|
||||
### Via API
|
||||
@@ -87,6 +90,20 @@ curl -X POST http://localhost:8080/api/instances/my-mlx-instance \
|
||||
"max_restarts": 3
|
||||
}'
|
||||
|
||||
# Create vLLM instance
|
||||
curl -X POST http://localhost:8080/api/instances/my-vllm-instance \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"backend_type": "vllm",
|
||||
"backend_options": {
|
||||
"model": "microsoft/DialoGPT-medium",
|
||||
"tensor_parallel_size": 2,
|
||||
"gpu_memory_utilization": 0.9
|
||||
},
|
||||
"auto_restart": true,
|
||||
"on_demand_start": true
|
||||
}'
|
||||
|
||||
# Create llama.cpp instance with HuggingFace model
|
||||
curl -X POST http://localhost:8080/api/instances/gemma-3-27b \
|
||||
-H "Content-Type: application/json" \
|
||||
@@ -179,16 +196,17 @@ curl -X DELETE http://localhost:8080/api/instances/{name}
|
||||
|
||||
## Instance Proxy
|
||||
|
||||
Llamactl proxies all requests to the underlying backend instances (llama-server or MLX).
|
||||
Llamactl proxies all requests to the underlying backend instances (llama-server, MLX, or vLLM).
|
||||
|
||||
```bash
|
||||
# Get instance details
|
||||
curl http://localhost:8080/api/instances/{name}/proxy/
|
||||
```
|
||||
|
||||
Both backends provide OpenAI-compatible endpoints. Check the respective documentation:
|
||||
All backends provide OpenAI-compatible endpoints. Check the respective documentation:
|
||||
- [llama-server docs](https://github.com/ggml-org/llama.cpp/blob/master/tools/server/README.md)
|
||||
- [MLX-LM docs](https://github.com/ml-explore/mlx-lm/blob/main/mlx_lm/SERVER.md)
|
||||
- [vLLM docs](https://docs.vllm.ai/en/latest/)
|
||||
|
||||
### Instance Health
|
||||
|
||||
|
||||
@@ -5,5 +5,6 @@ type BackendType string
|
||||
const (
|
||||
BackendTypeLlamaCpp BackendType = "llama_cpp"
|
||||
BackendTypeMlxLm BackendType = "mlx_lm"
|
||||
BackendTypeVllm BackendType = "vllm"
|
||||
// BackendTypeMlxVlm BackendType = "mlx_vlm" // Future expansion
|
||||
)
|
||||
|
||||
70
pkg/backends/builder.go
Normal file
70
pkg/backends/builder.go
Normal file
@@ -0,0 +1,70 @@
|
||||
package backends
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// BuildCommandArgs converts a struct to command line arguments
|
||||
func BuildCommandArgs(options any, multipleFlags map[string]bool) []string {
|
||||
var args []string
|
||||
|
||||
v := reflect.ValueOf(options).Elem()
|
||||
t := v.Type()
|
||||
|
||||
for i := 0; i < v.NumField(); i++ {
|
||||
field := v.Field(i)
|
||||
fieldType := t.Field(i)
|
||||
|
||||
if !field.CanInterface() {
|
||||
continue
|
||||
}
|
||||
|
||||
jsonTag := fieldType.Tag.Get("json")
|
||||
if jsonTag == "" || jsonTag == "-" {
|
||||
continue
|
||||
}
|
||||
|
||||
// Get flag name from JSON tag
|
||||
flagName := strings.Split(jsonTag, ",")[0]
|
||||
flagName = strings.ReplaceAll(flagName, "_", "-")
|
||||
|
||||
switch field.Kind() {
|
||||
case reflect.Bool:
|
||||
if field.Bool() {
|
||||
args = append(args, "--"+flagName)
|
||||
}
|
||||
case reflect.Int:
|
||||
if field.Int() != 0 {
|
||||
args = append(args, "--"+flagName, strconv.FormatInt(field.Int(), 10))
|
||||
}
|
||||
case reflect.Float64:
|
||||
if field.Float() != 0 {
|
||||
args = append(args, "--"+flagName, strconv.FormatFloat(field.Float(), 'f', -1, 64))
|
||||
}
|
||||
case reflect.String:
|
||||
if field.String() != "" {
|
||||
args = append(args, "--"+flagName, field.String())
|
||||
}
|
||||
case reflect.Slice:
|
||||
if field.Type().Elem().Kind() == reflect.String && field.Len() > 0 {
|
||||
if multipleFlags[flagName] {
|
||||
// Multiple flags: --flag value1 --flag value2
|
||||
for j := 0; j < field.Len(); j++ {
|
||||
args = append(args, "--"+flagName, field.Index(j).String())
|
||||
}
|
||||
} else {
|
||||
// Comma-separated: --flag value1,value2
|
||||
var values []string
|
||||
for j := 0; j < field.Len(); j++ {
|
||||
values = append(values, field.Index(j).String())
|
||||
}
|
||||
args = append(args, "--"+flagName, strings.Join(values, ","))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return args
|
||||
}
|
||||
@@ -2,9 +2,9 @@ package llamacpp
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"llamactl/pkg/backends"
|
||||
"reflect"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
type LlamaServerOptions struct {
|
||||
@@ -315,62 +315,44 @@ func (o *LlamaServerOptions) UnmarshalJSON(data []byte) error {
|
||||
|
||||
// BuildCommandArgs converts InstanceOptions to command line arguments
|
||||
func (o *LlamaServerOptions) BuildCommandArgs() []string {
|
||||
var args []string
|
||||
// Llama uses multiple flags for arrays by default (not comma-separated)
|
||||
multipleFlags := map[string]bool{
|
||||
"override-tensor": true,
|
||||
"override-kv": true,
|
||||
"lora": true,
|
||||
"lora-scaled": true,
|
||||
"control-vector": true,
|
||||
"control-vector-scaled": true,
|
||||
"dry-sequence-breaker": true,
|
||||
"logit-bias": true,
|
||||
}
|
||||
return backends.BuildCommandArgs(o, multipleFlags)
|
||||
}
|
||||
|
||||
v := reflect.ValueOf(o).Elem()
|
||||
t := v.Type()
|
||||
|
||||
for i := 0; i < v.NumField(); i++ {
|
||||
field := v.Field(i)
|
||||
fieldType := t.Field(i)
|
||||
|
||||
// Skip unexported fields
|
||||
if !field.CanInterface() {
|
||||
continue
|
||||
}
|
||||
|
||||
// Get the JSON tag to determine the flag name
|
||||
jsonTag := fieldType.Tag.Get("json")
|
||||
if jsonTag == "" || jsonTag == "-" {
|
||||
continue
|
||||
}
|
||||
|
||||
// Remove ",omitempty" from the tag
|
||||
flagName := jsonTag
|
||||
if commaIndex := strings.Index(jsonTag, ","); commaIndex != -1 {
|
||||
flagName = jsonTag[:commaIndex]
|
||||
}
|
||||
|
||||
// Convert snake_case to kebab-case for CLI flags
|
||||
flagName = strings.ReplaceAll(flagName, "_", "-")
|
||||
|
||||
// Add the appropriate arguments based on field type and value
|
||||
switch field.Kind() {
|
||||
case reflect.Bool:
|
||||
if field.Bool() {
|
||||
args = append(args, "--"+flagName)
|
||||
}
|
||||
case reflect.Int:
|
||||
if field.Int() != 0 {
|
||||
args = append(args, "--"+flagName, strconv.FormatInt(field.Int(), 10))
|
||||
}
|
||||
case reflect.Float64:
|
||||
if field.Float() != 0 {
|
||||
args = append(args, "--"+flagName, strconv.FormatFloat(field.Float(), 'f', -1, 64))
|
||||
}
|
||||
case reflect.String:
|
||||
if field.String() != "" {
|
||||
args = append(args, "--"+flagName, field.String())
|
||||
}
|
||||
case reflect.Slice:
|
||||
if field.Type().Elem().Kind() == reflect.String {
|
||||
// Handle []string fields
|
||||
for j := 0; j < field.Len(); j++ {
|
||||
args = append(args, "--"+flagName, field.Index(j).String())
|
||||
}
|
||||
}
|
||||
}
|
||||
// ParseLlamaCommand parses a llama-server command string into LlamaServerOptions
|
||||
// Supports multiple formats:
|
||||
// 1. Full command: "llama-server --model file.gguf"
|
||||
// 2. Full path: "/usr/local/bin/llama-server --model file.gguf"
|
||||
// 3. Args only: "--model file.gguf --gpu-layers 32"
|
||||
// 4. Multiline commands with backslashes
|
||||
func ParseLlamaCommand(command string) (*LlamaServerOptions, error) {
|
||||
executableNames := []string{"llama-server"}
|
||||
var subcommandNames []string // Llama has no subcommands
|
||||
multiValuedFlags := map[string]bool{
|
||||
"override_tensor": true,
|
||||
"override_kv": true,
|
||||
"lora": true,
|
||||
"lora_scaled": true,
|
||||
"control_vector": true,
|
||||
"control_vector_scaled": true,
|
||||
"dry_sequence_breaker": true,
|
||||
"logit_bias": true,
|
||||
}
|
||||
|
||||
return args
|
||||
var llamaOptions LlamaServerOptions
|
||||
if err := backends.ParseCommand(command, executableNames, subcommandNames, multiValuedFlags, &llamaOptions); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return &llamaOptions, nil
|
||||
}
|
||||
|
||||
@@ -378,6 +378,121 @@ func TestUnmarshalJSON_ArrayFields(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseLlamaCommand(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
command string
|
||||
expectErr bool
|
||||
}{
|
||||
{
|
||||
name: "basic command",
|
||||
command: "llama-server --model /path/to/model.gguf --gpu-layers 32",
|
||||
expectErr: false,
|
||||
},
|
||||
{
|
||||
name: "args only",
|
||||
command: "--model /path/to/model.gguf --ctx-size 4096",
|
||||
expectErr: false,
|
||||
},
|
||||
{
|
||||
name: "mixed flag formats",
|
||||
command: "llama-server --model=/path/model.gguf --gpu-layers 16 --verbose",
|
||||
expectErr: false,
|
||||
},
|
||||
{
|
||||
name: "quoted strings",
|
||||
command: `llama-server --model test.gguf --api-key "sk-1234567890abcdef"`,
|
||||
expectErr: false,
|
||||
},
|
||||
{
|
||||
name: "empty command",
|
||||
command: "",
|
||||
expectErr: true,
|
||||
},
|
||||
{
|
||||
name: "unterminated quote",
|
||||
command: `llama-server --model test.gguf --api-key "unterminated`,
|
||||
expectErr: true,
|
||||
},
|
||||
{
|
||||
name: "malformed flag",
|
||||
command: "llama-server ---model test.gguf",
|
||||
expectErr: true,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
result, err := llamacpp.ParseLlamaCommand(tt.command)
|
||||
|
||||
if tt.expectErr {
|
||||
if err == nil {
|
||||
t.Errorf("expected error but got none")
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
t.Errorf("unexpected error: %v", err)
|
||||
return
|
||||
}
|
||||
|
||||
if result == nil {
|
||||
t.Errorf("expected result but got nil")
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseLlamaCommandValues(t *testing.T) {
|
||||
command := "llama-server --model /test/model.gguf --gpu-layers 32 --temp 0.7 --verbose --no-mmap"
|
||||
result, err := llamacpp.ParseLlamaCommand(command)
|
||||
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
|
||||
if result.Model != "/test/model.gguf" {
|
||||
t.Errorf("expected model '/test/model.gguf', got '%s'", result.Model)
|
||||
}
|
||||
|
||||
if result.GPULayers != 32 {
|
||||
t.Errorf("expected gpu_layers 32, got %d", result.GPULayers)
|
||||
}
|
||||
|
||||
if result.Temperature != 0.7 {
|
||||
t.Errorf("expected temperature 0.7, got %f", result.Temperature)
|
||||
}
|
||||
|
||||
if !result.Verbose {
|
||||
t.Errorf("expected verbose to be true")
|
||||
}
|
||||
|
||||
if !result.NoMmap {
|
||||
t.Errorf("expected no_mmap to be true")
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseLlamaCommandArrays(t *testing.T) {
|
||||
command := "llama-server --model test.gguf --lora adapter1.bin --lora=adapter2.bin"
|
||||
result, err := llamacpp.ParseLlamaCommand(command)
|
||||
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
|
||||
if len(result.Lora) != 2 {
|
||||
t.Errorf("expected 2 lora adapters, got %d", len(result.Lora))
|
||||
}
|
||||
|
||||
expected := []string{"adapter1.bin", "adapter2.bin"}
|
||||
for i, v := range expected {
|
||||
if result.Lora[i] != v {
|
||||
t.Errorf("expected lora[%d]=%s got %s", i, v, result.Lora[i])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Helper functions
|
||||
func contains(slice []string, item string) bool {
|
||||
return slices.Contains(slice, item)
|
||||
|
||||
@@ -1,286 +0,0 @@
|
||||
package llamacpp
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// ParseLlamaCommand parses a llama-server command string into LlamaServerOptions
|
||||
// Supports multiple formats:
|
||||
// 1. Full command: "llama-server --model file.gguf"
|
||||
// 2. Full path: "/usr/local/bin/llama-server --model file.gguf"
|
||||
// 3. Args only: "--model file.gguf --gpu-layers 32"
|
||||
// 4. Multiline commands with backslashes
|
||||
func ParseLlamaCommand(command string) (*LlamaServerOptions, error) {
|
||||
// 1. Normalize the command - handle multiline with backslashes
|
||||
trimmed := normalizeMultilineCommand(command)
|
||||
if trimmed == "" {
|
||||
return nil, fmt.Errorf("command cannot be empty")
|
||||
}
|
||||
|
||||
// 2. Extract arguments from command
|
||||
args, err := extractArgumentsFromCommand(trimmed)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// 3. Parse arguments into map
|
||||
options := make(map[string]any)
|
||||
|
||||
// Known multi-valued flags (snake_case form)
|
||||
multiValued := map[string]struct{}{
|
||||
"override_tensor": {},
|
||||
"override_kv": {},
|
||||
"lora": {},
|
||||
"lora_scaled": {},
|
||||
"control_vector": {},
|
||||
"control_vector_scaled": {},
|
||||
"dry_sequence_breaker": {},
|
||||
"logit_bias": {},
|
||||
}
|
||||
|
||||
i := 0
|
||||
for i < len(args) {
|
||||
arg := args[i]
|
||||
|
||||
if !strings.HasPrefix(arg, "-") { // skip positional / stray values
|
||||
i++
|
||||
continue
|
||||
}
|
||||
|
||||
// Reject malformed flags with more than two leading dashes (e.g. ---model) to surface user mistakes
|
||||
if strings.HasPrefix(arg, "---") {
|
||||
return nil, fmt.Errorf("malformed flag: %s", arg)
|
||||
}
|
||||
|
||||
// Unified parsing for --flag=value vs --flag value
|
||||
var rawFlag, rawValue string
|
||||
hasEquals := false
|
||||
if strings.Contains(arg, "=") {
|
||||
parts := strings.SplitN(arg, "=", 2)
|
||||
rawFlag = parts[0]
|
||||
rawValue = parts[1] // may be empty string
|
||||
hasEquals = true
|
||||
} else {
|
||||
rawFlag = arg
|
||||
}
|
||||
|
||||
flagCore := strings.TrimPrefix(strings.TrimPrefix(rawFlag, "-"), "-")
|
||||
flagName := strings.ReplaceAll(flagCore, "-", "_")
|
||||
|
||||
// Detect value if not in equals form
|
||||
valueProvided := hasEquals
|
||||
if !hasEquals {
|
||||
if i+1 < len(args) && !isFlag(args[i+1]) { // next token is value
|
||||
rawValue = args[i+1]
|
||||
valueProvided = true
|
||||
}
|
||||
}
|
||||
|
||||
// Determine if multi-valued flag
|
||||
_, isMulti := multiValued[flagName]
|
||||
|
||||
// Normalization helper: ensure slice for multi-valued flags
|
||||
appendValue := func(valStr string) {
|
||||
if existing, ok := options[flagName]; ok {
|
||||
// Existing value; ensure slice semantics for multi-valued flags or repeated occurrences
|
||||
if slice, ok := existing.([]string); ok {
|
||||
options[flagName] = append(slice, valStr)
|
||||
return
|
||||
}
|
||||
// Convert scalar to slice
|
||||
options[flagName] = []string{fmt.Sprintf("%v", existing), valStr}
|
||||
return
|
||||
}
|
||||
// First value
|
||||
if isMulti {
|
||||
options[flagName] = []string{valStr}
|
||||
} else {
|
||||
// We'll parse type below for single-valued flags
|
||||
options[flagName] = valStr
|
||||
}
|
||||
}
|
||||
|
||||
if valueProvided {
|
||||
// Use raw token for multi-valued flags; else allow typed parsing
|
||||
appendValue(rawValue)
|
||||
if !isMulti { // convert to typed value if scalar
|
||||
if strVal, ok := options[flagName].(string); ok { // still scalar
|
||||
options[flagName] = parseValue(strVal)
|
||||
}
|
||||
}
|
||||
// Advance index: if we consumed a following token as value (non equals form), skip it
|
||||
if !hasEquals && i+1 < len(args) && rawValue == args[i+1] {
|
||||
i += 2
|
||||
} else {
|
||||
i++
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
// Boolean flag (no value)
|
||||
options[flagName] = true
|
||||
i++
|
||||
}
|
||||
|
||||
// 4. Convert to LlamaServerOptions using existing UnmarshalJSON
|
||||
jsonData, err := json.Marshal(options)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to marshal parsed options: %w", err)
|
||||
}
|
||||
|
||||
var llamaOptions LlamaServerOptions
|
||||
if err := json.Unmarshal(jsonData, &llamaOptions); err != nil {
|
||||
return nil, fmt.Errorf("failed to parse command options: %w", err)
|
||||
}
|
||||
|
||||
// 5. Return LlamaServerOptions
|
||||
return &llamaOptions, nil
|
||||
}
|
||||
|
||||
// parseValue attempts to parse a string value into the most appropriate type
|
||||
func parseValue(value string) any {
|
||||
// Surrounding matching quotes (single or double)
|
||||
if l := len(value); l >= 2 {
|
||||
if (value[0] == '"' && value[l-1] == '"') || (value[0] == '\'' && value[l-1] == '\'') {
|
||||
value = value[1 : l-1]
|
||||
}
|
||||
}
|
||||
|
||||
lower := strings.ToLower(value)
|
||||
if lower == "true" {
|
||||
return true
|
||||
}
|
||||
if lower == "false" {
|
||||
return false
|
||||
}
|
||||
|
||||
if intVal, err := strconv.Atoi(value); err == nil {
|
||||
return intVal
|
||||
}
|
||||
if floatVal, err := strconv.ParseFloat(value, 64); err == nil {
|
||||
return floatVal
|
||||
}
|
||||
return value
|
||||
}
|
||||
|
||||
// normalizeMultilineCommand handles multiline commands with backslashes
|
||||
func normalizeMultilineCommand(command string) string {
|
||||
// Handle escaped newlines (backslash followed by newline)
|
||||
re := regexp.MustCompile(`\\\s*\n\s*`)
|
||||
normalized := re.ReplaceAllString(command, " ")
|
||||
|
||||
// Clean up extra whitespace
|
||||
re = regexp.MustCompile(`\s+`)
|
||||
normalized = re.ReplaceAllString(normalized, " ")
|
||||
|
||||
return strings.TrimSpace(normalized)
|
||||
}
|
||||
|
||||
// extractArgumentsFromCommand extracts arguments from various command formats
|
||||
func extractArgumentsFromCommand(command string) ([]string, error) {
|
||||
// Split command into tokens respecting quotes
|
||||
tokens, err := splitCommandTokens(command)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if len(tokens) == 0 {
|
||||
return nil, fmt.Errorf("no command tokens found")
|
||||
}
|
||||
|
||||
// Check if first token looks like an executable
|
||||
firstToken := tokens[0]
|
||||
|
||||
// Case 1: Full path to executable (contains path separator or ends with llama-server)
|
||||
if strings.Contains(firstToken, string(filepath.Separator)) ||
|
||||
strings.HasSuffix(filepath.Base(firstToken), "llama-server") {
|
||||
return tokens[1:], nil // Return everything except the executable
|
||||
}
|
||||
|
||||
// Case 2: Just "llama-server" command
|
||||
if strings.ToLower(firstToken) == "llama-server" {
|
||||
return tokens[1:], nil // Return everything except the command
|
||||
}
|
||||
|
||||
// Case 3: Arguments only (starts with a flag)
|
||||
if strings.HasPrefix(firstToken, "-") {
|
||||
return tokens, nil // Return all tokens as arguments
|
||||
}
|
||||
|
||||
// Case 4: Unknown format - might be a different executable name
|
||||
// Be permissive and assume it's the executable
|
||||
return tokens[1:], nil
|
||||
}
|
||||
|
||||
// splitCommandTokens splits a command string into tokens, respecting quotes
|
||||
func splitCommandTokens(command string) ([]string, error) {
|
||||
var tokens []string
|
||||
var current strings.Builder
|
||||
inQuotes := false
|
||||
quoteChar := byte(0)
|
||||
escaped := false
|
||||
|
||||
for i := 0; i < len(command); i++ {
|
||||
c := command[i]
|
||||
|
||||
if escaped {
|
||||
current.WriteByte(c)
|
||||
escaped = false
|
||||
continue
|
||||
}
|
||||
|
||||
if c == '\\' {
|
||||
escaped = true
|
||||
current.WriteByte(c)
|
||||
continue
|
||||
}
|
||||
|
||||
if !inQuotes && (c == '"' || c == '\'') {
|
||||
inQuotes = true
|
||||
quoteChar = c
|
||||
current.WriteByte(c)
|
||||
} else if inQuotes && c == quoteChar {
|
||||
inQuotes = false
|
||||
quoteChar = 0
|
||||
current.WriteByte(c)
|
||||
} else if !inQuotes && (c == ' ' || c == '\t') {
|
||||
if current.Len() > 0 {
|
||||
tokens = append(tokens, current.String())
|
||||
current.Reset()
|
||||
}
|
||||
} else {
|
||||
current.WriteByte(c)
|
||||
}
|
||||
}
|
||||
|
||||
if inQuotes {
|
||||
return nil, errors.New("unterminated quoted string")
|
||||
}
|
||||
|
||||
if current.Len() > 0 {
|
||||
tokens = append(tokens, current.String())
|
||||
}
|
||||
|
||||
return tokens, nil
|
||||
}
|
||||
|
||||
// isFlag determines if a string is a command line flag or a value
|
||||
// Handles the special case where negative numbers should be treated as values, not flags
|
||||
func isFlag(arg string) bool {
|
||||
if !strings.HasPrefix(arg, "-") {
|
||||
return false
|
||||
}
|
||||
|
||||
// Special case: if it's a negative number, treat it as a value
|
||||
if _, err := strconv.ParseFloat(arg, 64); err == nil {
|
||||
return false
|
||||
}
|
||||
|
||||
return true
|
||||
}
|
||||
@@ -1,413 +0,0 @@
|
||||
package llamacpp
|
||||
|
||||
import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestParseLlamaCommand(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
command string
|
||||
expectErr bool
|
||||
}{
|
||||
{
|
||||
name: "basic command with model",
|
||||
command: "llama-server --model /path/to/model.gguf",
|
||||
expectErr: false,
|
||||
},
|
||||
{
|
||||
name: "command with multiple flags",
|
||||
command: "llama-server --model /path/to/model.gguf --gpu-layers 32 --ctx-size 4096",
|
||||
expectErr: false,
|
||||
},
|
||||
{
|
||||
name: "command with short flags",
|
||||
command: "llama-server -m /path/to/model.gguf -ngl 32 -c 4096",
|
||||
expectErr: false,
|
||||
},
|
||||
{
|
||||
name: "command with equals format",
|
||||
command: "llama-server --model=/path/to/model.gguf --gpu-layers=32",
|
||||
expectErr: false,
|
||||
},
|
||||
{
|
||||
name: "command with boolean flags",
|
||||
command: "llama-server --model /path/to/model.gguf --verbose --no-mmap",
|
||||
expectErr: false,
|
||||
},
|
||||
{
|
||||
name: "empty command",
|
||||
command: "",
|
||||
expectErr: true,
|
||||
},
|
||||
{
|
||||
name: "case insensitive command",
|
||||
command: "LLAMA-SERVER --model /path/to/model.gguf",
|
||||
expectErr: false,
|
||||
},
|
||||
// New test cases for improved functionality
|
||||
{
|
||||
name: "args only without llama-server",
|
||||
command: "--model /path/to/model.gguf --gpu-layers 32",
|
||||
expectErr: false,
|
||||
},
|
||||
{
|
||||
name: "full path to executable",
|
||||
command: "/usr/local/bin/llama-server --model /path/to/model.gguf",
|
||||
expectErr: false,
|
||||
},
|
||||
{
|
||||
name: "negative number handling",
|
||||
command: "llama-server --gpu-layers -1 --model test.gguf",
|
||||
expectErr: false,
|
||||
},
|
||||
{
|
||||
name: "multiline command with backslashes",
|
||||
command: "llama-server --model /path/to/model.gguf \\\n --ctx-size 4096 \\\n --batch-size 512",
|
||||
expectErr: false,
|
||||
},
|
||||
{
|
||||
name: "quoted string with special characters",
|
||||
command: `llama-server --model test.gguf --chat-template "{% for message in messages %}{{ message.role }}: {{ message.content }}\n{% endfor %}"`,
|
||||
expectErr: false,
|
||||
},
|
||||
{
|
||||
name: "unterminated quoted string",
|
||||
command: `llama-server --model test.gguf --chat-template "unterminated quote`,
|
||||
expectErr: true,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
result, err := ParseLlamaCommand(tt.command)
|
||||
|
||||
if tt.expectErr {
|
||||
if err == nil {
|
||||
t.Errorf("expected error but got none")
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
t.Errorf("unexpected error: %v", err)
|
||||
return
|
||||
}
|
||||
|
||||
if result == nil {
|
||||
t.Errorf("expected result but got nil")
|
||||
return
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseLlamaCommandSpecificValues(t *testing.T) {
|
||||
// Test specific value parsing
|
||||
command := "llama-server --model /test/model.gguf --gpu-layers 32 --ctx-size 4096 --verbose"
|
||||
result, err := ParseLlamaCommand(command)
|
||||
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
|
||||
if result.Model != "/test/model.gguf" {
|
||||
t.Errorf("expected model '/test/model.gguf', got '%s'", result.Model)
|
||||
}
|
||||
|
||||
if result.GPULayers != 32 {
|
||||
t.Errorf("expected gpu_layers 32, got %d", result.GPULayers)
|
||||
}
|
||||
|
||||
if result.CtxSize != 4096 {
|
||||
t.Errorf("expected ctx_size 4096, got %d", result.CtxSize)
|
||||
}
|
||||
|
||||
if !result.Verbose {
|
||||
t.Errorf("expected verbose to be true, got %v", result.Verbose)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseLlamaCommandArrayFlags(t *testing.T) {
|
||||
// Test array flag handling (critical for lora, override-tensor, etc.)
|
||||
command := "llama-server --model test.gguf --lora adapter1.bin --lora adapter2.bin"
|
||||
result, err := ParseLlamaCommand(command)
|
||||
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
|
||||
if len(result.Lora) != 2 {
|
||||
t.Errorf("expected 2 lora adapters, got %d", len(result.Lora))
|
||||
}
|
||||
|
||||
if result.Lora[0] != "adapter1.bin" || result.Lora[1] != "adapter2.bin" {
|
||||
t.Errorf("expected lora adapters [adapter1.bin, adapter2.bin], got %v", result.Lora)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseLlamaCommandMixedFormats(t *testing.T) {
|
||||
// Test mixing --flag=value and --flag value formats
|
||||
command := "llama-server --model=/path/model.gguf --gpu-layers 16 --batch-size=512 --verbose"
|
||||
result, err := ParseLlamaCommand(command)
|
||||
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
|
||||
if result.Model != "/path/model.gguf" {
|
||||
t.Errorf("expected model '/path/model.gguf', got '%s'", result.Model)
|
||||
}
|
||||
|
||||
if result.GPULayers != 16 {
|
||||
t.Errorf("expected gpu_layers 16, got %d", result.GPULayers)
|
||||
}
|
||||
|
||||
if result.BatchSize != 512 {
|
||||
t.Errorf("expected batch_size 512, got %d", result.BatchSize)
|
||||
}
|
||||
|
||||
if !result.Verbose {
|
||||
t.Errorf("expected verbose to be true, got %v", result.Verbose)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseLlamaCommandTypeConversion(t *testing.T) {
|
||||
// Test that values are converted to appropriate types
|
||||
command := "llama-server --model test.gguf --temp 0.7 --top-k 40 --no-mmap"
|
||||
result, err := ParseLlamaCommand(command)
|
||||
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
|
||||
if result.Temperature != 0.7 {
|
||||
t.Errorf("expected temperature 0.7, got %f", result.Temperature)
|
||||
}
|
||||
|
||||
if result.TopK != 40 {
|
||||
t.Errorf("expected top_k 40, got %d", result.TopK)
|
||||
}
|
||||
|
||||
if !result.NoMmap {
|
||||
t.Errorf("expected no_mmap to be true, got %v", result.NoMmap)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseLlamaCommandArgsOnly(t *testing.T) {
|
||||
// Test parsing arguments without llama-server command
|
||||
command := "--model /path/to/model.gguf --gpu-layers 32 --ctx-size 4096"
|
||||
result, err := ParseLlamaCommand(command)
|
||||
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
|
||||
if result.Model != "/path/to/model.gguf" {
|
||||
t.Errorf("expected model '/path/to/model.gguf', got '%s'", result.Model)
|
||||
}
|
||||
|
||||
if result.GPULayers != 32 {
|
||||
t.Errorf("expected gpu_layers 32, got %d", result.GPULayers)
|
||||
}
|
||||
|
||||
if result.CtxSize != 4096 {
|
||||
t.Errorf("expected ctx_size 4096, got %d", result.CtxSize)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseLlamaCommandFullPath(t *testing.T) {
|
||||
// Test full path to executable
|
||||
command := "/usr/local/bin/llama-server --model test.gguf --gpu-layers 16"
|
||||
result, err := ParseLlamaCommand(command)
|
||||
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
|
||||
if result.Model != "test.gguf" {
|
||||
t.Errorf("expected model 'test.gguf', got '%s'", result.Model)
|
||||
}
|
||||
|
||||
if result.GPULayers != 16 {
|
||||
t.Errorf("expected gpu_layers 16, got %d", result.GPULayers)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseLlamaCommandNegativeNumbers(t *testing.T) {
|
||||
// Test negative number parsing
|
||||
command := "llama-server --model test.gguf --gpu-layers -1 --seed -12345"
|
||||
result, err := ParseLlamaCommand(command)
|
||||
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
|
||||
if result.GPULayers != -1 {
|
||||
t.Errorf("expected gpu_layers -1, got %d", result.GPULayers)
|
||||
}
|
||||
|
||||
if result.Seed != -12345 {
|
||||
t.Errorf("expected seed -12345, got %d", result.Seed)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseLlamaCommandMultiline(t *testing.T) {
|
||||
// Test multiline command with backslashes
|
||||
command := `llama-server --model /path/to/model.gguf \
|
||||
--ctx-size 4096 \
|
||||
--batch-size 512 \
|
||||
--gpu-layers 32`
|
||||
|
||||
result, err := ParseLlamaCommand(command)
|
||||
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
|
||||
if result.Model != "/path/to/model.gguf" {
|
||||
t.Errorf("expected model '/path/to/model.gguf', got '%s'", result.Model)
|
||||
}
|
||||
|
||||
if result.CtxSize != 4096 {
|
||||
t.Errorf("expected ctx_size 4096, got %d", result.CtxSize)
|
||||
}
|
||||
|
||||
if result.BatchSize != 512 {
|
||||
t.Errorf("expected batch_size 512, got %d", result.BatchSize)
|
||||
}
|
||||
|
||||
if result.GPULayers != 32 {
|
||||
t.Errorf("expected gpu_layers 32, got %d", result.GPULayers)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseLlamaCommandQuotedStrings(t *testing.T) {
|
||||
// Test quoted strings with special characters
|
||||
command := `llama-server --model test.gguf --api-key "sk-1234567890abcdef" --chat-template "User: {user}\nAssistant: "`
|
||||
result, err := ParseLlamaCommand(command)
|
||||
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
|
||||
if result.Model != "test.gguf" {
|
||||
t.Errorf("expected model 'test.gguf', got '%s'", result.Model)
|
||||
}
|
||||
|
||||
if result.APIKey != "sk-1234567890abcdef" {
|
||||
t.Errorf("expected api_key 'sk-1234567890abcdef', got '%s'", result.APIKey)
|
||||
}
|
||||
|
||||
expectedTemplate := "User: {user}\\nAssistant: "
|
||||
if result.ChatTemplate != expectedTemplate {
|
||||
t.Errorf("expected chat_template '%s', got '%s'", expectedTemplate, result.ChatTemplate)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseLlamaCommandUnslothExample(t *testing.T) {
|
||||
// Test with realistic unsloth-style command
|
||||
command := `llama-server --model /path/to/model.gguf \
|
||||
--ctx-size 4096 \
|
||||
--batch-size 512 \
|
||||
--gpu-layers -1 \
|
||||
--temp 0.7 \
|
||||
--repeat-penalty 1.1 \
|
||||
--top-k 40 \
|
||||
--top-p 0.95 \
|
||||
--host 0.0.0.0 \
|
||||
--port 8000 \
|
||||
--api-key "sk-1234567890abcdef"`
|
||||
|
||||
result, err := ParseLlamaCommand(command)
|
||||
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
|
||||
// Verify key fields
|
||||
if result.Model != "/path/to/model.gguf" {
|
||||
t.Errorf("expected model '/path/to/model.gguf', got '%s'", result.Model)
|
||||
}
|
||||
|
||||
if result.CtxSize != 4096 {
|
||||
t.Errorf("expected ctx_size 4096, got %d", result.CtxSize)
|
||||
}
|
||||
|
||||
if result.BatchSize != 512 {
|
||||
t.Errorf("expected batch_size 512, got %d", result.BatchSize)
|
||||
}
|
||||
|
||||
if result.GPULayers != -1 {
|
||||
t.Errorf("expected gpu_layers -1, got %d", result.GPULayers)
|
||||
}
|
||||
|
||||
if result.Temperature != 0.7 {
|
||||
t.Errorf("expected temperature 0.7, got %f", result.Temperature)
|
||||
}
|
||||
|
||||
if result.RepeatPenalty != 1.1 {
|
||||
t.Errorf("expected repeat_penalty 1.1, got %f", result.RepeatPenalty)
|
||||
}
|
||||
|
||||
if result.TopK != 40 {
|
||||
t.Errorf("expected top_k 40, got %d", result.TopK)
|
||||
}
|
||||
|
||||
if result.TopP != 0.95 {
|
||||
t.Errorf("expected top_p 0.95, got %f", result.TopP)
|
||||
}
|
||||
|
||||
if result.Host != "0.0.0.0" {
|
||||
t.Errorf("expected host '0.0.0.0', got '%s'", result.Host)
|
||||
}
|
||||
|
||||
if result.Port != 8000 {
|
||||
t.Errorf("expected port 8000, got %d", result.Port)
|
||||
}
|
||||
|
||||
if result.APIKey != "sk-1234567890abcdef" {
|
||||
t.Errorf("expected api_key 'sk-1234567890abcdef', got '%s'", result.APIKey)
|
||||
}
|
||||
}
|
||||
|
||||
// Focused additional edge case tests (kept minimal per guidance)
|
||||
func TestParseLlamaCommandSingleQuotedValue(t *testing.T) {
|
||||
cmd := "llama-server --model 'my model.gguf' --alias 'Test Alias'"
|
||||
result, err := ParseLlamaCommand(cmd)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if result.Model != "my model.gguf" {
|
||||
t.Errorf("expected model 'my model.gguf', got '%s'", result.Model)
|
||||
}
|
||||
if result.Alias != "Test Alias" {
|
||||
t.Errorf("expected alias 'Test Alias', got '%s'", result.Alias)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseLlamaCommandMixedArrayForms(t *testing.T) {
|
||||
// Same multi-value flag using --flag value and --flag=value forms
|
||||
cmd := "llama-server --lora adapter1.bin --lora=adapter2.bin --lora adapter3.bin"
|
||||
result, err := ParseLlamaCommand(cmd)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if len(result.Lora) != 3 {
|
||||
t.Fatalf("expected 3 lora values, got %d (%v)", len(result.Lora), result.Lora)
|
||||
}
|
||||
expected := []string{"adapter1.bin", "adapter2.bin", "adapter3.bin"}
|
||||
for i, v := range expected {
|
||||
if result.Lora[i] != v {
|
||||
t.Errorf("expected lora[%d]=%s got %s", i, v, result.Lora[i])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseLlamaCommandMalformedFlag(t *testing.T) {
|
||||
cmd := "llama-server ---model test.gguf"
|
||||
_, err := ParseLlamaCommand(cmd)
|
||||
if err == nil {
|
||||
t.Fatalf("expected error for malformed flag but got none")
|
||||
}
|
||||
}
|
||||
@@ -1,16 +1,14 @@
|
||||
package mlx
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"reflect"
|
||||
"strconv"
|
||||
"llamactl/pkg/backends"
|
||||
)
|
||||
|
||||
type MlxServerOptions struct {
|
||||
// Basic connection options
|
||||
Model string `json:"model,omitempty"`
|
||||
Host string `json:"host,omitempty"`
|
||||
Port int `json:"port,omitempty"`
|
||||
Model string `json:"model,omitempty"`
|
||||
Host string `json:"host,omitempty"`
|
||||
Port int `json:"port,omitempty"`
|
||||
|
||||
// Model and adapter options
|
||||
AdapterPath string `json:"adapter_path,omitempty"`
|
||||
@@ -19,187 +17,40 @@ type MlxServerOptions struct {
|
||||
TrustRemoteCode bool `json:"trust_remote_code,omitempty"`
|
||||
|
||||
// Logging and templates
|
||||
LogLevel string `json:"log_level,omitempty"`
|
||||
ChatTemplate string `json:"chat_template,omitempty"`
|
||||
UseDefaultChatTemplate bool `json:"use_default_chat_template,omitempty"`
|
||||
ChatTemplateArgs string `json:"chat_template_args,omitempty"` // JSON string
|
||||
LogLevel string `json:"log_level,omitempty"`
|
||||
ChatTemplate string `json:"chat_template,omitempty"`
|
||||
UseDefaultChatTemplate bool `json:"use_default_chat_template,omitempty"`
|
||||
ChatTemplateArgs string `json:"chat_template_args,omitempty"` // JSON string
|
||||
|
||||
// Sampling defaults
|
||||
Temp float64 `json:"temp,omitempty"` // Note: MLX uses "temp" not "temperature"
|
||||
TopP float64 `json:"top_p,omitempty"`
|
||||
TopK int `json:"top_k,omitempty"`
|
||||
MinP float64 `json:"min_p,omitempty"`
|
||||
MaxTokens int `json:"max_tokens,omitempty"`
|
||||
}
|
||||
|
||||
// UnmarshalJSON implements custom JSON unmarshaling to support multiple field names
|
||||
func (o *MlxServerOptions) UnmarshalJSON(data []byte) error {
|
||||
// First unmarshal into a map to handle multiple field names
|
||||
var raw map[string]any
|
||||
if err := json.Unmarshal(data, &raw); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Create a temporary struct for standard unmarshaling
|
||||
type tempOptions MlxServerOptions
|
||||
temp := tempOptions{}
|
||||
|
||||
// Standard unmarshal first
|
||||
if err := json.Unmarshal(data, &temp); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Copy to our struct
|
||||
*o = MlxServerOptions(temp)
|
||||
|
||||
// Handle alternative field names
|
||||
fieldMappings := map[string]string{
|
||||
// Basic connection options
|
||||
"m": "model",
|
||||
"host": "host",
|
||||
"port": "port",
|
||||
// "python_path": "python_path", // removed
|
||||
|
||||
// Model and adapter options
|
||||
"adapter-path": "adapter_path",
|
||||
"draft-model": "draft_model",
|
||||
"num-draft-tokens": "num_draft_tokens",
|
||||
"trust-remote-code": "trust_remote_code",
|
||||
|
||||
// Logging and templates
|
||||
"log-level": "log_level",
|
||||
"chat-template": "chat_template",
|
||||
"use-default-chat-template": "use_default_chat_template",
|
||||
"chat-template-args": "chat_template_args",
|
||||
|
||||
// Sampling defaults
|
||||
"temperature": "temp", // Support both temp and temperature
|
||||
"top-p": "top_p",
|
||||
"top-k": "top_k",
|
||||
"min-p": "min_p",
|
||||
"max-tokens": "max_tokens",
|
||||
}
|
||||
|
||||
// Process alternative field names
|
||||
for altName, canonicalName := range fieldMappings {
|
||||
if value, exists := raw[altName]; exists {
|
||||
// Use reflection to set the field value
|
||||
v := reflect.ValueOf(o).Elem()
|
||||
field := v.FieldByNameFunc(func(fieldName string) bool {
|
||||
field, _ := v.Type().FieldByName(fieldName)
|
||||
jsonTag := field.Tag.Get("json")
|
||||
return jsonTag == canonicalName+",omitempty" || jsonTag == canonicalName
|
||||
})
|
||||
|
||||
if field.IsValid() && field.CanSet() {
|
||||
switch field.Kind() {
|
||||
case reflect.Int:
|
||||
if intVal, ok := value.(float64); ok {
|
||||
field.SetInt(int64(intVal))
|
||||
} else if strVal, ok := value.(string); ok {
|
||||
if intVal, err := strconv.Atoi(strVal); err == nil {
|
||||
field.SetInt(int64(intVal))
|
||||
}
|
||||
}
|
||||
case reflect.Float64:
|
||||
if floatVal, ok := value.(float64); ok {
|
||||
field.SetFloat(floatVal)
|
||||
} else if strVal, ok := value.(string); ok {
|
||||
if floatVal, err := strconv.ParseFloat(strVal, 64); err == nil {
|
||||
field.SetFloat(floatVal)
|
||||
}
|
||||
}
|
||||
case reflect.String:
|
||||
if strVal, ok := value.(string); ok {
|
||||
field.SetString(strVal)
|
||||
}
|
||||
case reflect.Bool:
|
||||
if boolVal, ok := value.(bool); ok {
|
||||
field.SetBool(boolVal)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// NewMlxServerOptions creates MlxServerOptions with MLX defaults
|
||||
func NewMlxServerOptions() *MlxServerOptions {
|
||||
return &MlxServerOptions{
|
||||
Host: "127.0.0.1", // MLX default (different from llama-server)
|
||||
Port: 8080, // MLX default
|
||||
NumDraftTokens: 3, // MLX default for speculative decoding
|
||||
LogLevel: "INFO", // MLX default
|
||||
Temp: 0.0, // MLX default
|
||||
TopP: 1.0, // MLX default
|
||||
TopK: 0, // MLX default (disabled)
|
||||
MinP: 0.0, // MLX default (disabled)
|
||||
MaxTokens: 512, // MLX default
|
||||
ChatTemplateArgs: "{}", // MLX default (empty JSON object)
|
||||
}
|
||||
Temp float64 `json:"temp,omitempty"`
|
||||
TopP float64 `json:"top_p,omitempty"`
|
||||
TopK int `json:"top_k,omitempty"`
|
||||
MinP float64 `json:"min_p,omitempty"`
|
||||
MaxTokens int `json:"max_tokens,omitempty"`
|
||||
}
|
||||
|
||||
// BuildCommandArgs converts to command line arguments
|
||||
func (o *MlxServerOptions) BuildCommandArgs() []string {
|
||||
var args []string
|
||||
|
||||
// Required and basic options
|
||||
if o.Model != "" {
|
||||
args = append(args, "--model", o.Model)
|
||||
}
|
||||
if o.Host != "" {
|
||||
args = append(args, "--host", o.Host)
|
||||
}
|
||||
if o.Port != 0 {
|
||||
args = append(args, "--port", strconv.Itoa(o.Port))
|
||||
}
|
||||
|
||||
// Model and adapter options
|
||||
if o.AdapterPath != "" {
|
||||
args = append(args, "--adapter-path", o.AdapterPath)
|
||||
}
|
||||
if o.DraftModel != "" {
|
||||
args = append(args, "--draft-model", o.DraftModel)
|
||||
}
|
||||
if o.NumDraftTokens != 0 {
|
||||
args = append(args, "--num-draft-tokens", strconv.Itoa(o.NumDraftTokens))
|
||||
}
|
||||
if o.TrustRemoteCode {
|
||||
args = append(args, "--trust-remote-code")
|
||||
}
|
||||
|
||||
// Logging and templates
|
||||
if o.LogLevel != "" {
|
||||
args = append(args, "--log-level", o.LogLevel)
|
||||
}
|
||||
if o.ChatTemplate != "" {
|
||||
args = append(args, "--chat-template", o.ChatTemplate)
|
||||
}
|
||||
if o.UseDefaultChatTemplate {
|
||||
args = append(args, "--use-default-chat-template")
|
||||
}
|
||||
if o.ChatTemplateArgs != "" {
|
||||
args = append(args, "--chat-template-args", o.ChatTemplateArgs)
|
||||
}
|
||||
|
||||
// Sampling defaults
|
||||
if o.Temp != 0 {
|
||||
args = append(args, "--temp", strconv.FormatFloat(o.Temp, 'f', -1, 64))
|
||||
}
|
||||
if o.TopP != 0 {
|
||||
args = append(args, "--top-p", strconv.FormatFloat(o.TopP, 'f', -1, 64))
|
||||
}
|
||||
if o.TopK != 0 {
|
||||
args = append(args, "--top-k", strconv.Itoa(o.TopK))
|
||||
}
|
||||
if o.MinP != 0 {
|
||||
args = append(args, "--min-p", strconv.FormatFloat(o.MinP, 'f', -1, 64))
|
||||
}
|
||||
if o.MaxTokens != 0 {
|
||||
args = append(args, "--max-tokens", strconv.Itoa(o.MaxTokens))
|
||||
}
|
||||
|
||||
return args
|
||||
multipleFlags := map[string]bool{} // MLX doesn't currently have []string fields
|
||||
return backends.BuildCommandArgs(o, multipleFlags)
|
||||
}
|
||||
|
||||
// ParseMlxCommand parses a mlx_lm.server command string into MlxServerOptions
|
||||
// Supports multiple formats:
|
||||
// 1. Full command: "mlx_lm.server --model model/path"
|
||||
// 2. Full path: "/usr/local/bin/mlx_lm.server --model model/path"
|
||||
// 3. Args only: "--model model/path --host 0.0.0.0"
|
||||
// 4. Multiline commands with backslashes
|
||||
func ParseMlxCommand(command string) (*MlxServerOptions, error) {
|
||||
executableNames := []string{"mlx_lm.server"}
|
||||
var subcommandNames []string // MLX has no subcommands
|
||||
multiValuedFlags := map[string]bool{} // MLX has no multi-valued flags
|
||||
|
||||
var mlxOptions MlxServerOptions
|
||||
if err := backends.ParseCommand(command, executableNames, subcommandNames, multiValuedFlags, &mlxOptions); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return &mlxOptions, nil
|
||||
}
|
||||
157
pkg/backends/mlx/mlx_test.go
Normal file
157
pkg/backends/mlx/mlx_test.go
Normal file
@@ -0,0 +1,157 @@
|
||||
package mlx_test
|
||||
|
||||
import (
|
||||
"llamactl/pkg/backends/mlx"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestParseMlxCommand(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
command string
|
||||
expectErr bool
|
||||
}{
|
||||
{
|
||||
name: "basic command",
|
||||
command: "mlx_lm.server --model /path/to/model --host 0.0.0.0",
|
||||
expectErr: false,
|
||||
},
|
||||
{
|
||||
name: "args only",
|
||||
command: "--model /path/to/model --port 8080",
|
||||
expectErr: false,
|
||||
},
|
||||
{
|
||||
name: "mixed flag formats",
|
||||
command: "mlx_lm.server --model=/path/model --temp=0.7 --trust-remote-code",
|
||||
expectErr: false,
|
||||
},
|
||||
{
|
||||
name: "quoted strings",
|
||||
command: `mlx_lm.server --model test.mlx --chat-template "User: {user}\nAssistant: "`,
|
||||
expectErr: false,
|
||||
},
|
||||
{
|
||||
name: "empty command",
|
||||
command: "",
|
||||
expectErr: true,
|
||||
},
|
||||
{
|
||||
name: "unterminated quote",
|
||||
command: `mlx_lm.server --model test.mlx --chat-template "unterminated`,
|
||||
expectErr: true,
|
||||
},
|
||||
{
|
||||
name: "malformed flag",
|
||||
command: "mlx_lm.server ---model test.mlx",
|
||||
expectErr: true,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
result, err := mlx.ParseMlxCommand(tt.command)
|
||||
|
||||
if tt.expectErr {
|
||||
if err == nil {
|
||||
t.Errorf("expected error but got none")
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
t.Errorf("unexpected error: %v", err)
|
||||
return
|
||||
}
|
||||
|
||||
if result == nil {
|
||||
t.Errorf("expected result but got nil")
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseMlxCommandValues(t *testing.T) {
|
||||
command := "mlx_lm.server --model /test/model.mlx --port 8080 --temp 0.7 --trust-remote-code --log-level DEBUG"
|
||||
result, err := mlx.ParseMlxCommand(command)
|
||||
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
|
||||
if result.Model != "/test/model.mlx" {
|
||||
t.Errorf("expected model '/test/model.mlx', got '%s'", result.Model)
|
||||
}
|
||||
|
||||
if result.Port != 8080 {
|
||||
t.Errorf("expected port 8080, got %d", result.Port)
|
||||
}
|
||||
|
||||
if result.Temp != 0.7 {
|
||||
t.Errorf("expected temp 0.7, got %f", result.Temp)
|
||||
}
|
||||
|
||||
if !result.TrustRemoteCode {
|
||||
t.Errorf("expected trust_remote_code to be true")
|
||||
}
|
||||
|
||||
if result.LogLevel != "DEBUG" {
|
||||
t.Errorf("expected log_level 'DEBUG', got '%s'", result.LogLevel)
|
||||
}
|
||||
}
|
||||
|
||||
func TestBuildCommandArgs(t *testing.T) {
|
||||
options := &mlx.MlxServerOptions{
|
||||
Model: "/test/model.mlx",
|
||||
Host: "127.0.0.1",
|
||||
Port: 8080,
|
||||
Temp: 0.7,
|
||||
TopP: 0.9,
|
||||
TopK: 40,
|
||||
MaxTokens: 2048,
|
||||
TrustRemoteCode: true,
|
||||
LogLevel: "DEBUG",
|
||||
ChatTemplate: "custom template",
|
||||
}
|
||||
|
||||
args := options.BuildCommandArgs()
|
||||
|
||||
// Check that all expected flags are present
|
||||
expectedFlags := map[string]string{
|
||||
"--model": "/test/model.mlx",
|
||||
"--host": "127.0.0.1",
|
||||
"--port": "8080",
|
||||
"--log-level": "DEBUG",
|
||||
"--chat-template": "custom template",
|
||||
"--temp": "0.7",
|
||||
"--top-p": "0.9",
|
||||
"--top-k": "40",
|
||||
"--max-tokens": "2048",
|
||||
}
|
||||
|
||||
for i := 0; i < len(args); i++ {
|
||||
if args[i] == "--trust-remote-code" {
|
||||
continue // Boolean flag with no value
|
||||
}
|
||||
if args[i] == "--use-default-chat-template" {
|
||||
continue // Boolean flag with no value
|
||||
}
|
||||
|
||||
if expectedValue, exists := expectedFlags[args[i]]; exists && i+1 < len(args) {
|
||||
if args[i+1] != expectedValue {
|
||||
t.Errorf("expected %s to have value %s, got %s", args[i], expectedValue, args[i+1])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Check boolean flags
|
||||
foundTrustRemoteCode := false
|
||||
for _, arg := range args {
|
||||
if arg == "--trust-remote-code" {
|
||||
foundTrustRemoteCode = true
|
||||
}
|
||||
}
|
||||
if !foundTrustRemoteCode {
|
||||
t.Errorf("expected --trust-remote-code flag to be present")
|
||||
}
|
||||
}
|
||||
@@ -1,254 +0,0 @@
|
||||
package mlx
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// ParseMlxCommand parses a mlx_lm.server command string into MlxServerOptions
|
||||
// Supports multiple formats:
|
||||
// 1. Full command: "mlx_lm.server --model model/path"
|
||||
// 2. Full path: "/usr/local/bin/mlx_lm.server --model model/path"
|
||||
// 3. Args only: "--model model/path --host 0.0.0.0"
|
||||
// 4. Multiline commands with backslashes
|
||||
func ParseMlxCommand(command string) (*MlxServerOptions, error) {
|
||||
// 1. Normalize the command - handle multiline with backslashes
|
||||
trimmed := normalizeMultilineCommand(command)
|
||||
if trimmed == "" {
|
||||
return nil, fmt.Errorf("command cannot be empty")
|
||||
}
|
||||
|
||||
// 2. Extract arguments from command
|
||||
args, err := extractArgumentsFromCommand(trimmed)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// 3. Parse arguments into map
|
||||
options := make(map[string]any)
|
||||
|
||||
i := 0
|
||||
for i < len(args) {
|
||||
arg := args[i]
|
||||
|
||||
if !strings.HasPrefix(arg, "-") { // skip positional / stray values
|
||||
i++
|
||||
continue
|
||||
}
|
||||
|
||||
// Reject malformed flags with more than two leading dashes (e.g. ---model) to surface user mistakes
|
||||
if strings.HasPrefix(arg, "---") {
|
||||
return nil, fmt.Errorf("malformed flag: %s", arg)
|
||||
}
|
||||
|
||||
// Unified parsing for --flag=value vs --flag value
|
||||
var rawFlag, rawValue string
|
||||
hasEquals := false
|
||||
if strings.Contains(arg, "=") {
|
||||
parts := strings.SplitN(arg, "=", 2)
|
||||
rawFlag = parts[0]
|
||||
rawValue = parts[1] // may be empty string
|
||||
hasEquals = true
|
||||
} else {
|
||||
rawFlag = arg
|
||||
}
|
||||
|
||||
flagCore := strings.TrimPrefix(strings.TrimPrefix(rawFlag, "-"), "-")
|
||||
flagName := strings.ReplaceAll(flagCore, "-", "_")
|
||||
|
||||
// Detect value if not in equals form
|
||||
valueProvided := hasEquals
|
||||
if !hasEquals {
|
||||
if i+1 < len(args) && !isFlag(args[i+1]) { // next token is value
|
||||
rawValue = args[i+1]
|
||||
valueProvided = true
|
||||
}
|
||||
}
|
||||
|
||||
if valueProvided {
|
||||
// MLX-specific validation for certain flags
|
||||
if flagName == "log_level" && !isValidLogLevel(rawValue) {
|
||||
return nil, fmt.Errorf("invalid log level: %s", rawValue)
|
||||
}
|
||||
|
||||
options[flagName] = parseValue(rawValue)
|
||||
|
||||
// Advance index: if we consumed a following token as value (non equals form), skip it
|
||||
if !hasEquals && i+1 < len(args) && rawValue == args[i+1] {
|
||||
i += 2
|
||||
} else {
|
||||
i++
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
// Boolean flag (no value) - MLX specific boolean flags
|
||||
if flagName == "trust_remote_code" || flagName == "use_default_chat_template" {
|
||||
options[flagName] = true
|
||||
} else {
|
||||
options[flagName] = true
|
||||
}
|
||||
i++
|
||||
}
|
||||
|
||||
// 4. Convert to MlxServerOptions using existing UnmarshalJSON
|
||||
jsonData, err := json.Marshal(options)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to marshal parsed options: %w", err)
|
||||
}
|
||||
|
||||
var mlxOptions MlxServerOptions
|
||||
if err := json.Unmarshal(jsonData, &mlxOptions); err != nil {
|
||||
return nil, fmt.Errorf("failed to parse command options: %w", err)
|
||||
}
|
||||
|
||||
// 5. Return MlxServerOptions
|
||||
return &mlxOptions, nil
|
||||
}
|
||||
|
||||
// isValidLogLevel validates MLX log levels
|
||||
func isValidLogLevel(level string) bool {
|
||||
validLevels := []string{"DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"}
|
||||
for _, valid := range validLevels {
|
||||
if level == valid {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// parseValue attempts to parse a string value into the most appropriate type
|
||||
func parseValue(value string) any {
|
||||
// Surrounding matching quotes (single or double)
|
||||
if l := len(value); l >= 2 {
|
||||
if (value[0] == '"' && value[l-1] == '"') || (value[0] == '\'' && value[l-1] == '\'') {
|
||||
value = value[1 : l-1]
|
||||
}
|
||||
}
|
||||
|
||||
lower := strings.ToLower(value)
|
||||
if lower == "true" {
|
||||
return true
|
||||
}
|
||||
if lower == "false" {
|
||||
return false
|
||||
}
|
||||
|
||||
if intVal, err := strconv.Atoi(value); err == nil {
|
||||
return intVal
|
||||
}
|
||||
if floatVal, err := strconv.ParseFloat(value, 64); err == nil {
|
||||
return floatVal
|
||||
}
|
||||
return value
|
||||
}
|
||||
|
||||
// normalizeMultilineCommand handles multiline commands with backslashes
|
||||
func normalizeMultilineCommand(command string) string {
|
||||
// Handle escaped newlines (backslash followed by newline)
|
||||
re := regexp.MustCompile(`\\\s*\n\s*`)
|
||||
normalized := re.ReplaceAllString(command, " ")
|
||||
|
||||
// Clean up extra whitespace
|
||||
re = regexp.MustCompile(`\s+`)
|
||||
normalized = re.ReplaceAllString(normalized, " ")
|
||||
|
||||
return strings.TrimSpace(normalized)
|
||||
}
|
||||
|
||||
// extractArgumentsFromCommand extracts arguments from various command formats
|
||||
func extractArgumentsFromCommand(command string) ([]string, error) {
|
||||
// Split command into tokens respecting quotes
|
||||
tokens, err := splitCommandTokens(command)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if len(tokens) == 0 {
|
||||
return nil, fmt.Errorf("no command tokens found")
|
||||
}
|
||||
|
||||
// Check if first token looks like an executable
|
||||
firstToken := tokens[0]
|
||||
|
||||
// Case 1: Full path to executable (contains path separator or ends with mlx_lm.server)
|
||||
if strings.Contains(firstToken, string(filepath.Separator)) ||
|
||||
strings.HasSuffix(filepath.Base(firstToken), "mlx_lm.server") {
|
||||
return tokens[1:], nil // Return everything except the executable
|
||||
}
|
||||
|
||||
// Case 2: Just "mlx_lm.server" command
|
||||
if strings.ToLower(firstToken) == "mlx_lm.server" {
|
||||
return tokens[1:], nil // Return everything except the command
|
||||
}
|
||||
|
||||
// Case 3: Arguments only (starts with a flag)
|
||||
if strings.HasPrefix(firstToken, "-") {
|
||||
return tokens, nil // Return all tokens as arguments
|
||||
}
|
||||
|
||||
// Case 4: Unknown format - might be a different executable name
|
||||
// Be permissive and assume it's the executable
|
||||
return tokens[1:], nil
|
||||
}
|
||||
|
||||
// splitCommandTokens splits a command string into tokens, respecting quotes
|
||||
func splitCommandTokens(command string) ([]string, error) {
|
||||
var tokens []string
|
||||
var current strings.Builder
|
||||
inQuotes := false
|
||||
quoteChar := byte(0)
|
||||
escaped := false
|
||||
|
||||
for i := 0; i < len(command); i++ {
|
||||
c := command[i]
|
||||
|
||||
if escaped {
|
||||
current.WriteByte(c)
|
||||
escaped = false
|
||||
continue
|
||||
}
|
||||
|
||||
if c == '\\' {
|
||||
escaped = true
|
||||
current.WriteByte(c)
|
||||
continue
|
||||
}
|
||||
|
||||
if !inQuotes && (c == '"' || c == '\'') {
|
||||
inQuotes = true
|
||||
quoteChar = c
|
||||
current.WriteByte(c)
|
||||
} else if inQuotes && c == quoteChar {
|
||||
inQuotes = false
|
||||
quoteChar = 0
|
||||
current.WriteByte(c)
|
||||
} else if !inQuotes && (c == ' ' || c == '\t' || c == '\n') {
|
||||
if current.Len() > 0 {
|
||||
tokens = append(tokens, current.String())
|
||||
current.Reset()
|
||||
}
|
||||
} else {
|
||||
current.WriteByte(c)
|
||||
}
|
||||
}
|
||||
|
||||
if inQuotes {
|
||||
return nil, fmt.Errorf("unclosed quote in command")
|
||||
}
|
||||
|
||||
if current.Len() > 0 {
|
||||
tokens = append(tokens, current.String())
|
||||
}
|
||||
|
||||
return tokens, nil
|
||||
}
|
||||
|
||||
// isFlag checks if a string looks like a command line flag
|
||||
func isFlag(s string) bool {
|
||||
return strings.HasPrefix(s, "-")
|
||||
}
|
||||
213
pkg/backends/parser.go
Normal file
213
pkg/backends/parser.go
Normal file
@@ -0,0 +1,213 @@
|
||||
package backends
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// ParseCommand parses a command string into a target struct
|
||||
func ParseCommand(command string, executableNames []string, subcommandNames []string, multiValuedFlags map[string]bool, target any) error {
|
||||
// Normalize multiline commands
|
||||
command = normalizeCommand(command)
|
||||
if command == "" {
|
||||
return fmt.Errorf("command cannot be empty")
|
||||
}
|
||||
|
||||
// Extract arguments and positional model
|
||||
args, modelFromPositional, err := extractArgs(command, executableNames, subcommandNames)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Parse flags into map
|
||||
options, err := parseFlags(args, multiValuedFlags)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// If we found a positional model and no --model flag was provided, set the model
|
||||
if modelFromPositional != "" {
|
||||
if _, hasModelFlag := options["model"]; !hasModelFlag {
|
||||
options["model"] = modelFromPositional
|
||||
}
|
||||
}
|
||||
|
||||
// Convert to target struct via JSON
|
||||
jsonData, err := json.Marshal(options)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to marshal options: %w", err)
|
||||
}
|
||||
|
||||
if err := json.Unmarshal(jsonData, target); err != nil {
|
||||
return fmt.Errorf("failed to unmarshal to target: %w", err)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// normalizeCommand handles multiline commands with backslashes
|
||||
func normalizeCommand(command string) string {
|
||||
re := regexp.MustCompile(`\\\s*\n\s*`)
|
||||
normalized := re.ReplaceAllString(command, " ")
|
||||
re = regexp.MustCompile(`\s+`)
|
||||
return strings.TrimSpace(re.ReplaceAllString(normalized, " "))
|
||||
}
|
||||
|
||||
// extractArgs extracts arguments from command, removing executable and subcommands
|
||||
// Returns: args, modelFromPositional, error
|
||||
func extractArgs(command string, executableNames []string, subcommandNames []string) ([]string, string, error) {
|
||||
// Check for unterminated quotes
|
||||
if strings.Count(command, `"`)%2 != 0 || strings.Count(command, `'`)%2 != 0 {
|
||||
return nil, "", fmt.Errorf("unterminated quoted string")
|
||||
}
|
||||
|
||||
tokens := strings.Fields(command)
|
||||
if len(tokens) == 0 {
|
||||
return nil, "", fmt.Errorf("no tokens found")
|
||||
}
|
||||
|
||||
// Skip executable
|
||||
start := 0
|
||||
firstToken := tokens[0]
|
||||
|
||||
// Check for executable name (with or without path)
|
||||
if strings.Contains(firstToken, string(filepath.Separator)) {
|
||||
baseName := filepath.Base(firstToken)
|
||||
for _, execName := range executableNames {
|
||||
if strings.HasSuffix(strings.ToLower(baseName), strings.ToLower(execName)) {
|
||||
start = 1
|
||||
break
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for _, execName := range executableNames {
|
||||
if strings.EqualFold(firstToken, execName) {
|
||||
start = 1
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Skip subcommand if present
|
||||
if start < len(tokens) {
|
||||
for _, subCmd := range subcommandNames {
|
||||
if strings.EqualFold(tokens[start], subCmd) {
|
||||
start++
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Handle case where command starts with subcommand (no executable)
|
||||
if start == 0 {
|
||||
for _, subCmd := range subcommandNames {
|
||||
if strings.EqualFold(firstToken, subCmd) {
|
||||
start = 1
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
args := tokens[start:]
|
||||
|
||||
// Extract first positional argument (model) if present and not a flag
|
||||
var modelFromPositional string
|
||||
if len(args) > 0 && !strings.HasPrefix(args[0], "-") {
|
||||
modelFromPositional = args[0]
|
||||
args = args[1:] // Remove the model from args to process remaining flags
|
||||
}
|
||||
|
||||
return args, modelFromPositional, nil
|
||||
}
|
||||
|
||||
// parseFlags parses command line flags into a map
|
||||
func parseFlags(args []string, multiValuedFlags map[string]bool) (map[string]any, error) {
|
||||
options := make(map[string]any)
|
||||
|
||||
for i := 0; i < len(args); i++ {
|
||||
arg := args[i]
|
||||
|
||||
if !strings.HasPrefix(arg, "-") {
|
||||
continue
|
||||
}
|
||||
|
||||
// Check for malformed flags (more than two leading dashes)
|
||||
if strings.HasPrefix(arg, "---") {
|
||||
return nil, fmt.Errorf("malformed flag: %s", arg)
|
||||
}
|
||||
|
||||
// Get flag name and value
|
||||
var flagName, value string
|
||||
var hasValue bool
|
||||
|
||||
if strings.Contains(arg, "=") {
|
||||
parts := strings.SplitN(arg, "=", 2)
|
||||
flagName = strings.TrimLeft(parts[0], "-")
|
||||
value = parts[1]
|
||||
hasValue = true
|
||||
} else {
|
||||
flagName = strings.TrimLeft(arg, "-")
|
||||
if i+1 < len(args) && !strings.HasPrefix(args[i+1], "-") {
|
||||
value = args[i+1]
|
||||
hasValue = true
|
||||
i++ // Skip next arg since we consumed it
|
||||
}
|
||||
}
|
||||
|
||||
// Convert kebab-case to snake_case for JSON
|
||||
flagName = strings.ReplaceAll(flagName, "-", "_")
|
||||
|
||||
if hasValue {
|
||||
// Handle multi-valued flags
|
||||
if multiValuedFlags[flagName] {
|
||||
if existing, ok := options[flagName].([]string); ok {
|
||||
options[flagName] = append(existing, value)
|
||||
} else {
|
||||
options[flagName] = []string{value}
|
||||
}
|
||||
} else {
|
||||
options[flagName] = parseValue(value)
|
||||
}
|
||||
} else {
|
||||
// Boolean flag
|
||||
options[flagName] = true
|
||||
}
|
||||
}
|
||||
|
||||
return options, nil
|
||||
}
|
||||
|
||||
// parseValue converts string to appropriate type
|
||||
func parseValue(value string) any {
|
||||
// Remove quotes
|
||||
if len(value) >= 2 {
|
||||
if (value[0] == '"' && value[len(value)-1] == '"') || (value[0] == '\'' && value[len(value)-1] == '\'') {
|
||||
value = value[1 : len(value)-1]
|
||||
}
|
||||
}
|
||||
|
||||
// Try boolean
|
||||
switch strings.ToLower(value) {
|
||||
case "true":
|
||||
return true
|
||||
case "false":
|
||||
return false
|
||||
}
|
||||
|
||||
// Try integer
|
||||
if intVal, err := strconv.Atoi(value); err == nil {
|
||||
return intVal
|
||||
}
|
||||
|
||||
// Try float
|
||||
if floatVal, err := strconv.ParseFloat(value, 64); err == nil {
|
||||
return floatVal
|
||||
}
|
||||
|
||||
// Return as string
|
||||
return value
|
||||
}
|
||||
189
pkg/backends/vllm/vllm.go
Normal file
189
pkg/backends/vllm/vllm.go
Normal file
@@ -0,0 +1,189 @@
|
||||
package vllm
|
||||
|
||||
import (
|
||||
"llamactl/pkg/backends"
|
||||
)
|
||||
|
||||
type VllmServerOptions struct {
|
||||
// Basic connection options (auto-assigned by llamactl)
|
||||
Host string `json:"host,omitempty"`
|
||||
Port int `json:"port,omitempty"`
|
||||
|
||||
// Model and engine configuration
|
||||
Model string `json:"model,omitempty"`
|
||||
Tokenizer string `json:"tokenizer,omitempty"`
|
||||
SkipTokenizerInit bool `json:"skip_tokenizer_init,omitempty"`
|
||||
Revision string `json:"revision,omitempty"`
|
||||
CodeRevision string `json:"code_revision,omitempty"`
|
||||
TokenizerRevision string `json:"tokenizer_revision,omitempty"`
|
||||
TokenizerMode string `json:"tokenizer_mode,omitempty"`
|
||||
TrustRemoteCode bool `json:"trust_remote_code,omitempty"`
|
||||
DownloadDir string `json:"download_dir,omitempty"`
|
||||
LoadFormat string `json:"load_format,omitempty"`
|
||||
ConfigFormat string `json:"config_format,omitempty"`
|
||||
Dtype string `json:"dtype,omitempty"`
|
||||
KVCacheDtype string `json:"kv_cache_dtype,omitempty"`
|
||||
QuantizationParamPath string `json:"quantization_param_path,omitempty"`
|
||||
Seed int `json:"seed,omitempty"`
|
||||
MaxModelLen int `json:"max_model_len,omitempty"`
|
||||
GuidedDecodingBackend string `json:"guided_decoding_backend,omitempty"`
|
||||
DistributedExecutorBackend string `json:"distributed_executor_backend,omitempty"`
|
||||
WorkerUseRay bool `json:"worker_use_ray,omitempty"`
|
||||
RayWorkersUseNSight bool `json:"ray_workers_use_nsight,omitempty"`
|
||||
|
||||
// Performance and serving configuration
|
||||
BlockSize int `json:"block_size,omitempty"`
|
||||
EnablePrefixCaching bool `json:"enable_prefix_caching,omitempty"`
|
||||
DisableSlidingWindow bool `json:"disable_sliding_window,omitempty"`
|
||||
UseV2BlockManager bool `json:"use_v2_block_manager,omitempty"`
|
||||
NumLookaheadSlots int `json:"num_lookahead_slots,omitempty"`
|
||||
SwapSpace int `json:"swap_space,omitempty"`
|
||||
CPUOffloadGB int `json:"cpu_offload_gb,omitempty"`
|
||||
GPUMemoryUtilization float64 `json:"gpu_memory_utilization,omitempty"`
|
||||
NumGPUBlocksOverride int `json:"num_gpu_blocks_override,omitempty"`
|
||||
MaxNumBatchedTokens int `json:"max_num_batched_tokens,omitempty"`
|
||||
MaxNumSeqs int `json:"max_num_seqs,omitempty"`
|
||||
MaxLogprobs int `json:"max_logprobs,omitempty"`
|
||||
DisableLogStats bool `json:"disable_log_stats,omitempty"`
|
||||
Quantization string `json:"quantization,omitempty"`
|
||||
RopeScaling string `json:"rope_scaling,omitempty"`
|
||||
RopeTheta float64 `json:"rope_theta,omitempty"`
|
||||
EnforceEager bool `json:"enforce_eager,omitempty"`
|
||||
MaxContextLenToCapture int `json:"max_context_len_to_capture,omitempty"`
|
||||
MaxSeqLenToCapture int `json:"max_seq_len_to_capture,omitempty"`
|
||||
DisableCustomAllReduce bool `json:"disable_custom_all_reduce,omitempty"`
|
||||
TokenizerPoolSize int `json:"tokenizer_pool_size,omitempty"`
|
||||
TokenizerPoolType string `json:"tokenizer_pool_type,omitempty"`
|
||||
TokenizerPoolExtraConfig string `json:"tokenizer_pool_extra_config,omitempty"`
|
||||
EnableLoraBias bool `json:"enable_lora_bias,omitempty"`
|
||||
LoraExtraVocabSize int `json:"lora_extra_vocab_size,omitempty"`
|
||||
LoraRank int `json:"lora_rank,omitempty"`
|
||||
PromptLookbackDistance int `json:"prompt_lookback_distance,omitempty"`
|
||||
PreemptionMode string `json:"preemption_mode,omitempty"`
|
||||
|
||||
// Distributed and parallel processing
|
||||
TensorParallelSize int `json:"tensor_parallel_size,omitempty"`
|
||||
PipelineParallelSize int `json:"pipeline_parallel_size,omitempty"`
|
||||
MaxParallelLoadingWorkers int `json:"max_parallel_loading_workers,omitempty"`
|
||||
DisableAsyncOutputProc bool `json:"disable_async_output_proc,omitempty"`
|
||||
WorkerClass string `json:"worker_class,omitempty"`
|
||||
EnabledLoraModules string `json:"enabled_lora_modules,omitempty"`
|
||||
MaxLoraRank int `json:"max_lora_rank,omitempty"`
|
||||
FullyShardedLoras bool `json:"fully_sharded_loras,omitempty"`
|
||||
LoraModules string `json:"lora_modules,omitempty"`
|
||||
PromptAdapters string `json:"prompt_adapters,omitempty"`
|
||||
MaxPromptAdapterToken int `json:"max_prompt_adapter_token,omitempty"`
|
||||
Device string `json:"device,omitempty"`
|
||||
SchedulerDelay float64 `json:"scheduler_delay,omitempty"`
|
||||
EnableChunkedPrefill bool `json:"enable_chunked_prefill,omitempty"`
|
||||
SpeculativeModel string `json:"speculative_model,omitempty"`
|
||||
SpeculativeModelQuantization string `json:"speculative_model_quantization,omitempty"`
|
||||
SpeculativeRevision string `json:"speculative_revision,omitempty"`
|
||||
SpeculativeMaxModelLen int `json:"speculative_max_model_len,omitempty"`
|
||||
SpeculativeDisableByBatchSize int `json:"speculative_disable_by_batch_size,omitempty"`
|
||||
NgptSpeculativeLength int `json:"ngpt_speculative_length,omitempty"`
|
||||
SpeculativeDisableMqa bool `json:"speculative_disable_mqa,omitempty"`
|
||||
ModelLoaderExtraConfig string `json:"model_loader_extra_config,omitempty"`
|
||||
IgnorePatterns string `json:"ignore_patterns,omitempty"`
|
||||
PreloadedLoraModules string `json:"preloaded_lora_modules,omitempty"`
|
||||
|
||||
// OpenAI server specific options
|
||||
UDS string `json:"uds,omitempty"`
|
||||
UvicornLogLevel string `json:"uvicorn_log_level,omitempty"`
|
||||
ResponseRole string `json:"response_role,omitempty"`
|
||||
SSLKeyfile string `json:"ssl_keyfile,omitempty"`
|
||||
SSLCertfile string `json:"ssl_certfile,omitempty"`
|
||||
SSLCACerts string `json:"ssl_ca_certs,omitempty"`
|
||||
SSLCertReqs int `json:"ssl_cert_reqs,omitempty"`
|
||||
RootPath string `json:"root_path,omitempty"`
|
||||
Middleware []string `json:"middleware,omitempty"`
|
||||
ReturnTokensAsTokenIDS bool `json:"return_tokens_as_token_ids,omitempty"`
|
||||
DisableFrontendMultiprocessing bool `json:"disable_frontend_multiprocessing,omitempty"`
|
||||
EnableAutoToolChoice bool `json:"enable_auto_tool_choice,omitempty"`
|
||||
ToolCallParser string `json:"tool_call_parser,omitempty"`
|
||||
ToolServer string `json:"tool_server,omitempty"`
|
||||
ChatTemplate string `json:"chat_template,omitempty"`
|
||||
ChatTemplateContentFormat string `json:"chat_template_content_format,omitempty"`
|
||||
AllowCredentials bool `json:"allow_credentials,omitempty"`
|
||||
AllowedOrigins []string `json:"allowed_origins,omitempty"`
|
||||
AllowedMethods []string `json:"allowed_methods,omitempty"`
|
||||
AllowedHeaders []string `json:"allowed_headers,omitempty"`
|
||||
APIKey []string `json:"api_key,omitempty"`
|
||||
EnableLogOutputs bool `json:"enable_log_outputs,omitempty"`
|
||||
EnableTokenUsage bool `json:"enable_token_usage,omitempty"`
|
||||
EnableAsyncEngineDebug bool `json:"enable_async_engine_debug,omitempty"`
|
||||
EngineUseRay bool `json:"engine_use_ray,omitempty"`
|
||||
DisableLogRequests bool `json:"disable_log_requests,omitempty"`
|
||||
MaxLogLen int `json:"max_log_len,omitempty"`
|
||||
|
||||
// Additional engine configuration
|
||||
Task string `json:"task,omitempty"`
|
||||
MultiModalConfig string `json:"multi_modal_config,omitempty"`
|
||||
LimitMmPerPrompt string `json:"limit_mm_per_prompt,omitempty"`
|
||||
EnableSleepMode bool `json:"enable_sleep_mode,omitempty"`
|
||||
EnableChunkingRequest bool `json:"enable_chunking_request,omitempty"`
|
||||
CompilationConfig string `json:"compilation_config,omitempty"`
|
||||
DisableSlidingWindowMask bool `json:"disable_sliding_window_mask,omitempty"`
|
||||
EnableTRTLLMEngineLatency bool `json:"enable_trtllm_engine_latency,omitempty"`
|
||||
OverridePoolingConfig string `json:"override_pooling_config,omitempty"`
|
||||
OverrideNeuronConfig string `json:"override_neuron_config,omitempty"`
|
||||
OverrideKVCacheALIGNSize int `json:"override_kv_cache_align_size,omitempty"`
|
||||
}
|
||||
|
||||
// BuildCommandArgs converts VllmServerOptions to command line arguments
|
||||
// Note: This does NOT include the "serve" subcommand, that's handled at the instance level
|
||||
// For vLLM, the model parameter is passed as a positional argument, not a --model flag
|
||||
func (o *VllmServerOptions) BuildCommandArgs() []string {
|
||||
var args []string
|
||||
|
||||
// Add model as positional argument if specified
|
||||
if o.Model != "" {
|
||||
args = append(args, o.Model)
|
||||
}
|
||||
|
||||
// Create a copy of the options without the Model field to avoid including it as --model flag
|
||||
optionsCopy := *o
|
||||
optionsCopy.Model = "" // Clear model field so it won't be included as a flag
|
||||
|
||||
multipleFlags := map[string]bool{
|
||||
"api-key": true,
|
||||
"allowed-origins": true,
|
||||
"allowed-methods": true,
|
||||
"allowed-headers": true,
|
||||
"middleware": true,
|
||||
}
|
||||
|
||||
// Build the rest of the arguments as flags
|
||||
flagArgs := backends.BuildCommandArgs(&optionsCopy, multipleFlags)
|
||||
args = append(args, flagArgs...)
|
||||
|
||||
return args
|
||||
}
|
||||
|
||||
// ParseVllmCommand parses a vLLM serve command string into VllmServerOptions
|
||||
// Supports multiple formats:
|
||||
// 1. Full command: "vllm serve --model MODEL_NAME --other-args"
|
||||
// 2. Full path: "/usr/local/bin/vllm serve --model MODEL_NAME"
|
||||
// 3. Serve only: "serve --model MODEL_NAME --other-args"
|
||||
// 4. Args only: "--model MODEL_NAME --other-args"
|
||||
// 5. Multiline commands with backslashes
|
||||
func ParseVllmCommand(command string) (*VllmServerOptions, error) {
|
||||
executableNames := []string{"vllm"}
|
||||
subcommandNames := []string{"serve"}
|
||||
multiValuedFlags := map[string]bool{
|
||||
"middleware": true,
|
||||
"api_key": true,
|
||||
"allowed_origins": true,
|
||||
"allowed_methods": true,
|
||||
"allowed_headers": true,
|
||||
"lora_modules": true,
|
||||
"prompt_adapters": true,
|
||||
}
|
||||
|
||||
var vllmOptions VllmServerOptions
|
||||
if err := backends.ParseCommand(command, executableNames, subcommandNames, multiValuedFlags, &vllmOptions); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return &vllmOptions, nil
|
||||
}
|
||||
153
pkg/backends/vllm/vllm_test.go
Normal file
153
pkg/backends/vllm/vllm_test.go
Normal file
@@ -0,0 +1,153 @@
|
||||
package vllm_test
|
||||
|
||||
import (
|
||||
"llamactl/pkg/backends/vllm"
|
||||
"slices"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestParseVllmCommand(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
command string
|
||||
expectErr bool
|
||||
}{
|
||||
{
|
||||
name: "basic vllm serve command",
|
||||
command: "vllm serve microsoft/DialoGPT-medium",
|
||||
expectErr: false,
|
||||
},
|
||||
{
|
||||
name: "serve only command",
|
||||
command: "serve microsoft/DialoGPT-medium",
|
||||
expectErr: false,
|
||||
},
|
||||
{
|
||||
name: "positional model with flags",
|
||||
command: "vllm serve microsoft/DialoGPT-medium --tensor-parallel-size 2",
|
||||
expectErr: false,
|
||||
},
|
||||
{
|
||||
name: "model with path",
|
||||
command: "vllm serve /path/to/model --gpu-memory-utilization 0.8",
|
||||
expectErr: false,
|
||||
},
|
||||
{
|
||||
name: "empty command",
|
||||
command: "",
|
||||
expectErr: true,
|
||||
},
|
||||
{
|
||||
name: "unterminated quote",
|
||||
command: `vllm serve "unterminated`,
|
||||
expectErr: true,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
result, err := vllm.ParseVllmCommand(tt.command)
|
||||
|
||||
if tt.expectErr {
|
||||
if err == nil {
|
||||
t.Errorf("expected error but got none")
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
t.Errorf("unexpected error: %v", err)
|
||||
return
|
||||
}
|
||||
|
||||
if result == nil {
|
||||
t.Errorf("expected result but got nil")
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseVllmCommandValues(t *testing.T) {
|
||||
command := "vllm serve test-model --tensor-parallel-size 4 --gpu-memory-utilization 0.8 --enable-log-outputs"
|
||||
result, err := vllm.ParseVllmCommand(command)
|
||||
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
|
||||
if result.Model != "test-model" {
|
||||
t.Errorf("expected model 'test-model', got '%s'", result.Model)
|
||||
}
|
||||
if result.TensorParallelSize != 4 {
|
||||
t.Errorf("expected tensor_parallel_size 4, got %d", result.TensorParallelSize)
|
||||
}
|
||||
if result.GPUMemoryUtilization != 0.8 {
|
||||
t.Errorf("expected gpu_memory_utilization 0.8, got %f", result.GPUMemoryUtilization)
|
||||
}
|
||||
if !result.EnableLogOutputs {
|
||||
t.Errorf("expected enable_log_outputs true, got %v", result.EnableLogOutputs)
|
||||
}
|
||||
}
|
||||
|
||||
func TestBuildCommandArgs(t *testing.T) {
|
||||
options := vllm.VllmServerOptions{
|
||||
Model: "microsoft/DialoGPT-medium",
|
||||
Port: 8080,
|
||||
Host: "localhost",
|
||||
TensorParallelSize: 2,
|
||||
GPUMemoryUtilization: 0.8,
|
||||
EnableLogOutputs: true,
|
||||
AllowedOrigins: []string{"http://localhost:3000", "https://example.com"},
|
||||
}
|
||||
|
||||
args := options.BuildCommandArgs()
|
||||
|
||||
// Check that model is the first positional argument (not a --model flag)
|
||||
if len(args) == 0 || args[0] != "microsoft/DialoGPT-medium" {
|
||||
t.Errorf("Expected model 'microsoft/DialoGPT-medium' as first positional argument, got args: %v", args)
|
||||
}
|
||||
|
||||
// Check that --model flag is NOT present (since model should be positional)
|
||||
if contains(args, "--model") {
|
||||
t.Errorf("Found --model flag, but model should be positional argument in args: %v", args)
|
||||
}
|
||||
|
||||
// Check other flags
|
||||
if !containsFlagWithValue(args, "--tensor-parallel-size", "2") {
|
||||
t.Errorf("Expected --tensor-parallel-size 2 not found in %v", args)
|
||||
}
|
||||
if !contains(args, "--enable-log-outputs") {
|
||||
t.Errorf("Expected --enable-log-outputs not found in %v", args)
|
||||
}
|
||||
if !contains(args, "--host") {
|
||||
t.Errorf("Expected --host not found in %v", args)
|
||||
}
|
||||
if !contains(args, "--port") {
|
||||
t.Errorf("Expected --port not found in %v", args)
|
||||
}
|
||||
|
||||
// Check array handling (multiple flags)
|
||||
allowedOriginsCount := 0
|
||||
for i := range args {
|
||||
if args[i] == "--allowed-origins" {
|
||||
allowedOriginsCount++
|
||||
}
|
||||
}
|
||||
if allowedOriginsCount != 2 {
|
||||
t.Errorf("Expected 2 --allowed-origins flags, got %d", allowedOriginsCount)
|
||||
}
|
||||
}
|
||||
|
||||
// Helper functions
|
||||
func contains(slice []string, item string) bool {
|
||||
return slices.Contains(slice, item)
|
||||
}
|
||||
|
||||
func containsFlagWithValue(args []string, flag, value string) bool {
|
||||
for i, arg := range args {
|
||||
if arg == flag && i+1 < len(args) && args[i+1] == value {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
@@ -17,6 +17,9 @@ type BackendConfig struct {
|
||||
|
||||
// Path to mlx_lm executable (MLX-LM backend)
|
||||
MLXLMExecutable string `yaml:"mlx_lm_executable"`
|
||||
|
||||
// Path to vllm executable (vLLM backend)
|
||||
VllmExecutable string `yaml:"vllm_executable"`
|
||||
}
|
||||
|
||||
// AppConfig represents the configuration for llamactl
|
||||
@@ -122,6 +125,7 @@ func LoadConfig(configPath string) (AppConfig, error) {
|
||||
Backends: BackendConfig{
|
||||
LlamaExecutable: "llama-server",
|
||||
MLXLMExecutable: "mlx_lm.server",
|
||||
VllmExecutable: "vllm",
|
||||
},
|
||||
Instances: InstancesConfig{
|
||||
PortRange: [2]int{8000, 9000},
|
||||
@@ -246,6 +250,9 @@ func loadEnvVars(cfg *AppConfig) {
|
||||
if mlxLMExec := os.Getenv("LLAMACTL_MLX_LM_EXECUTABLE"); mlxLMExec != "" {
|
||||
cfg.Backends.MLXLMExecutable = mlxLMExec
|
||||
}
|
||||
if vllmExec := os.Getenv("LLAMACTL_VLLM_EXECUTABLE"); vllmExec != "" {
|
||||
cfg.Backends.VllmExecutable = vllmExec
|
||||
}
|
||||
if autoRestart := os.Getenv("LLAMACTL_DEFAULT_AUTO_RESTART"); autoRestart != "" {
|
||||
if b, err := strconv.ParseBool(autoRestart); err == nil {
|
||||
cfg.Instances.DefaultAutoRestart = b
|
||||
|
||||
@@ -105,6 +105,10 @@ func (i *Process) GetPort() int {
|
||||
if i.options.MlxServerOptions != nil {
|
||||
return i.options.MlxServerOptions.Port
|
||||
}
|
||||
case backends.BackendTypeVllm:
|
||||
if i.options.VllmServerOptions != nil {
|
||||
return i.options.VllmServerOptions.Port
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0
|
||||
@@ -123,6 +127,10 @@ func (i *Process) GetHost() string {
|
||||
if i.options.MlxServerOptions != nil {
|
||||
return i.options.MlxServerOptions.Host
|
||||
}
|
||||
case backends.BackendTypeVllm:
|
||||
if i.options.VllmServerOptions != nil {
|
||||
return i.options.VllmServerOptions.Host
|
||||
}
|
||||
}
|
||||
}
|
||||
return ""
|
||||
@@ -176,6 +184,11 @@ func (i *Process) GetProxy() (*httputil.ReverseProxy, error) {
|
||||
host = i.options.MlxServerOptions.Host
|
||||
port = i.options.MlxServerOptions.Port
|
||||
}
|
||||
case backends.BackendTypeVllm:
|
||||
if i.options.VllmServerOptions != nil {
|
||||
host = i.options.VllmServerOptions.Host
|
||||
port = i.options.VllmServerOptions.Port
|
||||
}
|
||||
}
|
||||
|
||||
targetURL, err := url.Parse(fmt.Sprintf("http://%s:%d", host, port))
|
||||
|
||||
@@ -52,6 +52,8 @@ func (i *Process) Start() error {
|
||||
executable = i.globalBackendSettings.LlamaExecutable
|
||||
case backends.BackendTypeMlxLm:
|
||||
executable = i.globalBackendSettings.MLXLMExecutable
|
||||
case backends.BackendTypeVllm:
|
||||
executable = i.globalBackendSettings.VllmExecutable
|
||||
default:
|
||||
return fmt.Errorf("unsupported backend type: %s", i.options.BackendType)
|
||||
}
|
||||
@@ -200,6 +202,11 @@ func (i *Process) WaitForHealthy(timeout int) error {
|
||||
host = opts.MlxServerOptions.Host
|
||||
port = opts.MlxServerOptions.Port
|
||||
}
|
||||
case backends.BackendTypeVllm:
|
||||
if opts.VllmServerOptions != nil {
|
||||
host = opts.VllmServerOptions.Host
|
||||
port = opts.VllmServerOptions.Port
|
||||
}
|
||||
}
|
||||
if host == "" {
|
||||
host = "localhost"
|
||||
|
||||
@@ -6,6 +6,7 @@ import (
|
||||
"llamactl/pkg/backends"
|
||||
"llamactl/pkg/backends/llamacpp"
|
||||
"llamactl/pkg/backends/mlx"
|
||||
"llamactl/pkg/backends/vllm"
|
||||
"llamactl/pkg/config"
|
||||
"log"
|
||||
)
|
||||
@@ -26,6 +27,7 @@ type CreateInstanceOptions struct {
|
||||
// Backend-specific options
|
||||
LlamaServerOptions *llamacpp.LlamaServerOptions `json:"-"`
|
||||
MlxServerOptions *mlx.MlxServerOptions `json:"-"`
|
||||
VllmServerOptions *vllm.VllmServerOptions `json:"-"`
|
||||
}
|
||||
|
||||
// UnmarshalJSON implements custom JSON unmarshaling for CreateInstanceOptions
|
||||
@@ -69,6 +71,18 @@ func (c *CreateInstanceOptions) UnmarshalJSON(data []byte) error {
|
||||
return fmt.Errorf("failed to unmarshal MLX options: %w", err)
|
||||
}
|
||||
}
|
||||
case backends.BackendTypeVllm:
|
||||
if c.BackendOptions != nil {
|
||||
optionsData, err := json.Marshal(c.BackendOptions)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to marshal backend options: %w", err)
|
||||
}
|
||||
|
||||
c.VllmServerOptions = &vllm.VllmServerOptions{}
|
||||
if err := json.Unmarshal(optionsData, c.VllmServerOptions); err != nil {
|
||||
return fmt.Errorf("failed to unmarshal vLLM options: %w", err)
|
||||
}
|
||||
}
|
||||
default:
|
||||
return fmt.Errorf("unknown backend type: %s", c.BackendType)
|
||||
}
|
||||
@@ -114,6 +128,20 @@ func (c *CreateInstanceOptions) MarshalJSON() ([]byte, error) {
|
||||
return nil, fmt.Errorf("failed to unmarshal to map: %w", err)
|
||||
}
|
||||
|
||||
aux.BackendOptions = backendOpts
|
||||
}
|
||||
case backends.BackendTypeVllm:
|
||||
if c.VllmServerOptions != nil {
|
||||
data, err := json.Marshal(c.VllmServerOptions)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to marshal vLLM server options: %w", err)
|
||||
}
|
||||
|
||||
var backendOpts map[string]any
|
||||
if err := json.Unmarshal(data, &backendOpts); err != nil {
|
||||
return nil, fmt.Errorf("failed to unmarshal to map: %w", err)
|
||||
}
|
||||
|
||||
aux.BackendOptions = backendOpts
|
||||
}
|
||||
}
|
||||
@@ -171,6 +199,13 @@ func (c *CreateInstanceOptions) BuildCommandArgs() []string {
|
||||
if c.MlxServerOptions != nil {
|
||||
return c.MlxServerOptions.BuildCommandArgs()
|
||||
}
|
||||
case backends.BackendTypeVllm:
|
||||
if c.VllmServerOptions != nil {
|
||||
// Prepend "serve" as first argument
|
||||
args := []string{"serve"}
|
||||
args = append(args, c.VllmServerOptions.BuildCommandArgs()...)
|
||||
return args
|
||||
}
|
||||
}
|
||||
return []string{}
|
||||
}
|
||||
|
||||
@@ -264,6 +264,10 @@ func (im *instanceManager) getPortFromOptions(options *instance.CreateInstanceOp
|
||||
if options.MlxServerOptions != nil {
|
||||
return options.MlxServerOptions.Port
|
||||
}
|
||||
case backends.BackendTypeVllm:
|
||||
if options.VllmServerOptions != nil {
|
||||
return options.VllmServerOptions.Port
|
||||
}
|
||||
}
|
||||
return 0
|
||||
}
|
||||
@@ -279,6 +283,10 @@ func (im *instanceManager) setPortInOptions(options *instance.CreateInstanceOpti
|
||||
if options.MlxServerOptions != nil {
|
||||
options.MlxServerOptions.Port = port
|
||||
}
|
||||
case backends.BackendTypeVllm:
|
||||
if options.VllmServerOptions != nil {
|
||||
options.VllmServerOptions.Port = port
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -8,6 +8,7 @@ import (
|
||||
"llamactl/pkg/backends"
|
||||
"llamactl/pkg/backends/llamacpp"
|
||||
"llamactl/pkg/backends/mlx"
|
||||
"llamactl/pkg/backends/vllm"
|
||||
"llamactl/pkg/config"
|
||||
"llamactl/pkg/instance"
|
||||
"llamactl/pkg/manager"
|
||||
@@ -739,3 +740,56 @@ func (h *Handler) ParseMlxCommand() http.HandlerFunc {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ParseVllmCommand godoc
|
||||
// @Summary Parse vllm serve command
|
||||
// @Description Parses a vLLM serve command string into instance options
|
||||
// @Tags backends
|
||||
// @Security ApiKeyAuth
|
||||
// @Accept json
|
||||
// @Produce json
|
||||
// @Param request body ParseCommandRequest true "Command to parse"
|
||||
// @Success 200 {object} instance.CreateInstanceOptions "Parsed options"
|
||||
// @Failure 400 {object} map[string]string "Invalid request or command"
|
||||
// @Router /backends/vllm/parse-command [post]
|
||||
func (h *Handler) ParseVllmCommand() http.HandlerFunc {
|
||||
type errorResponse struct {
|
||||
Error string `json:"error"`
|
||||
Details string `json:"details,omitempty"`
|
||||
}
|
||||
writeError := func(w http.ResponseWriter, status int, code, details string) {
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
w.WriteHeader(status)
|
||||
_ = json.NewEncoder(w).Encode(errorResponse{Error: code, Details: details})
|
||||
}
|
||||
return func(w http.ResponseWriter, r *http.Request) {
|
||||
var req ParseCommandRequest
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
writeError(w, http.StatusBadRequest, "invalid_request", "Invalid JSON body")
|
||||
return
|
||||
}
|
||||
|
||||
if strings.TrimSpace(req.Command) == "" {
|
||||
writeError(w, http.StatusBadRequest, "invalid_command", "Command cannot be empty")
|
||||
return
|
||||
}
|
||||
|
||||
vllmOptions, err := vllm.ParseVllmCommand(req.Command)
|
||||
if err != nil {
|
||||
writeError(w, http.StatusBadRequest, "parse_error", err.Error())
|
||||
return
|
||||
}
|
||||
|
||||
backendType := backends.BackendTypeVllm
|
||||
|
||||
options := &instance.CreateInstanceOptions{
|
||||
BackendType: backendType,
|
||||
VllmServerOptions: vllmOptions,
|
||||
}
|
||||
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
if err := json.NewEncoder(w).Encode(options); err != nil {
|
||||
writeError(w, http.StatusInternalServerError, "encode_error", err.Error())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -58,6 +58,9 @@ func SetupRouter(handler *Handler) *chi.Mux {
|
||||
r.Route("/mlx", func(r chi.Router) {
|
||||
r.Post("/parse-command", handler.ParseMlxCommand())
|
||||
})
|
||||
r.Route("/vllm", func(r chi.Router) {
|
||||
r.Post("/parse-command", handler.ParseVllmCommand())
|
||||
})
|
||||
})
|
||||
|
||||
// Instance management endpoints
|
||||
|
||||
@@ -46,6 +46,8 @@ func ValidateInstanceOptions(options *instance.CreateInstanceOptions) error {
|
||||
return validateLlamaCppOptions(options)
|
||||
case backends.BackendTypeMlxLm:
|
||||
return validateMlxOptions(options)
|
||||
case backends.BackendTypeVllm:
|
||||
return validateVllmOptions(options)
|
||||
default:
|
||||
return ValidationError(fmt.Errorf("unsupported backend type: %s", options.BackendType))
|
||||
}
|
||||
@@ -88,6 +90,25 @@ func validateMlxOptions(options *instance.CreateInstanceOptions) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// validateVllmOptions validates vLLM backend specific options
|
||||
func validateVllmOptions(options *instance.CreateInstanceOptions) error {
|
||||
if options.VllmServerOptions == nil {
|
||||
return ValidationError(fmt.Errorf("vLLM server options cannot be nil for vLLM backend"))
|
||||
}
|
||||
|
||||
// Use reflection to check all string fields for injection patterns
|
||||
if err := validateStructStrings(options.VllmServerOptions, ""); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Basic network validation for port
|
||||
if options.VllmServerOptions.Port < 0 || options.VllmServerOptions.Port > 65535 {
|
||||
return ValidationError(fmt.Errorf("invalid port range: %d", options.VllmServerOptions.Port))
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// validateStructStrings recursively validates all string fields in a struct
|
||||
func validateStructStrings(v any, fieldPath string) error {
|
||||
val := reflect.ValueOf(v)
|
||||
|
||||
65
webui/src/components/BackendBadge.tsx
Normal file
65
webui/src/components/BackendBadge.tsx
Normal file
@@ -0,0 +1,65 @@
|
||||
import React from "react";
|
||||
import { Badge } from "@/components/ui/badge";
|
||||
import { BackendType, type BackendTypeValue } from "@/types/instance";
|
||||
import { Cpu, Zap, Server } from "lucide-react";
|
||||
|
||||
interface BackendBadgeProps {
|
||||
backend?: BackendTypeValue;
|
||||
}
|
||||
|
||||
const BackendBadge: React.FC<BackendBadgeProps> = ({ backend }) => {
|
||||
if (!backend) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const getIcon = () => {
|
||||
switch (backend) {
|
||||
case BackendType.LLAMA_CPP:
|
||||
return <Cpu className="h-3 w-3" />;
|
||||
case BackendType.MLX_LM:
|
||||
return <Zap className="h-3 w-3" />;
|
||||
case BackendType.VLLM:
|
||||
return <Server className="h-3 w-3" />;
|
||||
default:
|
||||
return <Server className="h-3 w-3" />;
|
||||
}
|
||||
};
|
||||
|
||||
const getText = () => {
|
||||
switch (backend) {
|
||||
case BackendType.LLAMA_CPP:
|
||||
return "llama.cpp";
|
||||
case BackendType.MLX_LM:
|
||||
return "MLX";
|
||||
case BackendType.VLLM:
|
||||
return "vLLM";
|
||||
default:
|
||||
return backend;
|
||||
}
|
||||
};
|
||||
|
||||
const getVariant = () => {
|
||||
switch (backend) {
|
||||
case BackendType.LLAMA_CPP:
|
||||
return "secondary";
|
||||
case BackendType.MLX_LM:
|
||||
return "outline";
|
||||
case BackendType.VLLM:
|
||||
return "default";
|
||||
default:
|
||||
return "secondary";
|
||||
}
|
||||
};
|
||||
|
||||
return (
|
||||
<Badge
|
||||
variant={getVariant()}
|
||||
className="flex items-center gap-1.5"
|
||||
>
|
||||
{getIcon()}
|
||||
<span className="text-xs">{getText()}</span>
|
||||
</Badge>
|
||||
);
|
||||
};
|
||||
|
||||
export default BackendBadge;
|
||||
@@ -45,7 +45,6 @@ const BackendFormField: React.FC<BackendFormFieldProps> = ({ fieldKey, value, on
|
||||
<div className="grid gap-2">
|
||||
<Label htmlFor={fieldKey}>
|
||||
{config.label}
|
||||
{config.required && <span className="text-red-500 ml-1">*</span>}
|
||||
</Label>
|
||||
<Input
|
||||
id={fieldKey}
|
||||
@@ -72,7 +71,6 @@ const BackendFormField: React.FC<BackendFormFieldProps> = ({ fieldKey, value, on
|
||||
<div className="grid gap-2">
|
||||
<Label htmlFor={fieldKey}>
|
||||
{config.label}
|
||||
{config.required && <span className="text-red-500 ml-1">*</span>}
|
||||
</Label>
|
||||
<Input
|
||||
id={fieldKey}
|
||||
@@ -99,7 +97,6 @@ const BackendFormField: React.FC<BackendFormFieldProps> = ({ fieldKey, value, on
|
||||
<div className="grid gap-2">
|
||||
<Label htmlFor={fieldKey}>
|
||||
{config.label}
|
||||
{config.required && <span className="text-red-500 ml-1">*</span>}
|
||||
</Label>
|
||||
<Input
|
||||
id={fieldKey}
|
||||
|
||||
@@ -5,6 +5,7 @@ import type { Instance } from "@/types/instance";
|
||||
import { Edit, FileText, Play, Square, Trash2 } from "lucide-react";
|
||||
import LogsDialog from "@/components/LogDialog";
|
||||
import HealthBadge from "@/components/HealthBadge";
|
||||
import BackendBadge from "@/components/BackendBadge";
|
||||
import { useState } from "react";
|
||||
import { useInstanceHealth } from "@/hooks/useInstanceHealth";
|
||||
|
||||
@@ -58,7 +59,10 @@ function InstanceCard({
|
||||
<CardHeader className="pb-3">
|
||||
<div className="flex items-center justify-between">
|
||||
<CardTitle className="text-lg">{instance.name}</CardTitle>
|
||||
{running && <HealthBadge health={health} />}
|
||||
<div className="flex flex-col items-end gap-2">
|
||||
{running && <HealthBadge health={health} />}
|
||||
<BackendBadge backend={instance.options?.backend_type} />
|
||||
</div>
|
||||
</div>
|
||||
</CardHeader>
|
||||
|
||||
|
||||
@@ -11,11 +11,13 @@ import {
|
||||
DialogTitle,
|
||||
} from "@/components/ui/dialog";
|
||||
import { BackendType, type CreateInstanceOptions, type Instance } from "@/types/instance";
|
||||
import { getBasicFields, getAdvancedFields, getBasicBackendFields, getAdvancedBackendFields } from "@/lib/zodFormUtils";
|
||||
import { getAdvancedFields, getAdvancedBackendFields } from "@/lib/zodFormUtils";
|
||||
import { ChevronDown, ChevronRight, Terminal } from "lucide-react";
|
||||
import ZodFormField from "@/components/ZodFormField";
|
||||
import BackendFormField from "@/components/BackendFormField";
|
||||
import ParseCommandDialog from "@/components/ParseCommandDialog";
|
||||
import AutoRestartConfiguration from "@/components/instance/AutoRestartConfiguration";
|
||||
import BasicInstanceFields from "@/components/instance/BasicInstanceFields";
|
||||
import BackendConfiguration from "@/components/instance/BackendConfiguration";
|
||||
import AdvancedInstanceFields from "@/components/instance/AdvancedInstanceFields";
|
||||
|
||||
interface InstanceDialogProps {
|
||||
open: boolean;
|
||||
@@ -39,9 +41,7 @@ const InstanceDialog: React.FC<InstanceDialogProps> = ({
|
||||
const [showParseDialog, setShowParseDialog] = useState(false);
|
||||
|
||||
// Get field lists dynamically from the type
|
||||
const basicFields = getBasicFields();
|
||||
const advancedFields = getAdvancedFields();
|
||||
const basicBackendFields = getBasicBackendFields(formData.backend_type);
|
||||
const advancedBackendFields = getAdvancedBackendFields(formData.backend_type);
|
||||
|
||||
// Reset form when dialog opens/closes or when instance changes
|
||||
@@ -163,8 +163,6 @@ const InstanceDialog: React.FC<InstanceDialogProps> = ({
|
||||
setShowParseDialog(false);
|
||||
};
|
||||
|
||||
// Check if auto_restart is enabled
|
||||
const isAutoRestartEnabled = formData.auto_restart === true;
|
||||
|
||||
// Save button label logic
|
||||
let saveButtonLabel = "Create Instance";
|
||||
@@ -212,70 +210,23 @@ const InstanceDialog: React.FC<InstanceDialogProps> = ({
|
||||
</div>
|
||||
|
||||
{/* Auto Restart Configuration Section */}
|
||||
<div className="space-y-4">
|
||||
<h3 className="text-lg font-medium">
|
||||
Auto Restart Configuration
|
||||
</h3>
|
||||
<AutoRestartConfiguration
|
||||
formData={formData}
|
||||
onChange={handleFieldChange}
|
||||
/>
|
||||
|
||||
{/* Auto Restart Toggle */}
|
||||
<ZodFormField
|
||||
fieldKey="auto_restart"
|
||||
value={formData.auto_restart}
|
||||
onChange={handleFieldChange}
|
||||
/>
|
||||
|
||||
{/* Show restart options only when auto restart is enabled */}
|
||||
{isAutoRestartEnabled && (
|
||||
<div className="ml-6 space-y-4 border-l-2 border-muted pl-4">
|
||||
<ZodFormField
|
||||
fieldKey="max_restarts"
|
||||
value={formData.max_restarts}
|
||||
onChange={handleFieldChange}
|
||||
/>
|
||||
<ZodFormField
|
||||
fieldKey="restart_delay"
|
||||
value={formData.restart_delay}
|
||||
onChange={handleFieldChange}
|
||||
/>
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
|
||||
{/* Basic Fields - Automatically generated from type (excluding auto restart options) */}
|
||||
<div className="space-y-4">
|
||||
<h3 className="text-lg font-medium">Basic Configuration</h3>
|
||||
{basicFields
|
||||
.filter(
|
||||
(fieldKey) =>
|
||||
fieldKey !== "auto_restart" &&
|
||||
fieldKey !== "max_restarts" &&
|
||||
fieldKey !== "restart_delay" &&
|
||||
fieldKey !== "backend_options" // backend_options is handled separately
|
||||
)
|
||||
.map((fieldKey) => (
|
||||
<ZodFormField
|
||||
key={fieldKey}
|
||||
fieldKey={fieldKey}
|
||||
value={formData[fieldKey]}
|
||||
onChange={handleFieldChange}
|
||||
/>
|
||||
))}
|
||||
</div>
|
||||
{/* Basic Fields */}
|
||||
<BasicInstanceFields
|
||||
formData={formData}
|
||||
onChange={handleFieldChange}
|
||||
/>
|
||||
|
||||
{/* Backend Configuration Section */}
|
||||
<div className="space-y-4">
|
||||
<h3 className="text-lg font-medium">Backend Configuration</h3>
|
||||
|
||||
{/* Basic backend fields */}
|
||||
{basicBackendFields.map((fieldKey) => (
|
||||
<BackendFormField
|
||||
key={fieldKey}
|
||||
fieldKey={fieldKey}
|
||||
value={(formData.backend_options as any)?.[fieldKey]}
|
||||
onChange={handleBackendFieldChange}
|
||||
/>
|
||||
))}
|
||||
</div>
|
||||
<BackendConfiguration
|
||||
formData={formData}
|
||||
onBackendFieldChange={handleBackendFieldChange}
|
||||
showAdvanced={showAdvanced}
|
||||
/>
|
||||
|
||||
{/* Advanced Fields Toggle */}
|
||||
<div className="border-t pt-4">
|
||||
@@ -314,54 +265,13 @@ const InstanceDialog: React.FC<InstanceDialogProps> = ({
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{/* Advanced Fields - Automatically generated from type (excluding restart options) */}
|
||||
{/* Advanced Fields */}
|
||||
{showAdvanced && (
|
||||
<div className="space-y-4 pl-6 border-l-2 border-muted">
|
||||
{/* Advanced instance fields */}
|
||||
{advancedFields
|
||||
.filter(
|
||||
(fieldKey) =>
|
||||
!["max_restarts", "restart_delay", "backend_options"].includes(
|
||||
fieldKey as string
|
||||
)
|
||||
).length > 0 && (
|
||||
<div className="space-y-4">
|
||||
<h4 className="text-md font-medium">Advanced Instance Configuration</h4>
|
||||
{advancedFields
|
||||
.filter(
|
||||
(fieldKey) =>
|
||||
!["max_restarts", "restart_delay", "backend_options"].includes(
|
||||
fieldKey as string
|
||||
)
|
||||
)
|
||||
.sort()
|
||||
.map((fieldKey) => (
|
||||
<ZodFormField
|
||||
key={fieldKey}
|
||||
fieldKey={fieldKey}
|
||||
value={fieldKey === 'backend_options' ? undefined : formData[fieldKey]}
|
||||
onChange={handleFieldChange}
|
||||
/>
|
||||
))}
|
||||
</div>
|
||||
)}
|
||||
|
||||
{/* Advanced backend fields */}
|
||||
{advancedBackendFields.length > 0 && (
|
||||
<div className="space-y-4">
|
||||
<h4 className="text-md font-medium">Advanced Backend Configuration</h4>
|
||||
{advancedBackendFields
|
||||
.sort()
|
||||
.map((fieldKey) => (
|
||||
<BackendFormField
|
||||
key={fieldKey}
|
||||
fieldKey={fieldKey}
|
||||
value={(formData.backend_options as any)?.[fieldKey]}
|
||||
onChange={handleBackendFieldChange}
|
||||
/>
|
||||
))}
|
||||
</div>
|
||||
)}
|
||||
<AdvancedInstanceFields
|
||||
formData={formData}
|
||||
onChange={handleFieldChange}
|
||||
/>
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
|
||||
@@ -9,7 +9,7 @@ import {
|
||||
DialogHeader,
|
||||
DialogTitle,
|
||||
} from "@/components/ui/dialog";
|
||||
import { type CreateInstanceOptions } from "@/types/instance";
|
||||
import { BackendType, type BackendTypeValue, type CreateInstanceOptions } from "@/types/instance";
|
||||
import { backendsApi } from "@/lib/api";
|
||||
import { toast } from "sonner";
|
||||
|
||||
@@ -25,6 +25,7 @@ const ParseCommandDialog: React.FC<ParseCommandDialogProps> = ({
|
||||
onParsed,
|
||||
}) => {
|
||||
const [command, setCommand] = useState('');
|
||||
const [backendType, setBackendType] = useState<BackendTypeValue>(BackendType.LLAMA_CPP);
|
||||
const [loading, setLoading] = useState(false);
|
||||
const [error, setError] = useState<string | null>(null);
|
||||
|
||||
@@ -38,18 +39,31 @@ const ParseCommandDialog: React.FC<ParseCommandDialogProps> = ({
|
||||
setError(null);
|
||||
|
||||
try {
|
||||
const options = await backendsApi.llamaCpp.parseCommand(command);
|
||||
let options: CreateInstanceOptions;
|
||||
|
||||
// Parse based on selected backend type
|
||||
switch (backendType) {
|
||||
case BackendType.LLAMA_CPP:
|
||||
options = await backendsApi.llamaCpp.parseCommand(command);
|
||||
break;
|
||||
case BackendType.MLX_LM:
|
||||
options = await backendsApi.mlx.parseCommand(command);
|
||||
break;
|
||||
case BackendType.VLLM:
|
||||
options = await backendsApi.vllm.parseCommand(command);
|
||||
break;
|
||||
default:
|
||||
throw new Error(`Unsupported backend type: ${backendType}`);
|
||||
}
|
||||
|
||||
onParsed(options);
|
||||
onOpenChange(false);
|
||||
// Reset form
|
||||
setCommand('');
|
||||
setError(null);
|
||||
// Show success toast
|
||||
toast.success('Command parsed successfully');
|
||||
} catch (err) {
|
||||
const errorMessage = err instanceof Error ? err.message : 'Failed to parse command';
|
||||
setError(errorMessage);
|
||||
// Show error toast
|
||||
toast.error('Failed to parse command', {
|
||||
description: errorMessage
|
||||
});
|
||||
@@ -60,31 +74,55 @@ const ParseCommandDialog: React.FC<ParseCommandDialogProps> = ({
|
||||
|
||||
const handleOpenChange = (open: boolean) => {
|
||||
if (!open) {
|
||||
// Reset form when closing
|
||||
setCommand('');
|
||||
setBackendType(BackendType.LLAMA_CPP);
|
||||
setError(null);
|
||||
}
|
||||
onOpenChange(open);
|
||||
};
|
||||
|
||||
const backendPlaceholders: Record<BackendTypeValue, string> = {
|
||||
[BackendType.LLAMA_CPP]: "llama-server --model /path/to/model.gguf --gpu-layers 32 --ctx-size 4096",
|
||||
[BackendType.MLX_LM]: "mlx_lm.server --model mlx-community/Mistral-7B-Instruct-v0.3-4bit --host 0.0.0.0 --port 8080",
|
||||
[BackendType.VLLM]: "vllm serve microsoft/DialoGPT-medium --tensor-parallel-size 2 --gpu-memory-utilization 0.9",
|
||||
};
|
||||
|
||||
const getPlaceholderForBackend = (backendType: BackendTypeValue): string => {
|
||||
return backendPlaceholders[backendType] || "Enter your command here...";
|
||||
};
|
||||
|
||||
return (
|
||||
<Dialog open={open} onOpenChange={handleOpenChange}>
|
||||
<DialogContent className="sm:max-w-[600px]">
|
||||
<DialogHeader>
|
||||
<DialogTitle>Parse Llama Server Command</DialogTitle>
|
||||
<DialogTitle>Parse Backend Command</DialogTitle>
|
||||
<DialogDescription>
|
||||
Paste your llama-server command to automatically populate the form fields
|
||||
Select your backend type and paste the command to automatically populate the form fields
|
||||
</DialogDescription>
|
||||
</DialogHeader>
|
||||
|
||||
<div className="space-y-4">
|
||||
<div>
|
||||
<Label htmlFor="backend-type">Backend Type</Label>
|
||||
<select
|
||||
id="backend-type"
|
||||
value={backendType}
|
||||
onChange={(e) => setBackendType(e.target.value as BackendTypeValue)}
|
||||
className="flex h-10 w-full rounded-md border border-input bg-background px-3 py-2 text-sm ring-offset-background file:border-0 file:bg-transparent file:text-sm file:font-medium placeholder:text-muted-foreground focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-ring focus-visible:ring-offset-2 disabled:cursor-not-allowed disabled:opacity-50"
|
||||
>
|
||||
<option value={BackendType.LLAMA_CPP}>Llama Server</option>
|
||||
<option value={BackendType.MLX_LM}>MLX LM</option>
|
||||
<option value={BackendType.VLLM}>vLLM</option>
|
||||
</select>
|
||||
</div>
|
||||
|
||||
<div>
|
||||
<Label htmlFor="command">Command</Label>
|
||||
<textarea
|
||||
id="command"
|
||||
value={command}
|
||||
onChange={(e) => setCommand(e.target.value)}
|
||||
placeholder="llama-server --model /path/to/model.gguf --gpu-layers 32 --ctx-size 4096"
|
||||
placeholder={getPlaceholderForBackend(backendType)}
|
||||
className="w-full h-32 p-3 mt-2 border border-input rounded-md font-mono text-sm resize-vertical focus:outline-none focus:ring-2 focus:ring-ring focus:ring-offset-2"
|
||||
/>
|
||||
</div>
|
||||
|
||||
@@ -29,7 +29,6 @@ const ZodFormField: React.FC<ZodFormFieldProps> = ({ fieldKey, value, onChange }
|
||||
<div className="grid gap-2">
|
||||
<Label htmlFor={fieldKey}>
|
||||
{config.label}
|
||||
{config.required && <span className="text-red-500 ml-1">*</span>}
|
||||
</Label>
|
||||
<select
|
||||
id={fieldKey}
|
||||
@@ -39,6 +38,7 @@ const ZodFormField: React.FC<ZodFormFieldProps> = ({ fieldKey, value, onChange }
|
||||
>
|
||||
<option value={BackendType.LLAMA_CPP}>Llama Server</option>
|
||||
<option value={BackendType.MLX_LM}>MLX LM</option>
|
||||
<option value={BackendType.VLLM}>vLLM</option>
|
||||
</select>
|
||||
{config.description && (
|
||||
<p className="text-sm text-muted-foreground">{config.description}</p>
|
||||
@@ -70,8 +70,7 @@ const ZodFormField: React.FC<ZodFormFieldProps> = ({ fieldKey, value, onChange }
|
||||
<div className="grid gap-2">
|
||||
<Label htmlFor={fieldKey}>
|
||||
{config.label}
|
||||
{config.required && <span className="text-red-500 ml-1">*</span>}
|
||||
</Label>
|
||||
</Label>
|
||||
<Input
|
||||
id={fieldKey}
|
||||
type="number"
|
||||
@@ -97,8 +96,7 @@ const ZodFormField: React.FC<ZodFormFieldProps> = ({ fieldKey, value, onChange }
|
||||
<div className="grid gap-2">
|
||||
<Label htmlFor={fieldKey}>
|
||||
{config.label}
|
||||
{config.required && <span className="text-red-500 ml-1">*</span>}
|
||||
</Label>
|
||||
</Label>
|
||||
<Input
|
||||
id={fieldKey}
|
||||
type="text"
|
||||
@@ -124,8 +122,7 @@ const ZodFormField: React.FC<ZodFormFieldProps> = ({ fieldKey, value, onChange }
|
||||
<div className="grid gap-2">
|
||||
<Label htmlFor={fieldKey}>
|
||||
{config.label}
|
||||
{config.required && <span className="text-red-500 ml-1">*</span>}
|
||||
</Label>
|
||||
</Label>
|
||||
<Input
|
||||
id={fieldKey}
|
||||
type="text"
|
||||
|
||||
62
webui/src/components/form/ArrayInput.tsx
Normal file
62
webui/src/components/form/ArrayInput.tsx
Normal file
@@ -0,0 +1,62 @@
|
||||
import React from 'react'
|
||||
import { Input } from '@/components/ui/input'
|
||||
import { Label } from '@/components/ui/label'
|
||||
|
||||
interface ArrayInputProps {
|
||||
id: string
|
||||
label: string
|
||||
value: string[] | undefined
|
||||
onChange: (value: string[] | undefined) => void
|
||||
placeholder?: string
|
||||
description?: string
|
||||
disabled?: boolean
|
||||
className?: string
|
||||
}
|
||||
|
||||
const ArrayInput: React.FC<ArrayInputProps> = ({
|
||||
id,
|
||||
label,
|
||||
value,
|
||||
onChange,
|
||||
placeholder = "item1, item2, item3",
|
||||
description,
|
||||
disabled = false,
|
||||
className
|
||||
}) => {
|
||||
const handleChange = (inputValue: string) => {
|
||||
if (inputValue === '') {
|
||||
onChange(undefined)
|
||||
return
|
||||
}
|
||||
|
||||
const arrayValue = inputValue
|
||||
.split(',')
|
||||
.map(s => s.trim())
|
||||
.filter(Boolean)
|
||||
|
||||
onChange(arrayValue.length > 0 ? arrayValue : undefined)
|
||||
}
|
||||
|
||||
return (
|
||||
<div className="grid gap-2">
|
||||
<Label htmlFor={id}>
|
||||
{label}
|
||||
</Label>
|
||||
<Input
|
||||
id={id}
|
||||
type="text"
|
||||
value={Array.isArray(value) ? value.join(', ') : ''}
|
||||
onChange={(e) => handleChange(e.target.value)}
|
||||
placeholder={placeholder}
|
||||
disabled={disabled}
|
||||
className={className}
|
||||
/>
|
||||
{description && (
|
||||
<p className="text-sm text-muted-foreground">{description}</p>
|
||||
)}
|
||||
<p className="text-xs text-muted-foreground">Separate multiple values with commas</p>
|
||||
</div>
|
||||
)
|
||||
}
|
||||
|
||||
export default ArrayInput
|
||||
42
webui/src/components/form/CheckboxInput.tsx
Normal file
42
webui/src/components/form/CheckboxInput.tsx
Normal file
@@ -0,0 +1,42 @@
|
||||
import React from 'react'
|
||||
import { Checkbox } from '@/components/ui/checkbox'
|
||||
import { Label } from '@/components/ui/label'
|
||||
|
||||
interface CheckboxInputProps {
|
||||
id: string
|
||||
label: string
|
||||
value: boolean | undefined
|
||||
onChange: (value: boolean) => void
|
||||
description?: string
|
||||
disabled?: boolean
|
||||
className?: string
|
||||
}
|
||||
|
||||
const CheckboxInput: React.FC<CheckboxInputProps> = ({
|
||||
id,
|
||||
label,
|
||||
value,
|
||||
onChange,
|
||||
description,
|
||||
disabled = false,
|
||||
className
|
||||
}) => {
|
||||
return (
|
||||
<div className={`flex items-center space-x-2 ${className || ''}`}>
|
||||
<Checkbox
|
||||
id={id}
|
||||
checked={value === true}
|
||||
onCheckedChange={(checked) => onChange(!!checked)}
|
||||
disabled={disabled}
|
||||
/>
|
||||
<Label htmlFor={id} className="text-sm font-normal">
|
||||
{label}
|
||||
{description && (
|
||||
<span className="text-muted-foreground ml-1">- {description}</span>
|
||||
)}
|
||||
</Label>
|
||||
</div>
|
||||
)
|
||||
}
|
||||
|
||||
export default CheckboxInput
|
||||
60
webui/src/components/form/NumberInput.tsx
Normal file
60
webui/src/components/form/NumberInput.tsx
Normal file
@@ -0,0 +1,60 @@
|
||||
import React from 'react'
|
||||
import { Input } from '@/components/ui/input'
|
||||
import { Label } from '@/components/ui/label'
|
||||
|
||||
interface NumberInputProps {
|
||||
id: string
|
||||
label: string
|
||||
value: number | undefined
|
||||
onChange: (value: number | undefined) => void
|
||||
placeholder?: string
|
||||
description?: string
|
||||
disabled?: boolean
|
||||
className?: string
|
||||
}
|
||||
|
||||
const NumberInput: React.FC<NumberInputProps> = ({
|
||||
id,
|
||||
label,
|
||||
value,
|
||||
onChange,
|
||||
placeholder,
|
||||
description,
|
||||
disabled = false,
|
||||
className
|
||||
}) => {
|
||||
const handleChange = (inputValue: string) => {
|
||||
if (inputValue === '') {
|
||||
onChange(undefined)
|
||||
return
|
||||
}
|
||||
|
||||
const numValue = parseFloat(inputValue)
|
||||
if (!isNaN(numValue)) {
|
||||
onChange(numValue)
|
||||
}
|
||||
}
|
||||
|
||||
return (
|
||||
<div className="grid gap-2">
|
||||
<Label htmlFor={id}>
|
||||
{label}
|
||||
</Label>
|
||||
<Input
|
||||
id={id}
|
||||
type="number"
|
||||
step="any"
|
||||
value={value !== undefined ? value : ''}
|
||||
onChange={(e) => handleChange(e.target.value)}
|
||||
placeholder={placeholder}
|
||||
disabled={disabled}
|
||||
className={className}
|
||||
/>
|
||||
{description && (
|
||||
<p className="text-sm text-muted-foreground">{description}</p>
|
||||
)}
|
||||
</div>
|
||||
)
|
||||
}
|
||||
|
||||
export default NumberInput
|
||||
55
webui/src/components/form/SelectInput.tsx
Normal file
55
webui/src/components/form/SelectInput.tsx
Normal file
@@ -0,0 +1,55 @@
|
||||
import React from 'react'
|
||||
import { Label } from '@/components/ui/label'
|
||||
|
||||
interface SelectOption {
|
||||
value: string
|
||||
label: string
|
||||
}
|
||||
|
||||
interface SelectInputProps {
|
||||
id: string
|
||||
label: string
|
||||
value: string | undefined
|
||||
onChange: (value: string | undefined) => void
|
||||
options: SelectOption[]
|
||||
description?: string
|
||||
disabled?: boolean
|
||||
className?: string
|
||||
}
|
||||
|
||||
const SelectInput: React.FC<SelectInputProps> = ({
|
||||
id,
|
||||
label,
|
||||
value,
|
||||
onChange,
|
||||
options,
|
||||
description,
|
||||
disabled = false,
|
||||
className
|
||||
}) => {
|
||||
return (
|
||||
<div className="grid gap-2">
|
||||
<Label htmlFor={id}>
|
||||
{label}
|
||||
</Label>
|
||||
<select
|
||||
id={id}
|
||||
value={value || ''}
|
||||
onChange={(e) => onChange(e.target.value || undefined)}
|
||||
disabled={disabled}
|
||||
className={`flex h-10 w-full rounded-md border border-input bg-background px-3 py-2 text-sm ring-offset-background focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-ring focus-visible:ring-offset-2 disabled:cursor-not-allowed disabled:opacity-50 ${className || ''}`}
|
||||
>
|
||||
{options.map(option => (
|
||||
<option key={option.value} value={option.value}>
|
||||
{option.label}
|
||||
</option>
|
||||
))}
|
||||
</select>
|
||||
{description && (
|
||||
<p className="text-sm text-muted-foreground">{description}</p>
|
||||
)}
|
||||
</div>
|
||||
)
|
||||
}
|
||||
|
||||
export default SelectInput
|
||||
47
webui/src/components/form/TextInput.tsx
Normal file
47
webui/src/components/form/TextInput.tsx
Normal file
@@ -0,0 +1,47 @@
|
||||
import React from 'react'
|
||||
import { Input } from '@/components/ui/input'
|
||||
import { Label } from '@/components/ui/label'
|
||||
|
||||
interface TextInputProps {
|
||||
id: string
|
||||
label: string
|
||||
value: string | number | undefined
|
||||
onChange: (value: string | undefined) => void
|
||||
placeholder?: string
|
||||
description?: string
|
||||
disabled?: boolean
|
||||
className?: string
|
||||
}
|
||||
|
||||
const TextInput: React.FC<TextInputProps> = ({
|
||||
id,
|
||||
label,
|
||||
value,
|
||||
onChange,
|
||||
placeholder,
|
||||
description,
|
||||
disabled = false,
|
||||
className
|
||||
}) => {
|
||||
return (
|
||||
<div className="grid gap-2">
|
||||
<Label htmlFor={id}>
|
||||
{label}
|
||||
</Label>
|
||||
<Input
|
||||
id={id}
|
||||
type="text"
|
||||
value={typeof value === 'string' || typeof value === 'number' ? value : ''}
|
||||
onChange={(e) => onChange(e.target.value || undefined)}
|
||||
placeholder={placeholder}
|
||||
disabled={disabled}
|
||||
className={className}
|
||||
/>
|
||||
{description && (
|
||||
<p className="text-sm text-muted-foreground">{description}</p>
|
||||
)}
|
||||
</div>
|
||||
)
|
||||
}
|
||||
|
||||
export default TextInput
|
||||
98
webui/src/components/instance/AdvancedInstanceFields.tsx
Normal file
98
webui/src/components/instance/AdvancedInstanceFields.tsx
Normal file
@@ -0,0 +1,98 @@
|
||||
import React from 'react'
|
||||
import type { CreateInstanceOptions } from '@/types/instance'
|
||||
import { getAdvancedFields, basicFieldsConfig } from '@/lib/zodFormUtils'
|
||||
import { getFieldType } from '@/schemas/instanceOptions'
|
||||
import TextInput from '@/components/form/TextInput'
|
||||
import NumberInput from '@/components/form/NumberInput'
|
||||
import CheckboxInput from '@/components/form/CheckboxInput'
|
||||
import ArrayInput from '@/components/form/ArrayInput'
|
||||
|
||||
interface AdvancedInstanceFieldsProps {
|
||||
formData: CreateInstanceOptions
|
||||
onChange: (key: keyof CreateInstanceOptions, value: any) => void
|
||||
}
|
||||
|
||||
const AdvancedInstanceFields: React.FC<AdvancedInstanceFieldsProps> = ({
|
||||
formData,
|
||||
onChange
|
||||
}) => {
|
||||
const advancedFields = getAdvancedFields()
|
||||
|
||||
const renderField = (fieldKey: keyof CreateInstanceOptions) => {
|
||||
const config = basicFieldsConfig[fieldKey as string] || { label: fieldKey }
|
||||
const fieldType = getFieldType(fieldKey)
|
||||
|
||||
switch (fieldType) {
|
||||
case 'boolean':
|
||||
return (
|
||||
<CheckboxInput
|
||||
key={fieldKey}
|
||||
id={fieldKey}
|
||||
label={config.label}
|
||||
value={formData[fieldKey] as boolean | undefined}
|
||||
onChange={(value) => onChange(fieldKey, value)}
|
||||
description={config.description}
|
||||
/>
|
||||
)
|
||||
|
||||
case 'number':
|
||||
return (
|
||||
<NumberInput
|
||||
key={fieldKey}
|
||||
id={fieldKey}
|
||||
label={config.label}
|
||||
value={formData[fieldKey] as number | undefined}
|
||||
onChange={(value) => onChange(fieldKey, value)}
|
||||
placeholder={config.placeholder}
|
||||
description={config.description}
|
||||
/>
|
||||
)
|
||||
|
||||
case 'array':
|
||||
return (
|
||||
<ArrayInput
|
||||
key={fieldKey}
|
||||
id={fieldKey}
|
||||
label={config.label}
|
||||
value={formData[fieldKey] as string[] | undefined}
|
||||
onChange={(value) => onChange(fieldKey, value)}
|
||||
placeholder={config.placeholder}
|
||||
description={config.description}
|
||||
/>
|
||||
)
|
||||
|
||||
default:
|
||||
return (
|
||||
<TextInput
|
||||
key={fieldKey}
|
||||
id={fieldKey}
|
||||
label={config.label}
|
||||
value={formData[fieldKey] as string | number | undefined}
|
||||
onChange={(value) => onChange(fieldKey, value)}
|
||||
placeholder={config.placeholder}
|
||||
description={config.description}
|
||||
/>
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
// Filter out restart options and backend_options (handled separately)
|
||||
const fieldsToRender = advancedFields.filter(
|
||||
fieldKey => !['max_restarts', 'restart_delay', 'backend_options'].includes(fieldKey as string)
|
||||
)
|
||||
|
||||
if (fieldsToRender.length === 0) {
|
||||
return null
|
||||
}
|
||||
|
||||
return (
|
||||
<div className="space-y-4">
|
||||
<h4 className="text-md font-medium">Advanced Instance Configuration</h4>
|
||||
{fieldsToRender
|
||||
.sort()
|
||||
.map(renderField)}
|
||||
</div>
|
||||
)
|
||||
}
|
||||
|
||||
export default AdvancedInstanceFields
|
||||
53
webui/src/components/instance/AutoRestartConfiguration.tsx
Normal file
53
webui/src/components/instance/AutoRestartConfiguration.tsx
Normal file
@@ -0,0 +1,53 @@
|
||||
import React from 'react'
|
||||
import type { CreateInstanceOptions } from '@/types/instance'
|
||||
import CheckboxInput from '@/components/form/CheckboxInput'
|
||||
import NumberInput from '@/components/form/NumberInput'
|
||||
|
||||
interface AutoRestartConfigurationProps {
|
||||
formData: CreateInstanceOptions
|
||||
onChange: (key: keyof CreateInstanceOptions, value: any) => void
|
||||
}
|
||||
|
||||
const AutoRestartConfiguration: React.FC<AutoRestartConfigurationProps> = ({
|
||||
formData,
|
||||
onChange
|
||||
}) => {
|
||||
const isAutoRestartEnabled = formData.auto_restart === true
|
||||
|
||||
return (
|
||||
<div className="space-y-4">
|
||||
<h3 className="text-lg font-medium">Auto Restart Configuration</h3>
|
||||
|
||||
<CheckboxInput
|
||||
id="auto_restart"
|
||||
label="Auto Restart"
|
||||
value={formData.auto_restart}
|
||||
onChange={(value) => onChange('auto_restart', value)}
|
||||
description="Automatically restart the instance on failure"
|
||||
/>
|
||||
|
||||
{isAutoRestartEnabled && (
|
||||
<div className="ml-6 space-y-4 border-l-2 border-muted pl-4">
|
||||
<NumberInput
|
||||
id="max_restarts"
|
||||
label="Max Restarts"
|
||||
value={formData.max_restarts}
|
||||
onChange={(value) => onChange('max_restarts', value)}
|
||||
placeholder="3"
|
||||
description="Maximum number of restart attempts (0 = unlimited)"
|
||||
/>
|
||||
<NumberInput
|
||||
id="restart_delay"
|
||||
label="Restart Delay (seconds)"
|
||||
value={formData.restart_delay}
|
||||
onChange={(value) => onChange('restart_delay', value)}
|
||||
placeholder="5"
|
||||
description="Delay in seconds before attempting restart"
|
||||
/>
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
)
|
||||
}
|
||||
|
||||
export default AutoRestartConfiguration
|
||||
54
webui/src/components/instance/BackendConfiguration.tsx
Normal file
54
webui/src/components/instance/BackendConfiguration.tsx
Normal file
@@ -0,0 +1,54 @@
|
||||
import React from 'react'
|
||||
import type { CreateInstanceOptions } from '@/types/instance'
|
||||
import { getBasicBackendFields, getAdvancedBackendFields } from '@/lib/zodFormUtils'
|
||||
import BackendFormField from '@/components/BackendFormField'
|
||||
|
||||
interface BackendConfigurationProps {
|
||||
formData: CreateInstanceOptions
|
||||
onBackendFieldChange: (key: string, value: any) => void
|
||||
showAdvanced?: boolean
|
||||
}
|
||||
|
||||
const BackendConfiguration: React.FC<BackendConfigurationProps> = ({
|
||||
formData,
|
||||
onBackendFieldChange,
|
||||
showAdvanced = false
|
||||
}) => {
|
||||
const basicBackendFields = getBasicBackendFields(formData.backend_type)
|
||||
const advancedBackendFields = getAdvancedBackendFields(formData.backend_type)
|
||||
|
||||
return (
|
||||
<div className="space-y-4">
|
||||
<h3 className="text-lg font-medium">Backend Configuration</h3>
|
||||
|
||||
{/* Basic backend fields */}
|
||||
{basicBackendFields.map((fieldKey) => (
|
||||
<BackendFormField
|
||||
key={fieldKey}
|
||||
fieldKey={fieldKey}
|
||||
value={(formData.backend_options as any)?.[fieldKey]}
|
||||
onChange={onBackendFieldChange}
|
||||
/>
|
||||
))}
|
||||
|
||||
{/* Advanced backend fields */}
|
||||
{showAdvanced && advancedBackendFields.length > 0 && (
|
||||
<div className="space-y-4 pl-6 border-l-2 border-muted">
|
||||
<h4 className="text-md font-medium">Advanced Backend Configuration</h4>
|
||||
{advancedBackendFields
|
||||
.sort()
|
||||
.map((fieldKey) => (
|
||||
<BackendFormField
|
||||
key={fieldKey}
|
||||
fieldKey={fieldKey}
|
||||
value={(formData.backend_options as any)?.[fieldKey]}
|
||||
onChange={onBackendFieldChange}
|
||||
/>
|
||||
))}
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
)
|
||||
}
|
||||
|
||||
export default BackendConfiguration
|
||||
99
webui/src/components/instance/BasicInstanceFields.tsx
Normal file
99
webui/src/components/instance/BasicInstanceFields.tsx
Normal file
@@ -0,0 +1,99 @@
|
||||
import React from 'react'
|
||||
import { BackendType, type CreateInstanceOptions } from '@/types/instance'
|
||||
import { getBasicFields, basicFieldsConfig } from '@/lib/zodFormUtils'
|
||||
import { getFieldType } from '@/schemas/instanceOptions'
|
||||
import TextInput from '@/components/form/TextInput'
|
||||
import NumberInput from '@/components/form/NumberInput'
|
||||
import CheckboxInput from '@/components/form/CheckboxInput'
|
||||
import SelectInput from '@/components/form/SelectInput'
|
||||
|
||||
interface BasicInstanceFieldsProps {
|
||||
formData: CreateInstanceOptions
|
||||
onChange: (key: keyof CreateInstanceOptions, value: any) => void
|
||||
}
|
||||
|
||||
const BasicInstanceFields: React.FC<BasicInstanceFieldsProps> = ({
|
||||
formData,
|
||||
onChange
|
||||
}) => {
|
||||
const basicFields = getBasicFields()
|
||||
|
||||
const renderField = (fieldKey: keyof CreateInstanceOptions) => {
|
||||
const config = basicFieldsConfig[fieldKey as string] || { label: fieldKey }
|
||||
const fieldType = getFieldType(fieldKey)
|
||||
|
||||
// Special handling for backend_type field
|
||||
if (fieldKey === 'backend_type') {
|
||||
return (
|
||||
<SelectInput
|
||||
key={fieldKey}
|
||||
id={fieldKey}
|
||||
label={config.label}
|
||||
value={formData[fieldKey] || BackendType.LLAMA_CPP}
|
||||
onChange={(value) => onChange(fieldKey, value)}
|
||||
options={[
|
||||
{ value: BackendType.LLAMA_CPP, label: 'Llama Server' },
|
||||
{ value: BackendType.MLX_LM, label: 'MLX LM' },
|
||||
{ value: BackendType.VLLM, label: 'vLLM' }
|
||||
]}
|
||||
description={config.description}
|
||||
/>
|
||||
)
|
||||
}
|
||||
|
||||
// Render based on field type
|
||||
switch (fieldType) {
|
||||
case 'boolean':
|
||||
return (
|
||||
<CheckboxInput
|
||||
key={fieldKey}
|
||||
id={fieldKey}
|
||||
label={config.label}
|
||||
value={formData[fieldKey] as boolean | undefined}
|
||||
onChange={(value) => onChange(fieldKey, value)}
|
||||
description={config.description}
|
||||
/>
|
||||
)
|
||||
|
||||
case 'number':
|
||||
return (
|
||||
<NumberInput
|
||||
key={fieldKey}
|
||||
id={fieldKey}
|
||||
label={config.label}
|
||||
value={formData[fieldKey] as number | undefined}
|
||||
onChange={(value) => onChange(fieldKey, value)}
|
||||
placeholder={config.placeholder}
|
||||
description={config.description}
|
||||
/>
|
||||
)
|
||||
|
||||
default:
|
||||
return (
|
||||
<TextInput
|
||||
key={fieldKey}
|
||||
id={fieldKey}
|
||||
label={config.label}
|
||||
value={formData[fieldKey] as string | number | undefined}
|
||||
onChange={(value) => onChange(fieldKey, value)}
|
||||
placeholder={config.placeholder}
|
||||
description={config.description}
|
||||
/>
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
// Filter out auto restart fields and backend_options (handled separately)
|
||||
const fieldsToRender = basicFields.filter(
|
||||
fieldKey => !['auto_restart', 'max_restarts', 'restart_delay', 'backend_options'].includes(fieldKey as string)
|
||||
)
|
||||
|
||||
return (
|
||||
<div className="space-y-4">
|
||||
<h3 className="text-lg font-medium">Basic Configuration</h3>
|
||||
{fieldsToRender.map(renderField)}
|
||||
</div>
|
||||
)
|
||||
}
|
||||
|
||||
export default BasicInstanceFields
|
||||
@@ -1,4 +1,5 @@
|
||||
import type { CreateInstanceOptions, Instance } from "@/types/instance";
|
||||
import { handleApiError } from "./errorUtils";
|
||||
|
||||
const API_BASE = "/api/v1";
|
||||
|
||||
@@ -30,25 +31,8 @@ async function apiCall<T>(
|
||||
headers,
|
||||
});
|
||||
|
||||
// Handle authentication errors
|
||||
if (response.status === 401) {
|
||||
throw new Error('Authentication required');
|
||||
}
|
||||
|
||||
if (!response.ok) {
|
||||
// Try to get error message from response
|
||||
let errorMessage = `HTTP ${response.status}`;
|
||||
try {
|
||||
const errorText = await response.text();
|
||||
if (errorText) {
|
||||
errorMessage += `: ${errorText}`;
|
||||
}
|
||||
} catch {
|
||||
// If we can't read the error, just use status
|
||||
}
|
||||
|
||||
throw new Error(errorMessage);
|
||||
}
|
||||
// Handle errors using centralized error handler
|
||||
await handleApiError(response);
|
||||
|
||||
// Handle empty responses (like DELETE)
|
||||
if (response.status === 204) {
|
||||
@@ -60,6 +44,14 @@ async function apiCall<T>(
|
||||
const text = await response.text();
|
||||
return text as T;
|
||||
} else {
|
||||
// Handle empty responses for JSON endpoints
|
||||
const contentLength = response.headers.get('content-length');
|
||||
if (contentLength === '0' || contentLength === null) {
|
||||
const text = await response.text();
|
||||
if (text.trim() === '') {
|
||||
return {} as T; // Return empty object for empty JSON responses
|
||||
}
|
||||
}
|
||||
const data = await response.json() as T;
|
||||
return data;
|
||||
}
|
||||
@@ -101,6 +93,14 @@ export const backendsApi = {
|
||||
body: JSON.stringify({ command }),
|
||||
}),
|
||||
},
|
||||
vllm: {
|
||||
// POST /backends/vllm/parse-command
|
||||
parseCommand: (command: string) =>
|
||||
apiCall<CreateInstanceOptions>('/backends/vllm/parse-command', {
|
||||
method: 'POST',
|
||||
body: JSON.stringify({ command }),
|
||||
}),
|
||||
},
|
||||
};
|
||||
|
||||
// Instance API functions
|
||||
|
||||
32
webui/src/lib/errorUtils.ts
Normal file
32
webui/src/lib/errorUtils.ts
Normal file
@@ -0,0 +1,32 @@
|
||||
/**
|
||||
* Parses error response from API calls and returns a formatted error message
|
||||
*/
|
||||
export async function parseErrorResponse(response: Response): Promise<string> {
|
||||
let errorMessage = `HTTP ${response.status}`
|
||||
|
||||
try {
|
||||
const errorText = await response.text()
|
||||
if (errorText) {
|
||||
errorMessage += `: ${errorText}`
|
||||
}
|
||||
} catch {
|
||||
// If we can't read the error, just use status
|
||||
}
|
||||
|
||||
return errorMessage
|
||||
}
|
||||
|
||||
/**
|
||||
* Handles common API call errors and throws appropriate Error objects
|
||||
*/
|
||||
export async function handleApiError(response: Response): Promise<void> {
|
||||
// Handle authentication errors
|
||||
if (response.status === 401) {
|
||||
throw new Error('Authentication required')
|
||||
}
|
||||
|
||||
if (!response.ok) {
|
||||
const errorMessage = await parseErrorResponse(response)
|
||||
throw new Error(errorMessage)
|
||||
}
|
||||
}
|
||||
@@ -2,13 +2,17 @@ import {
|
||||
type CreateInstanceOptions,
|
||||
type LlamaCppBackendOptions,
|
||||
type MlxBackendOptions,
|
||||
type VllmBackendOptions,
|
||||
LlamaCppBackendOptionsSchema,
|
||||
MlxBackendOptionsSchema,
|
||||
VllmBackendOptionsSchema,
|
||||
getAllFieldKeys,
|
||||
getAllLlamaCppFieldKeys,
|
||||
getAllMlxFieldKeys,
|
||||
getAllVllmFieldKeys,
|
||||
getLlamaCppFieldType,
|
||||
getMlxFieldType
|
||||
getMlxFieldType,
|
||||
getVllmFieldType
|
||||
} from '@/schemas/instanceOptions'
|
||||
|
||||
// Instance-level basic fields (not backend-specific)
|
||||
@@ -16,7 +20,6 @@ export const basicFieldsConfig: Record<string, {
|
||||
label: string
|
||||
description?: string
|
||||
placeholder?: string
|
||||
required?: boolean
|
||||
}> = {
|
||||
auto_restart: {
|
||||
label: 'Auto Restart',
|
||||
@@ -52,13 +55,11 @@ const basicLlamaCppFieldsConfig: Record<string, {
|
||||
label: string
|
||||
description?: string
|
||||
placeholder?: string
|
||||
required?: boolean
|
||||
}> = {
|
||||
model: {
|
||||
label: 'Model Path',
|
||||
placeholder: '/path/to/model.gguf',
|
||||
description: 'Path to the model file',
|
||||
required: true
|
||||
description: 'Path to the model file'
|
||||
},
|
||||
hf_repo: {
|
||||
label: 'Hugging Face Repository',
|
||||
@@ -82,13 +83,11 @@ const basicMlxFieldsConfig: Record<string, {
|
||||
label: string
|
||||
description?: string
|
||||
placeholder?: string
|
||||
required?: boolean
|
||||
}> = {
|
||||
model: {
|
||||
label: 'Model',
|
||||
placeholder: 'mlx-community/Mistral-7B-Instruct-v0.3-4bit',
|
||||
description: 'The path to the MLX model weights, tokenizer, and config',
|
||||
required: true
|
||||
description: 'The path to the MLX model weights, tokenizer, and config'
|
||||
},
|
||||
temp: {
|
||||
label: 'Temperature',
|
||||
@@ -117,11 +116,46 @@ const basicMlxFieldsConfig: Record<string, {
|
||||
}
|
||||
}
|
||||
|
||||
// vLLM backend-specific basic fields
|
||||
const basicVllmFieldsConfig: Record<string, {
|
||||
label: string
|
||||
description?: string
|
||||
placeholder?: string
|
||||
}> = {
|
||||
model: {
|
||||
label: 'Model',
|
||||
placeholder: 'microsoft/DialoGPT-medium',
|
||||
description: 'The name or path of the Hugging Face model to use'
|
||||
},
|
||||
tensor_parallel_size: {
|
||||
label: 'Tensor Parallel Size',
|
||||
placeholder: '1',
|
||||
description: 'Number of GPUs to use for distributed serving'
|
||||
},
|
||||
gpu_memory_utilization: {
|
||||
label: 'GPU Memory Utilization',
|
||||
placeholder: '0.9',
|
||||
description: 'The fraction of GPU memory to be used for the model executor'
|
||||
}
|
||||
}
|
||||
|
||||
// Backend field configuration lookup
|
||||
const backendFieldConfigs = {
|
||||
mlx_lm: basicMlxFieldsConfig,
|
||||
vllm: basicVllmFieldsConfig,
|
||||
llama_cpp: basicLlamaCppFieldsConfig,
|
||||
} as const
|
||||
|
||||
const backendFieldGetters = {
|
||||
mlx_lm: getAllMlxFieldKeys,
|
||||
vllm: getAllVllmFieldKeys,
|
||||
llama_cpp: getAllLlamaCppFieldKeys,
|
||||
} as const
|
||||
|
||||
function isBasicField(key: keyof CreateInstanceOptions): boolean {
|
||||
return key in basicFieldsConfig
|
||||
}
|
||||
|
||||
|
||||
export function getBasicFields(): (keyof CreateInstanceOptions)[] {
|
||||
return Object.keys(basicFieldsConfig) as (keyof CreateInstanceOptions)[]
|
||||
}
|
||||
@@ -130,25 +164,18 @@ export function getAdvancedFields(): (keyof CreateInstanceOptions)[] {
|
||||
return getAllFieldKeys().filter(key => !isBasicField(key))
|
||||
}
|
||||
|
||||
|
||||
export function getBasicBackendFields(backendType?: string): string[] {
|
||||
if (backendType === 'mlx_lm') {
|
||||
return Object.keys(basicMlxFieldsConfig)
|
||||
} else if (backendType === 'llama_cpp') {
|
||||
return Object.keys(basicLlamaCppFieldsConfig)
|
||||
}
|
||||
// Default to LlamaCpp for backward compatibility
|
||||
return Object.keys(basicLlamaCppFieldsConfig)
|
||||
const normalizedType = (backendType || 'llama_cpp') as keyof typeof backendFieldConfigs
|
||||
const config = backendFieldConfigs[normalizedType] || basicLlamaCppFieldsConfig
|
||||
return Object.keys(config)
|
||||
}
|
||||
|
||||
export function getAdvancedBackendFields(backendType?: string): string[] {
|
||||
if (backendType === 'mlx_lm') {
|
||||
return getAllMlxFieldKeys().filter(key => !(key in basicMlxFieldsConfig))
|
||||
} else if (backendType === 'llama_cpp') {
|
||||
return getAllLlamaCppFieldKeys().filter(key => !(key in basicLlamaCppFieldsConfig))
|
||||
}
|
||||
// Default to LlamaCpp for backward compatibility
|
||||
return getAllLlamaCppFieldKeys().filter(key => !(key in basicLlamaCppFieldsConfig))
|
||||
const normalizedType = (backendType || 'llama_cpp') as keyof typeof backendFieldGetters
|
||||
const fieldGetter = backendFieldGetters[normalizedType] || getAllLlamaCppFieldKeys
|
||||
const basicConfig = backendFieldConfigs[normalizedType] || basicLlamaCppFieldsConfig
|
||||
|
||||
return fieldGetter().filter(key => !(key in basicConfig))
|
||||
}
|
||||
|
||||
// Combined backend fields config for use in BackendFormField
|
||||
@@ -156,10 +183,10 @@ export const basicBackendFieldsConfig: Record<string, {
|
||||
label: string
|
||||
description?: string
|
||||
placeholder?: string
|
||||
required?: boolean
|
||||
}> = {
|
||||
...basicLlamaCppFieldsConfig,
|
||||
...basicMlxFieldsConfig
|
||||
...basicMlxFieldsConfig,
|
||||
...basicVllmFieldsConfig
|
||||
}
|
||||
|
||||
// Get field type for any backend option (union type)
|
||||
@@ -182,6 +209,15 @@ export function getBackendFieldType(key: string): 'text' | 'number' | 'boolean'
|
||||
// Schema might not be available
|
||||
}
|
||||
|
||||
// Try vLLM schema
|
||||
try {
|
||||
if (VllmBackendOptionsSchema.shape && key in VllmBackendOptionsSchema.shape) {
|
||||
return getVllmFieldType(key as keyof VllmBackendOptions)
|
||||
}
|
||||
} catch {
|
||||
// Schema might not be available
|
||||
}
|
||||
|
||||
// Default fallback
|
||||
return 'text'
|
||||
}
|
||||
|
||||
4
webui/src/schemas/backends/index.ts
Normal file
4
webui/src/schemas/backends/index.ts
Normal file
@@ -0,0 +1,4 @@
|
||||
// Re-export all backend schemas from one place
|
||||
export * from './llamacpp'
|
||||
export * from './mlx'
|
||||
export * from './vllm'
|
||||
192
webui/src/schemas/backends/llamacpp.ts
Normal file
192
webui/src/schemas/backends/llamacpp.ts
Normal file
@@ -0,0 +1,192 @@
|
||||
import { z } from 'zod'
|
||||
|
||||
// Define the LlamaCpp backend options schema
|
||||
export const LlamaCppBackendOptionsSchema = z.object({
|
||||
// Common params
|
||||
verbose_prompt: z.boolean().optional(),
|
||||
threads: z.number().optional(),
|
||||
threads_batch: z.number().optional(),
|
||||
cpu_mask: z.string().optional(),
|
||||
cpu_range: z.string().optional(),
|
||||
cpu_strict: z.number().optional(),
|
||||
prio: z.number().optional(),
|
||||
poll: z.number().optional(),
|
||||
cpu_mask_batch: z.string().optional(),
|
||||
cpu_range_batch: z.string().optional(),
|
||||
cpu_strict_batch: z.number().optional(),
|
||||
prio_batch: z.number().optional(),
|
||||
poll_batch: z.number().optional(),
|
||||
ctx_size: z.number().optional(),
|
||||
predict: z.number().optional(),
|
||||
batch_size: z.number().optional(),
|
||||
ubatch_size: z.number().optional(),
|
||||
keep: z.number().optional(),
|
||||
flash_attn: z.boolean().optional(),
|
||||
no_perf: z.boolean().optional(),
|
||||
escape: z.boolean().optional(),
|
||||
no_escape: z.boolean().optional(),
|
||||
rope_scaling: z.string().optional(),
|
||||
rope_scale: z.number().optional(),
|
||||
rope_freq_base: z.number().optional(),
|
||||
rope_freq_scale: z.number().optional(),
|
||||
yarn_orig_ctx: z.number().optional(),
|
||||
yarn_ext_factor: z.number().optional(),
|
||||
yarn_attn_factor: z.number().optional(),
|
||||
yarn_beta_slow: z.number().optional(),
|
||||
yarn_beta_fast: z.number().optional(),
|
||||
dump_kv_cache: z.boolean().optional(),
|
||||
no_kv_offload: z.boolean().optional(),
|
||||
cache_type_k: z.string().optional(),
|
||||
cache_type_v: z.string().optional(),
|
||||
defrag_thold: z.number().optional(),
|
||||
parallel: z.number().optional(),
|
||||
mlock: z.boolean().optional(),
|
||||
no_mmap: z.boolean().optional(),
|
||||
numa: z.string().optional(),
|
||||
device: z.string().optional(),
|
||||
override_tensor: z.array(z.string()).optional(),
|
||||
gpu_layers: z.number().optional(),
|
||||
split_mode: z.string().optional(),
|
||||
tensor_split: z.string().optional(),
|
||||
main_gpu: z.number().optional(),
|
||||
check_tensors: z.boolean().optional(),
|
||||
override_kv: z.array(z.string()).optional(),
|
||||
lora: z.array(z.string()).optional(),
|
||||
lora_scaled: z.array(z.string()).optional(),
|
||||
control_vector: z.array(z.string()).optional(),
|
||||
control_vector_scaled: z.array(z.string()).optional(),
|
||||
control_vector_layer_range: z.string().optional(),
|
||||
model: z.string().optional(),
|
||||
model_url: z.string().optional(),
|
||||
hf_repo: z.string().optional(),
|
||||
hf_repo_draft: z.string().optional(),
|
||||
hf_file: z.string().optional(),
|
||||
hf_repo_v: z.string().optional(),
|
||||
hf_file_v: z.string().optional(),
|
||||
hf_token: z.string().optional(),
|
||||
log_disable: z.boolean().optional(),
|
||||
log_file: z.string().optional(),
|
||||
log_colors: z.boolean().optional(),
|
||||
verbose: z.boolean().optional(),
|
||||
verbosity: z.number().optional(),
|
||||
log_prefix: z.boolean().optional(),
|
||||
log_timestamps: z.boolean().optional(),
|
||||
|
||||
// Sampling params
|
||||
samplers: z.string().optional(),
|
||||
seed: z.number().optional(),
|
||||
sampling_seq: z.string().optional(),
|
||||
ignore_eos: z.boolean().optional(),
|
||||
temp: z.number().optional(),
|
||||
top_k: z.number().optional(),
|
||||
top_p: z.number().optional(),
|
||||
min_p: z.number().optional(),
|
||||
xtc_probability: z.number().optional(),
|
||||
xtc_threshold: z.number().optional(),
|
||||
typical: z.number().optional(),
|
||||
repeat_last_n: z.number().optional(),
|
||||
repeat_penalty: z.number().optional(),
|
||||
presence_penalty: z.number().optional(),
|
||||
frequency_penalty: z.number().optional(),
|
||||
dry_multiplier: z.number().optional(),
|
||||
dry_base: z.number().optional(),
|
||||
dry_allowed_length: z.number().optional(),
|
||||
dry_penalty_last_n: z.number().optional(),
|
||||
dry_sequence_breaker: z.array(z.string()).optional(),
|
||||
dynatemp_range: z.number().optional(),
|
||||
dynatemp_exp: z.number().optional(),
|
||||
mirostat: z.number().optional(),
|
||||
mirostat_lr: z.number().optional(),
|
||||
mirostat_ent: z.number().optional(),
|
||||
logit_bias: z.array(z.string()).optional(),
|
||||
grammar: z.string().optional(),
|
||||
grammar_file: z.string().optional(),
|
||||
json_schema: z.string().optional(),
|
||||
json_schema_file: z.string().optional(),
|
||||
|
||||
// Example-specific params
|
||||
no_context_shift: z.boolean().optional(),
|
||||
special: z.boolean().optional(),
|
||||
no_warmup: z.boolean().optional(),
|
||||
spm_infill: z.boolean().optional(),
|
||||
pooling: z.string().optional(),
|
||||
cont_batching: z.boolean().optional(),
|
||||
no_cont_batching: z.boolean().optional(),
|
||||
mmproj: z.string().optional(),
|
||||
mmproj_url: z.string().optional(),
|
||||
no_mmproj: z.boolean().optional(),
|
||||
no_mmproj_offload: z.boolean().optional(),
|
||||
alias: z.string().optional(),
|
||||
host: z.string().optional(),
|
||||
port: z.number().optional(),
|
||||
path: z.string().optional(),
|
||||
no_webui: z.boolean().optional(),
|
||||
embedding: z.boolean().optional(),
|
||||
reranking: z.boolean().optional(),
|
||||
api_key: z.string().optional(),
|
||||
api_key_file: z.string().optional(),
|
||||
ssl_key_file: z.string().optional(),
|
||||
ssl_cert_file: z.string().optional(),
|
||||
chat_template_kwargs: z.string().optional(),
|
||||
timeout: z.number().optional(),
|
||||
threads_http: z.number().optional(),
|
||||
cache_reuse: z.number().optional(),
|
||||
metrics: z.boolean().optional(),
|
||||
slots: z.boolean().optional(),
|
||||
props: z.boolean().optional(),
|
||||
no_slots: z.boolean().optional(),
|
||||
slot_save_path: z.string().optional(),
|
||||
jinja: z.boolean().optional(),
|
||||
reasoning_format: z.string().optional(),
|
||||
reasoning_budget: z.number().optional(),
|
||||
chat_template: z.string().optional(),
|
||||
chat_template_file: z.string().optional(),
|
||||
no_prefill_assistant: z.boolean().optional(),
|
||||
slot_prompt_similarity: z.number().optional(),
|
||||
lora_init_without_apply: z.boolean().optional(),
|
||||
draft_max: z.number().optional(),
|
||||
draft_min: z.number().optional(),
|
||||
draft_p_min: z.number().optional(),
|
||||
ctx_size_draft: z.number().optional(),
|
||||
device_draft: z.string().optional(),
|
||||
gpu_layers_draft: z.number().optional(),
|
||||
model_draft: z.string().optional(),
|
||||
cache_type_k_draft: z.string().optional(),
|
||||
cache_type_v_draft: z.string().optional(),
|
||||
|
||||
// Audio/TTS params
|
||||
model_vocoder: z.string().optional(),
|
||||
tts_use_guide_tokens: z.boolean().optional(),
|
||||
|
||||
// Default model params
|
||||
embd_bge_small_en_default: z.boolean().optional(),
|
||||
embd_e5_small_en_default: z.boolean().optional(),
|
||||
embd_gte_small_default: z.boolean().optional(),
|
||||
fim_qwen_1_5b_default: z.boolean().optional(),
|
||||
fim_qwen_3b_default: z.boolean().optional(),
|
||||
fim_qwen_7b_default: z.boolean().optional(),
|
||||
fim_qwen_7b_spec: z.boolean().optional(),
|
||||
fim_qwen_14b_spec: z.boolean().optional(),
|
||||
})
|
||||
|
||||
// Infer the TypeScript type from the schema
|
||||
export type LlamaCppBackendOptions = z.infer<typeof LlamaCppBackendOptionsSchema>
|
||||
|
||||
// Helper to get all LlamaCpp backend option field keys
|
||||
export function getAllLlamaCppFieldKeys(): (keyof LlamaCppBackendOptions)[] {
|
||||
return Object.keys(LlamaCppBackendOptionsSchema.shape) as (keyof LlamaCppBackendOptions)[]
|
||||
}
|
||||
|
||||
// Get field type for LlamaCpp backend options
|
||||
export function getLlamaCppFieldType(key: keyof LlamaCppBackendOptions): 'text' | 'number' | 'boolean' | 'array' {
|
||||
const fieldSchema = LlamaCppBackendOptionsSchema.shape[key]
|
||||
if (!fieldSchema) return 'text'
|
||||
|
||||
// Handle ZodOptional wrapper
|
||||
const innerSchema = fieldSchema instanceof z.ZodOptional ? fieldSchema.unwrap() : fieldSchema
|
||||
|
||||
if (innerSchema instanceof z.ZodBoolean) return 'boolean'
|
||||
if (innerSchema instanceof z.ZodNumber) return 'number'
|
||||
if (innerSchema instanceof z.ZodArray) return 'array'
|
||||
return 'text' // ZodString and others default to text
|
||||
}
|
||||
51
webui/src/schemas/backends/mlx.ts
Normal file
51
webui/src/schemas/backends/mlx.ts
Normal file
@@ -0,0 +1,51 @@
|
||||
import { z } from 'zod'
|
||||
|
||||
// Define the MLX backend options schema
|
||||
export const MlxBackendOptionsSchema = z.object({
|
||||
// Basic connection options
|
||||
model: z.string().optional(),
|
||||
host: z.string().optional(),
|
||||
port: z.number().optional(),
|
||||
|
||||
// Model and adapter options
|
||||
adapter_path: z.string().optional(),
|
||||
draft_model: z.string().optional(),
|
||||
num_draft_tokens: z.number().optional(),
|
||||
trust_remote_code: z.boolean().optional(),
|
||||
|
||||
// Logging and templates
|
||||
log_level: z.enum(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']).optional(),
|
||||
chat_template: z.string().optional(),
|
||||
use_default_chat_template: z.boolean().optional(),
|
||||
chat_template_args: z.string().optional(), // JSON string
|
||||
|
||||
// Sampling defaults
|
||||
temp: z.number().optional(), // Note: MLX uses "temp" not "temperature"
|
||||
top_p: z.number().optional(),
|
||||
top_k: z.number().optional(),
|
||||
min_p: z.number().optional(),
|
||||
max_tokens: z.number().optional(),
|
||||
})
|
||||
|
||||
// Infer the TypeScript type from the schema
|
||||
export type MlxBackendOptions = z.infer<typeof MlxBackendOptionsSchema>
|
||||
|
||||
// Helper to get all MLX backend option field keys
|
||||
export function getAllMlxFieldKeys(): (keyof MlxBackendOptions)[] {
|
||||
return Object.keys(MlxBackendOptionsSchema.shape) as (keyof MlxBackendOptions)[]
|
||||
}
|
||||
|
||||
// Get field type for MLX backend options
|
||||
export function getMlxFieldType(key: keyof MlxBackendOptions): 'text' | 'number' | 'boolean' | 'array' {
|
||||
const fieldSchema = MlxBackendOptionsSchema.shape[key]
|
||||
if (!fieldSchema) return 'text'
|
||||
|
||||
// Handle ZodOptional wrapper
|
||||
const innerSchema = fieldSchema instanceof z.ZodOptional ? fieldSchema.unwrap() : fieldSchema
|
||||
|
||||
if (innerSchema instanceof z.ZodBoolean) return 'boolean'
|
||||
if (innerSchema instanceof z.ZodNumber) return 'number'
|
||||
if (innerSchema instanceof z.ZodArray) return 'array'
|
||||
if (innerSchema instanceof z.ZodEnum) return 'text' // Enum treated as text/select
|
||||
return 'text' // ZodString and others default to text
|
||||
}
|
||||
150
webui/src/schemas/backends/vllm.ts
Normal file
150
webui/src/schemas/backends/vllm.ts
Normal file
@@ -0,0 +1,150 @@
|
||||
import { z } from 'zod'
|
||||
|
||||
// Define the vLLM backend options schema
|
||||
export const VllmBackendOptionsSchema = z.object({
|
||||
// Basic connection options (auto-assigned by llamactl)
|
||||
host: z.string().optional(),
|
||||
port: z.number().optional(),
|
||||
|
||||
// Model and engine configuration
|
||||
model: z.string().optional(),
|
||||
tokenizer: z.string().optional(),
|
||||
skip_tokenizer_init: z.boolean().optional(),
|
||||
revision: z.string().optional(),
|
||||
code_revision: z.string().optional(),
|
||||
tokenizer_revision: z.string().optional(),
|
||||
tokenizer_mode: z.string().optional(),
|
||||
trust_remote_code: z.boolean().optional(),
|
||||
download_dir: z.string().optional(),
|
||||
load_format: z.string().optional(),
|
||||
config_format: z.string().optional(),
|
||||
dtype: z.string().optional(),
|
||||
kv_cache_dtype: z.string().optional(),
|
||||
quantization_param_path: z.string().optional(),
|
||||
seed: z.number().optional(),
|
||||
max_model_len: z.number().optional(),
|
||||
guided_decoding_backend: z.string().optional(),
|
||||
distributed_executor_backend: z.string().optional(),
|
||||
worker_use_ray: z.boolean().optional(),
|
||||
ray_workers_use_nsight: z.boolean().optional(),
|
||||
|
||||
// Performance and serving configuration
|
||||
block_size: z.number().optional(),
|
||||
enable_prefix_caching: z.boolean().optional(),
|
||||
disable_sliding_window: z.boolean().optional(),
|
||||
use_v2_block_manager: z.boolean().optional(),
|
||||
num_lookahead_slots: z.number().optional(),
|
||||
swap_space: z.number().optional(),
|
||||
cpu_offload_gb: z.number().optional(),
|
||||
gpu_memory_utilization: z.number().optional(),
|
||||
num_gpu_blocks_override: z.number().optional(),
|
||||
max_num_batched_tokens: z.number().optional(),
|
||||
max_num_seqs: z.number().optional(),
|
||||
max_logprobs: z.number().optional(),
|
||||
disable_log_stats: z.boolean().optional(),
|
||||
quantization: z.string().optional(),
|
||||
rope_scaling: z.string().optional(),
|
||||
rope_theta: z.number().optional(),
|
||||
enforce_eager: z.boolean().optional(),
|
||||
max_context_len_to_capture: z.number().optional(),
|
||||
max_seq_len_to_capture: z.number().optional(),
|
||||
disable_custom_all_reduce: z.boolean().optional(),
|
||||
tokenizer_pool_size: z.number().optional(),
|
||||
tokenizer_pool_type: z.string().optional(),
|
||||
tokenizer_pool_extra_config: z.string().optional(),
|
||||
enable_lora_bias: z.boolean().optional(),
|
||||
lora_extra_vocab_size: z.number().optional(),
|
||||
lora_rank: z.number().optional(),
|
||||
prompt_lookback_distance: z.number().optional(),
|
||||
preemption_mode: z.string().optional(),
|
||||
|
||||
// Distributed and parallel processing
|
||||
tensor_parallel_size: z.number().optional(),
|
||||
pipeline_parallel_size: z.number().optional(),
|
||||
max_parallel_loading_workers: z.number().optional(),
|
||||
disable_async_output_proc: z.boolean().optional(),
|
||||
worker_class: z.string().optional(),
|
||||
enabled_lora_modules: z.string().optional(),
|
||||
max_lora_rank: z.number().optional(),
|
||||
fully_sharded_loras: z.boolean().optional(),
|
||||
lora_modules: z.string().optional(),
|
||||
prompt_adapters: z.string().optional(),
|
||||
max_prompt_adapter_token: z.number().optional(),
|
||||
device: z.string().optional(),
|
||||
scheduler_delay: z.number().optional(),
|
||||
enable_chunked_prefill: z.boolean().optional(),
|
||||
speculative_model: z.string().optional(),
|
||||
speculative_model_quantization: z.string().optional(),
|
||||
speculative_revision: z.string().optional(),
|
||||
speculative_max_model_len: z.number().optional(),
|
||||
speculative_disable_by_batch_size: z.number().optional(),
|
||||
ngpt_speculative_length: z.number().optional(),
|
||||
speculative_disable_mqa: z.boolean().optional(),
|
||||
model_loader_extra_config: z.string().optional(),
|
||||
ignore_patterns: z.string().optional(),
|
||||
preloaded_lora_modules: z.string().optional(),
|
||||
|
||||
// OpenAI server specific options
|
||||
uds: z.string().optional(),
|
||||
uvicorn_log_level: z.string().optional(),
|
||||
response_role: z.string().optional(),
|
||||
ssl_keyfile: z.string().optional(),
|
||||
ssl_certfile: z.string().optional(),
|
||||
ssl_ca_certs: z.string().optional(),
|
||||
ssl_cert_reqs: z.number().optional(),
|
||||
root_path: z.string().optional(),
|
||||
middleware: z.array(z.string()).optional(),
|
||||
return_tokens_as_token_ids: z.boolean().optional(),
|
||||
disable_frontend_multiprocessing: z.boolean().optional(),
|
||||
enable_auto_tool_choice: z.boolean().optional(),
|
||||
tool_call_parser: z.string().optional(),
|
||||
tool_server: z.string().optional(),
|
||||
chat_template: z.string().optional(),
|
||||
chat_template_content_format: z.string().optional(),
|
||||
allow_credentials: z.boolean().optional(),
|
||||
allowed_origins: z.array(z.string()).optional(),
|
||||
allowed_methods: z.array(z.string()).optional(),
|
||||
allowed_headers: z.array(z.string()).optional(),
|
||||
api_key: z.array(z.string()).optional(),
|
||||
enable_log_outputs: z.boolean().optional(),
|
||||
enable_token_usage: z.boolean().optional(),
|
||||
enable_async_engine_debug: z.boolean().optional(),
|
||||
engine_use_ray: z.boolean().optional(),
|
||||
disable_log_requests: z.boolean().optional(),
|
||||
max_log_len: z.number().optional(),
|
||||
|
||||
// Additional engine configuration
|
||||
task: z.string().optional(),
|
||||
multi_modal_config: z.string().optional(),
|
||||
limit_mm_per_prompt: z.string().optional(),
|
||||
enable_sleep_mode: z.boolean().optional(),
|
||||
enable_chunking_request: z.boolean().optional(),
|
||||
compilation_config: z.string().optional(),
|
||||
disable_sliding_window_mask: z.boolean().optional(),
|
||||
enable_trtllm_engine_latency: z.boolean().optional(),
|
||||
override_pooling_config: z.string().optional(),
|
||||
override_neuron_config: z.string().optional(),
|
||||
override_kv_cache_align_size: z.number().optional(),
|
||||
})
|
||||
|
||||
// Infer the TypeScript type from the schema
|
||||
export type VllmBackendOptions = z.infer<typeof VllmBackendOptionsSchema>
|
||||
|
||||
// Helper to get all vLLM backend option field keys
|
||||
export function getAllVllmFieldKeys(): (keyof VllmBackendOptions)[] {
|
||||
return Object.keys(VllmBackendOptionsSchema.shape) as (keyof VllmBackendOptions)[]
|
||||
}
|
||||
|
||||
// Get field type for vLLM backend options
|
||||
export function getVllmFieldType(key: keyof VllmBackendOptions): 'text' | 'number' | 'boolean' | 'array' {
|
||||
const fieldSchema = VllmBackendOptionsSchema.shape[key]
|
||||
if (!fieldSchema) return 'text'
|
||||
|
||||
// Handle ZodOptional wrapper
|
||||
const innerSchema = fieldSchema instanceof z.ZodOptional ? fieldSchema.unwrap() : fieldSchema
|
||||
|
||||
if (innerSchema instanceof z.ZodBoolean) return 'boolean'
|
||||
if (innerSchema instanceof z.ZodNumber) return 'number'
|
||||
if (innerSchema instanceof z.ZodArray) return 'array'
|
||||
return 'text' // ZodString and others default to text
|
||||
}
|
||||
@@ -1,206 +1,27 @@
|
||||
import { BackendType } from '@/types/instance'
|
||||
import { z } from 'zod'
|
||||
|
||||
// Define the LlamaCpp backend options schema
|
||||
export const LlamaCppBackendOptionsSchema = z.object({
|
||||
// Common params
|
||||
verbose_prompt: z.boolean().optional(),
|
||||
threads: z.number().optional(),
|
||||
threads_batch: z.number().optional(),
|
||||
cpu_mask: z.string().optional(),
|
||||
cpu_range: z.string().optional(),
|
||||
cpu_strict: z.number().optional(),
|
||||
prio: z.number().optional(),
|
||||
poll: z.number().optional(),
|
||||
cpu_mask_batch: z.string().optional(),
|
||||
cpu_range_batch: z.string().optional(),
|
||||
cpu_strict_batch: z.number().optional(),
|
||||
prio_batch: z.number().optional(),
|
||||
poll_batch: z.number().optional(),
|
||||
ctx_size: z.number().optional(),
|
||||
predict: z.number().optional(),
|
||||
batch_size: z.number().optional(),
|
||||
ubatch_size: z.number().optional(),
|
||||
keep: z.number().optional(),
|
||||
flash_attn: z.boolean().optional(),
|
||||
no_perf: z.boolean().optional(),
|
||||
escape: z.boolean().optional(),
|
||||
no_escape: z.boolean().optional(),
|
||||
rope_scaling: z.string().optional(),
|
||||
rope_scale: z.number().optional(),
|
||||
rope_freq_base: z.number().optional(),
|
||||
rope_freq_scale: z.number().optional(),
|
||||
yarn_orig_ctx: z.number().optional(),
|
||||
yarn_ext_factor: z.number().optional(),
|
||||
yarn_attn_factor: z.number().optional(),
|
||||
yarn_beta_slow: z.number().optional(),
|
||||
yarn_beta_fast: z.number().optional(),
|
||||
dump_kv_cache: z.boolean().optional(),
|
||||
no_kv_offload: z.boolean().optional(),
|
||||
cache_type_k: z.string().optional(),
|
||||
cache_type_v: z.string().optional(),
|
||||
defrag_thold: z.number().optional(),
|
||||
parallel: z.number().optional(),
|
||||
mlock: z.boolean().optional(),
|
||||
no_mmap: z.boolean().optional(),
|
||||
numa: z.string().optional(),
|
||||
device: z.string().optional(),
|
||||
override_tensor: z.array(z.string()).optional(),
|
||||
gpu_layers: z.number().optional(),
|
||||
split_mode: z.string().optional(),
|
||||
tensor_split: z.string().optional(),
|
||||
main_gpu: z.number().optional(),
|
||||
check_tensors: z.boolean().optional(),
|
||||
override_kv: z.array(z.string()).optional(),
|
||||
lora: z.array(z.string()).optional(),
|
||||
lora_scaled: z.array(z.string()).optional(),
|
||||
control_vector: z.array(z.string()).optional(),
|
||||
control_vector_scaled: z.array(z.string()).optional(),
|
||||
control_vector_layer_range: z.string().optional(),
|
||||
model: z.string().optional(),
|
||||
model_url: z.string().optional(),
|
||||
hf_repo: z.string().optional(),
|
||||
hf_repo_draft: z.string().optional(),
|
||||
hf_file: z.string().optional(),
|
||||
hf_repo_v: z.string().optional(),
|
||||
hf_file_v: z.string().optional(),
|
||||
hf_token: z.string().optional(),
|
||||
log_disable: z.boolean().optional(),
|
||||
log_file: z.string().optional(),
|
||||
log_colors: z.boolean().optional(),
|
||||
verbose: z.boolean().optional(),
|
||||
verbosity: z.number().optional(),
|
||||
log_prefix: z.boolean().optional(),
|
||||
log_timestamps: z.boolean().optional(),
|
||||
|
||||
// Sampling params
|
||||
samplers: z.string().optional(),
|
||||
seed: z.number().optional(),
|
||||
sampling_seq: z.string().optional(),
|
||||
ignore_eos: z.boolean().optional(),
|
||||
temp: z.number().optional(),
|
||||
top_k: z.number().optional(),
|
||||
top_p: z.number().optional(),
|
||||
min_p: z.number().optional(),
|
||||
xtc_probability: z.number().optional(),
|
||||
xtc_threshold: z.number().optional(),
|
||||
typical: z.number().optional(),
|
||||
repeat_last_n: z.number().optional(),
|
||||
repeat_penalty: z.number().optional(),
|
||||
presence_penalty: z.number().optional(),
|
||||
frequency_penalty: z.number().optional(),
|
||||
dry_multiplier: z.number().optional(),
|
||||
dry_base: z.number().optional(),
|
||||
dry_allowed_length: z.number().optional(),
|
||||
dry_penalty_last_n: z.number().optional(),
|
||||
dry_sequence_breaker: z.array(z.string()).optional(),
|
||||
dynatemp_range: z.number().optional(),
|
||||
dynatemp_exp: z.number().optional(),
|
||||
mirostat: z.number().optional(),
|
||||
mirostat_lr: z.number().optional(),
|
||||
mirostat_ent: z.number().optional(),
|
||||
logit_bias: z.array(z.string()).optional(),
|
||||
grammar: z.string().optional(),
|
||||
grammar_file: z.string().optional(),
|
||||
json_schema: z.string().optional(),
|
||||
json_schema_file: z.string().optional(),
|
||||
|
||||
// Example-specific params
|
||||
no_context_shift: z.boolean().optional(),
|
||||
special: z.boolean().optional(),
|
||||
no_warmup: z.boolean().optional(),
|
||||
spm_infill: z.boolean().optional(),
|
||||
pooling: z.string().optional(),
|
||||
cont_batching: z.boolean().optional(),
|
||||
no_cont_batching: z.boolean().optional(),
|
||||
mmproj: z.string().optional(),
|
||||
mmproj_url: z.string().optional(),
|
||||
no_mmproj: z.boolean().optional(),
|
||||
no_mmproj_offload: z.boolean().optional(),
|
||||
alias: z.string().optional(),
|
||||
host: z.string().optional(),
|
||||
port: z.number().optional(),
|
||||
path: z.string().optional(),
|
||||
no_webui: z.boolean().optional(),
|
||||
embedding: z.boolean().optional(),
|
||||
reranking: z.boolean().optional(),
|
||||
api_key: z.string().optional(),
|
||||
api_key_file: z.string().optional(),
|
||||
ssl_key_file: z.string().optional(),
|
||||
ssl_cert_file: z.string().optional(),
|
||||
chat_template_kwargs: z.string().optional(),
|
||||
timeout: z.number().optional(),
|
||||
threads_http: z.number().optional(),
|
||||
cache_reuse: z.number().optional(),
|
||||
metrics: z.boolean().optional(),
|
||||
slots: z.boolean().optional(),
|
||||
props: z.boolean().optional(),
|
||||
no_slots: z.boolean().optional(),
|
||||
slot_save_path: z.string().optional(),
|
||||
jinja: z.boolean().optional(),
|
||||
reasoning_format: z.string().optional(),
|
||||
reasoning_budget: z.number().optional(),
|
||||
chat_template: z.string().optional(),
|
||||
chat_template_file: z.string().optional(),
|
||||
no_prefill_assistant: z.boolean().optional(),
|
||||
slot_prompt_similarity: z.number().optional(),
|
||||
lora_init_without_apply: z.boolean().optional(),
|
||||
draft_max: z.number().optional(),
|
||||
draft_min: z.number().optional(),
|
||||
draft_p_min: z.number().optional(),
|
||||
ctx_size_draft: z.number().optional(),
|
||||
device_draft: z.string().optional(),
|
||||
gpu_layers_draft: z.number().optional(),
|
||||
model_draft: z.string().optional(),
|
||||
cache_type_k_draft: z.string().optional(),
|
||||
cache_type_v_draft: z.string().optional(),
|
||||
|
||||
// Audio/TTS params
|
||||
model_vocoder: z.string().optional(),
|
||||
tts_use_guide_tokens: z.boolean().optional(),
|
||||
|
||||
// Default model params
|
||||
embd_bge_small_en_default: z.boolean().optional(),
|
||||
embd_e5_small_en_default: z.boolean().optional(),
|
||||
embd_gte_small_default: z.boolean().optional(),
|
||||
fim_qwen_1_5b_default: z.boolean().optional(),
|
||||
fim_qwen_3b_default: z.boolean().optional(),
|
||||
fim_qwen_7b_default: z.boolean().optional(),
|
||||
fim_qwen_7b_spec: z.boolean().optional(),
|
||||
fim_qwen_14b_spec: z.boolean().optional(),
|
||||
})
|
||||
|
||||
// Define the MLX backend options schema
|
||||
export const MlxBackendOptionsSchema = z.object({
|
||||
// Basic connection options
|
||||
model: z.string().optional(),
|
||||
host: z.string().optional(),
|
||||
port: z.number().optional(),
|
||||
|
||||
// Model and adapter options
|
||||
adapter_path: z.string().optional(),
|
||||
draft_model: z.string().optional(),
|
||||
num_draft_tokens: z.number().optional(),
|
||||
trust_remote_code: z.boolean().optional(),
|
||||
|
||||
// Logging and templates
|
||||
log_level: z.enum(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']).optional(),
|
||||
chat_template: z.string().optional(),
|
||||
use_default_chat_template: z.boolean().optional(),
|
||||
chat_template_args: z.string().optional(), // JSON string
|
||||
|
||||
// Sampling defaults
|
||||
temp: z.number().optional(), // Note: MLX uses "temp" not "temperature"
|
||||
top_p: z.number().optional(),
|
||||
top_k: z.number().optional(),
|
||||
min_p: z.number().optional(),
|
||||
max_tokens: z.number().optional(),
|
||||
})
|
||||
// Import backend schemas from separate files
|
||||
import {
|
||||
LlamaCppBackendOptionsSchema,
|
||||
type LlamaCppBackendOptions,
|
||||
getAllLlamaCppFieldKeys,
|
||||
getLlamaCppFieldType,
|
||||
MlxBackendOptionsSchema,
|
||||
type MlxBackendOptions,
|
||||
getAllMlxFieldKeys,
|
||||
getMlxFieldType,
|
||||
VllmBackendOptionsSchema,
|
||||
type VllmBackendOptions,
|
||||
getAllVllmFieldKeys,
|
||||
getVllmFieldType
|
||||
} from './backends'
|
||||
|
||||
// Backend options union
|
||||
export const BackendOptionsSchema = z.union([
|
||||
LlamaCppBackendOptionsSchema,
|
||||
MlxBackendOptionsSchema,
|
||||
VllmBackendOptionsSchema,
|
||||
])
|
||||
|
||||
// Define the main create instance options schema
|
||||
@@ -213,13 +34,27 @@ export const CreateInstanceOptionsSchema = z.object({
|
||||
on_demand_start: z.boolean().optional(),
|
||||
|
||||
// Backend configuration
|
||||
backend_type: z.enum([BackendType.LLAMA_CPP, BackendType.MLX_LM]).optional(),
|
||||
backend_type: z.enum([BackendType.LLAMA_CPP, BackendType.MLX_LM, BackendType.VLLM]).optional(),
|
||||
backend_options: BackendOptionsSchema.optional(),
|
||||
})
|
||||
|
||||
// Re-export types and schemas from backend files
|
||||
export {
|
||||
LlamaCppBackendOptionsSchema,
|
||||
MlxBackendOptionsSchema,
|
||||
VllmBackendOptionsSchema,
|
||||
type LlamaCppBackendOptions,
|
||||
type MlxBackendOptions,
|
||||
type VllmBackendOptions,
|
||||
getAllLlamaCppFieldKeys,
|
||||
getAllMlxFieldKeys,
|
||||
getAllVllmFieldKeys,
|
||||
getLlamaCppFieldType,
|
||||
getMlxFieldType,
|
||||
getVllmFieldType
|
||||
}
|
||||
|
||||
// Infer the TypeScript types from the schemas
|
||||
export type LlamaCppBackendOptions = z.infer<typeof LlamaCppBackendOptionsSchema>
|
||||
export type MlxBackendOptions = z.infer<typeof MlxBackendOptionsSchema>
|
||||
export type BackendOptions = z.infer<typeof BackendOptionsSchema>
|
||||
export type CreateInstanceOptions = z.infer<typeof CreateInstanceOptionsSchema>
|
||||
|
||||
@@ -228,16 +63,6 @@ export function getAllFieldKeys(): (keyof CreateInstanceOptions)[] {
|
||||
return Object.keys(CreateInstanceOptionsSchema.shape) as (keyof CreateInstanceOptions)[]
|
||||
}
|
||||
|
||||
// Helper to get all LlamaCpp backend option field keys
|
||||
export function getAllLlamaCppFieldKeys(): (keyof LlamaCppBackendOptions)[] {
|
||||
return Object.keys(LlamaCppBackendOptionsSchema.shape) as (keyof LlamaCppBackendOptions)[]
|
||||
}
|
||||
|
||||
// Helper to get all MLX backend option field keys
|
||||
export function getAllMlxFieldKeys(): (keyof MlxBackendOptions)[] {
|
||||
return Object.keys(MlxBackendOptionsSchema.shape) as (keyof MlxBackendOptions)[]
|
||||
}
|
||||
|
||||
// Get field type from Zod schema
|
||||
export function getFieldType(key: keyof CreateInstanceOptions): 'text' | 'number' | 'boolean' | 'array' | 'object' {
|
||||
const fieldSchema = CreateInstanceOptionsSchema.shape[key]
|
||||
@@ -252,32 +77,3 @@ export function getFieldType(key: keyof CreateInstanceOptions): 'text' | 'number
|
||||
if (innerSchema instanceof z.ZodObject) return 'object'
|
||||
return 'text' // ZodString and others default to text
|
||||
}
|
||||
|
||||
// Get field type for LlamaCpp backend options
|
||||
export function getLlamaCppFieldType(key: keyof LlamaCppBackendOptions): 'text' | 'number' | 'boolean' | 'array' {
|
||||
const fieldSchema = LlamaCppBackendOptionsSchema.shape[key]
|
||||
if (!fieldSchema) return 'text'
|
||||
|
||||
// Handle ZodOptional wrapper
|
||||
const innerSchema = fieldSchema instanceof z.ZodOptional ? fieldSchema.unwrap() : fieldSchema
|
||||
|
||||
if (innerSchema instanceof z.ZodBoolean) return 'boolean'
|
||||
if (innerSchema instanceof z.ZodNumber) return 'number'
|
||||
if (innerSchema instanceof z.ZodArray) return 'array'
|
||||
return 'text' // ZodString and others default to text
|
||||
}
|
||||
|
||||
// Get field type for MLX backend options
|
||||
export function getMlxFieldType(key: keyof MlxBackendOptions): 'text' | 'number' | 'boolean' | 'array' {
|
||||
const fieldSchema = MlxBackendOptionsSchema.shape[key]
|
||||
if (!fieldSchema) return 'text'
|
||||
|
||||
// Handle ZodOptional wrapper
|
||||
const innerSchema = fieldSchema instanceof z.ZodOptional ? fieldSchema.unwrap() : fieldSchema
|
||||
|
||||
if (innerSchema instanceof z.ZodBoolean) return 'boolean'
|
||||
if (innerSchema instanceof z.ZodNumber) return 'number'
|
||||
if (innerSchema instanceof z.ZodArray) return 'array'
|
||||
if (innerSchema instanceof z.ZodEnum) return 'text' // Enum treated as text/select
|
||||
return 'text' // ZodString and others default to text
|
||||
}
|
||||
@@ -5,6 +5,7 @@ export { type CreateInstanceOptions } from '@/schemas/instanceOptions'
|
||||
export const BackendType = {
|
||||
LLAMA_CPP: 'llama_cpp',
|
||||
MLX_LM: 'mlx_lm',
|
||||
VLLM: 'vllm',
|
||||
// MLX_VLM: 'mlx_vlm', // Future expansion
|
||||
} as const
|
||||
|
||||
|
||||
Reference in New Issue
Block a user