diff --git a/README.md b/README.md index 2a24520..31c827c 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,7 @@ ### 🔗 Universal Compatibility - **OpenAI API Compatible**: Drop-in replacement - route requests by model name -- **Multi-Backend Support**: Native support for both llama.cpp and MLX (Apple Silicon optimized) +- **Multi-Backend Support**: Native support for llama.cpp, MLX (Apple Silicon optimized), and vLLM ### 🌐 User-Friendly Interface - **Web Dashboard**: Modern React UI for visual management (unlike CLI-only tools) @@ -31,6 +31,7 @@ # 1. Install backend (one-time setup) # For llama.cpp: https://github.com/ggml-org/llama.cpp#quick-start # For MLX on macOS: pip install mlx-lm +# For vLLM: pip install vllm # 2. Download and run llamactl LATEST_VERSION=$(curl -s https://api.github.com/repos/lordmathis/llamactl/releases/latest | grep '"tag_name":' | sed -E 's/.*"([^"]+)".*/\1/') @@ -47,7 +48,7 @@ llamactl ### Create and manage instances via web dashboard: 1. Open http://localhost:8080 2. Click "Create Instance" -3. Choose backend type (llama.cpp or MLX) +3. Choose backend type (llama.cpp, MLX, or vLLM) 4. Set model path and backend-specific options 5. Start or stop the instance @@ -63,6 +64,11 @@ curl -X POST localhost:8080/api/v1/instances/my-mlx-model \ -H "Authorization: Bearer your-key" \ -d '{"backend_type": "mlx_lm", "backend_options": {"model": "mlx-community/Mistral-7B-Instruct-v0.3-4bit"}}' +# Create vLLM instance +curl -X POST localhost:8080/api/v1/instances/my-vllm-model \ + -H "Authorization: Bearer your-key" \ + -d '{"backend_type": "vllm", "backend_options": {"model": "microsoft/DialoGPT-medium", "tensor_parallel_size": 2}}' + # Use with OpenAI SDK curl -X POST localhost:8080/v1/chat/completions \ -H "Authorization: Bearer your-key" \ @@ -121,6 +127,21 @@ source mlx-env/bin/activate pip install mlx-lm ``` +**For vLLM backend:** +You need vLLM installed: + +```bash +# Install via pip (requires Python 3.8+, GPU required) +pip install vllm + +# Or in a virtual environment (recommended) +python -m venv vllm-env +source vllm-env/bin/activate +pip install vllm + +# For production deployments, consider container-based installation +``` + ## Configuration llamactl works out of the box with sensible defaults. @@ -135,6 +156,7 @@ server: backends: llama_executable: llama-server # Path to llama-server executable mlx_lm_executable: mlx_lm.server # Path to mlx_lm.server executable + vllm_executable: vllm # Path to vllm executable instances: port_range: [8000, 9000] # Port range for instances diff --git a/apidocs/docs.go b/apidocs/docs.go index 7ea502e..93edced 100644 --- a/apidocs/docs.go +++ b/apidocs/docs.go @@ -19,6 +19,159 @@ const docTemplate = `{ "host": "{{.Host}}", "basePath": "{{.BasePath}}", "paths": { + "/backends/llama-cpp/parse-command": { + "post": { + "security": [ + { + "ApiKeyAuth": [] + } + ], + "description": "Parses a llama-server command string into instance options", + "consumes": [ + "application/json" + ], + "produces": [ + "application/json" + ], + "tags": [ + "backends" + ], + "summary": "Parse llama-server command", + "parameters": [ + { + "description": "Command to parse", + "name": "request", + "in": "body", + "required": true, + "schema": { + "$ref": "#/definitions/server.ParseCommandRequest" + } + } + ], + "responses": { + "200": { + "description": "Parsed options", + "schema": { + "$ref": "#/definitions/instance.CreateInstanceOptions" + } + }, + "400": { + "description": "Invalid request or command", + "schema": { + "type": "object", + "additionalProperties": { + "type": "string" + } + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "type": "object", + "additionalProperties": { + "type": "string" + } + } + } + } + } + }, + "/backends/mlx/parse-command": { + "post": { + "security": [ + { + "ApiKeyAuth": [] + } + ], + "description": "Parses MLX-LM server command string into instance options", + "consumes": [ + "application/json" + ], + "produces": [ + "application/json" + ], + "tags": [ + "backends" + ], + "summary": "Parse mlx_lm.server command", + "parameters": [ + { + "description": "Command to parse", + "name": "request", + "in": "body", + "required": true, + "schema": { + "$ref": "#/definitions/server.ParseCommandRequest" + } + } + ], + "responses": { + "200": { + "description": "Parsed options", + "schema": { + "$ref": "#/definitions/instance.CreateInstanceOptions" + } + }, + "400": { + "description": "Invalid request or command", + "schema": { + "type": "object", + "additionalProperties": { + "type": "string" + } + } + } + } + } + }, + "/backends/vllm/parse-command": { + "post": { + "security": [ + { + "ApiKeyAuth": [] + } + ], + "description": "Parses a vLLM serve command string into instance options", + "consumes": [ + "application/json" + ], + "produces": [ + "application/json" + ], + "tags": [ + "backends" + ], + "summary": "Parse vllm serve command", + "parameters": [ + { + "description": "Command to parse", + "name": "request", + "in": "body", + "required": true, + "schema": { + "$ref": "#/definitions/server.ParseCommandRequest" + } + } + ], + "responses": { + "200": { + "description": "Parsed options", + "schema": { + "$ref": "#/definitions/instance.CreateInstanceOptions" + } + }, + "400": { + "description": "Invalid request or command", + "schema": { + "type": "object", + "additionalProperties": { + "type": "string" + } + } + } + } + } + }, "/instances": { "get": { "security": [ @@ -681,522 +834,46 @@ const docTemplate = `{ } }, "definitions": { + "backends.BackendType": { + "type": "string", + "enum": [ + "llama_cpp", + "mlx_lm", + "vllm" + ], + "x-enum-varnames": [ + "BackendTypeLlamaCpp", + "BackendTypeMlxLm", + "BackendTypeVllm" + ] + }, "instance.CreateInstanceOptions": { "type": "object", "properties": { - "alias": { - "type": "string" - }, - "api_key": { - "type": "string" - }, - "api_key_file": { - "type": "string" - }, "auto_restart": { "description": "Auto restart", "type": "boolean" }, - "batch_size": { - "type": "integer" + "backend_options": { + "type": "object", + "additionalProperties": {} }, - "cache_reuse": { - "type": "integer" - }, - "cache_type_k": { - "type": "string" - }, - "cache_type_k_draft": { - "type": "string" - }, - "cache_type_v": { - "type": "string" - }, - "cache_type_v_draft": { - "type": "string" - }, - "chat_template": { - "type": "string" - }, - "chat_template_file": { - "type": "string" - }, - "chat_template_kwargs": { - "type": "string" - }, - "check_tensors": { - "type": "boolean" - }, - "cont_batching": { - "type": "boolean" - }, - "control_vector": { - "type": "array", - "items": { - "type": "string" - } - }, - "control_vector_layer_range": { - "type": "string" - }, - "control_vector_scaled": { - "type": "array", - "items": { - "type": "string" - } - }, - "cpu_mask": { - "type": "string" - }, - "cpu_mask_batch": { - "type": "string" - }, - "cpu_range": { - "type": "string" - }, - "cpu_range_batch": { - "type": "string" - }, - "cpu_strict": { - "type": "integer" - }, - "cpu_strict_batch": { - "type": "integer" - }, - "ctx_size": { - "type": "integer" - }, - "ctx_size_draft": { - "type": "integer" - }, - "defrag_thold": { - "type": "number" - }, - "device": { - "type": "string" - }, - "device_draft": { - "type": "string" - }, - "draft_max": { - "type": "integer" - }, - "draft_min": { - "type": "integer" - }, - "draft_p_min": { - "type": "number" - }, - "dry_allowed_length": { - "type": "integer" - }, - "dry_base": { - "type": "number" - }, - "dry_multiplier": { - "type": "number" - }, - "dry_penalty_last_n": { - "type": "integer" - }, - "dry_sequence_breaker": { - "type": "array", - "items": { - "type": "string" - } - }, - "dump_kv_cache": { - "type": "boolean" - }, - "dynatemp_exp": { - "type": "number" - }, - "dynatemp_range": { - "type": "number" - }, - "embd_bge_small_en_default": { - "description": "Default model params", - "type": "boolean" - }, - "embd_e5_small_en_default": { - "type": "boolean" - }, - "embd_gte_small_default": { - "type": "boolean" - }, - "embedding": { - "type": "boolean" - }, - "escape": { - "type": "boolean" - }, - "fim_qwen_14b_spec": { - "type": "boolean" - }, - "fim_qwen_1_5b_default": { - "type": "boolean" - }, - "fim_qwen_3b_default": { - "type": "boolean" - }, - "fim_qwen_7b_default": { - "type": "boolean" - }, - "fim_qwen_7b_spec": { - "type": "boolean" - }, - "flash_attn": { - "type": "boolean" - }, - "frequency_penalty": { - "type": "number" - }, - "gpu_layers": { - "type": "integer" - }, - "gpu_layers_draft": { - "type": "integer" - }, - "grammar": { - "type": "string" - }, - "grammar_file": { - "type": "string" - }, - "hf_file": { - "type": "string" - }, - "hf_file_v": { - "type": "string" - }, - "hf_repo": { - "type": "string" - }, - "hf_repo_draft": { - "type": "string" - }, - "hf_repo_v": { - "type": "string" - }, - "hf_token": { - "type": "string" - }, - "host": { - "type": "string" + "backend_type": { + "$ref": "#/definitions/backends.BackendType" }, "idle_timeout": { "description": "Idle timeout", "type": "integer" }, - "ignore_eos": { - "type": "boolean" - }, - "jinja": { - "type": "boolean" - }, - "json_schema": { - "type": "string" - }, - "json_schema_file": { - "type": "string" - }, - "keep": { - "type": "integer" - }, - "log_colors": { - "type": "boolean" - }, - "log_disable": { - "type": "boolean" - }, - "log_file": { - "type": "string" - }, - "log_prefix": { - "type": "boolean" - }, - "log_timestamps": { - "type": "boolean" - }, - "logit_bias": { - "type": "array", - "items": { - "type": "string" - } - }, - "lora": { - "type": "array", - "items": { - "type": "string" - } - }, - "lora_init_without_apply": { - "type": "boolean" - }, - "lora_scaled": { - "type": "array", - "items": { - "type": "string" - } - }, - "main_gpu": { - "type": "integer" - }, "max_restarts": { "type": "integer" }, - "metrics": { - "type": "boolean" - }, - "min_p": { - "type": "number" - }, - "mirostat": { - "type": "integer" - }, - "mirostat_ent": { - "type": "number" - }, - "mirostat_lr": { - "type": "number" - }, - "mlock": { - "type": "boolean" - }, - "mmproj": { - "type": "string" - }, - "mmproj_url": { - "type": "string" - }, - "model": { - "type": "string" - }, - "model_draft": { - "type": "string" - }, - "model_url": { - "type": "string" - }, - "model_vocoder": { - "description": "Audio/TTS params", - "type": "string" - }, - "no_cont_batching": { - "type": "boolean" - }, - "no_context_shift": { - "description": "Example-specific params", - "type": "boolean" - }, - "no_escape": { - "type": "boolean" - }, - "no_kv_offload": { - "type": "boolean" - }, - "no_mmap": { - "type": "boolean" - }, - "no_mmproj": { - "type": "boolean" - }, - "no_mmproj_offload": { - "type": "boolean" - }, - "no_perf": { - "type": "boolean" - }, - "no_prefill_assistant": { - "type": "boolean" - }, - "no_slots": { - "type": "boolean" - }, - "no_warmup": { - "type": "boolean" - }, - "no_webui": { - "type": "boolean" - }, - "numa": { - "type": "string" - }, "on_demand_start": { "description": "On demand start", "type": "boolean" }, - "override_kv": { - "type": "array", - "items": { - "type": "string" - } - }, - "override_tensor": { - "type": "array", - "items": { - "type": "string" - } - }, - "parallel": { - "type": "integer" - }, - "path": { - "type": "string" - }, - "poll": { - "type": "integer" - }, - "poll_batch": { - "type": "integer" - }, - "pooling": { - "type": "string" - }, - "port": { - "type": "integer" - }, - "predict": { - "type": "integer" - }, - "presence_penalty": { - "type": "number" - }, - "prio": { - "type": "integer" - }, - "prio_batch": { - "type": "integer" - }, - "props": { - "type": "boolean" - }, - "reasoning_budget": { - "type": "integer" - }, - "reasoning_format": { - "type": "string" - }, - "repeat_last_n": { - "type": "integer" - }, - "repeat_penalty": { - "type": "number" - }, - "reranking": { - "type": "boolean" - }, "restart_delay": { - "type": "integer" - }, - "rope_freq_base": { - "type": "number" - }, - "rope_freq_scale": { - "type": "number" - }, - "rope_scale": { - "type": "number" - }, - "rope_scaling": { - "type": "string" - }, - "samplers": { - "description": "Sampling params", - "type": "string" - }, - "sampling_seq": { - "type": "string" - }, - "seed": { - "type": "integer" - }, - "slot_prompt_similarity": { - "type": "number" - }, - "slot_save_path": { - "type": "string" - }, - "slots": { - "type": "boolean" - }, - "special": { - "type": "boolean" - }, - "split_mode": { - "type": "string" - }, - "spm_infill": { - "type": "boolean" - }, - "ssl_cert_file": { - "type": "string" - }, - "ssl_key_file": { - "type": "string" - }, - "temp": { - "type": "number" - }, - "tensor_split": { - "type": "string" - }, - "threads": { - "type": "integer" - }, - "threads_batch": { - "type": "integer" - }, - "threads_http": { - "type": "integer" - }, - "timeout": { - "type": "integer" - }, - "top_k": { - "type": "integer" - }, - "top_p": { - "type": "number" - }, - "tts_use_guide_tokens": { - "type": "boolean" - }, - "typical": { - "type": "number" - }, - "ubatch_size": { - "type": "integer" - }, - "verbose": { - "type": "boolean" - }, - "verbose_prompt": { - "description": "Common params", - "type": "boolean" - }, - "verbosity": { - "type": "integer" - }, - "xtc_probability": { - "type": "number" - }, - "xtc_threshold": { - "type": "number" - }, - "yarn_attn_factor": { - "type": "number" - }, - "yarn_beta_fast": { - "type": "number" - }, - "yarn_beta_slow": { - "type": "number" - }, - "yarn_ext_factor": { - "type": "number" - }, - "yarn_orig_ctx": { + "description": "seconds", "type": "integer" } } @@ -1264,6 +941,14 @@ const docTemplate = `{ "type": "string" } } + }, + "server.ParseCommandRequest": { + "type": "object", + "properties": { + "command": { + "type": "string" + } + } } } }` diff --git a/apidocs/swagger.json b/apidocs/swagger.json index be8d193..dc7f1c8 100644 --- a/apidocs/swagger.json +++ b/apidocs/swagger.json @@ -12,6 +12,159 @@ }, "basePath": "/api/v1", "paths": { + "/backends/llama-cpp/parse-command": { + "post": { + "security": [ + { + "ApiKeyAuth": [] + } + ], + "description": "Parses a llama-server command string into instance options", + "consumes": [ + "application/json" + ], + "produces": [ + "application/json" + ], + "tags": [ + "backends" + ], + "summary": "Parse llama-server command", + "parameters": [ + { + "description": "Command to parse", + "name": "request", + "in": "body", + "required": true, + "schema": { + "$ref": "#/definitions/server.ParseCommandRequest" + } + } + ], + "responses": { + "200": { + "description": "Parsed options", + "schema": { + "$ref": "#/definitions/instance.CreateInstanceOptions" + } + }, + "400": { + "description": "Invalid request or command", + "schema": { + "type": "object", + "additionalProperties": { + "type": "string" + } + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "type": "object", + "additionalProperties": { + "type": "string" + } + } + } + } + } + }, + "/backends/mlx/parse-command": { + "post": { + "security": [ + { + "ApiKeyAuth": [] + } + ], + "description": "Parses MLX-LM server command string into instance options", + "consumes": [ + "application/json" + ], + "produces": [ + "application/json" + ], + "tags": [ + "backends" + ], + "summary": "Parse mlx_lm.server command", + "parameters": [ + { + "description": "Command to parse", + "name": "request", + "in": "body", + "required": true, + "schema": { + "$ref": "#/definitions/server.ParseCommandRequest" + } + } + ], + "responses": { + "200": { + "description": "Parsed options", + "schema": { + "$ref": "#/definitions/instance.CreateInstanceOptions" + } + }, + "400": { + "description": "Invalid request or command", + "schema": { + "type": "object", + "additionalProperties": { + "type": "string" + } + } + } + } + } + }, + "/backends/vllm/parse-command": { + "post": { + "security": [ + { + "ApiKeyAuth": [] + } + ], + "description": "Parses a vLLM serve command string into instance options", + "consumes": [ + "application/json" + ], + "produces": [ + "application/json" + ], + "tags": [ + "backends" + ], + "summary": "Parse vllm serve command", + "parameters": [ + { + "description": "Command to parse", + "name": "request", + "in": "body", + "required": true, + "schema": { + "$ref": "#/definitions/server.ParseCommandRequest" + } + } + ], + "responses": { + "200": { + "description": "Parsed options", + "schema": { + "$ref": "#/definitions/instance.CreateInstanceOptions" + } + }, + "400": { + "description": "Invalid request or command", + "schema": { + "type": "object", + "additionalProperties": { + "type": "string" + } + } + } + } + } + }, "/instances": { "get": { "security": [ @@ -674,522 +827,46 @@ } }, "definitions": { + "backends.BackendType": { + "type": "string", + "enum": [ + "llama_cpp", + "mlx_lm", + "vllm" + ], + "x-enum-varnames": [ + "BackendTypeLlamaCpp", + "BackendTypeMlxLm", + "BackendTypeVllm" + ] + }, "instance.CreateInstanceOptions": { "type": "object", "properties": { - "alias": { - "type": "string" - }, - "api_key": { - "type": "string" - }, - "api_key_file": { - "type": "string" - }, "auto_restart": { "description": "Auto restart", "type": "boolean" }, - "batch_size": { - "type": "integer" + "backend_options": { + "type": "object", + "additionalProperties": {} }, - "cache_reuse": { - "type": "integer" - }, - "cache_type_k": { - "type": "string" - }, - "cache_type_k_draft": { - "type": "string" - }, - "cache_type_v": { - "type": "string" - }, - "cache_type_v_draft": { - "type": "string" - }, - "chat_template": { - "type": "string" - }, - "chat_template_file": { - "type": "string" - }, - "chat_template_kwargs": { - "type": "string" - }, - "check_tensors": { - "type": "boolean" - }, - "cont_batching": { - "type": "boolean" - }, - "control_vector": { - "type": "array", - "items": { - "type": "string" - } - }, - "control_vector_layer_range": { - "type": "string" - }, - "control_vector_scaled": { - "type": "array", - "items": { - "type": "string" - } - }, - "cpu_mask": { - "type": "string" - }, - "cpu_mask_batch": { - "type": "string" - }, - "cpu_range": { - "type": "string" - }, - "cpu_range_batch": { - "type": "string" - }, - "cpu_strict": { - "type": "integer" - }, - "cpu_strict_batch": { - "type": "integer" - }, - "ctx_size": { - "type": "integer" - }, - "ctx_size_draft": { - "type": "integer" - }, - "defrag_thold": { - "type": "number" - }, - "device": { - "type": "string" - }, - "device_draft": { - "type": "string" - }, - "draft_max": { - "type": "integer" - }, - "draft_min": { - "type": "integer" - }, - "draft_p_min": { - "type": "number" - }, - "dry_allowed_length": { - "type": "integer" - }, - "dry_base": { - "type": "number" - }, - "dry_multiplier": { - "type": "number" - }, - "dry_penalty_last_n": { - "type": "integer" - }, - "dry_sequence_breaker": { - "type": "array", - "items": { - "type": "string" - } - }, - "dump_kv_cache": { - "type": "boolean" - }, - "dynatemp_exp": { - "type": "number" - }, - "dynatemp_range": { - "type": "number" - }, - "embd_bge_small_en_default": { - "description": "Default model params", - "type": "boolean" - }, - "embd_e5_small_en_default": { - "type": "boolean" - }, - "embd_gte_small_default": { - "type": "boolean" - }, - "embedding": { - "type": "boolean" - }, - "escape": { - "type": "boolean" - }, - "fim_qwen_14b_spec": { - "type": "boolean" - }, - "fim_qwen_1_5b_default": { - "type": "boolean" - }, - "fim_qwen_3b_default": { - "type": "boolean" - }, - "fim_qwen_7b_default": { - "type": "boolean" - }, - "fim_qwen_7b_spec": { - "type": "boolean" - }, - "flash_attn": { - "type": "boolean" - }, - "frequency_penalty": { - "type": "number" - }, - "gpu_layers": { - "type": "integer" - }, - "gpu_layers_draft": { - "type": "integer" - }, - "grammar": { - "type": "string" - }, - "grammar_file": { - "type": "string" - }, - "hf_file": { - "type": "string" - }, - "hf_file_v": { - "type": "string" - }, - "hf_repo": { - "type": "string" - }, - "hf_repo_draft": { - "type": "string" - }, - "hf_repo_v": { - "type": "string" - }, - "hf_token": { - "type": "string" - }, - "host": { - "type": "string" + "backend_type": { + "$ref": "#/definitions/backends.BackendType" }, "idle_timeout": { "description": "Idle timeout", "type": "integer" }, - "ignore_eos": { - "type": "boolean" - }, - "jinja": { - "type": "boolean" - }, - "json_schema": { - "type": "string" - }, - "json_schema_file": { - "type": "string" - }, - "keep": { - "type": "integer" - }, - "log_colors": { - "type": "boolean" - }, - "log_disable": { - "type": "boolean" - }, - "log_file": { - "type": "string" - }, - "log_prefix": { - "type": "boolean" - }, - "log_timestamps": { - "type": "boolean" - }, - "logit_bias": { - "type": "array", - "items": { - "type": "string" - } - }, - "lora": { - "type": "array", - "items": { - "type": "string" - } - }, - "lora_init_without_apply": { - "type": "boolean" - }, - "lora_scaled": { - "type": "array", - "items": { - "type": "string" - } - }, - "main_gpu": { - "type": "integer" - }, "max_restarts": { "type": "integer" }, - "metrics": { - "type": "boolean" - }, - "min_p": { - "type": "number" - }, - "mirostat": { - "type": "integer" - }, - "mirostat_ent": { - "type": "number" - }, - "mirostat_lr": { - "type": "number" - }, - "mlock": { - "type": "boolean" - }, - "mmproj": { - "type": "string" - }, - "mmproj_url": { - "type": "string" - }, - "model": { - "type": "string" - }, - "model_draft": { - "type": "string" - }, - "model_url": { - "type": "string" - }, - "model_vocoder": { - "description": "Audio/TTS params", - "type": "string" - }, - "no_cont_batching": { - "type": "boolean" - }, - "no_context_shift": { - "description": "Example-specific params", - "type": "boolean" - }, - "no_escape": { - "type": "boolean" - }, - "no_kv_offload": { - "type": "boolean" - }, - "no_mmap": { - "type": "boolean" - }, - "no_mmproj": { - "type": "boolean" - }, - "no_mmproj_offload": { - "type": "boolean" - }, - "no_perf": { - "type": "boolean" - }, - "no_prefill_assistant": { - "type": "boolean" - }, - "no_slots": { - "type": "boolean" - }, - "no_warmup": { - "type": "boolean" - }, - "no_webui": { - "type": "boolean" - }, - "numa": { - "type": "string" - }, "on_demand_start": { "description": "On demand start", "type": "boolean" }, - "override_kv": { - "type": "array", - "items": { - "type": "string" - } - }, - "override_tensor": { - "type": "array", - "items": { - "type": "string" - } - }, - "parallel": { - "type": "integer" - }, - "path": { - "type": "string" - }, - "poll": { - "type": "integer" - }, - "poll_batch": { - "type": "integer" - }, - "pooling": { - "type": "string" - }, - "port": { - "type": "integer" - }, - "predict": { - "type": "integer" - }, - "presence_penalty": { - "type": "number" - }, - "prio": { - "type": "integer" - }, - "prio_batch": { - "type": "integer" - }, - "props": { - "type": "boolean" - }, - "reasoning_budget": { - "type": "integer" - }, - "reasoning_format": { - "type": "string" - }, - "repeat_last_n": { - "type": "integer" - }, - "repeat_penalty": { - "type": "number" - }, - "reranking": { - "type": "boolean" - }, "restart_delay": { - "type": "integer" - }, - "rope_freq_base": { - "type": "number" - }, - "rope_freq_scale": { - "type": "number" - }, - "rope_scale": { - "type": "number" - }, - "rope_scaling": { - "type": "string" - }, - "samplers": { - "description": "Sampling params", - "type": "string" - }, - "sampling_seq": { - "type": "string" - }, - "seed": { - "type": "integer" - }, - "slot_prompt_similarity": { - "type": "number" - }, - "slot_save_path": { - "type": "string" - }, - "slots": { - "type": "boolean" - }, - "special": { - "type": "boolean" - }, - "split_mode": { - "type": "string" - }, - "spm_infill": { - "type": "boolean" - }, - "ssl_cert_file": { - "type": "string" - }, - "ssl_key_file": { - "type": "string" - }, - "temp": { - "type": "number" - }, - "tensor_split": { - "type": "string" - }, - "threads": { - "type": "integer" - }, - "threads_batch": { - "type": "integer" - }, - "threads_http": { - "type": "integer" - }, - "timeout": { - "type": "integer" - }, - "top_k": { - "type": "integer" - }, - "top_p": { - "type": "number" - }, - "tts_use_guide_tokens": { - "type": "boolean" - }, - "typical": { - "type": "number" - }, - "ubatch_size": { - "type": "integer" - }, - "verbose": { - "type": "boolean" - }, - "verbose_prompt": { - "description": "Common params", - "type": "boolean" - }, - "verbosity": { - "type": "integer" - }, - "xtc_probability": { - "type": "number" - }, - "xtc_threshold": { - "type": "number" - }, - "yarn_attn_factor": { - "type": "number" - }, - "yarn_beta_fast": { - "type": "number" - }, - "yarn_beta_slow": { - "type": "number" - }, - "yarn_ext_factor": { - "type": "number" - }, - "yarn_orig_ctx": { + "description": "seconds", "type": "integer" } } @@ -1257,6 +934,14 @@ "type": "string" } } + }, + "server.ParseCommandRequest": { + "type": "object", + "properties": { + "command": { + "type": "string" + } + } } } } \ No newline at end of file diff --git a/apidocs/swagger.yaml b/apidocs/swagger.yaml index bc6e4ec..89b53fd 100644 --- a/apidocs/swagger.yaml +++ b/apidocs/swagger.yaml @@ -1,352 +1,35 @@ basePath: /api/v1 definitions: + backends.BackendType: + enum: + - llama_cpp + - mlx_lm + - vllm + type: string + x-enum-varnames: + - BackendTypeLlamaCpp + - BackendTypeMlxLm + - BackendTypeVllm instance.CreateInstanceOptions: properties: - alias: - type: string - api_key: - type: string - api_key_file: - type: string auto_restart: description: Auto restart type: boolean - batch_size: - type: integer - cache_reuse: - type: integer - cache_type_k: - type: string - cache_type_k_draft: - type: string - cache_type_v: - type: string - cache_type_v_draft: - type: string - chat_template: - type: string - chat_template_file: - type: string - chat_template_kwargs: - type: string - check_tensors: - type: boolean - cont_batching: - type: boolean - control_vector: - items: - type: string - type: array - control_vector_layer_range: - type: string - control_vector_scaled: - items: - type: string - type: array - cpu_mask: - type: string - cpu_mask_batch: - type: string - cpu_range: - type: string - cpu_range_batch: - type: string - cpu_strict: - type: integer - cpu_strict_batch: - type: integer - ctx_size: - type: integer - ctx_size_draft: - type: integer - defrag_thold: - type: number - device: - type: string - device_draft: - type: string - draft_max: - type: integer - draft_min: - type: integer - draft_p_min: - type: number - dry_allowed_length: - type: integer - dry_base: - type: number - dry_multiplier: - type: number - dry_penalty_last_n: - type: integer - dry_sequence_breaker: - items: - type: string - type: array - dump_kv_cache: - type: boolean - dynatemp_exp: - type: number - dynatemp_range: - type: number - embd_bge_small_en_default: - description: Default model params - type: boolean - embd_e5_small_en_default: - type: boolean - embd_gte_small_default: - type: boolean - embedding: - type: boolean - escape: - type: boolean - fim_qwen_1_5b_default: - type: boolean - fim_qwen_3b_default: - type: boolean - fim_qwen_7b_default: - type: boolean - fim_qwen_7b_spec: - type: boolean - fim_qwen_14b_spec: - type: boolean - flash_attn: - type: boolean - frequency_penalty: - type: number - gpu_layers: - type: integer - gpu_layers_draft: - type: integer - grammar: - type: string - grammar_file: - type: string - hf_file: - type: string - hf_file_v: - type: string - hf_repo: - type: string - hf_repo_draft: - type: string - hf_repo_v: - type: string - hf_token: - type: string - host: - type: string + backend_options: + additionalProperties: {} + type: object + backend_type: + $ref: '#/definitions/backends.BackendType' idle_timeout: description: Idle timeout type: integer - ignore_eos: - type: boolean - jinja: - type: boolean - json_schema: - type: string - json_schema_file: - type: string - keep: - type: integer - log_colors: - type: boolean - log_disable: - type: boolean - log_file: - type: string - log_prefix: - type: boolean - log_timestamps: - type: boolean - logit_bias: - items: - type: string - type: array - lora: - items: - type: string - type: array - lora_init_without_apply: - type: boolean - lora_scaled: - items: - type: string - type: array - main_gpu: - type: integer max_restarts: type: integer - metrics: - type: boolean - min_p: - type: number - mirostat: - type: integer - mirostat_ent: - type: number - mirostat_lr: - type: number - mlock: - type: boolean - mmproj: - type: string - mmproj_url: - type: string - model: - type: string - model_draft: - type: string - model_url: - type: string - model_vocoder: - description: Audio/TTS params - type: string - no_cont_batching: - type: boolean - no_context_shift: - description: Example-specific params - type: boolean - no_escape: - type: boolean - no_kv_offload: - type: boolean - no_mmap: - type: boolean - no_mmproj: - type: boolean - no_mmproj_offload: - type: boolean - no_perf: - type: boolean - no_prefill_assistant: - type: boolean - no_slots: - type: boolean - no_warmup: - type: boolean - no_webui: - type: boolean - numa: - type: string on_demand_start: description: On demand start type: boolean - override_kv: - items: - type: string - type: array - override_tensor: - items: - type: string - type: array - parallel: - type: integer - path: - type: string - poll: - type: integer - poll_batch: - type: integer - pooling: - type: string - port: - type: integer - predict: - type: integer - presence_penalty: - type: number - prio: - type: integer - prio_batch: - type: integer - props: - type: boolean - reasoning_budget: - type: integer - reasoning_format: - type: string - repeat_last_n: - type: integer - repeat_penalty: - type: number - reranking: - type: boolean restart_delay: - type: integer - rope_freq_base: - type: number - rope_freq_scale: - type: number - rope_scale: - type: number - rope_scaling: - type: string - samplers: - description: Sampling params - type: string - sampling_seq: - type: string - seed: - type: integer - slot_prompt_similarity: - type: number - slot_save_path: - type: string - slots: - type: boolean - special: - type: boolean - split_mode: - type: string - spm_infill: - type: boolean - ssl_cert_file: - type: string - ssl_key_file: - type: string - temp: - type: number - tensor_split: - type: string - threads: - type: integer - threads_batch: - type: integer - threads_http: - type: integer - timeout: - type: integer - top_k: - type: integer - top_p: - type: number - tts_use_guide_tokens: - type: boolean - typical: - type: number - ubatch_size: - type: integer - verbose: - type: boolean - verbose_prompt: - description: Common params - type: boolean - verbosity: - type: integer - xtc_probability: - type: number - xtc_threshold: - type: number - yarn_attn_factor: - type: number - yarn_beta_fast: - type: number - yarn_beta_slow: - type: number - yarn_ext_factor: - type: number - yarn_orig_ctx: + description: seconds type: integer type: object instance.InstanceStatus: @@ -391,6 +74,11 @@ definitions: object: type: string type: object + server.ParseCommandRequest: + properties: + command: + type: string + type: object info: contact: {} description: llamactl is a control server for managing Llama Server instances. @@ -400,6 +88,102 @@ info: title: llamactl API version: "1.0" paths: + /backends/llama-cpp/parse-command: + post: + consumes: + - application/json + description: Parses a llama-server command string into instance options + parameters: + - description: Command to parse + in: body + name: request + required: true + schema: + $ref: '#/definitions/server.ParseCommandRequest' + produces: + - application/json + responses: + "200": + description: Parsed options + schema: + $ref: '#/definitions/instance.CreateInstanceOptions' + "400": + description: Invalid request or command + schema: + additionalProperties: + type: string + type: object + "500": + description: Internal Server Error + schema: + additionalProperties: + type: string + type: object + security: + - ApiKeyAuth: [] + summary: Parse llama-server command + tags: + - backends + /backends/mlx/parse-command: + post: + consumes: + - application/json + description: Parses MLX-LM server command string into instance options + parameters: + - description: Command to parse + in: body + name: request + required: true + schema: + $ref: '#/definitions/server.ParseCommandRequest' + produces: + - application/json + responses: + "200": + description: Parsed options + schema: + $ref: '#/definitions/instance.CreateInstanceOptions' + "400": + description: Invalid request or command + schema: + additionalProperties: + type: string + type: object + security: + - ApiKeyAuth: [] + summary: Parse mlx_lm.server command + tags: + - backends + /backends/vllm/parse-command: + post: + consumes: + - application/json + description: Parses a vLLM serve command string into instance options + parameters: + - description: Command to parse + in: body + name: request + required: true + schema: + $ref: '#/definitions/server.ParseCommandRequest' + produces: + - application/json + responses: + "200": + description: Parsed options + schema: + $ref: '#/definitions/instance.CreateInstanceOptions' + "400": + description: Invalid request or command + schema: + additionalProperties: + type: string + type: object + security: + - ApiKeyAuth: [] + summary: Parse vllm serve command + tags: + - backends /instances: get: description: Returns a list of all instances managed by the server diff --git a/docs/getting-started/configuration.md b/docs/getting-started/configuration.md index f8003ef..4100492 100644 --- a/docs/getting-started/configuration.md +++ b/docs/getting-started/configuration.md @@ -22,6 +22,7 @@ server: backends: llama_executable: llama-server # Path to llama-server executable mlx_lm_executable: mlx_lm.server # Path to mlx_lm.server executable + vllm_executable: vllm # Path to vllm executable instances: port_range: [8000, 9000] # Port range for instances @@ -94,11 +95,13 @@ server: backends: llama_executable: "llama-server" # Path to llama-server executable (default: "llama-server") mlx_lm_executable: "mlx_lm.server" # Path to mlx_lm.server executable (default: "mlx_lm.server") + vllm_executable: "vllm" # Path to vllm executable (default: "vllm") ``` **Environment Variables:** - `LLAMACTL_LLAMA_EXECUTABLE` - Path to llama-server executable - `LLAMACTL_MLX_LM_EXECUTABLE` - Path to mlx_lm.server executable +- `LLAMACTL_VLLM_EXECUTABLE` - Path to vllm executable ### Instance Configuration diff --git a/docs/getting-started/installation.md b/docs/getting-started/installation.md index a3ceae6..6f52fff 100644 --- a/docs/getting-started/installation.md +++ b/docs/getting-started/installation.md @@ -37,6 +37,22 @@ pip install mlx-lm Note: MLX backend is only available on macOS with Apple Silicon (M1, M2, M3, etc.) +**For vLLM backend:** + +vLLM provides high-throughput distributed serving for LLMs. Install vLLM: + +```bash +# Install via pip (requires Python 3.8+, GPU required) +pip install vllm + +# Or in a virtual environment (recommended) +python -m venv vllm-env +source vllm-env/bin/activate +pip install vllm + +# For production deployments, consider container-based installation +``` + ## Installation Methods ### Option 1: Download Binary (Recommended) diff --git a/docs/getting-started/quick-start.md b/docs/getting-started/quick-start.md index 4de1065..20d8aa8 100644 --- a/docs/getting-started/quick-start.md +++ b/docs/getting-started/quick-start.md @@ -29,8 +29,9 @@ You should see the Llamactl web interface. 1. Click the "Add Instance" button 2. Fill in the instance configuration: - **Name**: Give your instance a descriptive name - - **Model Path**: Path to your Llama.cpp model file - - **Additional Options**: Any extra Llama.cpp parameters + - **Backend Type**: Choose from llama.cpp, MLX, or vLLM + - **Model**: Model path or identifier for your chosen backend + - **Additional Options**: Backend-specific parameters 3. Click "Create Instance" @@ -43,17 +44,46 @@ Once created, you can: - **View logs** by clicking the logs button - **Stop** the instance when needed -## Example Configuration +## Example Configurations -Here's a basic example configuration for a Llama 2 model: +Here are basic example configurations for each backend: +**llama.cpp backend:** ```json { "name": "llama2-7b", - "model_path": "/path/to/llama-2-7b-chat.gguf", - "options": { + "backend_type": "llama_cpp", + "backend_options": { + "model": "/path/to/llama-2-7b-chat.gguf", "threads": 4, - "context_size": 2048 + "ctx_size": 2048, + "gpu_layers": 32 + } +} +``` + +**MLX backend (macOS only):** +```json +{ + "name": "mistral-mlx", + "backend_type": "mlx_lm", + "backend_options": { + "model": "mlx-community/Mistral-7B-Instruct-v0.3-4bit", + "temp": 0.7, + "max_tokens": 2048 + } +} +``` + +**vLLM backend:** +```json +{ + "name": "dialogpt-vllm", + "backend_type": "vllm", + "backend_options": { + "model": "microsoft/DialoGPT-medium", + "tensor_parallel_size": 2, + "gpu_memory_utilization": 0.9 } } ``` @@ -66,12 +96,14 @@ You can also manage instances via the REST API: # List all instances curl http://localhost:8080/api/instances -# Create a new instance -curl -X POST http://localhost:8080/api/instances \ +# Create a new llama.cpp instance +curl -X POST http://localhost:8080/api/instances/my-model \ -H "Content-Type: application/json" \ -d '{ - "name": "my-model", - "model_path": "/path/to/model.gguf", + "backend_type": "llama_cpp", + "backend_options": { + "model": "/path/to/model.gguf" + } }' # Start an instance diff --git a/docs/user-guide/api-reference.md b/docs/user-guide/api-reference.md index 3f99e53..27189e3 100644 --- a/docs/user-guide/api-reference.md +++ b/docs/user-guide/api-reference.md @@ -170,7 +170,7 @@ POST /api/v1/instances/{name}/start ```json { "name": "llama2-7b", - "status": "starting", + "status": "running", "created": 1705312200 } ``` @@ -191,7 +191,7 @@ POST /api/v1/instances/{name}/stop ```json { "name": "llama2-7b", - "status": "stopping", + "status": "stopped", "created": 1705312200 } ``` @@ -208,7 +208,7 @@ POST /api/v1/instances/{name}/restart ```json { "name": "llama2-7b", - "status": "restarting", + "status": "running", "created": 1705312200 } ``` @@ -316,9 +316,9 @@ The server routes requests to the appropriate instance based on the `model` fiel ## Instance Status Values -Instances can have the following status values: -- `stopped`: Instance is not running -- `running`: Instance is running and ready to accept requests +Instances can have the following status values: +- `stopped`: Instance is not running +- `running`: Instance is running and ready to accept requests - `failed`: Instance failed to start or crashed ## Error Responses @@ -401,6 +401,102 @@ curl -X POST http://localhost:8080/api/v1/instances/my-model/proxy/completion \ }' ``` +## Backend-Specific Endpoints + +### Parse Commands + +Llamactl provides endpoints to parse command strings from different backends into instance configuration options. + +#### Parse Llama.cpp Command + +Parse a llama-server command string into instance options. + +```http +POST /api/v1/backends/llama-cpp/parse-command +``` + +**Request Body:** +```json +{ + "command": "llama-server -m /path/to/model.gguf -c 2048 --port 8080" +} +``` + +**Response:** +```json +{ + "backend_type": "llama_cpp", + "llama_server_options": { + "model": "/path/to/model.gguf", + "ctx_size": 2048, + "port": 8080 + } +} +``` + +#### Parse MLX-LM Command + +Parse an MLX-LM server command string into instance options. + +```http +POST /api/v1/backends/mlx/parse-command +``` + +**Request Body:** +```json +{ + "command": "mlx_lm.server --model /path/to/model --port 8080" +} +``` + +**Response:** +```json +{ + "backend_type": "mlx_lm", + "mlx_server_options": { + "model": "/path/to/model", + "port": 8080 + } +} +``` + +#### Parse vLLM Command + +Parse a vLLM serve command string into instance options. + +```http +POST /api/v1/backends/vllm/parse-command +``` + +**Request Body:** +```json +{ + "command": "vllm serve /path/to/model --port 8080" +} +``` + +**Response:** +```json +{ + "backend_type": "vllm", + "vllm_server_options": { + "model": "/path/to/model", + "port": 8080 + } +} +``` + +**Error Responses for Parse Commands:** +- `400 Bad Request`: Invalid request body, empty command, or parse error +- `500 Internal Server Error`: Encoding error + +## Auto-Generated Documentation + +The API documentation is automatically generated from code annotations using Swagger/OpenAPI. To regenerate the documentation: + +1. Install the swag tool: `go install github.com/swaggo/swag/cmd/swag@latest` +2. Generate docs: `swag init -g cmd/server/main.go -o apidocs` + ## Swagger Documentation If swagger documentation is enabled in the server configuration, you can access the interactive API documentation at: diff --git a/docs/user-guide/managing-instances.md b/docs/user-guide/managing-instances.md index 186670c..e094d42 100644 --- a/docs/user-guide/managing-instances.md +++ b/docs/user-guide/managing-instances.md @@ -1,6 +1,6 @@ # Managing Instances -Learn how to effectively manage your llama.cpp and MLX instances with Llamactl through both the Web UI and API. +Learn how to effectively manage your llama.cpp, MLX, and vLLM instances with Llamactl through both the Web UI and API. ## Overview @@ -42,9 +42,11 @@ Each instance is displayed as a card showing: 3. **Choose Backend Type**: - **llama.cpp**: For GGUF models using llama-server - **MLX**: For MLX-optimized models (macOS only) + - **vLLM**: For distributed serving and high-throughput inference 4. Configure model source: - **For llama.cpp**: GGUF model path or HuggingFace repo - **For MLX**: MLX model path or identifier (e.g., `mlx-community/Mistral-7B-Instruct-v0.3-4bit`) + - **For vLLM**: HuggingFace model identifier (e.g., `microsoft/DialoGPT-medium`) 5. Configure optional instance management settings: - **Auto Restart**: Automatically restart instance on failure - **Max Restarts**: Maximum number of restart attempts @@ -54,6 +56,7 @@ Each instance is displayed as a card showing: 6. Configure backend-specific options: - **llama.cpp**: Threads, context size, GPU layers, port, etc. - **MLX**: Temperature, top-p, adapter path, Python environment, etc. + - **vLLM**: Tensor parallel size, GPU memory utilization, quantization, etc. 7. Click **"Create"** to save the instance ### Via API @@ -87,6 +90,20 @@ curl -X POST http://localhost:8080/api/instances/my-mlx-instance \ "max_restarts": 3 }' +# Create vLLM instance +curl -X POST http://localhost:8080/api/instances/my-vllm-instance \ + -H "Content-Type: application/json" \ + -d '{ + "backend_type": "vllm", + "backend_options": { + "model": "microsoft/DialoGPT-medium", + "tensor_parallel_size": 2, + "gpu_memory_utilization": 0.9 + }, + "auto_restart": true, + "on_demand_start": true + }' + # Create llama.cpp instance with HuggingFace model curl -X POST http://localhost:8080/api/instances/gemma-3-27b \ -H "Content-Type: application/json" \ @@ -179,16 +196,17 @@ curl -X DELETE http://localhost:8080/api/instances/{name} ## Instance Proxy -Llamactl proxies all requests to the underlying backend instances (llama-server or MLX). +Llamactl proxies all requests to the underlying backend instances (llama-server, MLX, or vLLM). ```bash # Get instance details curl http://localhost:8080/api/instances/{name}/proxy/ ``` -Both backends provide OpenAI-compatible endpoints. Check the respective documentation: +All backends provide OpenAI-compatible endpoints. Check the respective documentation: - [llama-server docs](https://github.com/ggml-org/llama.cpp/blob/master/tools/server/README.md) - [MLX-LM docs](https://github.com/ml-explore/mlx-lm/blob/main/mlx_lm/SERVER.md) +- [vLLM docs](https://docs.vllm.ai/en/latest/) ### Instance Health diff --git a/pkg/backends/backend.go b/pkg/backends/backend.go index 0270945..802fec2 100644 --- a/pkg/backends/backend.go +++ b/pkg/backends/backend.go @@ -5,5 +5,6 @@ type BackendType string const ( BackendTypeLlamaCpp BackendType = "llama_cpp" BackendTypeMlxLm BackendType = "mlx_lm" + BackendTypeVllm BackendType = "vllm" // BackendTypeMlxVlm BackendType = "mlx_vlm" // Future expansion ) diff --git a/pkg/backends/builder.go b/pkg/backends/builder.go new file mode 100644 index 0000000..23c3bb1 --- /dev/null +++ b/pkg/backends/builder.go @@ -0,0 +1,70 @@ +package backends + +import ( + "reflect" + "strconv" + "strings" +) + +// BuildCommandArgs converts a struct to command line arguments +func BuildCommandArgs(options any, multipleFlags map[string]bool) []string { + var args []string + + v := reflect.ValueOf(options).Elem() + t := v.Type() + + for i := 0; i < v.NumField(); i++ { + field := v.Field(i) + fieldType := t.Field(i) + + if !field.CanInterface() { + continue + } + + jsonTag := fieldType.Tag.Get("json") + if jsonTag == "" || jsonTag == "-" { + continue + } + + // Get flag name from JSON tag + flagName := strings.Split(jsonTag, ",")[0] + flagName = strings.ReplaceAll(flagName, "_", "-") + + switch field.Kind() { + case reflect.Bool: + if field.Bool() { + args = append(args, "--"+flagName) + } + case reflect.Int: + if field.Int() != 0 { + args = append(args, "--"+flagName, strconv.FormatInt(field.Int(), 10)) + } + case reflect.Float64: + if field.Float() != 0 { + args = append(args, "--"+flagName, strconv.FormatFloat(field.Float(), 'f', -1, 64)) + } + case reflect.String: + if field.String() != "" { + args = append(args, "--"+flagName, field.String()) + } + case reflect.Slice: + if field.Type().Elem().Kind() == reflect.String && field.Len() > 0 { + if multipleFlags[flagName] { + // Multiple flags: --flag value1 --flag value2 + for j := 0; j < field.Len(); j++ { + args = append(args, "--"+flagName, field.Index(j).String()) + } + } else { + // Comma-separated: --flag value1,value2 + var values []string + for j := 0; j < field.Len(); j++ { + values = append(values, field.Index(j).String()) + } + args = append(args, "--"+flagName, strings.Join(values, ",")) + } + } + } + } + + return args +} diff --git a/pkg/backends/llamacpp/llama.go b/pkg/backends/llamacpp/llama.go index c838141..f2a7d31 100644 --- a/pkg/backends/llamacpp/llama.go +++ b/pkg/backends/llamacpp/llama.go @@ -2,9 +2,9 @@ package llamacpp import ( "encoding/json" + "llamactl/pkg/backends" "reflect" "strconv" - "strings" ) type LlamaServerOptions struct { @@ -315,62 +315,44 @@ func (o *LlamaServerOptions) UnmarshalJSON(data []byte) error { // BuildCommandArgs converts InstanceOptions to command line arguments func (o *LlamaServerOptions) BuildCommandArgs() []string { - var args []string + // Llama uses multiple flags for arrays by default (not comma-separated) + multipleFlags := map[string]bool{ + "override-tensor": true, + "override-kv": true, + "lora": true, + "lora-scaled": true, + "control-vector": true, + "control-vector-scaled": true, + "dry-sequence-breaker": true, + "logit-bias": true, + } + return backends.BuildCommandArgs(o, multipleFlags) +} - v := reflect.ValueOf(o).Elem() - t := v.Type() - - for i := 0; i < v.NumField(); i++ { - field := v.Field(i) - fieldType := t.Field(i) - - // Skip unexported fields - if !field.CanInterface() { - continue - } - - // Get the JSON tag to determine the flag name - jsonTag := fieldType.Tag.Get("json") - if jsonTag == "" || jsonTag == "-" { - continue - } - - // Remove ",omitempty" from the tag - flagName := jsonTag - if commaIndex := strings.Index(jsonTag, ","); commaIndex != -1 { - flagName = jsonTag[:commaIndex] - } - - // Convert snake_case to kebab-case for CLI flags - flagName = strings.ReplaceAll(flagName, "_", "-") - - // Add the appropriate arguments based on field type and value - switch field.Kind() { - case reflect.Bool: - if field.Bool() { - args = append(args, "--"+flagName) - } - case reflect.Int: - if field.Int() != 0 { - args = append(args, "--"+flagName, strconv.FormatInt(field.Int(), 10)) - } - case reflect.Float64: - if field.Float() != 0 { - args = append(args, "--"+flagName, strconv.FormatFloat(field.Float(), 'f', -1, 64)) - } - case reflect.String: - if field.String() != "" { - args = append(args, "--"+flagName, field.String()) - } - case reflect.Slice: - if field.Type().Elem().Kind() == reflect.String { - // Handle []string fields - for j := 0; j < field.Len(); j++ { - args = append(args, "--"+flagName, field.Index(j).String()) - } - } - } +// ParseLlamaCommand parses a llama-server command string into LlamaServerOptions +// Supports multiple formats: +// 1. Full command: "llama-server --model file.gguf" +// 2. Full path: "/usr/local/bin/llama-server --model file.gguf" +// 3. Args only: "--model file.gguf --gpu-layers 32" +// 4. Multiline commands with backslashes +func ParseLlamaCommand(command string) (*LlamaServerOptions, error) { + executableNames := []string{"llama-server"} + var subcommandNames []string // Llama has no subcommands + multiValuedFlags := map[string]bool{ + "override_tensor": true, + "override_kv": true, + "lora": true, + "lora_scaled": true, + "control_vector": true, + "control_vector_scaled": true, + "dry_sequence_breaker": true, + "logit_bias": true, } - return args + var llamaOptions LlamaServerOptions + if err := backends.ParseCommand(command, executableNames, subcommandNames, multiValuedFlags, &llamaOptions); err != nil { + return nil, err + } + + return &llamaOptions, nil } diff --git a/pkg/backends/llamacpp/llama_test.go b/pkg/backends/llamacpp/llama_test.go index 9c1162e..c779320 100644 --- a/pkg/backends/llamacpp/llama_test.go +++ b/pkg/backends/llamacpp/llama_test.go @@ -378,6 +378,121 @@ func TestUnmarshalJSON_ArrayFields(t *testing.T) { } } +func TestParseLlamaCommand(t *testing.T) { + tests := []struct { + name string + command string + expectErr bool + }{ + { + name: "basic command", + command: "llama-server --model /path/to/model.gguf --gpu-layers 32", + expectErr: false, + }, + { + name: "args only", + command: "--model /path/to/model.gguf --ctx-size 4096", + expectErr: false, + }, + { + name: "mixed flag formats", + command: "llama-server --model=/path/model.gguf --gpu-layers 16 --verbose", + expectErr: false, + }, + { + name: "quoted strings", + command: `llama-server --model test.gguf --api-key "sk-1234567890abcdef"`, + expectErr: false, + }, + { + name: "empty command", + command: "", + expectErr: true, + }, + { + name: "unterminated quote", + command: `llama-server --model test.gguf --api-key "unterminated`, + expectErr: true, + }, + { + name: "malformed flag", + command: "llama-server ---model test.gguf", + expectErr: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result, err := llamacpp.ParseLlamaCommand(tt.command) + + if tt.expectErr { + if err == nil { + t.Errorf("expected error but got none") + } + return + } + + if err != nil { + t.Errorf("unexpected error: %v", err) + return + } + + if result == nil { + t.Errorf("expected result but got nil") + } + }) + } +} + +func TestParseLlamaCommandValues(t *testing.T) { + command := "llama-server --model /test/model.gguf --gpu-layers 32 --temp 0.7 --verbose --no-mmap" + result, err := llamacpp.ParseLlamaCommand(command) + + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + if result.Model != "/test/model.gguf" { + t.Errorf("expected model '/test/model.gguf', got '%s'", result.Model) + } + + if result.GPULayers != 32 { + t.Errorf("expected gpu_layers 32, got %d", result.GPULayers) + } + + if result.Temperature != 0.7 { + t.Errorf("expected temperature 0.7, got %f", result.Temperature) + } + + if !result.Verbose { + t.Errorf("expected verbose to be true") + } + + if !result.NoMmap { + t.Errorf("expected no_mmap to be true") + } +} + +func TestParseLlamaCommandArrays(t *testing.T) { + command := "llama-server --model test.gguf --lora adapter1.bin --lora=adapter2.bin" + result, err := llamacpp.ParseLlamaCommand(command) + + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + if len(result.Lora) != 2 { + t.Errorf("expected 2 lora adapters, got %d", len(result.Lora)) + } + + expected := []string{"adapter1.bin", "adapter2.bin"} + for i, v := range expected { + if result.Lora[i] != v { + t.Errorf("expected lora[%d]=%s got %s", i, v, result.Lora[i]) + } + } +} + // Helper functions func contains(slice []string, item string) bool { return slices.Contains(slice, item) diff --git a/pkg/backends/llamacpp/parser.go b/pkg/backends/llamacpp/parser.go deleted file mode 100644 index d94b0ed..0000000 --- a/pkg/backends/llamacpp/parser.go +++ /dev/null @@ -1,286 +0,0 @@ -package llamacpp - -import ( - "encoding/json" - "errors" - "fmt" - "path/filepath" - "regexp" - "strconv" - "strings" -) - -// ParseLlamaCommand parses a llama-server command string into LlamaServerOptions -// Supports multiple formats: -// 1. Full command: "llama-server --model file.gguf" -// 2. Full path: "/usr/local/bin/llama-server --model file.gguf" -// 3. Args only: "--model file.gguf --gpu-layers 32" -// 4. Multiline commands with backslashes -func ParseLlamaCommand(command string) (*LlamaServerOptions, error) { - // 1. Normalize the command - handle multiline with backslashes - trimmed := normalizeMultilineCommand(command) - if trimmed == "" { - return nil, fmt.Errorf("command cannot be empty") - } - - // 2. Extract arguments from command - args, err := extractArgumentsFromCommand(trimmed) - if err != nil { - return nil, err - } - - // 3. Parse arguments into map - options := make(map[string]any) - - // Known multi-valued flags (snake_case form) - multiValued := map[string]struct{}{ - "override_tensor": {}, - "override_kv": {}, - "lora": {}, - "lora_scaled": {}, - "control_vector": {}, - "control_vector_scaled": {}, - "dry_sequence_breaker": {}, - "logit_bias": {}, - } - - i := 0 - for i < len(args) { - arg := args[i] - - if !strings.HasPrefix(arg, "-") { // skip positional / stray values - i++ - continue - } - - // Reject malformed flags with more than two leading dashes (e.g. ---model) to surface user mistakes - if strings.HasPrefix(arg, "---") { - return nil, fmt.Errorf("malformed flag: %s", arg) - } - - // Unified parsing for --flag=value vs --flag value - var rawFlag, rawValue string - hasEquals := false - if strings.Contains(arg, "=") { - parts := strings.SplitN(arg, "=", 2) - rawFlag = parts[0] - rawValue = parts[1] // may be empty string - hasEquals = true - } else { - rawFlag = arg - } - - flagCore := strings.TrimPrefix(strings.TrimPrefix(rawFlag, "-"), "-") - flagName := strings.ReplaceAll(flagCore, "-", "_") - - // Detect value if not in equals form - valueProvided := hasEquals - if !hasEquals { - if i+1 < len(args) && !isFlag(args[i+1]) { // next token is value - rawValue = args[i+1] - valueProvided = true - } - } - - // Determine if multi-valued flag - _, isMulti := multiValued[flagName] - - // Normalization helper: ensure slice for multi-valued flags - appendValue := func(valStr string) { - if existing, ok := options[flagName]; ok { - // Existing value; ensure slice semantics for multi-valued flags or repeated occurrences - if slice, ok := existing.([]string); ok { - options[flagName] = append(slice, valStr) - return - } - // Convert scalar to slice - options[flagName] = []string{fmt.Sprintf("%v", existing), valStr} - return - } - // First value - if isMulti { - options[flagName] = []string{valStr} - } else { - // We'll parse type below for single-valued flags - options[flagName] = valStr - } - } - - if valueProvided { - // Use raw token for multi-valued flags; else allow typed parsing - appendValue(rawValue) - if !isMulti { // convert to typed value if scalar - if strVal, ok := options[flagName].(string); ok { // still scalar - options[flagName] = parseValue(strVal) - } - } - // Advance index: if we consumed a following token as value (non equals form), skip it - if !hasEquals && i+1 < len(args) && rawValue == args[i+1] { - i += 2 - } else { - i++ - } - continue - } - - // Boolean flag (no value) - options[flagName] = true - i++ - } - - // 4. Convert to LlamaServerOptions using existing UnmarshalJSON - jsonData, err := json.Marshal(options) - if err != nil { - return nil, fmt.Errorf("failed to marshal parsed options: %w", err) - } - - var llamaOptions LlamaServerOptions - if err := json.Unmarshal(jsonData, &llamaOptions); err != nil { - return nil, fmt.Errorf("failed to parse command options: %w", err) - } - - // 5. Return LlamaServerOptions - return &llamaOptions, nil -} - -// parseValue attempts to parse a string value into the most appropriate type -func parseValue(value string) any { - // Surrounding matching quotes (single or double) - if l := len(value); l >= 2 { - if (value[0] == '"' && value[l-1] == '"') || (value[0] == '\'' && value[l-1] == '\'') { - value = value[1 : l-1] - } - } - - lower := strings.ToLower(value) - if lower == "true" { - return true - } - if lower == "false" { - return false - } - - if intVal, err := strconv.Atoi(value); err == nil { - return intVal - } - if floatVal, err := strconv.ParseFloat(value, 64); err == nil { - return floatVal - } - return value -} - -// normalizeMultilineCommand handles multiline commands with backslashes -func normalizeMultilineCommand(command string) string { - // Handle escaped newlines (backslash followed by newline) - re := regexp.MustCompile(`\\\s*\n\s*`) - normalized := re.ReplaceAllString(command, " ") - - // Clean up extra whitespace - re = regexp.MustCompile(`\s+`) - normalized = re.ReplaceAllString(normalized, " ") - - return strings.TrimSpace(normalized) -} - -// extractArgumentsFromCommand extracts arguments from various command formats -func extractArgumentsFromCommand(command string) ([]string, error) { - // Split command into tokens respecting quotes - tokens, err := splitCommandTokens(command) - if err != nil { - return nil, err - } - - if len(tokens) == 0 { - return nil, fmt.Errorf("no command tokens found") - } - - // Check if first token looks like an executable - firstToken := tokens[0] - - // Case 1: Full path to executable (contains path separator or ends with llama-server) - if strings.Contains(firstToken, string(filepath.Separator)) || - strings.HasSuffix(filepath.Base(firstToken), "llama-server") { - return tokens[1:], nil // Return everything except the executable - } - - // Case 2: Just "llama-server" command - if strings.ToLower(firstToken) == "llama-server" { - return tokens[1:], nil // Return everything except the command - } - - // Case 3: Arguments only (starts with a flag) - if strings.HasPrefix(firstToken, "-") { - return tokens, nil // Return all tokens as arguments - } - - // Case 4: Unknown format - might be a different executable name - // Be permissive and assume it's the executable - return tokens[1:], nil -} - -// splitCommandTokens splits a command string into tokens, respecting quotes -func splitCommandTokens(command string) ([]string, error) { - var tokens []string - var current strings.Builder - inQuotes := false - quoteChar := byte(0) - escaped := false - - for i := 0; i < len(command); i++ { - c := command[i] - - if escaped { - current.WriteByte(c) - escaped = false - continue - } - - if c == '\\' { - escaped = true - current.WriteByte(c) - continue - } - - if !inQuotes && (c == '"' || c == '\'') { - inQuotes = true - quoteChar = c - current.WriteByte(c) - } else if inQuotes && c == quoteChar { - inQuotes = false - quoteChar = 0 - current.WriteByte(c) - } else if !inQuotes && (c == ' ' || c == '\t') { - if current.Len() > 0 { - tokens = append(tokens, current.String()) - current.Reset() - } - } else { - current.WriteByte(c) - } - } - - if inQuotes { - return nil, errors.New("unterminated quoted string") - } - - if current.Len() > 0 { - tokens = append(tokens, current.String()) - } - - return tokens, nil -} - -// isFlag determines if a string is a command line flag or a value -// Handles the special case where negative numbers should be treated as values, not flags -func isFlag(arg string) bool { - if !strings.HasPrefix(arg, "-") { - return false - } - - // Special case: if it's a negative number, treat it as a value - if _, err := strconv.ParseFloat(arg, 64); err == nil { - return false - } - - return true -} diff --git a/pkg/backends/llamacpp/parser_test.go b/pkg/backends/llamacpp/parser_test.go deleted file mode 100644 index 60e6a19..0000000 --- a/pkg/backends/llamacpp/parser_test.go +++ /dev/null @@ -1,413 +0,0 @@ -package llamacpp - -import ( - "testing" -) - -func TestParseLlamaCommand(t *testing.T) { - tests := []struct { - name string - command string - expectErr bool - }{ - { - name: "basic command with model", - command: "llama-server --model /path/to/model.gguf", - expectErr: false, - }, - { - name: "command with multiple flags", - command: "llama-server --model /path/to/model.gguf --gpu-layers 32 --ctx-size 4096", - expectErr: false, - }, - { - name: "command with short flags", - command: "llama-server -m /path/to/model.gguf -ngl 32 -c 4096", - expectErr: false, - }, - { - name: "command with equals format", - command: "llama-server --model=/path/to/model.gguf --gpu-layers=32", - expectErr: false, - }, - { - name: "command with boolean flags", - command: "llama-server --model /path/to/model.gguf --verbose --no-mmap", - expectErr: false, - }, - { - name: "empty command", - command: "", - expectErr: true, - }, - { - name: "case insensitive command", - command: "LLAMA-SERVER --model /path/to/model.gguf", - expectErr: false, - }, - // New test cases for improved functionality - { - name: "args only without llama-server", - command: "--model /path/to/model.gguf --gpu-layers 32", - expectErr: false, - }, - { - name: "full path to executable", - command: "/usr/local/bin/llama-server --model /path/to/model.gguf", - expectErr: false, - }, - { - name: "negative number handling", - command: "llama-server --gpu-layers -1 --model test.gguf", - expectErr: false, - }, - { - name: "multiline command with backslashes", - command: "llama-server --model /path/to/model.gguf \\\n --ctx-size 4096 \\\n --batch-size 512", - expectErr: false, - }, - { - name: "quoted string with special characters", - command: `llama-server --model test.gguf --chat-template "{% for message in messages %}{{ message.role }}: {{ message.content }}\n{% endfor %}"`, - expectErr: false, - }, - { - name: "unterminated quoted string", - command: `llama-server --model test.gguf --chat-template "unterminated quote`, - expectErr: true, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - result, err := ParseLlamaCommand(tt.command) - - if tt.expectErr { - if err == nil { - t.Errorf("expected error but got none") - } - return - } - - if err != nil { - t.Errorf("unexpected error: %v", err) - return - } - - if result == nil { - t.Errorf("expected result but got nil") - return - } - }) - } -} - -func TestParseLlamaCommandSpecificValues(t *testing.T) { - // Test specific value parsing - command := "llama-server --model /test/model.gguf --gpu-layers 32 --ctx-size 4096 --verbose" - result, err := ParseLlamaCommand(command) - - if err != nil { - t.Fatalf("unexpected error: %v", err) - } - - if result.Model != "/test/model.gguf" { - t.Errorf("expected model '/test/model.gguf', got '%s'", result.Model) - } - - if result.GPULayers != 32 { - t.Errorf("expected gpu_layers 32, got %d", result.GPULayers) - } - - if result.CtxSize != 4096 { - t.Errorf("expected ctx_size 4096, got %d", result.CtxSize) - } - - if !result.Verbose { - t.Errorf("expected verbose to be true, got %v", result.Verbose) - } -} - -func TestParseLlamaCommandArrayFlags(t *testing.T) { - // Test array flag handling (critical for lora, override-tensor, etc.) - command := "llama-server --model test.gguf --lora adapter1.bin --lora adapter2.bin" - result, err := ParseLlamaCommand(command) - - if err != nil { - t.Fatalf("unexpected error: %v", err) - } - - if len(result.Lora) != 2 { - t.Errorf("expected 2 lora adapters, got %d", len(result.Lora)) - } - - if result.Lora[0] != "adapter1.bin" || result.Lora[1] != "adapter2.bin" { - t.Errorf("expected lora adapters [adapter1.bin, adapter2.bin], got %v", result.Lora) - } -} - -func TestParseLlamaCommandMixedFormats(t *testing.T) { - // Test mixing --flag=value and --flag value formats - command := "llama-server --model=/path/model.gguf --gpu-layers 16 --batch-size=512 --verbose" - result, err := ParseLlamaCommand(command) - - if err != nil { - t.Fatalf("unexpected error: %v", err) - } - - if result.Model != "/path/model.gguf" { - t.Errorf("expected model '/path/model.gguf', got '%s'", result.Model) - } - - if result.GPULayers != 16 { - t.Errorf("expected gpu_layers 16, got %d", result.GPULayers) - } - - if result.BatchSize != 512 { - t.Errorf("expected batch_size 512, got %d", result.BatchSize) - } - - if !result.Verbose { - t.Errorf("expected verbose to be true, got %v", result.Verbose) - } -} - -func TestParseLlamaCommandTypeConversion(t *testing.T) { - // Test that values are converted to appropriate types - command := "llama-server --model test.gguf --temp 0.7 --top-k 40 --no-mmap" - result, err := ParseLlamaCommand(command) - - if err != nil { - t.Fatalf("unexpected error: %v", err) - } - - if result.Temperature != 0.7 { - t.Errorf("expected temperature 0.7, got %f", result.Temperature) - } - - if result.TopK != 40 { - t.Errorf("expected top_k 40, got %d", result.TopK) - } - - if !result.NoMmap { - t.Errorf("expected no_mmap to be true, got %v", result.NoMmap) - } -} - -func TestParseLlamaCommandArgsOnly(t *testing.T) { - // Test parsing arguments without llama-server command - command := "--model /path/to/model.gguf --gpu-layers 32 --ctx-size 4096" - result, err := ParseLlamaCommand(command) - - if err != nil { - t.Fatalf("unexpected error: %v", err) - } - - if result.Model != "/path/to/model.gguf" { - t.Errorf("expected model '/path/to/model.gguf', got '%s'", result.Model) - } - - if result.GPULayers != 32 { - t.Errorf("expected gpu_layers 32, got %d", result.GPULayers) - } - - if result.CtxSize != 4096 { - t.Errorf("expected ctx_size 4096, got %d", result.CtxSize) - } -} - -func TestParseLlamaCommandFullPath(t *testing.T) { - // Test full path to executable - command := "/usr/local/bin/llama-server --model test.gguf --gpu-layers 16" - result, err := ParseLlamaCommand(command) - - if err != nil { - t.Fatalf("unexpected error: %v", err) - } - - if result.Model != "test.gguf" { - t.Errorf("expected model 'test.gguf', got '%s'", result.Model) - } - - if result.GPULayers != 16 { - t.Errorf("expected gpu_layers 16, got %d", result.GPULayers) - } -} - -func TestParseLlamaCommandNegativeNumbers(t *testing.T) { - // Test negative number parsing - command := "llama-server --model test.gguf --gpu-layers -1 --seed -12345" - result, err := ParseLlamaCommand(command) - - if err != nil { - t.Fatalf("unexpected error: %v", err) - } - - if result.GPULayers != -1 { - t.Errorf("expected gpu_layers -1, got %d", result.GPULayers) - } - - if result.Seed != -12345 { - t.Errorf("expected seed -12345, got %d", result.Seed) - } -} - -func TestParseLlamaCommandMultiline(t *testing.T) { - // Test multiline command with backslashes - command := `llama-server --model /path/to/model.gguf \ - --ctx-size 4096 \ - --batch-size 512 \ - --gpu-layers 32` - - result, err := ParseLlamaCommand(command) - - if err != nil { - t.Fatalf("unexpected error: %v", err) - } - - if result.Model != "/path/to/model.gguf" { - t.Errorf("expected model '/path/to/model.gguf', got '%s'", result.Model) - } - - if result.CtxSize != 4096 { - t.Errorf("expected ctx_size 4096, got %d", result.CtxSize) - } - - if result.BatchSize != 512 { - t.Errorf("expected batch_size 512, got %d", result.BatchSize) - } - - if result.GPULayers != 32 { - t.Errorf("expected gpu_layers 32, got %d", result.GPULayers) - } -} - -func TestParseLlamaCommandQuotedStrings(t *testing.T) { - // Test quoted strings with special characters - command := `llama-server --model test.gguf --api-key "sk-1234567890abcdef" --chat-template "User: {user}\nAssistant: "` - result, err := ParseLlamaCommand(command) - - if err != nil { - t.Fatalf("unexpected error: %v", err) - } - - if result.Model != "test.gguf" { - t.Errorf("expected model 'test.gguf', got '%s'", result.Model) - } - - if result.APIKey != "sk-1234567890abcdef" { - t.Errorf("expected api_key 'sk-1234567890abcdef', got '%s'", result.APIKey) - } - - expectedTemplate := "User: {user}\\nAssistant: " - if result.ChatTemplate != expectedTemplate { - t.Errorf("expected chat_template '%s', got '%s'", expectedTemplate, result.ChatTemplate) - } -} - -func TestParseLlamaCommandUnslothExample(t *testing.T) { - // Test with realistic unsloth-style command - command := `llama-server --model /path/to/model.gguf \ - --ctx-size 4096 \ - --batch-size 512 \ - --gpu-layers -1 \ - --temp 0.7 \ - --repeat-penalty 1.1 \ - --top-k 40 \ - --top-p 0.95 \ - --host 0.0.0.0 \ - --port 8000 \ - --api-key "sk-1234567890abcdef"` - - result, err := ParseLlamaCommand(command) - - if err != nil { - t.Fatalf("unexpected error: %v", err) - } - - // Verify key fields - if result.Model != "/path/to/model.gguf" { - t.Errorf("expected model '/path/to/model.gguf', got '%s'", result.Model) - } - - if result.CtxSize != 4096 { - t.Errorf("expected ctx_size 4096, got %d", result.CtxSize) - } - - if result.BatchSize != 512 { - t.Errorf("expected batch_size 512, got %d", result.BatchSize) - } - - if result.GPULayers != -1 { - t.Errorf("expected gpu_layers -1, got %d", result.GPULayers) - } - - if result.Temperature != 0.7 { - t.Errorf("expected temperature 0.7, got %f", result.Temperature) - } - - if result.RepeatPenalty != 1.1 { - t.Errorf("expected repeat_penalty 1.1, got %f", result.RepeatPenalty) - } - - if result.TopK != 40 { - t.Errorf("expected top_k 40, got %d", result.TopK) - } - - if result.TopP != 0.95 { - t.Errorf("expected top_p 0.95, got %f", result.TopP) - } - - if result.Host != "0.0.0.0" { - t.Errorf("expected host '0.0.0.0', got '%s'", result.Host) - } - - if result.Port != 8000 { - t.Errorf("expected port 8000, got %d", result.Port) - } - - if result.APIKey != "sk-1234567890abcdef" { - t.Errorf("expected api_key 'sk-1234567890abcdef', got '%s'", result.APIKey) - } -} - -// Focused additional edge case tests (kept minimal per guidance) -func TestParseLlamaCommandSingleQuotedValue(t *testing.T) { - cmd := "llama-server --model 'my model.gguf' --alias 'Test Alias'" - result, err := ParseLlamaCommand(cmd) - if err != nil { - t.Fatalf("unexpected error: %v", err) - } - if result.Model != "my model.gguf" { - t.Errorf("expected model 'my model.gguf', got '%s'", result.Model) - } - if result.Alias != "Test Alias" { - t.Errorf("expected alias 'Test Alias', got '%s'", result.Alias) - } -} - -func TestParseLlamaCommandMixedArrayForms(t *testing.T) { - // Same multi-value flag using --flag value and --flag=value forms - cmd := "llama-server --lora adapter1.bin --lora=adapter2.bin --lora adapter3.bin" - result, err := ParseLlamaCommand(cmd) - if err != nil { - t.Fatalf("unexpected error: %v", err) - } - if len(result.Lora) != 3 { - t.Fatalf("expected 3 lora values, got %d (%v)", len(result.Lora), result.Lora) - } - expected := []string{"adapter1.bin", "adapter2.bin", "adapter3.bin"} - for i, v := range expected { - if result.Lora[i] != v { - t.Errorf("expected lora[%d]=%s got %s", i, v, result.Lora[i]) - } - } -} - -func TestParseLlamaCommandMalformedFlag(t *testing.T) { - cmd := "llama-server ---model test.gguf" - _, err := ParseLlamaCommand(cmd) - if err == nil { - t.Fatalf("expected error for malformed flag but got none") - } -} diff --git a/pkg/backends/mlx/mlx.go b/pkg/backends/mlx/mlx.go index c3324d2..3b83681 100644 --- a/pkg/backends/mlx/mlx.go +++ b/pkg/backends/mlx/mlx.go @@ -1,205 +1,56 @@ package mlx import ( - "encoding/json" - "reflect" - "strconv" + "llamactl/pkg/backends" ) type MlxServerOptions struct { // Basic connection options - Model string `json:"model,omitempty"` - Host string `json:"host,omitempty"` - Port int `json:"port,omitempty"` - + Model string `json:"model,omitempty"` + Host string `json:"host,omitempty"` + Port int `json:"port,omitempty"` + // Model and adapter options AdapterPath string `json:"adapter_path,omitempty"` DraftModel string `json:"draft_model,omitempty"` NumDraftTokens int `json:"num_draft_tokens,omitempty"` TrustRemoteCode bool `json:"trust_remote_code,omitempty"` - + // Logging and templates - LogLevel string `json:"log_level,omitempty"` - ChatTemplate string `json:"chat_template,omitempty"` - UseDefaultChatTemplate bool `json:"use_default_chat_template,omitempty"` - ChatTemplateArgs string `json:"chat_template_args,omitempty"` // JSON string - + LogLevel string `json:"log_level,omitempty"` + ChatTemplate string `json:"chat_template,omitempty"` + UseDefaultChatTemplate bool `json:"use_default_chat_template,omitempty"` + ChatTemplateArgs string `json:"chat_template_args,omitempty"` // JSON string + // Sampling defaults - Temp float64 `json:"temp,omitempty"` // Note: MLX uses "temp" not "temperature" - TopP float64 `json:"top_p,omitempty"` - TopK int `json:"top_k,omitempty"` - MinP float64 `json:"min_p,omitempty"` - MaxTokens int `json:"max_tokens,omitempty"` -} - -// UnmarshalJSON implements custom JSON unmarshaling to support multiple field names -func (o *MlxServerOptions) UnmarshalJSON(data []byte) error { - // First unmarshal into a map to handle multiple field names - var raw map[string]any - if err := json.Unmarshal(data, &raw); err != nil { - return err - } - - // Create a temporary struct for standard unmarshaling - type tempOptions MlxServerOptions - temp := tempOptions{} - - // Standard unmarshal first - if err := json.Unmarshal(data, &temp); err != nil { - return err - } - - // Copy to our struct - *o = MlxServerOptions(temp) - - // Handle alternative field names - fieldMappings := map[string]string{ - // Basic connection options - "m": "model", - "host": "host", - "port": "port", -// "python_path": "python_path", // removed - - // Model and adapter options - "adapter-path": "adapter_path", - "draft-model": "draft_model", - "num-draft-tokens": "num_draft_tokens", - "trust-remote-code": "trust_remote_code", - - // Logging and templates - "log-level": "log_level", - "chat-template": "chat_template", - "use-default-chat-template": "use_default_chat_template", - "chat-template-args": "chat_template_args", - - // Sampling defaults - "temperature": "temp", // Support both temp and temperature - "top-p": "top_p", - "top-k": "top_k", - "min-p": "min_p", - "max-tokens": "max_tokens", - } - - // Process alternative field names - for altName, canonicalName := range fieldMappings { - if value, exists := raw[altName]; exists { - // Use reflection to set the field value - v := reflect.ValueOf(o).Elem() - field := v.FieldByNameFunc(func(fieldName string) bool { - field, _ := v.Type().FieldByName(fieldName) - jsonTag := field.Tag.Get("json") - return jsonTag == canonicalName+",omitempty" || jsonTag == canonicalName - }) - - if field.IsValid() && field.CanSet() { - switch field.Kind() { - case reflect.Int: - if intVal, ok := value.(float64); ok { - field.SetInt(int64(intVal)) - } else if strVal, ok := value.(string); ok { - if intVal, err := strconv.Atoi(strVal); err == nil { - field.SetInt(int64(intVal)) - } - } - case reflect.Float64: - if floatVal, ok := value.(float64); ok { - field.SetFloat(floatVal) - } else if strVal, ok := value.(string); ok { - if floatVal, err := strconv.ParseFloat(strVal, 64); err == nil { - field.SetFloat(floatVal) - } - } - case reflect.String: - if strVal, ok := value.(string); ok { - field.SetString(strVal) - } - case reflect.Bool: - if boolVal, ok := value.(bool); ok { - field.SetBool(boolVal) - } - } - } - } - } - - return nil -} - -// NewMlxServerOptions creates MlxServerOptions with MLX defaults -func NewMlxServerOptions() *MlxServerOptions { - return &MlxServerOptions{ - Host: "127.0.0.1", // MLX default (different from llama-server) - Port: 8080, // MLX default - NumDraftTokens: 3, // MLX default for speculative decoding - LogLevel: "INFO", // MLX default - Temp: 0.0, // MLX default - TopP: 1.0, // MLX default - TopK: 0, // MLX default (disabled) - MinP: 0.0, // MLX default (disabled) - MaxTokens: 512, // MLX default - ChatTemplateArgs: "{}", // MLX default (empty JSON object) - } + Temp float64 `json:"temp,omitempty"` + TopP float64 `json:"top_p,omitempty"` + TopK int `json:"top_k,omitempty"` + MinP float64 `json:"min_p,omitempty"` + MaxTokens int `json:"max_tokens,omitempty"` } // BuildCommandArgs converts to command line arguments func (o *MlxServerOptions) BuildCommandArgs() []string { - var args []string - - // Required and basic options - if o.Model != "" { - args = append(args, "--model", o.Model) + multipleFlags := map[string]bool{} // MLX doesn't currently have []string fields + return backends.BuildCommandArgs(o, multipleFlags) +} + +// ParseMlxCommand parses a mlx_lm.server command string into MlxServerOptions +// Supports multiple formats: +// 1. Full command: "mlx_lm.server --model model/path" +// 2. Full path: "/usr/local/bin/mlx_lm.server --model model/path" +// 3. Args only: "--model model/path --host 0.0.0.0" +// 4. Multiline commands with backslashes +func ParseMlxCommand(command string) (*MlxServerOptions, error) { + executableNames := []string{"mlx_lm.server"} + var subcommandNames []string // MLX has no subcommands + multiValuedFlags := map[string]bool{} // MLX has no multi-valued flags + + var mlxOptions MlxServerOptions + if err := backends.ParseCommand(command, executableNames, subcommandNames, multiValuedFlags, &mlxOptions); err != nil { + return nil, err } - if o.Host != "" { - args = append(args, "--host", o.Host) - } - if o.Port != 0 { - args = append(args, "--port", strconv.Itoa(o.Port)) - } - - // Model and adapter options - if o.AdapterPath != "" { - args = append(args, "--adapter-path", o.AdapterPath) - } - if o.DraftModel != "" { - args = append(args, "--draft-model", o.DraftModel) - } - if o.NumDraftTokens != 0 { - args = append(args, "--num-draft-tokens", strconv.Itoa(o.NumDraftTokens)) - } - if o.TrustRemoteCode { - args = append(args, "--trust-remote-code") - } - - // Logging and templates - if o.LogLevel != "" { - args = append(args, "--log-level", o.LogLevel) - } - if o.ChatTemplate != "" { - args = append(args, "--chat-template", o.ChatTemplate) - } - if o.UseDefaultChatTemplate { - args = append(args, "--use-default-chat-template") - } - if o.ChatTemplateArgs != "" { - args = append(args, "--chat-template-args", o.ChatTemplateArgs) - } - - // Sampling defaults - if o.Temp != 0 { - args = append(args, "--temp", strconv.FormatFloat(o.Temp, 'f', -1, 64)) - } - if o.TopP != 0 { - args = append(args, "--top-p", strconv.FormatFloat(o.TopP, 'f', -1, 64)) - } - if o.TopK != 0 { - args = append(args, "--top-k", strconv.Itoa(o.TopK)) - } - if o.MinP != 0 { - args = append(args, "--min-p", strconv.FormatFloat(o.MinP, 'f', -1, 64)) - } - if o.MaxTokens != 0 { - args = append(args, "--max-tokens", strconv.Itoa(o.MaxTokens)) - } - - return args -} \ No newline at end of file + + return &mlxOptions, nil +} diff --git a/pkg/backends/mlx/mlx_test.go b/pkg/backends/mlx/mlx_test.go new file mode 100644 index 0000000..8baeb5c --- /dev/null +++ b/pkg/backends/mlx/mlx_test.go @@ -0,0 +1,157 @@ +package mlx_test + +import ( + "llamactl/pkg/backends/mlx" + "testing" +) + +func TestParseMlxCommand(t *testing.T) { + tests := []struct { + name string + command string + expectErr bool + }{ + { + name: "basic command", + command: "mlx_lm.server --model /path/to/model --host 0.0.0.0", + expectErr: false, + }, + { + name: "args only", + command: "--model /path/to/model --port 8080", + expectErr: false, + }, + { + name: "mixed flag formats", + command: "mlx_lm.server --model=/path/model --temp=0.7 --trust-remote-code", + expectErr: false, + }, + { + name: "quoted strings", + command: `mlx_lm.server --model test.mlx --chat-template "User: {user}\nAssistant: "`, + expectErr: false, + }, + { + name: "empty command", + command: "", + expectErr: true, + }, + { + name: "unterminated quote", + command: `mlx_lm.server --model test.mlx --chat-template "unterminated`, + expectErr: true, + }, + { + name: "malformed flag", + command: "mlx_lm.server ---model test.mlx", + expectErr: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result, err := mlx.ParseMlxCommand(tt.command) + + if tt.expectErr { + if err == nil { + t.Errorf("expected error but got none") + } + return + } + + if err != nil { + t.Errorf("unexpected error: %v", err) + return + } + + if result == nil { + t.Errorf("expected result but got nil") + } + }) + } +} + +func TestParseMlxCommandValues(t *testing.T) { + command := "mlx_lm.server --model /test/model.mlx --port 8080 --temp 0.7 --trust-remote-code --log-level DEBUG" + result, err := mlx.ParseMlxCommand(command) + + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + if result.Model != "/test/model.mlx" { + t.Errorf("expected model '/test/model.mlx', got '%s'", result.Model) + } + + if result.Port != 8080 { + t.Errorf("expected port 8080, got %d", result.Port) + } + + if result.Temp != 0.7 { + t.Errorf("expected temp 0.7, got %f", result.Temp) + } + + if !result.TrustRemoteCode { + t.Errorf("expected trust_remote_code to be true") + } + + if result.LogLevel != "DEBUG" { + t.Errorf("expected log_level 'DEBUG', got '%s'", result.LogLevel) + } +} + +func TestBuildCommandArgs(t *testing.T) { + options := &mlx.MlxServerOptions{ + Model: "/test/model.mlx", + Host: "127.0.0.1", + Port: 8080, + Temp: 0.7, + TopP: 0.9, + TopK: 40, + MaxTokens: 2048, + TrustRemoteCode: true, + LogLevel: "DEBUG", + ChatTemplate: "custom template", + } + + args := options.BuildCommandArgs() + + // Check that all expected flags are present + expectedFlags := map[string]string{ + "--model": "/test/model.mlx", + "--host": "127.0.0.1", + "--port": "8080", + "--log-level": "DEBUG", + "--chat-template": "custom template", + "--temp": "0.7", + "--top-p": "0.9", + "--top-k": "40", + "--max-tokens": "2048", + } + + for i := 0; i < len(args); i++ { + if args[i] == "--trust-remote-code" { + continue // Boolean flag with no value + } + if args[i] == "--use-default-chat-template" { + continue // Boolean flag with no value + } + + if expectedValue, exists := expectedFlags[args[i]]; exists && i+1 < len(args) { + if args[i+1] != expectedValue { + t.Errorf("expected %s to have value %s, got %s", args[i], expectedValue, args[i+1]) + } + } + } + + // Check boolean flags + foundTrustRemoteCode := false + for _, arg := range args { + if arg == "--trust-remote-code" { + foundTrustRemoteCode = true + } + } + if !foundTrustRemoteCode { + t.Errorf("expected --trust-remote-code flag to be present") + } +} diff --git a/pkg/backends/mlx/parser.go b/pkg/backends/mlx/parser.go deleted file mode 100644 index 96b04a9..0000000 --- a/pkg/backends/mlx/parser.go +++ /dev/null @@ -1,254 +0,0 @@ -package mlx - -import ( - "encoding/json" - "fmt" - "path/filepath" - "regexp" - "strconv" - "strings" -) - -// ParseMlxCommand parses a mlx_lm.server command string into MlxServerOptions -// Supports multiple formats: -// 1. Full command: "mlx_lm.server --model model/path" -// 2. Full path: "/usr/local/bin/mlx_lm.server --model model/path" -// 3. Args only: "--model model/path --host 0.0.0.0" -// 4. Multiline commands with backslashes -func ParseMlxCommand(command string) (*MlxServerOptions, error) { - // 1. Normalize the command - handle multiline with backslashes - trimmed := normalizeMultilineCommand(command) - if trimmed == "" { - return nil, fmt.Errorf("command cannot be empty") - } - - // 2. Extract arguments from command - args, err := extractArgumentsFromCommand(trimmed) - if err != nil { - return nil, err - } - - // 3. Parse arguments into map - options := make(map[string]any) - - i := 0 - for i < len(args) { - arg := args[i] - - if !strings.HasPrefix(arg, "-") { // skip positional / stray values - i++ - continue - } - - // Reject malformed flags with more than two leading dashes (e.g. ---model) to surface user mistakes - if strings.HasPrefix(arg, "---") { - return nil, fmt.Errorf("malformed flag: %s", arg) - } - - // Unified parsing for --flag=value vs --flag value - var rawFlag, rawValue string - hasEquals := false - if strings.Contains(arg, "=") { - parts := strings.SplitN(arg, "=", 2) - rawFlag = parts[0] - rawValue = parts[1] // may be empty string - hasEquals = true - } else { - rawFlag = arg - } - - flagCore := strings.TrimPrefix(strings.TrimPrefix(rawFlag, "-"), "-") - flagName := strings.ReplaceAll(flagCore, "-", "_") - - // Detect value if not in equals form - valueProvided := hasEquals - if !hasEquals { - if i+1 < len(args) && !isFlag(args[i+1]) { // next token is value - rawValue = args[i+1] - valueProvided = true - } - } - - if valueProvided { - // MLX-specific validation for certain flags - if flagName == "log_level" && !isValidLogLevel(rawValue) { - return nil, fmt.Errorf("invalid log level: %s", rawValue) - } - - options[flagName] = parseValue(rawValue) - - // Advance index: if we consumed a following token as value (non equals form), skip it - if !hasEquals && i+1 < len(args) && rawValue == args[i+1] { - i += 2 - } else { - i++ - } - continue - } - - // Boolean flag (no value) - MLX specific boolean flags - if flagName == "trust_remote_code" || flagName == "use_default_chat_template" { - options[flagName] = true - } else { - options[flagName] = true - } - i++ - } - - // 4. Convert to MlxServerOptions using existing UnmarshalJSON - jsonData, err := json.Marshal(options) - if err != nil { - return nil, fmt.Errorf("failed to marshal parsed options: %w", err) - } - - var mlxOptions MlxServerOptions - if err := json.Unmarshal(jsonData, &mlxOptions); err != nil { - return nil, fmt.Errorf("failed to parse command options: %w", err) - } - - // 5. Return MlxServerOptions - return &mlxOptions, nil -} - -// isValidLogLevel validates MLX log levels -func isValidLogLevel(level string) bool { - validLevels := []string{"DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"} - for _, valid := range validLevels { - if level == valid { - return true - } - } - return false -} - -// parseValue attempts to parse a string value into the most appropriate type -func parseValue(value string) any { - // Surrounding matching quotes (single or double) - if l := len(value); l >= 2 { - if (value[0] == '"' && value[l-1] == '"') || (value[0] == '\'' && value[l-1] == '\'') { - value = value[1 : l-1] - } - } - - lower := strings.ToLower(value) - if lower == "true" { - return true - } - if lower == "false" { - return false - } - - if intVal, err := strconv.Atoi(value); err == nil { - return intVal - } - if floatVal, err := strconv.ParseFloat(value, 64); err == nil { - return floatVal - } - return value -} - -// normalizeMultilineCommand handles multiline commands with backslashes -func normalizeMultilineCommand(command string) string { - // Handle escaped newlines (backslash followed by newline) - re := regexp.MustCompile(`\\\s*\n\s*`) - normalized := re.ReplaceAllString(command, " ") - - // Clean up extra whitespace - re = regexp.MustCompile(`\s+`) - normalized = re.ReplaceAllString(normalized, " ") - - return strings.TrimSpace(normalized) -} - -// extractArgumentsFromCommand extracts arguments from various command formats -func extractArgumentsFromCommand(command string) ([]string, error) { - // Split command into tokens respecting quotes - tokens, err := splitCommandTokens(command) - if err != nil { - return nil, err - } - - if len(tokens) == 0 { - return nil, fmt.Errorf("no command tokens found") - } - - // Check if first token looks like an executable - firstToken := tokens[0] - - // Case 1: Full path to executable (contains path separator or ends with mlx_lm.server) - if strings.Contains(firstToken, string(filepath.Separator)) || - strings.HasSuffix(filepath.Base(firstToken), "mlx_lm.server") { - return tokens[1:], nil // Return everything except the executable - } - - // Case 2: Just "mlx_lm.server" command - if strings.ToLower(firstToken) == "mlx_lm.server" { - return tokens[1:], nil // Return everything except the command - } - - // Case 3: Arguments only (starts with a flag) - if strings.HasPrefix(firstToken, "-") { - return tokens, nil // Return all tokens as arguments - } - - // Case 4: Unknown format - might be a different executable name - // Be permissive and assume it's the executable - return tokens[1:], nil -} - -// splitCommandTokens splits a command string into tokens, respecting quotes -func splitCommandTokens(command string) ([]string, error) { - var tokens []string - var current strings.Builder - inQuotes := false - quoteChar := byte(0) - escaped := false - - for i := 0; i < len(command); i++ { - c := command[i] - - if escaped { - current.WriteByte(c) - escaped = false - continue - } - - if c == '\\' { - escaped = true - current.WriteByte(c) - continue - } - - if !inQuotes && (c == '"' || c == '\'') { - inQuotes = true - quoteChar = c - current.WriteByte(c) - } else if inQuotes && c == quoteChar { - inQuotes = false - quoteChar = 0 - current.WriteByte(c) - } else if !inQuotes && (c == ' ' || c == '\t' || c == '\n') { - if current.Len() > 0 { - tokens = append(tokens, current.String()) - current.Reset() - } - } else { - current.WriteByte(c) - } - } - - if inQuotes { - return nil, fmt.Errorf("unclosed quote in command") - } - - if current.Len() > 0 { - tokens = append(tokens, current.String()) - } - - return tokens, nil -} - -// isFlag checks if a string looks like a command line flag -func isFlag(s string) bool { - return strings.HasPrefix(s, "-") -} \ No newline at end of file diff --git a/pkg/backends/parser.go b/pkg/backends/parser.go new file mode 100644 index 0000000..df585c9 --- /dev/null +++ b/pkg/backends/parser.go @@ -0,0 +1,213 @@ +package backends + +import ( + "encoding/json" + "fmt" + "path/filepath" + "regexp" + "strconv" + "strings" +) + +// ParseCommand parses a command string into a target struct +func ParseCommand(command string, executableNames []string, subcommandNames []string, multiValuedFlags map[string]bool, target any) error { + // Normalize multiline commands + command = normalizeCommand(command) + if command == "" { + return fmt.Errorf("command cannot be empty") + } + + // Extract arguments and positional model + args, modelFromPositional, err := extractArgs(command, executableNames, subcommandNames) + if err != nil { + return err + } + + // Parse flags into map + options, err := parseFlags(args, multiValuedFlags) + if err != nil { + return err + } + + // If we found a positional model and no --model flag was provided, set the model + if modelFromPositional != "" { + if _, hasModelFlag := options["model"]; !hasModelFlag { + options["model"] = modelFromPositional + } + } + + // Convert to target struct via JSON + jsonData, err := json.Marshal(options) + if err != nil { + return fmt.Errorf("failed to marshal options: %w", err) + } + + if err := json.Unmarshal(jsonData, target); err != nil { + return fmt.Errorf("failed to unmarshal to target: %w", err) + } + + return nil +} + +// normalizeCommand handles multiline commands with backslashes +func normalizeCommand(command string) string { + re := regexp.MustCompile(`\\\s*\n\s*`) + normalized := re.ReplaceAllString(command, " ") + re = regexp.MustCompile(`\s+`) + return strings.TrimSpace(re.ReplaceAllString(normalized, " ")) +} + +// extractArgs extracts arguments from command, removing executable and subcommands +// Returns: args, modelFromPositional, error +func extractArgs(command string, executableNames []string, subcommandNames []string) ([]string, string, error) { + // Check for unterminated quotes + if strings.Count(command, `"`)%2 != 0 || strings.Count(command, `'`)%2 != 0 { + return nil, "", fmt.Errorf("unterminated quoted string") + } + + tokens := strings.Fields(command) + if len(tokens) == 0 { + return nil, "", fmt.Errorf("no tokens found") + } + + // Skip executable + start := 0 + firstToken := tokens[0] + + // Check for executable name (with or without path) + if strings.Contains(firstToken, string(filepath.Separator)) { + baseName := filepath.Base(firstToken) + for _, execName := range executableNames { + if strings.HasSuffix(strings.ToLower(baseName), strings.ToLower(execName)) { + start = 1 + break + } + } + } else { + for _, execName := range executableNames { + if strings.EqualFold(firstToken, execName) { + start = 1 + break + } + } + } + + // Skip subcommand if present + if start < len(tokens) { + for _, subCmd := range subcommandNames { + if strings.EqualFold(tokens[start], subCmd) { + start++ + break + } + } + } + + // Handle case where command starts with subcommand (no executable) + if start == 0 { + for _, subCmd := range subcommandNames { + if strings.EqualFold(firstToken, subCmd) { + start = 1 + break + } + } + } + + args := tokens[start:] + + // Extract first positional argument (model) if present and not a flag + var modelFromPositional string + if len(args) > 0 && !strings.HasPrefix(args[0], "-") { + modelFromPositional = args[0] + args = args[1:] // Remove the model from args to process remaining flags + } + + return args, modelFromPositional, nil +} + +// parseFlags parses command line flags into a map +func parseFlags(args []string, multiValuedFlags map[string]bool) (map[string]any, error) { + options := make(map[string]any) + + for i := 0; i < len(args); i++ { + arg := args[i] + + if !strings.HasPrefix(arg, "-") { + continue + } + + // Check for malformed flags (more than two leading dashes) + if strings.HasPrefix(arg, "---") { + return nil, fmt.Errorf("malformed flag: %s", arg) + } + + // Get flag name and value + var flagName, value string + var hasValue bool + + if strings.Contains(arg, "=") { + parts := strings.SplitN(arg, "=", 2) + flagName = strings.TrimLeft(parts[0], "-") + value = parts[1] + hasValue = true + } else { + flagName = strings.TrimLeft(arg, "-") + if i+1 < len(args) && !strings.HasPrefix(args[i+1], "-") { + value = args[i+1] + hasValue = true + i++ // Skip next arg since we consumed it + } + } + + // Convert kebab-case to snake_case for JSON + flagName = strings.ReplaceAll(flagName, "-", "_") + + if hasValue { + // Handle multi-valued flags + if multiValuedFlags[flagName] { + if existing, ok := options[flagName].([]string); ok { + options[flagName] = append(existing, value) + } else { + options[flagName] = []string{value} + } + } else { + options[flagName] = parseValue(value) + } + } else { + // Boolean flag + options[flagName] = true + } + } + + return options, nil +} + +// parseValue converts string to appropriate type +func parseValue(value string) any { + // Remove quotes + if len(value) >= 2 { + if (value[0] == '"' && value[len(value)-1] == '"') || (value[0] == '\'' && value[len(value)-1] == '\'') { + value = value[1 : len(value)-1] + } + } + + // Try boolean + switch strings.ToLower(value) { + case "true": + return true + case "false": + return false + } + + // Try integer + if intVal, err := strconv.Atoi(value); err == nil { + return intVal + } + + // Try float + if floatVal, err := strconv.ParseFloat(value, 64); err == nil { + return floatVal + } + + // Return as string + return value +} diff --git a/pkg/backends/vllm/vllm.go b/pkg/backends/vllm/vllm.go new file mode 100644 index 0000000..7811c4c --- /dev/null +++ b/pkg/backends/vllm/vllm.go @@ -0,0 +1,189 @@ +package vllm + +import ( + "llamactl/pkg/backends" +) + +type VllmServerOptions struct { + // Basic connection options (auto-assigned by llamactl) + Host string `json:"host,omitempty"` + Port int `json:"port,omitempty"` + + // Model and engine configuration + Model string `json:"model,omitempty"` + Tokenizer string `json:"tokenizer,omitempty"` + SkipTokenizerInit bool `json:"skip_tokenizer_init,omitempty"` + Revision string `json:"revision,omitempty"` + CodeRevision string `json:"code_revision,omitempty"` + TokenizerRevision string `json:"tokenizer_revision,omitempty"` + TokenizerMode string `json:"tokenizer_mode,omitempty"` + TrustRemoteCode bool `json:"trust_remote_code,omitempty"` + DownloadDir string `json:"download_dir,omitempty"` + LoadFormat string `json:"load_format,omitempty"` + ConfigFormat string `json:"config_format,omitempty"` + Dtype string `json:"dtype,omitempty"` + KVCacheDtype string `json:"kv_cache_dtype,omitempty"` + QuantizationParamPath string `json:"quantization_param_path,omitempty"` + Seed int `json:"seed,omitempty"` + MaxModelLen int `json:"max_model_len,omitempty"` + GuidedDecodingBackend string `json:"guided_decoding_backend,omitempty"` + DistributedExecutorBackend string `json:"distributed_executor_backend,omitempty"` + WorkerUseRay bool `json:"worker_use_ray,omitempty"` + RayWorkersUseNSight bool `json:"ray_workers_use_nsight,omitempty"` + + // Performance and serving configuration + BlockSize int `json:"block_size,omitempty"` + EnablePrefixCaching bool `json:"enable_prefix_caching,omitempty"` + DisableSlidingWindow bool `json:"disable_sliding_window,omitempty"` + UseV2BlockManager bool `json:"use_v2_block_manager,omitempty"` + NumLookaheadSlots int `json:"num_lookahead_slots,omitempty"` + SwapSpace int `json:"swap_space,omitempty"` + CPUOffloadGB int `json:"cpu_offload_gb,omitempty"` + GPUMemoryUtilization float64 `json:"gpu_memory_utilization,omitempty"` + NumGPUBlocksOverride int `json:"num_gpu_blocks_override,omitempty"` + MaxNumBatchedTokens int `json:"max_num_batched_tokens,omitempty"` + MaxNumSeqs int `json:"max_num_seqs,omitempty"` + MaxLogprobs int `json:"max_logprobs,omitempty"` + DisableLogStats bool `json:"disable_log_stats,omitempty"` + Quantization string `json:"quantization,omitempty"` + RopeScaling string `json:"rope_scaling,omitempty"` + RopeTheta float64 `json:"rope_theta,omitempty"` + EnforceEager bool `json:"enforce_eager,omitempty"` + MaxContextLenToCapture int `json:"max_context_len_to_capture,omitempty"` + MaxSeqLenToCapture int `json:"max_seq_len_to_capture,omitempty"` + DisableCustomAllReduce bool `json:"disable_custom_all_reduce,omitempty"` + TokenizerPoolSize int `json:"tokenizer_pool_size,omitempty"` + TokenizerPoolType string `json:"tokenizer_pool_type,omitempty"` + TokenizerPoolExtraConfig string `json:"tokenizer_pool_extra_config,omitempty"` + EnableLoraBias bool `json:"enable_lora_bias,omitempty"` + LoraExtraVocabSize int `json:"lora_extra_vocab_size,omitempty"` + LoraRank int `json:"lora_rank,omitempty"` + PromptLookbackDistance int `json:"prompt_lookback_distance,omitempty"` + PreemptionMode string `json:"preemption_mode,omitempty"` + + // Distributed and parallel processing + TensorParallelSize int `json:"tensor_parallel_size,omitempty"` + PipelineParallelSize int `json:"pipeline_parallel_size,omitempty"` + MaxParallelLoadingWorkers int `json:"max_parallel_loading_workers,omitempty"` + DisableAsyncOutputProc bool `json:"disable_async_output_proc,omitempty"` + WorkerClass string `json:"worker_class,omitempty"` + EnabledLoraModules string `json:"enabled_lora_modules,omitempty"` + MaxLoraRank int `json:"max_lora_rank,omitempty"` + FullyShardedLoras bool `json:"fully_sharded_loras,omitempty"` + LoraModules string `json:"lora_modules,omitempty"` + PromptAdapters string `json:"prompt_adapters,omitempty"` + MaxPromptAdapterToken int `json:"max_prompt_adapter_token,omitempty"` + Device string `json:"device,omitempty"` + SchedulerDelay float64 `json:"scheduler_delay,omitempty"` + EnableChunkedPrefill bool `json:"enable_chunked_prefill,omitempty"` + SpeculativeModel string `json:"speculative_model,omitempty"` + SpeculativeModelQuantization string `json:"speculative_model_quantization,omitempty"` + SpeculativeRevision string `json:"speculative_revision,omitempty"` + SpeculativeMaxModelLen int `json:"speculative_max_model_len,omitempty"` + SpeculativeDisableByBatchSize int `json:"speculative_disable_by_batch_size,omitempty"` + NgptSpeculativeLength int `json:"ngpt_speculative_length,omitempty"` + SpeculativeDisableMqa bool `json:"speculative_disable_mqa,omitempty"` + ModelLoaderExtraConfig string `json:"model_loader_extra_config,omitempty"` + IgnorePatterns string `json:"ignore_patterns,omitempty"` + PreloadedLoraModules string `json:"preloaded_lora_modules,omitempty"` + + // OpenAI server specific options + UDS string `json:"uds,omitempty"` + UvicornLogLevel string `json:"uvicorn_log_level,omitempty"` + ResponseRole string `json:"response_role,omitempty"` + SSLKeyfile string `json:"ssl_keyfile,omitempty"` + SSLCertfile string `json:"ssl_certfile,omitempty"` + SSLCACerts string `json:"ssl_ca_certs,omitempty"` + SSLCertReqs int `json:"ssl_cert_reqs,omitempty"` + RootPath string `json:"root_path,omitempty"` + Middleware []string `json:"middleware,omitempty"` + ReturnTokensAsTokenIDS bool `json:"return_tokens_as_token_ids,omitempty"` + DisableFrontendMultiprocessing bool `json:"disable_frontend_multiprocessing,omitempty"` + EnableAutoToolChoice bool `json:"enable_auto_tool_choice,omitempty"` + ToolCallParser string `json:"tool_call_parser,omitempty"` + ToolServer string `json:"tool_server,omitempty"` + ChatTemplate string `json:"chat_template,omitempty"` + ChatTemplateContentFormat string `json:"chat_template_content_format,omitempty"` + AllowCredentials bool `json:"allow_credentials,omitempty"` + AllowedOrigins []string `json:"allowed_origins,omitempty"` + AllowedMethods []string `json:"allowed_methods,omitempty"` + AllowedHeaders []string `json:"allowed_headers,omitempty"` + APIKey []string `json:"api_key,omitempty"` + EnableLogOutputs bool `json:"enable_log_outputs,omitempty"` + EnableTokenUsage bool `json:"enable_token_usage,omitempty"` + EnableAsyncEngineDebug bool `json:"enable_async_engine_debug,omitempty"` + EngineUseRay bool `json:"engine_use_ray,omitempty"` + DisableLogRequests bool `json:"disable_log_requests,omitempty"` + MaxLogLen int `json:"max_log_len,omitempty"` + + // Additional engine configuration + Task string `json:"task,omitempty"` + MultiModalConfig string `json:"multi_modal_config,omitempty"` + LimitMmPerPrompt string `json:"limit_mm_per_prompt,omitempty"` + EnableSleepMode bool `json:"enable_sleep_mode,omitempty"` + EnableChunkingRequest bool `json:"enable_chunking_request,omitempty"` + CompilationConfig string `json:"compilation_config,omitempty"` + DisableSlidingWindowMask bool `json:"disable_sliding_window_mask,omitempty"` + EnableTRTLLMEngineLatency bool `json:"enable_trtllm_engine_latency,omitempty"` + OverridePoolingConfig string `json:"override_pooling_config,omitempty"` + OverrideNeuronConfig string `json:"override_neuron_config,omitempty"` + OverrideKVCacheALIGNSize int `json:"override_kv_cache_align_size,omitempty"` +} + +// BuildCommandArgs converts VllmServerOptions to command line arguments +// Note: This does NOT include the "serve" subcommand, that's handled at the instance level +// For vLLM, the model parameter is passed as a positional argument, not a --model flag +func (o *VllmServerOptions) BuildCommandArgs() []string { + var args []string + + // Add model as positional argument if specified + if o.Model != "" { + args = append(args, o.Model) + } + + // Create a copy of the options without the Model field to avoid including it as --model flag + optionsCopy := *o + optionsCopy.Model = "" // Clear model field so it won't be included as a flag + + multipleFlags := map[string]bool{ + "api-key": true, + "allowed-origins": true, + "allowed-methods": true, + "allowed-headers": true, + "middleware": true, + } + + // Build the rest of the arguments as flags + flagArgs := backends.BuildCommandArgs(&optionsCopy, multipleFlags) + args = append(args, flagArgs...) + + return args +} + +// ParseVllmCommand parses a vLLM serve command string into VllmServerOptions +// Supports multiple formats: +// 1. Full command: "vllm serve --model MODEL_NAME --other-args" +// 2. Full path: "/usr/local/bin/vllm serve --model MODEL_NAME" +// 3. Serve only: "serve --model MODEL_NAME --other-args" +// 4. Args only: "--model MODEL_NAME --other-args" +// 5. Multiline commands with backslashes +func ParseVllmCommand(command string) (*VllmServerOptions, error) { + executableNames := []string{"vllm"} + subcommandNames := []string{"serve"} + multiValuedFlags := map[string]bool{ + "middleware": true, + "api_key": true, + "allowed_origins": true, + "allowed_methods": true, + "allowed_headers": true, + "lora_modules": true, + "prompt_adapters": true, + } + + var vllmOptions VllmServerOptions + if err := backends.ParseCommand(command, executableNames, subcommandNames, multiValuedFlags, &vllmOptions); err != nil { + return nil, err + } + + return &vllmOptions, nil +} diff --git a/pkg/backends/vllm/vllm_test.go b/pkg/backends/vllm/vllm_test.go new file mode 100644 index 0000000..ea13496 --- /dev/null +++ b/pkg/backends/vllm/vllm_test.go @@ -0,0 +1,153 @@ +package vllm_test + +import ( + "llamactl/pkg/backends/vllm" + "slices" + "testing" +) + +func TestParseVllmCommand(t *testing.T) { + tests := []struct { + name string + command string + expectErr bool + }{ + { + name: "basic vllm serve command", + command: "vllm serve microsoft/DialoGPT-medium", + expectErr: false, + }, + { + name: "serve only command", + command: "serve microsoft/DialoGPT-medium", + expectErr: false, + }, + { + name: "positional model with flags", + command: "vllm serve microsoft/DialoGPT-medium --tensor-parallel-size 2", + expectErr: false, + }, + { + name: "model with path", + command: "vllm serve /path/to/model --gpu-memory-utilization 0.8", + expectErr: false, + }, + { + name: "empty command", + command: "", + expectErr: true, + }, + { + name: "unterminated quote", + command: `vllm serve "unterminated`, + expectErr: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result, err := vllm.ParseVllmCommand(tt.command) + + if tt.expectErr { + if err == nil { + t.Errorf("expected error but got none") + } + return + } + + if err != nil { + t.Errorf("unexpected error: %v", err) + return + } + + if result == nil { + t.Errorf("expected result but got nil") + } + }) + } +} + +func TestParseVllmCommandValues(t *testing.T) { + command := "vllm serve test-model --tensor-parallel-size 4 --gpu-memory-utilization 0.8 --enable-log-outputs" + result, err := vllm.ParseVllmCommand(command) + + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + if result.Model != "test-model" { + t.Errorf("expected model 'test-model', got '%s'", result.Model) + } + if result.TensorParallelSize != 4 { + t.Errorf("expected tensor_parallel_size 4, got %d", result.TensorParallelSize) + } + if result.GPUMemoryUtilization != 0.8 { + t.Errorf("expected gpu_memory_utilization 0.8, got %f", result.GPUMemoryUtilization) + } + if !result.EnableLogOutputs { + t.Errorf("expected enable_log_outputs true, got %v", result.EnableLogOutputs) + } +} + +func TestBuildCommandArgs(t *testing.T) { + options := vllm.VllmServerOptions{ + Model: "microsoft/DialoGPT-medium", + Port: 8080, + Host: "localhost", + TensorParallelSize: 2, + GPUMemoryUtilization: 0.8, + EnableLogOutputs: true, + AllowedOrigins: []string{"http://localhost:3000", "https://example.com"}, + } + + args := options.BuildCommandArgs() + + // Check that model is the first positional argument (not a --model flag) + if len(args) == 0 || args[0] != "microsoft/DialoGPT-medium" { + t.Errorf("Expected model 'microsoft/DialoGPT-medium' as first positional argument, got args: %v", args) + } + + // Check that --model flag is NOT present (since model should be positional) + if contains(args, "--model") { + t.Errorf("Found --model flag, but model should be positional argument in args: %v", args) + } + + // Check other flags + if !containsFlagWithValue(args, "--tensor-parallel-size", "2") { + t.Errorf("Expected --tensor-parallel-size 2 not found in %v", args) + } + if !contains(args, "--enable-log-outputs") { + t.Errorf("Expected --enable-log-outputs not found in %v", args) + } + if !contains(args, "--host") { + t.Errorf("Expected --host not found in %v", args) + } + if !contains(args, "--port") { + t.Errorf("Expected --port not found in %v", args) + } + + // Check array handling (multiple flags) + allowedOriginsCount := 0 + for i := range args { + if args[i] == "--allowed-origins" { + allowedOriginsCount++ + } + } + if allowedOriginsCount != 2 { + t.Errorf("Expected 2 --allowed-origins flags, got %d", allowedOriginsCount) + } +} + +// Helper functions +func contains(slice []string, item string) bool { + return slices.Contains(slice, item) +} + +func containsFlagWithValue(args []string, flag, value string) bool { + for i, arg := range args { + if arg == flag && i+1 < len(args) && args[i+1] == value { + return true + } + } + return false +} diff --git a/pkg/config/config.go b/pkg/config/config.go index 28087db..504ecc3 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -17,6 +17,9 @@ type BackendConfig struct { // Path to mlx_lm executable (MLX-LM backend) MLXLMExecutable string `yaml:"mlx_lm_executable"` + + // Path to vllm executable (vLLM backend) + VllmExecutable string `yaml:"vllm_executable"` } // AppConfig represents the configuration for llamactl @@ -122,6 +125,7 @@ func LoadConfig(configPath string) (AppConfig, error) { Backends: BackendConfig{ LlamaExecutable: "llama-server", MLXLMExecutable: "mlx_lm.server", + VllmExecutable: "vllm", }, Instances: InstancesConfig{ PortRange: [2]int{8000, 9000}, @@ -246,6 +250,9 @@ func loadEnvVars(cfg *AppConfig) { if mlxLMExec := os.Getenv("LLAMACTL_MLX_LM_EXECUTABLE"); mlxLMExec != "" { cfg.Backends.MLXLMExecutable = mlxLMExec } + if vllmExec := os.Getenv("LLAMACTL_VLLM_EXECUTABLE"); vllmExec != "" { + cfg.Backends.VllmExecutable = vllmExec + } if autoRestart := os.Getenv("LLAMACTL_DEFAULT_AUTO_RESTART"); autoRestart != "" { if b, err := strconv.ParseBool(autoRestart); err == nil { cfg.Instances.DefaultAutoRestart = b diff --git a/pkg/instance/instance.go b/pkg/instance/instance.go index c0e5060..e1509a8 100644 --- a/pkg/instance/instance.go +++ b/pkg/instance/instance.go @@ -105,6 +105,10 @@ func (i *Process) GetPort() int { if i.options.MlxServerOptions != nil { return i.options.MlxServerOptions.Port } + case backends.BackendTypeVllm: + if i.options.VllmServerOptions != nil { + return i.options.VllmServerOptions.Port + } } } return 0 @@ -123,6 +127,10 @@ func (i *Process) GetHost() string { if i.options.MlxServerOptions != nil { return i.options.MlxServerOptions.Host } + case backends.BackendTypeVllm: + if i.options.VllmServerOptions != nil { + return i.options.VllmServerOptions.Host + } } } return "" @@ -176,6 +184,11 @@ func (i *Process) GetProxy() (*httputil.ReverseProxy, error) { host = i.options.MlxServerOptions.Host port = i.options.MlxServerOptions.Port } + case backends.BackendTypeVllm: + if i.options.VllmServerOptions != nil { + host = i.options.VllmServerOptions.Host + port = i.options.VllmServerOptions.Port + } } targetURL, err := url.Parse(fmt.Sprintf("http://%s:%d", host, port)) diff --git a/pkg/instance/lifecycle.go b/pkg/instance/lifecycle.go index 04c5fba..9eab260 100644 --- a/pkg/instance/lifecycle.go +++ b/pkg/instance/lifecycle.go @@ -52,6 +52,8 @@ func (i *Process) Start() error { executable = i.globalBackendSettings.LlamaExecutable case backends.BackendTypeMlxLm: executable = i.globalBackendSettings.MLXLMExecutable + case backends.BackendTypeVllm: + executable = i.globalBackendSettings.VllmExecutable default: return fmt.Errorf("unsupported backend type: %s", i.options.BackendType) } @@ -200,6 +202,11 @@ func (i *Process) WaitForHealthy(timeout int) error { host = opts.MlxServerOptions.Host port = opts.MlxServerOptions.Port } + case backends.BackendTypeVllm: + if opts.VllmServerOptions != nil { + host = opts.VllmServerOptions.Host + port = opts.VllmServerOptions.Port + } } if host == "" { host = "localhost" diff --git a/pkg/instance/options.go b/pkg/instance/options.go index 2b1437f..2e0b2fd 100644 --- a/pkg/instance/options.go +++ b/pkg/instance/options.go @@ -6,6 +6,7 @@ import ( "llamactl/pkg/backends" "llamactl/pkg/backends/llamacpp" "llamactl/pkg/backends/mlx" + "llamactl/pkg/backends/vllm" "llamactl/pkg/config" "log" ) @@ -26,6 +27,7 @@ type CreateInstanceOptions struct { // Backend-specific options LlamaServerOptions *llamacpp.LlamaServerOptions `json:"-"` MlxServerOptions *mlx.MlxServerOptions `json:"-"` + VllmServerOptions *vllm.VllmServerOptions `json:"-"` } // UnmarshalJSON implements custom JSON unmarshaling for CreateInstanceOptions @@ -63,12 +65,24 @@ func (c *CreateInstanceOptions) UnmarshalJSON(data []byte) error { if err != nil { return fmt.Errorf("failed to marshal backend options: %w", err) } - + c.MlxServerOptions = &mlx.MlxServerOptions{} if err := json.Unmarshal(optionsData, c.MlxServerOptions); err != nil { return fmt.Errorf("failed to unmarshal MLX options: %w", err) } } + case backends.BackendTypeVllm: + if c.BackendOptions != nil { + optionsData, err := json.Marshal(c.BackendOptions) + if err != nil { + return fmt.Errorf("failed to marshal backend options: %w", err) + } + + c.VllmServerOptions = &vllm.VllmServerOptions{} + if err := json.Unmarshal(optionsData, c.VllmServerOptions); err != nil { + return fmt.Errorf("failed to unmarshal vLLM options: %w", err) + } + } default: return fmt.Errorf("unknown backend type: %s", c.BackendType) } @@ -114,6 +128,20 @@ func (c *CreateInstanceOptions) MarshalJSON() ([]byte, error) { return nil, fmt.Errorf("failed to unmarshal to map: %w", err) } + aux.BackendOptions = backendOpts + } + case backends.BackendTypeVllm: + if c.VllmServerOptions != nil { + data, err := json.Marshal(c.VllmServerOptions) + if err != nil { + return nil, fmt.Errorf("failed to marshal vLLM server options: %w", err) + } + + var backendOpts map[string]any + if err := json.Unmarshal(data, &backendOpts); err != nil { + return nil, fmt.Errorf("failed to unmarshal to map: %w", err) + } + aux.BackendOptions = backendOpts } } @@ -171,6 +199,13 @@ func (c *CreateInstanceOptions) BuildCommandArgs() []string { if c.MlxServerOptions != nil { return c.MlxServerOptions.BuildCommandArgs() } + case backends.BackendTypeVllm: + if c.VllmServerOptions != nil { + // Prepend "serve" as first argument + args := []string{"serve"} + args = append(args, c.VllmServerOptions.BuildCommandArgs()...) + return args + } } return []string{} } diff --git a/pkg/manager/operations.go b/pkg/manager/operations.go index 1354481..b3c0d13 100644 --- a/pkg/manager/operations.go +++ b/pkg/manager/operations.go @@ -264,6 +264,10 @@ func (im *instanceManager) getPortFromOptions(options *instance.CreateInstanceOp if options.MlxServerOptions != nil { return options.MlxServerOptions.Port } + case backends.BackendTypeVllm: + if options.VllmServerOptions != nil { + return options.VllmServerOptions.Port + } } return 0 } @@ -279,6 +283,10 @@ func (im *instanceManager) setPortInOptions(options *instance.CreateInstanceOpti if options.MlxServerOptions != nil { options.MlxServerOptions.Port = port } + case backends.BackendTypeVllm: + if options.VllmServerOptions != nil { + options.VllmServerOptions.Port = port + } } } diff --git a/pkg/server/handlers.go b/pkg/server/handlers.go index c4932b2..0d74851 100644 --- a/pkg/server/handlers.go +++ b/pkg/server/handlers.go @@ -8,6 +8,7 @@ import ( "llamactl/pkg/backends" "llamactl/pkg/backends/llamacpp" "llamactl/pkg/backends/mlx" + "llamactl/pkg/backends/vllm" "llamactl/pkg/config" "llamactl/pkg/instance" "llamactl/pkg/manager" @@ -732,7 +733,60 @@ func (h *Handler) ParseMlxCommand() http.HandlerFunc { BackendType: backendType, MlxServerOptions: mlxOptions, } - + + w.Header().Set("Content-Type", "application/json") + if err := json.NewEncoder(w).Encode(options); err != nil { + writeError(w, http.StatusInternalServerError, "encode_error", err.Error()) + } + } +} + +// ParseVllmCommand godoc +// @Summary Parse vllm serve command +// @Description Parses a vLLM serve command string into instance options +// @Tags backends +// @Security ApiKeyAuth +// @Accept json +// @Produce json +// @Param request body ParseCommandRequest true "Command to parse" +// @Success 200 {object} instance.CreateInstanceOptions "Parsed options" +// @Failure 400 {object} map[string]string "Invalid request or command" +// @Router /backends/vllm/parse-command [post] +func (h *Handler) ParseVllmCommand() http.HandlerFunc { + type errorResponse struct { + Error string `json:"error"` + Details string `json:"details,omitempty"` + } + writeError := func(w http.ResponseWriter, status int, code, details string) { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(status) + _ = json.NewEncoder(w).Encode(errorResponse{Error: code, Details: details}) + } + return func(w http.ResponseWriter, r *http.Request) { + var req ParseCommandRequest + if err := json.NewDecoder(r.Body).Decode(&req); err != nil { + writeError(w, http.StatusBadRequest, "invalid_request", "Invalid JSON body") + return + } + + if strings.TrimSpace(req.Command) == "" { + writeError(w, http.StatusBadRequest, "invalid_command", "Command cannot be empty") + return + } + + vllmOptions, err := vllm.ParseVllmCommand(req.Command) + if err != nil { + writeError(w, http.StatusBadRequest, "parse_error", err.Error()) + return + } + + backendType := backends.BackendTypeVllm + + options := &instance.CreateInstanceOptions{ + BackendType: backendType, + VllmServerOptions: vllmOptions, + } + w.Header().Set("Content-Type", "application/json") if err := json.NewEncoder(w).Encode(options); err != nil { writeError(w, http.StatusInternalServerError, "encode_error", err.Error()) diff --git a/pkg/server/routes.go b/pkg/server/routes.go index aa31e1f..898b574 100644 --- a/pkg/server/routes.go +++ b/pkg/server/routes.go @@ -58,6 +58,9 @@ func SetupRouter(handler *Handler) *chi.Mux { r.Route("/mlx", func(r chi.Router) { r.Post("/parse-command", handler.ParseMlxCommand()) }) + r.Route("/vllm", func(r chi.Router) { + r.Post("/parse-command", handler.ParseVllmCommand()) + }) }) // Instance management endpoints diff --git a/pkg/validation/validation.go b/pkg/validation/validation.go index eff1dd3..638e5d2 100644 --- a/pkg/validation/validation.go +++ b/pkg/validation/validation.go @@ -46,6 +46,8 @@ func ValidateInstanceOptions(options *instance.CreateInstanceOptions) error { return validateLlamaCppOptions(options) case backends.BackendTypeMlxLm: return validateMlxOptions(options) + case backends.BackendTypeVllm: + return validateVllmOptions(options) default: return ValidationError(fmt.Errorf("unsupported backend type: %s", options.BackendType)) } @@ -88,6 +90,25 @@ func validateMlxOptions(options *instance.CreateInstanceOptions) error { return nil } +// validateVllmOptions validates vLLM backend specific options +func validateVllmOptions(options *instance.CreateInstanceOptions) error { + if options.VllmServerOptions == nil { + return ValidationError(fmt.Errorf("vLLM server options cannot be nil for vLLM backend")) + } + + // Use reflection to check all string fields for injection patterns + if err := validateStructStrings(options.VllmServerOptions, ""); err != nil { + return err + } + + // Basic network validation for port + if options.VllmServerOptions.Port < 0 || options.VllmServerOptions.Port > 65535 { + return ValidationError(fmt.Errorf("invalid port range: %d", options.VllmServerOptions.Port)) + } + + return nil +} + // validateStructStrings recursively validates all string fields in a struct func validateStructStrings(v any, fieldPath string) error { val := reflect.ValueOf(v) diff --git a/webui/src/components/BackendBadge.tsx b/webui/src/components/BackendBadge.tsx new file mode 100644 index 0000000..a50cb4d --- /dev/null +++ b/webui/src/components/BackendBadge.tsx @@ -0,0 +1,65 @@ +import React from "react"; +import { Badge } from "@/components/ui/badge"; +import { BackendType, type BackendTypeValue } from "@/types/instance"; +import { Cpu, Zap, Server } from "lucide-react"; + +interface BackendBadgeProps { + backend?: BackendTypeValue; +} + +const BackendBadge: React.FC = ({ backend }) => { + if (!backend) { + return null; + } + + const getIcon = () => { + switch (backend) { + case BackendType.LLAMA_CPP: + return ; + case BackendType.MLX_LM: + return ; + case BackendType.VLLM: + return ; + default: + return ; + } + }; + + const getText = () => { + switch (backend) { + case BackendType.LLAMA_CPP: + return "llama.cpp"; + case BackendType.MLX_LM: + return "MLX"; + case BackendType.VLLM: + return "vLLM"; + default: + return backend; + } + }; + + const getVariant = () => { + switch (backend) { + case BackendType.LLAMA_CPP: + return "secondary"; + case BackendType.MLX_LM: + return "outline"; + case BackendType.VLLM: + return "default"; + default: + return "secondary"; + } + }; + + return ( + + {getIcon()} + {getText()} + + ); +}; + +export default BackendBadge; \ No newline at end of file diff --git a/webui/src/components/BackendFormField.tsx b/webui/src/components/BackendFormField.tsx index 3dd7af0..e66fedd 100644 --- a/webui/src/components/BackendFormField.tsx +++ b/webui/src/components/BackendFormField.tsx @@ -45,7 +45,6 @@ const BackendFormField: React.FC = ({ fieldKey, value, on
= ({ fieldKey, value, on
= ({ fieldKey, value, on
{instance.name} - {running && } +
+ {running && } + +
diff --git a/webui/src/components/InstanceDialog.tsx b/webui/src/components/InstanceDialog.tsx index 919ef52..8111348 100644 --- a/webui/src/components/InstanceDialog.tsx +++ b/webui/src/components/InstanceDialog.tsx @@ -11,11 +11,13 @@ import { DialogTitle, } from "@/components/ui/dialog"; import { BackendType, type CreateInstanceOptions, type Instance } from "@/types/instance"; -import { getBasicFields, getAdvancedFields, getBasicBackendFields, getAdvancedBackendFields } from "@/lib/zodFormUtils"; +import { getAdvancedFields, getAdvancedBackendFields } from "@/lib/zodFormUtils"; import { ChevronDown, ChevronRight, Terminal } from "lucide-react"; -import ZodFormField from "@/components/ZodFormField"; -import BackendFormField from "@/components/BackendFormField"; import ParseCommandDialog from "@/components/ParseCommandDialog"; +import AutoRestartConfiguration from "@/components/instance/AutoRestartConfiguration"; +import BasicInstanceFields from "@/components/instance/BasicInstanceFields"; +import BackendConfiguration from "@/components/instance/BackendConfiguration"; +import AdvancedInstanceFields from "@/components/instance/AdvancedInstanceFields"; interface InstanceDialogProps { open: boolean; @@ -39,9 +41,7 @@ const InstanceDialog: React.FC = ({ const [showParseDialog, setShowParseDialog] = useState(false); // Get field lists dynamically from the type - const basicFields = getBasicFields(); const advancedFields = getAdvancedFields(); - const basicBackendFields = getBasicBackendFields(formData.backend_type); const advancedBackendFields = getAdvancedBackendFields(formData.backend_type); // Reset form when dialog opens/closes or when instance changes @@ -163,8 +163,6 @@ const InstanceDialog: React.FC = ({ setShowParseDialog(false); }; - // Check if auto_restart is enabled - const isAutoRestartEnabled = formData.auto_restart === true; // Save button label logic let saveButtonLabel = "Create Instance"; @@ -212,70 +210,23 @@ const InstanceDialog: React.FC = ({
{/* Auto Restart Configuration Section */} -
-

- Auto Restart Configuration -

+ - {/* Auto Restart Toggle */} - - - {/* Show restart options only when auto restart is enabled */} - {isAutoRestartEnabled && ( -
- - -
- )} -
- - {/* Basic Fields - Automatically generated from type (excluding auto restart options) */} -
-

Basic Configuration

- {basicFields - .filter( - (fieldKey) => - fieldKey !== "auto_restart" && - fieldKey !== "max_restarts" && - fieldKey !== "restart_delay" && - fieldKey !== "backend_options" // backend_options is handled separately - ) - .map((fieldKey) => ( - - ))} -
+ {/* Basic Fields */} + {/* Backend Configuration Section */} -
-

Backend Configuration

- - {/* Basic backend fields */} - {basicBackendFields.map((fieldKey) => ( - - ))} -
+ {/* Advanced Fields Toggle */}
@@ -314,54 +265,13 @@ const InstanceDialog: React.FC = ({
- {/* Advanced Fields - Automatically generated from type (excluding restart options) */} + {/* Advanced Fields */} {showAdvanced && (
- {/* Advanced instance fields */} - {advancedFields - .filter( - (fieldKey) => - !["max_restarts", "restart_delay", "backend_options"].includes( - fieldKey as string - ) - ).length > 0 && ( -
-

Advanced Instance Configuration

- {advancedFields - .filter( - (fieldKey) => - !["max_restarts", "restart_delay", "backend_options"].includes( - fieldKey as string - ) - ) - .sort() - .map((fieldKey) => ( - - ))} -
- )} - - {/* Advanced backend fields */} - {advancedBackendFields.length > 0 && ( -
-

Advanced Backend Configuration

- {advancedBackendFields - .sort() - .map((fieldKey) => ( - - ))} -
- )} +
)}
diff --git a/webui/src/components/ParseCommandDialog.tsx b/webui/src/components/ParseCommandDialog.tsx index 6b14eaa..593c664 100644 --- a/webui/src/components/ParseCommandDialog.tsx +++ b/webui/src/components/ParseCommandDialog.tsx @@ -9,7 +9,7 @@ import { DialogHeader, DialogTitle, } from "@/components/ui/dialog"; -import { type CreateInstanceOptions } from "@/types/instance"; +import { BackendType, type BackendTypeValue, type CreateInstanceOptions } from "@/types/instance"; import { backendsApi } from "@/lib/api"; import { toast } from "sonner"; @@ -25,6 +25,7 @@ const ParseCommandDialog: React.FC = ({ onParsed, }) => { const [command, setCommand] = useState(''); + const [backendType, setBackendType] = useState(BackendType.LLAMA_CPP); const [loading, setLoading] = useState(false); const [error, setError] = useState(null); @@ -38,18 +39,31 @@ const ParseCommandDialog: React.FC = ({ setError(null); try { - const options = await backendsApi.llamaCpp.parseCommand(command); + let options: CreateInstanceOptions; + + // Parse based on selected backend type + switch (backendType) { + case BackendType.LLAMA_CPP: + options = await backendsApi.llamaCpp.parseCommand(command); + break; + case BackendType.MLX_LM: + options = await backendsApi.mlx.parseCommand(command); + break; + case BackendType.VLLM: + options = await backendsApi.vllm.parseCommand(command); + break; + default: + throw new Error(`Unsupported backend type: ${backendType}`); + } + onParsed(options); onOpenChange(false); - // Reset form setCommand(''); setError(null); - // Show success toast toast.success('Command parsed successfully'); } catch (err) { const errorMessage = err instanceof Error ? err.message : 'Failed to parse command'; setError(errorMessage); - // Show error toast toast.error('Failed to parse command', { description: errorMessage }); @@ -60,35 +74,59 @@ const ParseCommandDialog: React.FC = ({ const handleOpenChange = (open: boolean) => { if (!open) { - // Reset form when closing setCommand(''); + setBackendType(BackendType.LLAMA_CPP); setError(null); } onOpenChange(open); }; + const backendPlaceholders: Record = { + [BackendType.LLAMA_CPP]: "llama-server --model /path/to/model.gguf --gpu-layers 32 --ctx-size 4096", + [BackendType.MLX_LM]: "mlx_lm.server --model mlx-community/Mistral-7B-Instruct-v0.3-4bit --host 0.0.0.0 --port 8080", + [BackendType.VLLM]: "vllm serve microsoft/DialoGPT-medium --tensor-parallel-size 2 --gpu-memory-utilization 0.9", + }; + + const getPlaceholderForBackend = (backendType: BackendTypeValue): string => { + return backendPlaceholders[backendType] || "Enter your command here..."; + }; + return ( - Parse Llama Server Command + Parse Backend Command - Paste your llama-server command to automatically populate the form fields + Select your backend type and paste the command to automatically populate the form fields - +
+
+ + +
+