From 4df02a6519dfffbd36364c044032d4db4ccb9fea Mon Sep 17 00:00:00 2001 From: LordMathis Date: Fri, 19 Sep 2025 18:05:12 +0200 Subject: [PATCH] Initial vLLM backend support --- pkg/backends/backend.go | 1 + pkg/backends/vllm/parser.go | 302 +++++++++++++++++++++ pkg/backends/vllm/parser_test.go | 83 ++++++ pkg/backends/vllm/vllm.go | 439 ++++++++++++++++++++++++++++++ pkg/backends/vllm/vllm_test.go | 106 ++++++++ pkg/config/config.go | 7 + pkg/instance/lifecycle.go | 2 + pkg/instance/options.go | 37 ++- pkg/server/handlers.go | 56 +++- pkg/server/routes.go | 3 + pkg/validation/validation.go | 21 ++ vllm_backend_spec.md | 440 +++++++++++++++++++++++++++++++ 12 files changed, 1495 insertions(+), 2 deletions(-) create mode 100644 pkg/backends/vllm/parser.go create mode 100644 pkg/backends/vllm/parser_test.go create mode 100644 pkg/backends/vllm/vllm.go create mode 100644 pkg/backends/vllm/vllm_test.go create mode 100644 vllm_backend_spec.md diff --git a/pkg/backends/backend.go b/pkg/backends/backend.go index 0270945..802fec2 100644 --- a/pkg/backends/backend.go +++ b/pkg/backends/backend.go @@ -5,5 +5,6 @@ type BackendType string const ( BackendTypeLlamaCpp BackendType = "llama_cpp" BackendTypeMlxLm BackendType = "mlx_lm" + BackendTypeVllm BackendType = "vllm" // BackendTypeMlxVlm BackendType = "mlx_vlm" // Future expansion ) diff --git a/pkg/backends/vllm/parser.go b/pkg/backends/vllm/parser.go new file mode 100644 index 0000000..cb9125c --- /dev/null +++ b/pkg/backends/vllm/parser.go @@ -0,0 +1,302 @@ +package vllm + +import ( + "encoding/json" + "errors" + "fmt" + "path/filepath" + "regexp" + "strconv" + "strings" +) + +// ParseVllmCommand parses a vLLM serve command string into VllmServerOptions +// Supports multiple formats: +// 1. Full command: "vllm serve --model MODEL_NAME --other-args" +// 2. Full path: "/usr/local/bin/vllm serve --model MODEL_NAME" +// 3. Serve only: "serve --model MODEL_NAME --other-args" +// 4. Args only: "--model MODEL_NAME --other-args" +// 5. Multiline commands with backslashes +func ParseVllmCommand(command string) (*VllmServerOptions, error) { + // 1. Normalize the command - handle multiline with backslashes + trimmed := normalizeMultilineCommand(command) + if trimmed == "" { + return nil, fmt.Errorf("command cannot be empty") + } + + // 2. Extract arguments from command + args, err := extractArgumentsFromCommand(trimmed) + if err != nil { + return nil, err + } + + // 3. Parse arguments into map + options := make(map[string]any) + + // Known multi-valued flags (snake_case form) + multiValued := map[string]struct{}{ + "middleware": {}, + "api_key": {}, + "allowed_origins": {}, + "allowed_methods": {}, + "allowed_headers": {}, + "lora_modules": {}, + "prompt_adapters": {}, + } + + i := 0 + for i < len(args) { + arg := args[i] + + if !strings.HasPrefix(arg, "-") { // skip positional / stray values + i++ + continue + } + + // Reject malformed flags with more than two leading dashes (e.g. ---model) to surface user mistakes + if strings.HasPrefix(arg, "---") { + return nil, fmt.Errorf("malformed flag: %s", arg) + } + + // Unified parsing for --flag=value vs --flag value + var rawFlag, rawValue string + hasEquals := false + if strings.Contains(arg, "=") { + parts := strings.SplitN(arg, "=", 2) + rawFlag = parts[0] + rawValue = parts[1] // may be empty string + hasEquals = true + } else { + rawFlag = arg + } + + flagCore := strings.TrimPrefix(strings.TrimPrefix(rawFlag, "-"), "-") + flagName := strings.ReplaceAll(flagCore, "-", "_") + + // Detect value if not in equals form + valueProvided := hasEquals + if !hasEquals { + if i+1 < len(args) && !isFlag(args[i+1]) { // next token is value + rawValue = args[i+1] + valueProvided = true + } + } + + // Determine if multi-valued flag + _, isMulti := multiValued[flagName] + + // Normalization helper: ensure slice for multi-valued flags + appendValue := func(valStr string) { + if existing, ok := options[flagName]; ok { + // Existing value; ensure slice semantics for multi-valued flags or repeated occurrences + if slice, ok := existing.([]string); ok { + options[flagName] = append(slice, valStr) + return + } + // Convert scalar to slice + options[flagName] = []string{fmt.Sprintf("%v", existing), valStr} + return + } + // First value + if isMulti { + options[flagName] = []string{valStr} + } else { + // We'll parse type below for single-valued flags + options[flagName] = valStr + } + } + + if valueProvided { + // Use raw token for multi-valued flags; else allow typed parsing + appendValue(rawValue) + if !isMulti { // convert to typed value if scalar + if strVal, ok := options[flagName].(string); ok { // still scalar + options[flagName] = parseValue(strVal) + } + } + // Advance index: if we consumed a following token as value (non equals form), skip it + if !hasEquals && i+1 < len(args) && rawValue == args[i+1] { + i += 2 + } else { + i++ + } + continue + } + + // Boolean flag (no value) + options[flagName] = true + i++ + } + + // 4. Convert to VllmServerOptions using existing UnmarshalJSON + jsonData, err := json.Marshal(options) + if err != nil { + return nil, fmt.Errorf("failed to marshal parsed options: %w", err) + } + + var vllmOptions VllmServerOptions + if err := json.Unmarshal(jsonData, &vllmOptions); err != nil { + return nil, fmt.Errorf("failed to parse command options: %w", err) + } + + // 5. Return VllmServerOptions + return &vllmOptions, nil +} + +// parseValue attempts to parse a string value into the most appropriate type +func parseValue(value string) any { + // Surrounding matching quotes (single or double) + if l := len(value); l >= 2 { + if (value[0] == '"' && value[l-1] == '"') || (value[0] == '\'' && value[l-1] == '\'') { + value = value[1 : l-1] + } + } + + lower := strings.ToLower(value) + if lower == "true" { + return true + } + if lower == "false" { + return false + } + + if intVal, err := strconv.Atoi(value); err == nil { + return intVal + } + if floatVal, err := strconv.ParseFloat(value, 64); err == nil { + return floatVal + } + return value +} + +// normalizeMultilineCommand handles multiline commands with backslashes +func normalizeMultilineCommand(command string) string { + // Handle escaped newlines (backslash followed by newline) + re := regexp.MustCompile(`\\\s*\n\s*`) + normalized := re.ReplaceAllString(command, " ") + + // Clean up extra whitespace + re = regexp.MustCompile(`\s+`) + normalized = re.ReplaceAllString(normalized, " ") + + return strings.TrimSpace(normalized) +} + +// extractArgumentsFromCommand extracts arguments from various command formats +func extractArgumentsFromCommand(command string) ([]string, error) { + // Split command into tokens respecting quotes + tokens, err := splitCommandTokens(command) + if err != nil { + return nil, err + } + + if len(tokens) == 0 { + return nil, fmt.Errorf("no command tokens found") + } + + // Check if first token looks like an executable + firstToken := tokens[0] + + // Case 1: Full path to executable (contains path separator or ends with vllm) + if strings.Contains(firstToken, string(filepath.Separator)) || + strings.HasSuffix(filepath.Base(firstToken), "vllm") { + // Check if second token is "serve" + if len(tokens) > 1 && strings.ToLower(tokens[1]) == "serve" { + return tokens[2:], nil // Return everything except executable and serve + } + return tokens[1:], nil // Return everything except the executable + } + + // Case 2: Just "vllm" command + if strings.ToLower(firstToken) == "vllm" { + // Check if second token is "serve" + if len(tokens) > 1 && strings.ToLower(tokens[1]) == "serve" { + return tokens[2:], nil // Return everything except vllm and serve + } + return tokens[1:], nil // Return everything except vllm + } + + // Case 3: Just "serve" command + if strings.ToLower(firstToken) == "serve" { + return tokens[1:], nil // Return everything except serve + } + + // Case 4: Arguments only (starts with a flag) + if strings.HasPrefix(firstToken, "-") { + return tokens, nil // Return all tokens as arguments + } + + // Case 5: Unknown format - might be a different executable name + // Be permissive and assume it's the executable + if len(tokens) > 1 && strings.ToLower(tokens[1]) == "serve" { + return tokens[2:], nil // Return everything except executable and serve + } + return tokens[1:], nil +} + +// splitCommandTokens splits a command string into tokens, respecting quotes +func splitCommandTokens(command string) ([]string, error) { + var tokens []string + var current strings.Builder + inQuotes := false + quoteChar := byte(0) + escaped := false + + for i := 0; i < len(command); i++ { + c := command[i] + + if escaped { + current.WriteByte(c) + escaped = false + continue + } + + if c == '\\' { + escaped = true + current.WriteByte(c) + continue + } + + if !inQuotes && (c == '"' || c == '\'') { + inQuotes = true + quoteChar = c + current.WriteByte(c) + } else if inQuotes && c == quoteChar { + inQuotes = false + quoteChar = 0 + current.WriteByte(c) + } else if !inQuotes && (c == ' ' || c == '\t') { + if current.Len() > 0 { + tokens = append(tokens, current.String()) + current.Reset() + } + } else { + current.WriteByte(c) + } + } + + if inQuotes { + return nil, errors.New("unterminated quoted string") + } + + if current.Len() > 0 { + tokens = append(tokens, current.String()) + } + + return tokens, nil +} + +// isFlag determines if a string is a command line flag or a value +// Handles the special case where negative numbers should be treated as values, not flags +func isFlag(arg string) bool { + if !strings.HasPrefix(arg, "-") { + return false + } + + // Special case: if it's a negative number, treat it as a value + if _, err := strconv.ParseFloat(arg, 64); err == nil { + return false + } + + return true +} \ No newline at end of file diff --git a/pkg/backends/vllm/parser_test.go b/pkg/backends/vllm/parser_test.go new file mode 100644 index 0000000..91921b2 --- /dev/null +++ b/pkg/backends/vllm/parser_test.go @@ -0,0 +1,83 @@ +package vllm + +import ( + "testing" +) + +func TestParseVllmCommand(t *testing.T) { + tests := []struct { + name string + command string + expectErr bool + }{ + { + name: "basic vllm serve command", + command: "vllm serve --model microsoft/DialoGPT-medium", + expectErr: false, + }, + { + name: "serve only command", + command: "serve --model microsoft/DialoGPT-medium", + expectErr: false, + }, + { + name: "args only", + command: "--model microsoft/DialoGPT-medium --tensor-parallel-size 2", + expectErr: false, + }, + { + name: "empty command", + command: "", + expectErr: true, + }, + { + name: "unterminated quote", + command: `vllm serve --model "unterminated`, + expectErr: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result, err := ParseVllmCommand(tt.command) + + if tt.expectErr { + if err == nil { + t.Errorf("expected error but got none") + } + return + } + + if err != nil { + t.Errorf("unexpected error: %v", err) + return + } + + if result == nil { + t.Errorf("expected result but got nil") + } + }) + } +} + +func TestParseVllmCommandValues(t *testing.T) { + command := "vllm serve --model test-model --tensor-parallel-size 4 --gpu-memory-utilization 0.8 --enable-log-outputs" + result, err := ParseVllmCommand(command) + + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + if result.Model != "test-model" { + t.Errorf("expected model 'test-model', got '%s'", result.Model) + } + if result.TensorParallelSize != 4 { + t.Errorf("expected tensor_parallel_size 4, got %d", result.TensorParallelSize) + } + if result.GPUMemoryUtilization != 0.8 { + t.Errorf("expected gpu_memory_utilization 0.8, got %f", result.GPUMemoryUtilization) + } + if !result.EnableLogOutputs { + t.Errorf("expected enable_log_outputs true, got %v", result.EnableLogOutputs) + } +} \ No newline at end of file diff --git a/pkg/backends/vllm/vllm.go b/pkg/backends/vllm/vllm.go new file mode 100644 index 0000000..6378b5e --- /dev/null +++ b/pkg/backends/vllm/vllm.go @@ -0,0 +1,439 @@ +package vllm + +import ( + "encoding/json" + "reflect" + "strconv" + "strings" +) + +type VllmServerOptions struct { + // Basic connection options (auto-assigned by llamactl) + Host string `json:"host,omitempty"` + Port int `json:"port,omitempty"` + + // Model and engine configuration + Model string `json:"model,omitempty"` + Tokenizer string `json:"tokenizer,omitempty"` + SkipTokenizerInit bool `json:"skip_tokenizer_init,omitempty"` + Revision string `json:"revision,omitempty"` + CodeRevision string `json:"code_revision,omitempty"` + TokenizerRevision string `json:"tokenizer_revision,omitempty"` + TokenizerMode string `json:"tokenizer_mode,omitempty"` + TrustRemoteCode bool `json:"trust_remote_code,omitempty"` + DownloadDir string `json:"download_dir,omitempty"` + LoadFormat string `json:"load_format,omitempty"` + ConfigFormat string `json:"config_format,omitempty"` + Dtype string `json:"dtype,omitempty"` + KVCacheDtype string `json:"kv_cache_dtype,omitempty"` + QuantizationParamPath string `json:"quantization_param_path,omitempty"` + Seed int `json:"seed,omitempty"` + MaxModelLen int `json:"max_model_len,omitempty"` + GuidedDecodingBackend string `json:"guided_decoding_backend,omitempty"` + DistributedExecutorBackend string `json:"distributed_executor_backend,omitempty"` + WorkerUseRay bool `json:"worker_use_ray,omitempty"` + RayWorkersUseNSight bool `json:"ray_workers_use_nsight,omitempty"` + + // Performance and serving configuration + BlockSize int `json:"block_size,omitempty"` + EnablePrefixCaching bool `json:"enable_prefix_caching,omitempty"` + DisableSlidingWindow bool `json:"disable_sliding_window,omitempty"` + UseV2BlockManager bool `json:"use_v2_block_manager,omitempty"` + NumLookaheadSlots int `json:"num_lookahead_slots,omitempty"` + SwapSpace int `json:"swap_space,omitempty"` + CPUOffloadGB int `json:"cpu_offload_gb,omitempty"` + GPUMemoryUtilization float64 `json:"gpu_memory_utilization,omitempty"` + NumGPUBlocksOverride int `json:"num_gpu_blocks_override,omitempty"` + MaxNumBatchedTokens int `json:"max_num_batched_tokens,omitempty"` + MaxNumSeqs int `json:"max_num_seqs,omitempty"` + MaxLogprobs int `json:"max_logprobs,omitempty"` + DisableLogStats bool `json:"disable_log_stats,omitempty"` + Quantization string `json:"quantization,omitempty"` + RopeScaling string `json:"rope_scaling,omitempty"` + RopeTheta float64 `json:"rope_theta,omitempty"` + EnforceEager bool `json:"enforce_eager,omitempty"` + MaxContextLenToCapture int `json:"max_context_len_to_capture,omitempty"` + MaxSeqLenToCapture int `json:"max_seq_len_to_capture,omitempty"` + DisableCustomAllReduce bool `json:"disable_custom_all_reduce,omitempty"` + TokenizerPoolSize int `json:"tokenizer_pool_size,omitempty"` + TokenizerPoolType string `json:"tokenizer_pool_type,omitempty"` + TokenizerPoolExtraConfig string `json:"tokenizer_pool_extra_config,omitempty"` + EnableLoraBias bool `json:"enable_lora_bias,omitempty"` + LoraExtraVocabSize int `json:"lora_extra_vocab_size,omitempty"` + LoraRank int `json:"lora_rank,omitempty"` + PromptLookbackDistance int `json:"prompt_lookback_distance,omitempty"` + PreemptionMode string `json:"preemption_mode,omitempty"` + + // Distributed and parallel processing + TensorParallelSize int `json:"tensor_parallel_size,omitempty"` + PipelineParallelSize int `json:"pipeline_parallel_size,omitempty"` + MaxParallelLoadingWorkers int `json:"max_parallel_loading_workers,omitempty"` + DisableAsyncOutputProc bool `json:"disable_async_output_proc,omitempty"` + WorkerClass string `json:"worker_class,omitempty"` + EnabledLoraModules string `json:"enabled_lora_modules,omitempty"` + MaxLoraRank int `json:"max_lora_rank,omitempty"` + FullyShardedLoras bool `json:"fully_sharded_loras,omitempty"` + LoraModules string `json:"lora_modules,omitempty"` + PromptAdapters string `json:"prompt_adapters,omitempty"` + MaxPromptAdapterToken int `json:"max_prompt_adapter_token,omitempty"` + Device string `json:"device,omitempty"` + SchedulerDelay float64 `json:"scheduler_delay,omitempty"` + EnableChunkedPrefill bool `json:"enable_chunked_prefill,omitempty"` + SpeculativeModel string `json:"speculative_model,omitempty"` + SpeculativeModelQuantization string `json:"speculative_model_quantization,omitempty"` + SpeculativeRevision string `json:"speculative_revision,omitempty"` + SpeculativeMaxModelLen int `json:"speculative_max_model_len,omitempty"` + SpeculativeDisableByBatchSize int `json:"speculative_disable_by_batch_size,omitempty"` + NgptSpeculativeLength int `json:"ngpt_speculative_length,omitempty"` + SpeculativeDisableMqa bool `json:"speculative_disable_mqa,omitempty"` + ModelLoaderExtraConfig string `json:"model_loader_extra_config,omitempty"` + IgnorePatterns string `json:"ignore_patterns,omitempty"` + PreloadedLoraModules string `json:"preloaded_lora_modules,omitempty"` + + // OpenAI server specific options + UDS string `json:"uds,omitempty"` + UvicornLogLevel string `json:"uvicorn_log_level,omitempty"` + ResponseRole string `json:"response_role,omitempty"` + SSLKeyfile string `json:"ssl_keyfile,omitempty"` + SSLCertfile string `json:"ssl_certfile,omitempty"` + SSLCACerts string `json:"ssl_ca_certs,omitempty"` + SSLCertReqs int `json:"ssl_cert_reqs,omitempty"` + RootPath string `json:"root_path,omitempty"` + Middleware []string `json:"middleware,omitempty"` + ReturnTokensAsTokenIDS bool `json:"return_tokens_as_token_ids,omitempty"` + DisableFrontendMultiprocessing bool `json:"disable_frontend_multiprocessing,omitempty"` + EnableAutoToolChoice bool `json:"enable_auto_tool_choice,omitempty"` + ToolCallParser string `json:"tool_call_parser,omitempty"` + ToolServer string `json:"tool_server,omitempty"` + ChatTemplate string `json:"chat_template,omitempty"` + ChatTemplateContentFormat string `json:"chat_template_content_format,omitempty"` + AllowCredentials bool `json:"allow_credentials,omitempty"` + AllowedOrigins []string `json:"allowed_origins,omitempty"` + AllowedMethods []string `json:"allowed_methods,omitempty"` + AllowedHeaders []string `json:"allowed_headers,omitempty"` + APIKey []string `json:"api_key,omitempty"` + EnableLogOutputs bool `json:"enable_log_outputs,omitempty"` + EnableTokenUsage bool `json:"enable_token_usage,omitempty"` + EnableAsyncEngineDebug bool `json:"enable_async_engine_debug,omitempty"` + EngineUseRay bool `json:"engine_use_ray,omitempty"` + DisableLogRequests bool `json:"disable_log_requests,omitempty"` + MaxLogLen int `json:"max_log_len,omitempty"` + + // Additional engine configuration + Task string `json:"task,omitempty"` + MultiModalConfig string `json:"multi_modal_config,omitempty"` + LimitMmPerPrompt string `json:"limit_mm_per_prompt,omitempty"` + EnableSleepMode bool `json:"enable_sleep_mode,omitempty"` + EnableChunkingRequest bool `json:"enable_chunking_request,omitempty"` + CompilationConfig string `json:"compilation_config,omitempty"` + DisableSlidingWindowMask bool `json:"disable_sliding_window_mask,omitempty"` + EnableTRTLLMEngineLatency bool `json:"enable_trtllm_engine_latency,omitempty"` + OverridePoolingConfig string `json:"override_pooling_config,omitempty"` + OverrideNeuronConfig string `json:"override_neuron_config,omitempty"` + OverrideKVCacheALIGNSize int `json:"override_kv_cache_align_size,omitempty"` +} + +// NewVllmServerOptions creates a new VllmServerOptions with defaults +func NewVllmServerOptions() *VllmServerOptions { + return &VllmServerOptions{ + Host: "127.0.0.1", + Port: 8000, + TensorParallelSize: 1, + PipelineParallelSize: 1, + GPUMemoryUtilization: 0.9, + BlockSize: 16, + SwapSpace: 4, + UvicornLogLevel: "info", + ResponseRole: "assistant", + TokenizerMode: "auto", + TrustRemoteCode: false, + EnablePrefixCaching: false, + EnforceEager: false, + DisableLogStats: false, + DisableLogRequests: false, + MaxLogprobs: 20, + EnableLogOutputs: false, + EnableTokenUsage: false, + AllowCredentials: false, + AllowedOrigins: []string{"*"}, + AllowedMethods: []string{"*"}, + AllowedHeaders: []string{"*"}, + } +} + +// UnmarshalJSON implements custom JSON unmarshaling to support multiple field names +func (o *VllmServerOptions) UnmarshalJSON(data []byte) error { + // First unmarshal into a map to handle multiple field names + var raw map[string]any + if err := json.Unmarshal(data, &raw); err != nil { + return err + } + + // Create a temporary struct for standard unmarshaling + type tempOptions VllmServerOptions + temp := tempOptions{} + + // Standard unmarshal first + if err := json.Unmarshal(data, &temp); err != nil { + return err + } + + // Copy to our struct + *o = VllmServerOptions(temp) + + // Handle alternative field names (CLI format with dashes) + fieldMappings := map[string]string{ + // Basic options + "tensor-parallel-size": "tensor_parallel_size", + "pipeline-parallel-size": "pipeline_parallel_size", + "max-parallel-loading-workers": "max_parallel_loading_workers", + "disable-async-output-proc": "disable_async_output_proc", + "worker-class": "worker_class", + "enabled-lora-modules": "enabled_lora_modules", + "max-lora-rank": "max_lora_rank", + "fully-sharded-loras": "fully_sharded_loras", + "lora-modules": "lora_modules", + "prompt-adapters": "prompt_adapters", + "max-prompt-adapter-token": "max_prompt_adapter_token", + "scheduler-delay": "scheduler_delay", + "enable-chunked-prefill": "enable_chunked_prefill", + "speculative-model": "speculative_model", + "speculative-model-quantization": "speculative_model_quantization", + "speculative-revision": "speculative_revision", + "speculative-max-model-len": "speculative_max_model_len", + "speculative-disable-by-batch-size": "speculative_disable_by_batch_size", + "ngpt-speculative-length": "ngpt_speculative_length", + "speculative-disable-mqa": "speculative_disable_mqa", + "model-loader-extra-config": "model_loader_extra_config", + "ignore-patterns": "ignore_patterns", + "preloaded-lora-modules": "preloaded_lora_modules", + + // Model configuration + "skip-tokenizer-init": "skip_tokenizer_init", + "code-revision": "code_revision", + "tokenizer-revision": "tokenizer_revision", + "tokenizer-mode": "tokenizer_mode", + "trust-remote-code": "trust_remote_code", + "download-dir": "download_dir", + "load-format": "load_format", + "config-format": "config_format", + "kv-cache-dtype": "kv_cache_dtype", + "quantization-param-path": "quantization_param_path", + "max-model-len": "max_model_len", + "guided-decoding-backend": "guided_decoding_backend", + "distributed-executor-backend": "distributed_executor_backend", + "worker-use-ray": "worker_use_ray", + "ray-workers-use-nsight": "ray_workers_use_nsight", + + // Performance configuration + "block-size": "block_size", + "enable-prefix-caching": "enable_prefix_caching", + "disable-sliding-window": "disable_sliding_window", + "use-v2-block-manager": "use_v2_block_manager", + "num-lookahead-slots": "num_lookahead_slots", + "swap-space": "swap_space", + "cpu-offload-gb": "cpu_offload_gb", + "gpu-memory-utilization": "gpu_memory_utilization", + "num-gpu-blocks-override": "num_gpu_blocks_override", + "max-num-batched-tokens": "max_num_batched_tokens", + "max-num-seqs": "max_num_seqs", + "max-logprobs": "max_logprobs", + "disable-log-stats": "disable_log_stats", + "rope-scaling": "rope_scaling", + "rope-theta": "rope_theta", + "enforce-eager": "enforce_eager", + "max-context-len-to-capture": "max_context_len_to_capture", + "max-seq-len-to-capture": "max_seq_len_to_capture", + "disable-custom-all-reduce": "disable_custom_all_reduce", + "tokenizer-pool-size": "tokenizer_pool_size", + "tokenizer-pool-type": "tokenizer_pool_type", + "tokenizer-pool-extra-config": "tokenizer_pool_extra_config", + "enable-lora-bias": "enable_lora_bias", + "lora-extra-vocab-size": "lora_extra_vocab_size", + "lora-rank": "lora_rank", + "prompt-lookback-distance": "prompt_lookback_distance", + "preemption-mode": "preemption_mode", + + // Server configuration + "uvicorn-log-level": "uvicorn_log_level", + "response-role": "response_role", + "ssl-keyfile": "ssl_keyfile", + "ssl-certfile": "ssl_certfile", + "ssl-ca-certs": "ssl_ca_certs", + "ssl-cert-reqs": "ssl_cert_reqs", + "root-path": "root_path", + "return-tokens-as-token-ids": "return_tokens_as_token_ids", + "disable-frontend-multiprocessing": "disable_frontend_multiprocessing", + "enable-auto-tool-choice": "enable_auto_tool_choice", + "tool-call-parser": "tool_call_parser", + "tool-server": "tool_server", + "chat-template": "chat_template", + "chat-template-content-format": "chat_template_content_format", + "allow-credentials": "allow_credentials", + "allowed-origins": "allowed_origins", + "allowed-methods": "allowed_methods", + "allowed-headers": "allowed_headers", + "api-key": "api_key", + "enable-log-outputs": "enable_log_outputs", + "enable-token-usage": "enable_token_usage", + "enable-async-engine-debug": "enable_async_engine_debug", + "engine-use-ray": "engine_use_ray", + "disable-log-requests": "disable_log_requests", + "max-log-len": "max_log_len", + + // Additional options + "multi-modal-config": "multi_modal_config", + "limit-mm-per-prompt": "limit_mm_per_prompt", + "enable-sleep-mode": "enable_sleep_mode", + "enable-chunking-request": "enable_chunking_request", + "compilation-config": "compilation_config", + "disable-sliding-window-mask": "disable_sliding_window_mask", + "enable-trtllm-engine-latency": "enable_trtllm_engine_latency", + "override-pooling-config": "override_pooling_config", + "override-neuron-config": "override_neuron_config", + "override-kv-cache-align-size": "override_kv_cache_align_size", + } + + // Process alternative field names + for altName, canonicalName := range fieldMappings { + if value, exists := raw[altName]; exists { + // Use reflection to set the field value + v := reflect.ValueOf(o).Elem() + field := v.FieldByNameFunc(func(fieldName string) bool { + field, _ := v.Type().FieldByName(fieldName) + jsonTag := field.Tag.Get("json") + return jsonTag == canonicalName+",omitempty" || jsonTag == canonicalName + }) + + if field.IsValid() && field.CanSet() { + switch field.Kind() { + case reflect.Int: + if intVal, ok := value.(float64); ok { + field.SetInt(int64(intVal)) + } else if strVal, ok := value.(string); ok { + if intVal, err := strconv.Atoi(strVal); err == nil { + field.SetInt(int64(intVal)) + } + } + case reflect.Float64: + if floatVal, ok := value.(float64); ok { + field.SetFloat(floatVal) + } else if strVal, ok := value.(string); ok { + if floatVal, err := strconv.ParseFloat(strVal, 64); err == nil { + field.SetFloat(floatVal) + } + } + case reflect.String: + if strVal, ok := value.(string); ok { + field.SetString(strVal) + } + case reflect.Bool: + if boolVal, ok := value.(bool); ok { + field.SetBool(boolVal) + } + case reflect.Slice: + if field.Type().Elem().Kind() == reflect.String { + if strVal, ok := value.(string); ok { + // Split comma-separated values + values := strings.Split(strVal, ",") + for i, v := range values { + values[i] = strings.TrimSpace(v) + } + field.Set(reflect.ValueOf(values)) + } else if slice, ok := value.([]interface{}); ok { + var strSlice []string + for _, item := range slice { + if str, ok := item.(string); ok { + strSlice = append(strSlice, str) + } + } + field.Set(reflect.ValueOf(strSlice)) + } + } + } + } + } + } + + return nil +} + +// BuildCommandArgs converts VllmServerOptions to command line arguments +// Note: This does NOT include the "serve" subcommand, that's handled at the instance level +func (o *VllmServerOptions) BuildCommandArgs() []string { + var args []string + + v := reflect.ValueOf(o).Elem() + t := v.Type() + + for i := 0; i < v.NumField(); i++ { + field := v.Field(i) + fieldType := t.Field(i) + + // Skip unexported fields + if !field.CanInterface() { + continue + } + + // Get the JSON tag to determine the flag name + jsonTag := fieldType.Tag.Get("json") + if jsonTag == "" || jsonTag == "-" { + continue + } + + // Remove ",omitempty" from the tag + flagName := jsonTag + if commaIndex := strings.Index(jsonTag, ","); commaIndex != -1 { + flagName = jsonTag[:commaIndex] + } + + // Skip host and port as they are handled by llamactl + if flagName == "host" || flagName == "port" { + continue + } + + // Convert snake_case to kebab-case for CLI flags + flagName = strings.ReplaceAll(flagName, "_", "-") + + // Add the appropriate arguments based on field type and value + switch field.Kind() { + case reflect.Bool: + if field.Bool() { + args = append(args, "--"+flagName) + } + case reflect.Int: + if field.Int() != 0 { + args = append(args, "--"+flagName, strconv.FormatInt(field.Int(), 10)) + } + case reflect.Float64: + if field.Float() != 0 { + args = append(args, "--"+flagName, strconv.FormatFloat(field.Float(), 'f', -1, 64)) + } + case reflect.String: + if field.String() != "" { + args = append(args, "--"+flagName, field.String()) + } + case reflect.Slice: + if field.Type().Elem().Kind() == reflect.String { + // Handle []string fields - some are comma-separated, some use multiple flags + if flagName == "api-key" || flagName == "allowed-origins" || flagName == "allowed-methods" || flagName == "allowed-headers" || flagName == "middleware" { + // Multiple flags for these + for j := 0; j < field.Len(); j++ { + args = append(args, "--"+flagName, field.Index(j).String()) + } + } else { + // Comma-separated for others + if field.Len() > 0 { + var values []string + for j := 0; j < field.Len(); j++ { + values = append(values, field.Index(j).String()) + } + args = append(args, "--"+flagName, strings.Join(values, ",")) + } + } + } + } + } + + return args +} \ No newline at end of file diff --git a/pkg/backends/vllm/vllm_test.go b/pkg/backends/vllm/vllm_test.go new file mode 100644 index 0000000..e05320a --- /dev/null +++ b/pkg/backends/vllm/vllm_test.go @@ -0,0 +1,106 @@ +package vllm_test + +import ( + "encoding/json" + "llamactl/pkg/backends/vllm" + "slices" + "testing" +) + +func TestBuildCommandArgs(t *testing.T) { + options := vllm.VllmServerOptions{ + Model: "microsoft/DialoGPT-medium", + Port: 8080, // should be excluded + Host: "localhost", // should be excluded + TensorParallelSize: 2, + GPUMemoryUtilization: 0.8, + EnableLogOutputs: true, + APIKey: []string{"key1", "key2"}, + } + + args := options.BuildCommandArgs() + + // Check core functionality + if !containsFlagWithValue(args, "--model", "microsoft/DialoGPT-medium") { + t.Errorf("Expected --model microsoft/DialoGPT-medium not found in %v", args) + } + if !containsFlagWithValue(args, "--tensor-parallel-size", "2") { + t.Errorf("Expected --tensor-parallel-size 2 not found in %v", args) + } + if !contains(args, "--enable-log-outputs") { + t.Errorf("Expected --enable-log-outputs not found in %v", args) + } + + // Host and port should NOT be in the arguments (handled by llamactl) + if contains(args, "--host") || contains(args, "--port") { + t.Errorf("Host and port should not be in command args, found in %v", args) + } + + // Check array handling (multiple flags) + apiKeyCount := 0 + for i := range args { + if args[i] == "--api-key" { + apiKeyCount++ + } + } + if apiKeyCount != 2 { + t.Errorf("Expected 2 --api-key flags, got %d", apiKeyCount) + } +} + +func TestUnmarshalJSON(t *testing.T) { + // Test both underscore and dash formats + jsonData := `{ + "model": "test-model", + "tensor_parallel_size": 4, + "gpu-memory-utilization": 0.9, + "enable-log-outputs": true + }` + + var options vllm.VllmServerOptions + err := json.Unmarshal([]byte(jsonData), &options) + if err != nil { + t.Fatalf("Unmarshal failed: %v", err) + } + + if options.Model != "test-model" { + t.Errorf("Expected model 'test-model', got %q", options.Model) + } + if options.TensorParallelSize != 4 { + t.Errorf("Expected tensor_parallel_size 4, got %d", options.TensorParallelSize) + } + if options.GPUMemoryUtilization != 0.9 { + t.Errorf("Expected gpu_memory_utilization 0.9, got %f", options.GPUMemoryUtilization) + } + if !options.EnableLogOutputs { + t.Errorf("Expected enable_log_outputs true, got %v", options.EnableLogOutputs) + } +} + +func TestNewVllmServerOptions(t *testing.T) { + options := vllm.NewVllmServerOptions() + + if options == nil { + t.Fatal("NewVllmServerOptions returned nil") + } + if options.Host != "127.0.0.1" { + t.Errorf("Expected default host '127.0.0.1', got %q", options.Host) + } + if options.Port != 8000 { + t.Errorf("Expected default port 8000, got %d", options.Port) + } +} + +// Helper functions +func contains(slice []string, item string) bool { + return slices.Contains(slice, item) +} + +func containsFlagWithValue(args []string, flag, value string) bool { + for i, arg := range args { + if arg == flag && i+1 < len(args) && args[i+1] == value { + return true + } + } + return false +} \ No newline at end of file diff --git a/pkg/config/config.go b/pkg/config/config.go index 28087db..504ecc3 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -17,6 +17,9 @@ type BackendConfig struct { // Path to mlx_lm executable (MLX-LM backend) MLXLMExecutable string `yaml:"mlx_lm_executable"` + + // Path to vllm executable (vLLM backend) + VllmExecutable string `yaml:"vllm_executable"` } // AppConfig represents the configuration for llamactl @@ -122,6 +125,7 @@ func LoadConfig(configPath string) (AppConfig, error) { Backends: BackendConfig{ LlamaExecutable: "llama-server", MLXLMExecutable: "mlx_lm.server", + VllmExecutable: "vllm", }, Instances: InstancesConfig{ PortRange: [2]int{8000, 9000}, @@ -246,6 +250,9 @@ func loadEnvVars(cfg *AppConfig) { if mlxLMExec := os.Getenv("LLAMACTL_MLX_LM_EXECUTABLE"); mlxLMExec != "" { cfg.Backends.MLXLMExecutable = mlxLMExec } + if vllmExec := os.Getenv("LLAMACTL_VLLM_EXECUTABLE"); vllmExec != "" { + cfg.Backends.VllmExecutable = vllmExec + } if autoRestart := os.Getenv("LLAMACTL_DEFAULT_AUTO_RESTART"); autoRestart != "" { if b, err := strconv.ParseBool(autoRestart); err == nil { cfg.Instances.DefaultAutoRestart = b diff --git a/pkg/instance/lifecycle.go b/pkg/instance/lifecycle.go index 04c5fba..c4e23a7 100644 --- a/pkg/instance/lifecycle.go +++ b/pkg/instance/lifecycle.go @@ -52,6 +52,8 @@ func (i *Process) Start() error { executable = i.globalBackendSettings.LlamaExecutable case backends.BackendTypeMlxLm: executable = i.globalBackendSettings.MLXLMExecutable + case backends.BackendTypeVllm: + executable = i.globalBackendSettings.VllmExecutable default: return fmt.Errorf("unsupported backend type: %s", i.options.BackendType) } diff --git a/pkg/instance/options.go b/pkg/instance/options.go index 2b1437f..2e0b2fd 100644 --- a/pkg/instance/options.go +++ b/pkg/instance/options.go @@ -6,6 +6,7 @@ import ( "llamactl/pkg/backends" "llamactl/pkg/backends/llamacpp" "llamactl/pkg/backends/mlx" + "llamactl/pkg/backends/vllm" "llamactl/pkg/config" "log" ) @@ -26,6 +27,7 @@ type CreateInstanceOptions struct { // Backend-specific options LlamaServerOptions *llamacpp.LlamaServerOptions `json:"-"` MlxServerOptions *mlx.MlxServerOptions `json:"-"` + VllmServerOptions *vllm.VllmServerOptions `json:"-"` } // UnmarshalJSON implements custom JSON unmarshaling for CreateInstanceOptions @@ -63,12 +65,24 @@ func (c *CreateInstanceOptions) UnmarshalJSON(data []byte) error { if err != nil { return fmt.Errorf("failed to marshal backend options: %w", err) } - + c.MlxServerOptions = &mlx.MlxServerOptions{} if err := json.Unmarshal(optionsData, c.MlxServerOptions); err != nil { return fmt.Errorf("failed to unmarshal MLX options: %w", err) } } + case backends.BackendTypeVllm: + if c.BackendOptions != nil { + optionsData, err := json.Marshal(c.BackendOptions) + if err != nil { + return fmt.Errorf("failed to marshal backend options: %w", err) + } + + c.VllmServerOptions = &vllm.VllmServerOptions{} + if err := json.Unmarshal(optionsData, c.VllmServerOptions); err != nil { + return fmt.Errorf("failed to unmarshal vLLM options: %w", err) + } + } default: return fmt.Errorf("unknown backend type: %s", c.BackendType) } @@ -114,6 +128,20 @@ func (c *CreateInstanceOptions) MarshalJSON() ([]byte, error) { return nil, fmt.Errorf("failed to unmarshal to map: %w", err) } + aux.BackendOptions = backendOpts + } + case backends.BackendTypeVllm: + if c.VllmServerOptions != nil { + data, err := json.Marshal(c.VllmServerOptions) + if err != nil { + return nil, fmt.Errorf("failed to marshal vLLM server options: %w", err) + } + + var backendOpts map[string]any + if err := json.Unmarshal(data, &backendOpts); err != nil { + return nil, fmt.Errorf("failed to unmarshal to map: %w", err) + } + aux.BackendOptions = backendOpts } } @@ -171,6 +199,13 @@ func (c *CreateInstanceOptions) BuildCommandArgs() []string { if c.MlxServerOptions != nil { return c.MlxServerOptions.BuildCommandArgs() } + case backends.BackendTypeVllm: + if c.VllmServerOptions != nil { + // Prepend "serve" as first argument + args := []string{"serve"} + args = append(args, c.VllmServerOptions.BuildCommandArgs()...) + return args + } } return []string{} } diff --git a/pkg/server/handlers.go b/pkg/server/handlers.go index c4932b2..0d74851 100644 --- a/pkg/server/handlers.go +++ b/pkg/server/handlers.go @@ -8,6 +8,7 @@ import ( "llamactl/pkg/backends" "llamactl/pkg/backends/llamacpp" "llamactl/pkg/backends/mlx" + "llamactl/pkg/backends/vllm" "llamactl/pkg/config" "llamactl/pkg/instance" "llamactl/pkg/manager" @@ -732,7 +733,60 @@ func (h *Handler) ParseMlxCommand() http.HandlerFunc { BackendType: backendType, MlxServerOptions: mlxOptions, } - + + w.Header().Set("Content-Type", "application/json") + if err := json.NewEncoder(w).Encode(options); err != nil { + writeError(w, http.StatusInternalServerError, "encode_error", err.Error()) + } + } +} + +// ParseVllmCommand godoc +// @Summary Parse vllm serve command +// @Description Parses a vLLM serve command string into instance options +// @Tags backends +// @Security ApiKeyAuth +// @Accept json +// @Produce json +// @Param request body ParseCommandRequest true "Command to parse" +// @Success 200 {object} instance.CreateInstanceOptions "Parsed options" +// @Failure 400 {object} map[string]string "Invalid request or command" +// @Router /backends/vllm/parse-command [post] +func (h *Handler) ParseVllmCommand() http.HandlerFunc { + type errorResponse struct { + Error string `json:"error"` + Details string `json:"details,omitempty"` + } + writeError := func(w http.ResponseWriter, status int, code, details string) { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(status) + _ = json.NewEncoder(w).Encode(errorResponse{Error: code, Details: details}) + } + return func(w http.ResponseWriter, r *http.Request) { + var req ParseCommandRequest + if err := json.NewDecoder(r.Body).Decode(&req); err != nil { + writeError(w, http.StatusBadRequest, "invalid_request", "Invalid JSON body") + return + } + + if strings.TrimSpace(req.Command) == "" { + writeError(w, http.StatusBadRequest, "invalid_command", "Command cannot be empty") + return + } + + vllmOptions, err := vllm.ParseVllmCommand(req.Command) + if err != nil { + writeError(w, http.StatusBadRequest, "parse_error", err.Error()) + return + } + + backendType := backends.BackendTypeVllm + + options := &instance.CreateInstanceOptions{ + BackendType: backendType, + VllmServerOptions: vllmOptions, + } + w.Header().Set("Content-Type", "application/json") if err := json.NewEncoder(w).Encode(options); err != nil { writeError(w, http.StatusInternalServerError, "encode_error", err.Error()) diff --git a/pkg/server/routes.go b/pkg/server/routes.go index aa31e1f..898b574 100644 --- a/pkg/server/routes.go +++ b/pkg/server/routes.go @@ -58,6 +58,9 @@ func SetupRouter(handler *Handler) *chi.Mux { r.Route("/mlx", func(r chi.Router) { r.Post("/parse-command", handler.ParseMlxCommand()) }) + r.Route("/vllm", func(r chi.Router) { + r.Post("/parse-command", handler.ParseVllmCommand()) + }) }) // Instance management endpoints diff --git a/pkg/validation/validation.go b/pkg/validation/validation.go index eff1dd3..638e5d2 100644 --- a/pkg/validation/validation.go +++ b/pkg/validation/validation.go @@ -46,6 +46,8 @@ func ValidateInstanceOptions(options *instance.CreateInstanceOptions) error { return validateLlamaCppOptions(options) case backends.BackendTypeMlxLm: return validateMlxOptions(options) + case backends.BackendTypeVllm: + return validateVllmOptions(options) default: return ValidationError(fmt.Errorf("unsupported backend type: %s", options.BackendType)) } @@ -88,6 +90,25 @@ func validateMlxOptions(options *instance.CreateInstanceOptions) error { return nil } +// validateVllmOptions validates vLLM backend specific options +func validateVllmOptions(options *instance.CreateInstanceOptions) error { + if options.VllmServerOptions == nil { + return ValidationError(fmt.Errorf("vLLM server options cannot be nil for vLLM backend")) + } + + // Use reflection to check all string fields for injection patterns + if err := validateStructStrings(options.VllmServerOptions, ""); err != nil { + return err + } + + // Basic network validation for port + if options.VllmServerOptions.Port < 0 || options.VllmServerOptions.Port > 65535 { + return ValidationError(fmt.Errorf("invalid port range: %d", options.VllmServerOptions.Port)) + } + + return nil +} + // validateStructStrings recursively validates all string fields in a struct func validateStructStrings(v any, fieldPath string) error { val := reflect.ValueOf(v) diff --git a/vllm_backend_spec.md b/vllm_backend_spec.md new file mode 100644 index 0000000..9fede10 --- /dev/null +++ b/vllm_backend_spec.md @@ -0,0 +1,440 @@ +# vLLM Backend Implementation Specification + +## Overview +This specification outlines the implementation of vLLM backend support for llamactl, following the existing patterns established by the llama.cpp and MLX backends. + +## 1. Backend Configuration + +### Basic Details +- **Backend Type**: `vllm` +- **Executable**: `vllm` (configured via `VllmExecutable`) +- **Subcommand**: `serve` (automatically prepended to arguments) +- **Default Host/Port**: Auto-assigned by llamactl +- **Health Check**: Uses `/health` endpoint (returns HTTP 200 with no content) +- **API Compatibility**: OpenAI-compatible endpoints + +### Example Command +```bash +vllm serve --enable-log-outputs --tensor-parallel-size 2 --gpu-memory-utilization 0.5 --model ISTA-DASLab/gemma-3-27b-it-GPTQ-4b-128g +``` + +## 2. File Structure +Following the existing backend pattern: +``` +pkg/backends/vllm/ +├── vllm.go # VllmServerOptions struct and methods +├── vllm_test.go # Unit tests for VllmServerOptions +├── parser.go # Command parsing logic +└── parser_test.go # Parser tests +``` + +## 3. Core Implementation Files + +### 3.1 `pkg/backends/vllm/vllm.go` + +#### VllmServerOptions Struct +```go +type VllmServerOptions struct { + // Basic connection options (auto-assigned by llamactl) + Host string `json:"host,omitempty"` + Port int `json:"port,omitempty"` + + // Core model options + Model string `json:"model,omitempty"` + + // Common serving options + EnableLogOutputs bool `json:"enable_log_outputs,omitempty"` + TensorParallelSize int `json:"tensor_parallel_size,omitempty"` + GPUMemoryUtilization float64 `json:"gpu_memory_utilization,omitempty"` + + // Additional parameters to be added based on vLLM CLI documentation + // Following the same comprehensive approach as llamacpp.LlamaServerOptions +} +``` + +#### Required Methods +- `UnmarshalJSON()` - Custom unmarshaling with alternative field name support (dash-to-underscore conversion) +- `BuildCommandArgs()` - Convert struct to command line arguments (excluding "serve" subcommand) +- `NewVllmServerOptions()` - Constructor with vLLM defaults + +#### Field Name Mapping +Support both CLI argument names (with dashes) and programmatic names (with underscores), similar to the llama.cpp implementation: +```go +fieldMappings := map[string]string{ + "enable-log-outputs": "enable_log_outputs", + "tensor-parallel-size": "tensor_parallel_size", + "gpu-memory-utilization": "gpu_memory_utilization", + // ... other mappings +} +``` + +### 3.2 `pkg/backends/vllm/parser.go` + +#### ParseVllmCommand Function +Following the same pattern as `llamacpp/parser.go` and `mlx/parser.go`: + +```go +func ParseVllmCommand(command string) (*VllmServerOptions, error) +``` + +**Supported Input Formats:** +1. `vllm serve --model MODEL_NAME --other-args` +2. `/path/to/vllm serve --model MODEL_NAME` +3. `serve --model MODEL_NAME --other-args` +4. `--model MODEL_NAME --other-args` (args only) +5. Multiline commands with backslashes + +**Implementation Details:** +- Handle "serve" subcommand detection and removal +- Support quoted strings and escaped characters +- Validate command structure +- Convert parsed arguments to `VllmServerOptions` + +## 4. Backend Integration + +### 4.1 Backend Type Definition +**File**: `pkg/backends/backend.go` +```go +const ( + BackendTypeLlamaCpp BackendType = "llama_cpp" + BackendTypeMlxLm BackendType = "mlx_lm" + BackendTypeVllm BackendType = "vllm" // ADD THIS +) +``` + +### 4.2 Configuration Integration +**File**: `pkg/config/config.go` + +#### BackendConfig Update +```go +type BackendConfig struct { + LlamaExecutable string `yaml:"llama_executable"` + MLXLMExecutable string `yaml:"mlx_lm_executable"` + VllmExecutable string `yaml:"vllm_executable"` // ADD THIS +} +``` + +#### Default Configuration +- **Default Value**: `"vllm"` +- **Environment Variable**: `LLAMACTL_VLLM_EXECUTABLE` + +#### Environment Variable Loading +Add to `loadEnvVars()` function: +```go +if vllmExec := os.Getenv("LLAMACTL_VLLM_EXECUTABLE"); vllmExec != "" { + cfg.Backends.VllmExecutable = vllmExec +} +``` + +### 4.3 Instance Options Integration +**File**: `pkg/instance/options.go` + +#### CreateInstanceOptions Update +```go +type CreateInstanceOptions struct { + // existing fields... + VllmServerOptions *vllm.VllmServerOptions `json:"-"` +} +``` + +#### JSON Marshaling/Unmarshaling +Update `UnmarshalJSON()` and `MarshalJSON()` methods to handle vLLM backend similar to existing backends. + +#### BuildCommandArgs Implementation +```go +case backends.BackendTypeVllm: + if c.VllmServerOptions != nil { + // Prepend "serve" as first argument + args := []string{"serve"} + args = append(args, c.VllmServerOptions.BuildCommandArgs()...) + return args + } +``` + +**Key Point**: The "serve" subcommand is handled at the instance options level, keeping the `VllmServerOptions.BuildCommandArgs()` method focused only on vLLM-specific parameters. + +## 5. Health Check Integration + +### 5.1 Standard Health Check for vLLM +**File**: `pkg/instance/lifecycle.go` + +vLLM provides a standard `/health` endpoint that returns HTTP 200 with no content, so no modifications are needed to the existing health check logic. The current `WaitForHealthy()` method will work as-is: + +```go +healthURL := fmt.Sprintf("http://%s:%d/health", host, port) +``` + +### 5.2 Startup Time Considerations +- vLLM typically has longer startup times compared to llama.cpp +- The existing configurable timeout system should handle this adequately +- Users may need to adjust `on_demand_start_timeout` for larger models + +## 6. Lifecycle Integration + +### 6.1 Executable Selection +**File**: `pkg/instance/lifecycle.go` + +Update the `Start()` method to handle vLLM executable: + +```go +switch i.options.BackendType { +case backends.BackendTypeLlamaCpp: + executable = i.globalBackendSettings.LlamaExecutable +case backends.BackendTypeMlxLm: + executable = i.globalBackendSettings.MLXLMExecutable +case backends.BackendTypeVllm: // ADD THIS + executable = i.globalBackendSettings.VllmExecutable +default: + return fmt.Errorf("unsupported backend type: %s", i.options.BackendType) +} + +args := i.options.BuildCommandArgs() +i.cmd = exec.CommandContext(i.ctx, executable, args...) +``` + +### 6.2 Command Execution +The final executed command will be: +```bash +vllm serve --model MODEL_NAME --other-vllm-args +``` + +Where: +- `vllm` comes from `VllmExecutable` configuration +- `serve` is prepended by `BuildCommandArgs()` +- Remaining args come from `VllmServerOptions.BuildCommandArgs()` + +## 7. Server Handler Integration + +### 7.1 New Handler Method +**File**: `pkg/server/handlers.go` + +```go +// ParseVllmCommand godoc +// @Summary Parse vllm serve command +// @Description Parses a vLLM serve command string into instance options +// @Tags backends +// @Security ApiKeyAuth +// @Accept json +// @Produce json +// @Param request body ParseCommandRequest true "Command to parse" +// @Success 200 {object} instance.CreateInstanceOptions "Parsed options" +// @Failure 400 {object} map[string]string "Invalid request or command" +// @Router /backends/vllm/parse-command [post] +func (h *Handler) ParseVllmCommand() http.HandlerFunc { + // Implementation similar to ParseMlxCommand() + // Uses vllm.ParseVllmCommand() internally +} +``` + +### 7.2 Router Integration +**File**: `pkg/server/routes.go` + +Add vLLM route: +```go +r.Route("/backends", func(r chi.Router) { + r.Route("/llama-cpp", func(r chi.Router) { + r.Post("/parse-command", handler.ParseLlamaCommand()) + }) + r.Route("/mlx", func(r chi.Router) { + r.Post("/parse-command", handler.ParseMlxCommand()) + }) + r.Route("/vllm", func(r chi.Router) { // ADD THIS + r.Post("/parse-command", handler.ParseVllmCommand()) + }) +}) +``` + +## 8. Validation Integration + +### 8.1 Instance Options Validation +**File**: `pkg/validation/validation.go` + +Add vLLM validation case: +```go +func ValidateInstanceOptions(options *instance.CreateInstanceOptions) error { + // existing validation... + + switch options.BackendType { + case backends.BackendTypeLlamaCpp: + return validateLlamaCppOptions(options) + case backends.BackendTypeMlxLm: + return validateMlxOptions(options) + case backends.BackendTypeVllm: // ADD THIS + return validateVllmOptions(options) + default: + return ValidationError(fmt.Errorf("unsupported backend type: %s", options.BackendType)) + } +} + +func validateVllmOptions(options *instance.CreateInstanceOptions) error { + if options.VllmServerOptions == nil { + return ValidationError(fmt.Errorf("vLLM server options cannot be nil for vLLM backend")) + } + + // Basic validation following the same pattern as other backends + if err := validateStructStrings(options.VllmServerOptions, ""); err != nil { + return err + } + + // Port validation + if options.VllmServerOptions.Port < 0 || options.VllmServerOptions.Port > 65535 { + return ValidationError(fmt.Errorf("invalid port range: %d", options.VllmServerOptions.Port)) + } + + return nil +} +``` + +## 9. Testing Strategy + +### 9.1 Unit Tests +- **`vllm_test.go`**: Test `VllmServerOptions` marshaling/unmarshaling, BuildCommandArgs() +- **`parser_test.go`**: Test command parsing for various formats +- **Integration tests**: Mock vLLM commands and validate parsing + +### 9.2 Test Cases +```go +func TestBuildCommandArgs_VllmBasic(t *testing.T) { + options := VllmServerOptions{ + Model: "microsoft/DialoGPT-medium", + Port: 8080, + Host: "localhost", + EnableLogOutputs: true, + TensorParallelSize: 2, + } + + args := options.BuildCommandArgs() + // Validate expected arguments (excluding "serve") +} + +func TestParseVllmCommand_FullCommand(t *testing.T) { + command := "vllm serve --model ISTA-DASLab/gemma-3-27b-it-GPTQ-4b-128g --tensor-parallel-size 2" + result, err := ParseVllmCommand(command) + // Validate parsing results +} +``` + +## 10. Example Usage + +### 10.1 Parse Existing vLLM Command +```bash +curl -X POST http://localhost:8080/api/v1/backends/vllm/parse-command \ + -H "Authorization: Bearer your-management-key" \ + -H "Content-Type: application/json" \ + -d '{ + "command": "vllm serve --model ISTA-DASLab/gemma-3-27b-it-GPTQ-4b-128g --tensor-parallel-size 2 --gpu-memory-utilization 0.5" + }' +``` + +### 10.2 Create vLLM Instance +```bash +curl -X POST http://localhost:8080/api/v1/instances/my-vllm-model \ + -H "Authorization: Bearer your-management-key" \ + -H "Content-Type: application/json" \ + -d '{ + "backend_type": "vllm", + "backend_options": { + "model": "ISTA-DASLab/gemma-3-27b-it-GPTQ-4b-128g", + "tensor_parallel_size": 2, + "gpu_memory_utilization": 0.5, + "enable_log_outputs": true + } + }' +``` + +### 10.3 Use via OpenAI-Compatible API +```bash +curl -X POST http://localhost:8080/v1/chat/completions \ + -H "Authorization: Bearer your-inference-key" \ + -H "Content-Type: application/json" \ + -d '{ + "model": "my-vllm-model", + "messages": [{"role": "user", "content": "Hello!"}] + }' +``` + +## 11. Implementation Checklist + +### Phase 1: Core Backend +- [ ] Create `pkg/backends/vllm/vllm.go` +- [ ] Implement `VllmServerOptions` struct with basic fields +- [ ] Implement `BuildCommandArgs()`, `UnmarshalJSON()`, `MarshalJSON()` +- [ ] Add comprehensive field mappings for CLI args +- [ ] Create unit tests for `VllmServerOptions` + +### Phase 2: Command Parsing +- [ ] Create `pkg/backends/vllm/parser.go` +- [ ] Implement `ParseVllmCommand()` function +- [ ] Handle various command input formats +- [ ] Create comprehensive parser tests +- [ ] Test edge cases and error conditions + +### Phase 3: Integration +- [ ] Add `BackendTypeVllm` to `pkg/backends/backend.go` +- [ ] Update `BackendConfig` in `pkg/config/config.go` +- [ ] Add environment variable support +- [ ] Update `CreateInstanceOptions` in `pkg/instance/options.go` +- [ ] Implement `BuildCommandArgs()` with "serve" prepending + +### Phase 4: Lifecycle & Health Checks +- [ ] Update executable selection in `pkg/instance/lifecycle.go` +- [ ] Test instance startup and health checking (uses existing `/health` endpoint) +- [ ] Validate command execution flow + +### Phase 5: API Integration +- [ ] Add `ParseVllmCommand()` handler in `pkg/server/handlers.go` +- [ ] Add vLLM route in `pkg/server/routes.go` +- [ ] Update validation in `pkg/validation/validation.go` +- [ ] Test API endpoints + +### Phase 6: Testing & Documentation +- [ ] Create comprehensive integration tests +- [ ] Test with actual vLLM installation (if available) +- [ ] Update documentation +- [ ] Test OpenAI-compatible proxy functionality + +## 12. Configuration Examples + +### 12.1 YAML Configuration +```yaml +backends: + llama_executable: "llama-server" + mlx_lm_executable: "mlx_lm.server" + vllm_executable: "vllm" + +instances: + # ... other instance settings +``` + +### 12.2 Environment Variables +```bash +export LLAMACTL_VLLM_EXECUTABLE="vllm" +# OR for custom installation +export LLAMACTL_VLLM_EXECUTABLE="python -m vllm" +# OR for containerized deployment +export LLAMACTL_VLLM_EXECUTABLE="docker run --rm --gpus all vllm/vllm-openai" +``` + +## 13. Notes and Considerations + +### 13.1 Startup Time +- vLLM instances may take significantly longer to start than llama.cpp +- Consider documenting recommended timeout values +- The configurable `on_demand_start_timeout` should accommodate this + +### 13.2 Resource Usage +- vLLM typically requires substantial GPU memory +- No special handling needed in llamactl (follows existing pattern) +- Resource management is left to the user/administrator + +### 13.3 Model Compatibility +- Primarily designed for HuggingFace models +- Supports various quantization formats (GPTQ, AWQ, etc.) +- Model path validation can be basic (similar to other backends) + +### 13.4 Future Enhancements +- Consider adding vLLM-specific parameter validation +- Could add model download/caching features +- May want to add vLLM version detection capabilities + +This specification provides a comprehensive roadmap for implementing vLLM backend support while maintaining consistency with the existing llamactl architecture. \ No newline at end of file