Initial vLLM backend support

2025-11-05 16:44:22 +00:00 · 2025-09-19 18:05:12 +02:00
parent 02fdae24ee
commit 4df02a6519
12 changed files with 1495 additions and 2 deletions
--- a/pkg/backends/backend.go
+++ b/pkg/backends/backend.go
@@ -5,5 +5,6 @@ type BackendType string
 const (
 	BackendTypeLlamaCpp BackendType = "llama_cpp"
 	BackendTypeMlxLm    BackendType = "mlx_lm"
 	BackendTypeVllm     BackendType = "vllm"
 	// BackendTypeMlxVlm BackendType = "mlx_vlm"  // Future expansion
 )
--- a/pkg/backends/vllm/parser.go
+++ b/pkg/backends/vllm/parser.go
@@ -0,0 +1,302 @@
 package vllm
 import (
 	"encoding/json"
 	"errors"
 	"fmt"
 	"path/filepath"
 	"regexp"
 	"strconv"
 	"strings"
 )
 // ParseVllmCommand parses a vLLM serve command string into VllmServerOptions
 // Supports multiple formats:
 // 1. Full command: "vllm serve --model MODEL_NAME --other-args"
 // 2. Full path: "/usr/local/bin/vllm serve --model MODEL_NAME"
 // 3. Serve only: "serve --model MODEL_NAME --other-args"
 // 4. Args only: "--model MODEL_NAME --other-args"
 // 5. Multiline commands with backslashes
 func ParseVllmCommand(command string) (*VllmServerOptions, error) {
 	// 1. Normalize the command - handle multiline with backslashes
 	trimmed := normalizeMultilineCommand(command)
 	if trimmed == "" {
 		return nil, fmt.Errorf("command cannot be empty")
 	}
 	// 2. Extract arguments from command
 	args, err := extractArgumentsFromCommand(trimmed)
 	if err != nil {
 		return nil, err
 	}
 	// 3. Parse arguments into map
 	options := make(map[string]any)
 	// Known multi-valued flags (snake_case form)
 	multiValued := map[string]struct{}{
 		"middleware":         {},
 		"api_key":            {},
 		"allowed_origins":    {},
 		"allowed_methods":    {},
 		"allowed_headers":    {},
 		"lora_modules":       {},
 		"prompt_adapters":    {},
 	}
 	i := 0
 	for i < len(args) {
 		arg := args[i]
 		if !strings.HasPrefix(arg, "-") { // skip positional / stray values
 			i++
 			continue
 		}
 		// Reject malformed flags with more than two leading dashes (e.g. ---model) to surface user mistakes
 		if strings.HasPrefix(arg, "---") {
 			return nil, fmt.Errorf("malformed flag: %s", arg)
 		}
 		// Unified parsing for --flag=value vs --flag value
 		var rawFlag, rawValue string
 		hasEquals := false
 		if strings.Contains(arg, "=") {
 			parts := strings.SplitN(arg, "=", 2)
 			rawFlag = parts[0]
 			rawValue = parts[1] // may be empty string
 			hasEquals = true
 		} else {
 			rawFlag = arg
 		}
 		flagCore := strings.TrimPrefix(strings.TrimPrefix(rawFlag, "-"), "-")
 		flagName := strings.ReplaceAll(flagCore, "-", "_")
 		// Detect value if not in equals form
 		valueProvided := hasEquals
 		if !hasEquals {
 			if i+1 < len(args) && !isFlag(args[i+1]) { // next token is value
 				rawValue = args[i+1]
 				valueProvided = true
 			}
 		}
 		// Determine if multi-valued flag
 		_, isMulti := multiValued[flagName]
 		// Normalization helper: ensure slice for multi-valued flags
 		appendValue := func(valStr string) {
 			if existing, ok := options[flagName]; ok {
 				// Existing value; ensure slice semantics for multi-valued flags or repeated occurrences
 				if slice, ok := existing.([]string); ok {
 					options[flagName] = append(slice, valStr)
 					return
 				}
 				// Convert scalar to slice
 				options[flagName] = []string{fmt.Sprintf("%v", existing), valStr}
 				return
 			}
 			// First value
 			if isMulti {
 				options[flagName] = []string{valStr}
 			} else {
 				// We'll parse type below for single-valued flags
 				options[flagName] = valStr
 			}
 		}
 		if valueProvided {
 			// Use raw token for multi-valued flags; else allow typed parsing
 			appendValue(rawValue)
 			if !isMulti { // convert to typed value if scalar
 				if strVal, ok := options[flagName].(string); ok { // still scalar
 					options[flagName] = parseValue(strVal)
 				}
 			}
 			// Advance index: if we consumed a following token as value (non equals form), skip it
 			if !hasEquals && i+1 < len(args) && rawValue == args[i+1] {
 				i += 2
 			} else {
 				i++
 			}
 			continue
 		}
 		// Boolean flag (no value)
 		options[flagName] = true
 		i++
 	}
 	// 4. Convert to VllmServerOptions using existing UnmarshalJSON
 	jsonData, err := json.Marshal(options)
 	if err != nil {
 		return nil, fmt.Errorf("failed to marshal parsed options: %w", err)
 	}
 	var vllmOptions VllmServerOptions
 	if err := json.Unmarshal(jsonData, &vllmOptions); err != nil {
 		return nil, fmt.Errorf("failed to parse command options: %w", err)
 	}
 	// 5. Return VllmServerOptions
 	return &vllmOptions, nil
 }
 // parseValue attempts to parse a string value into the most appropriate type
 func parseValue(value string) any {
 	// Surrounding matching quotes (single or double)
 	if l := len(value); l >= 2 {
 		if (value[0] == '"' && value[l-1] == '"') || (value[0] == '\'' && value[l-1] == '\'') {
 			value = value[1 : l-1]
 		}
 	}
 	lower := strings.ToLower(value)
 	if lower == "true" {
 		return true
 	}
 	if lower == "false" {
 		return false
 	}
 	if intVal, err := strconv.Atoi(value); err == nil {
 		return intVal
 	}
 	if floatVal, err := strconv.ParseFloat(value, 64); err == nil {
 		return floatVal
 	}
 	return value
 }
 // normalizeMultilineCommand handles multiline commands with backslashes
 func normalizeMultilineCommand(command string) string {
 	// Handle escaped newlines (backslash followed by newline)
 	re := regexp.MustCompile(`\\\s*\n\s*`)
 	normalized := re.ReplaceAllString(command, " ")
 	// Clean up extra whitespace
 	re = regexp.MustCompile(`\s+`)
 	normalized = re.ReplaceAllString(normalized, " ")
 	return strings.TrimSpace(normalized)
 }
 // extractArgumentsFromCommand extracts arguments from various command formats
 func extractArgumentsFromCommand(command string) ([]string, error) {
 	// Split command into tokens respecting quotes
 	tokens, err := splitCommandTokens(command)
 	if err != nil {
 		return nil, err
 	}
 	if len(tokens) == 0 {
 		return nil, fmt.Errorf("no command tokens found")
 	}
 	// Check if first token looks like an executable
 	firstToken := tokens[0]
 	// Case 1: Full path to executable (contains path separator or ends with vllm)
 	if strings.Contains(firstToken, string(filepath.Separator)) ||
 		strings.HasSuffix(filepath.Base(firstToken), "vllm") {
 		// Check if second token is "serve"
 		if len(tokens) > 1 && strings.ToLower(tokens[1]) == "serve" {
 			return tokens[2:], nil // Return everything except executable and serve
 		}
 		return tokens[1:], nil // Return everything except the executable
 	}
 	// Case 2: Just "vllm" command
 	if strings.ToLower(firstToken) == "vllm" {
 		// Check if second token is "serve"
 		if len(tokens) > 1 && strings.ToLower(tokens[1]) == "serve" {
 			return tokens[2:], nil // Return everything except vllm and serve
 		}
 		return tokens[1:], nil // Return everything except vllm
 	}
 	// Case 3: Just "serve" command
 	if strings.ToLower(firstToken) == "serve" {
 		return tokens[1:], nil // Return everything except serve
 	}
 	// Case 4: Arguments only (starts with a flag)
 	if strings.HasPrefix(firstToken, "-") {
 		return tokens, nil // Return all tokens as arguments
 	}
 	// Case 5: Unknown format - might be a different executable name
 	// Be permissive and assume it's the executable
 	if len(tokens) > 1 && strings.ToLower(tokens[1]) == "serve" {
 		return tokens[2:], nil // Return everything except executable and serve
 	}
 	return tokens[1:], nil
 }
 // splitCommandTokens splits a command string into tokens, respecting quotes
 func splitCommandTokens(command string) ([]string, error) {
 	var tokens []string
 	var current strings.Builder
 	inQuotes := false
 	quoteChar := byte(0)
 	escaped := false
 	for i := 0; i < len(command); i++ {
 		c := command[i]
 		if escaped {
 			current.WriteByte(c)
 			escaped = false
 			continue
 		}
 		if c == '\\' {
 			escaped = true
 			current.WriteByte(c)
 			continue
 		}
 		if !inQuotes && (c == '"' || c == '\'') {
 			inQuotes = true
 			quoteChar = c
 			current.WriteByte(c)
 		} else if inQuotes && c == quoteChar {
 			inQuotes = false
 			quoteChar = 0
 			current.WriteByte(c)
 		} else if !inQuotes && (c == ' ' || c == '\t') {
 			if current.Len() > 0 {
 				tokens = append(tokens, current.String())
 				current.Reset()
 			}
 		} else {
 			current.WriteByte(c)
 		}
 	}
 	if inQuotes {
 		return nil, errors.New("unterminated quoted string")
 	}
 	if current.Len() > 0 {
 		tokens = append(tokens, current.String())
 	}
 	return tokens, nil
 }
 // isFlag determines if a string is a command line flag or a value
 // Handles the special case where negative numbers should be treated as values, not flags
 func isFlag(arg string) bool {
 	if !strings.HasPrefix(arg, "-") {
 		return false
 	}
 	// Special case: if it's a negative number, treat it as a value
 	if _, err := strconv.ParseFloat(arg, 64); err == nil {
 		return false
 	}
 	return true
 }
--- a/pkg/backends/vllm/parser_test.go
+++ b/pkg/backends/vllm/parser_test.go
@@ -0,0 +1,83 @@
 package vllm
 import (
 	"testing"
 )
 func TestParseVllmCommand(t *testing.T) {
 	tests := []struct {
 		name      string
 		command   string
 		expectErr bool
 	}{
 		{
 			name:      "basic vllm serve command",
 			command:   "vllm serve --model microsoft/DialoGPT-medium",
 			expectErr: false,
 		},
 		{
 			name:      "serve only command",
 			command:   "serve --model microsoft/DialoGPT-medium",
 			expectErr: false,
 		},
 		{
 			name:      "args only",
 			command:   "--model microsoft/DialoGPT-medium --tensor-parallel-size 2",
 			expectErr: false,
 		},
 		{
 			name:      "empty command",
 			command:   "",
 			expectErr: true,
 		},
 		{
 			name:      "unterminated quote",
 			command:   `vllm serve --model "unterminated`,
 			expectErr: true,
 		},
 	}
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
 			result, err := ParseVllmCommand(tt.command)
 			if tt.expectErr {
 				if err == nil {
 					t.Errorf("expected error but got none")
 				}
 				return
 			}
 			if err != nil {
 				t.Errorf("unexpected error: %v", err)
 				return
 			}
 			if result == nil {
 				t.Errorf("expected result but got nil")
 			}
 		})
 	}
 }
 func TestParseVllmCommandValues(t *testing.T) {
 	command := "vllm serve --model test-model --tensor-parallel-size 4 --gpu-memory-utilization 0.8 --enable-log-outputs"
 	result, err := ParseVllmCommand(command)
 	if err != nil {
 		t.Fatalf("unexpected error: %v", err)
 	}
 	if result.Model != "test-model" {
 		t.Errorf("expected model 'test-model', got '%s'", result.Model)
 	}
 	if result.TensorParallelSize != 4 {
 		t.Errorf("expected tensor_parallel_size 4, got %d", result.TensorParallelSize)
 	}
 	if result.GPUMemoryUtilization != 0.8 {
 		t.Errorf("expected gpu_memory_utilization 0.8, got %f", result.GPUMemoryUtilization)
 	}
 	if !result.EnableLogOutputs {
 		t.Errorf("expected enable_log_outputs true, got %v", result.EnableLogOutputs)
 	}
 }
--- a/pkg/backends/vllm/vllm.go
+++ b/pkg/backends/vllm/vllm.go
@@ -0,0 +1,439 @@
 package vllm
 import (
 	"encoding/json"
 	"reflect"
 	"strconv"
 	"strings"
 )
 type VllmServerOptions struct {
 	// Basic connection options (auto-assigned by llamactl)
 	Host string `json:"host,omitempty"`
 	Port int    `json:"port,omitempty"`
 	// Model and engine configuration
 	Model                      string   `json:"model,omitempty"`
 	Tokenizer                  string   `json:"tokenizer,omitempty"`
 	SkipTokenizerInit          bool     `json:"skip_tokenizer_init,omitempty"`
 	Revision                   string   `json:"revision,omitempty"`
 	CodeRevision               string   `json:"code_revision,omitempty"`
 	TokenizerRevision          string   `json:"tokenizer_revision,omitempty"`
 	TokenizerMode              string   `json:"tokenizer_mode,omitempty"`
 	TrustRemoteCode            bool     `json:"trust_remote_code,omitempty"`
 	DownloadDir                string   `json:"download_dir,omitempty"`
 	LoadFormat                 string   `json:"load_format,omitempty"`
 	ConfigFormat               string   `json:"config_format,omitempty"`
 	Dtype                      string   `json:"dtype,omitempty"`
 	KVCacheDtype               string   `json:"kv_cache_dtype,omitempty"`
 	QuantizationParamPath      string   `json:"quantization_param_path,omitempty"`
 	Seed                       int      `json:"seed,omitempty"`
 	MaxModelLen                int      `json:"max_model_len,omitempty"`
 	GuidedDecodingBackend      string   `json:"guided_decoding_backend,omitempty"`
 	DistributedExecutorBackend string   `json:"distributed_executor_backend,omitempty"`
 	WorkerUseRay               bool     `json:"worker_use_ray,omitempty"`
 	RayWorkersUseNSight        bool     `json:"ray_workers_use_nsight,omitempty"`
 	// Performance and serving configuration
 	BlockSize                    int     `json:"block_size,omitempty"`
 	EnablePrefixCaching          bool    `json:"enable_prefix_caching,omitempty"`
 	DisableSlidingWindow         bool    `json:"disable_sliding_window,omitempty"`
 	UseV2BlockManager            bool    `json:"use_v2_block_manager,omitempty"`
 	NumLookaheadSlots            int     `json:"num_lookahead_slots,omitempty"`
 	SwapSpace                    int     `json:"swap_space,omitempty"`
 	CPUOffloadGB                 int     `json:"cpu_offload_gb,omitempty"`
 	GPUMemoryUtilization         float64 `json:"gpu_memory_utilization,omitempty"`
 	NumGPUBlocksOverride         int     `json:"num_gpu_blocks_override,omitempty"`
 	MaxNumBatchedTokens          int     `json:"max_num_batched_tokens,omitempty"`
 	MaxNumSeqs                   int     `json:"max_num_seqs,omitempty"`
 	MaxLogprobs                  int     `json:"max_logprobs,omitempty"`
 	DisableLogStats              bool    `json:"disable_log_stats,omitempty"`
 	Quantization                 string  `json:"quantization,omitempty"`
 	RopeScaling                  string  `json:"rope_scaling,omitempty"`
 	RopeTheta                    float64 `json:"rope_theta,omitempty"`
 	EnforceEager                 bool    `json:"enforce_eager,omitempty"`
 	MaxContextLenToCapture       int     `json:"max_context_len_to_capture,omitempty"`
 	MaxSeqLenToCapture           int     `json:"max_seq_len_to_capture,omitempty"`
 	DisableCustomAllReduce       bool    `json:"disable_custom_all_reduce,omitempty"`
 	TokenizerPoolSize            int     `json:"tokenizer_pool_size,omitempty"`
 	TokenizerPoolType            string  `json:"tokenizer_pool_type,omitempty"`
 	TokenizerPoolExtraConfig     string  `json:"tokenizer_pool_extra_config,omitempty"`
 	EnableLoraBias               bool    `json:"enable_lora_bias,omitempty"`
 	LoraExtraVocabSize           int     `json:"lora_extra_vocab_size,omitempty"`
 	LoraRank                     int     `json:"lora_rank,omitempty"`
 	PromptLookbackDistance       int     `json:"prompt_lookback_distance,omitempty"`
 	PreemptionMode               string  `json:"preemption_mode,omitempty"`
 	// Distributed and parallel processing
 	TensorParallelSize             int    `json:"tensor_parallel_size,omitempty"`
 	PipelineParallelSize           int    `json:"pipeline_parallel_size,omitempty"`
 	MaxParallelLoadingWorkers      int    `json:"max_parallel_loading_workers,omitempty"`
 	DisableAsyncOutputProc         bool   `json:"disable_async_output_proc,omitempty"`
 	WorkerClass                    string `json:"worker_class,omitempty"`
 	EnabledLoraModules             string `json:"enabled_lora_modules,omitempty"`
 	MaxLoraRank                    int    `json:"max_lora_rank,omitempty"`
 	FullyShardedLoras              bool   `json:"fully_sharded_loras,omitempty"`
 	LoraModules                    string `json:"lora_modules,omitempty"`
 	PromptAdapters                 string `json:"prompt_adapters,omitempty"`
 	MaxPromptAdapterToken          int    `json:"max_prompt_adapter_token,omitempty"`
 	Device                         string `json:"device,omitempty"`
 	SchedulerDelay                 float64 `json:"scheduler_delay,omitempty"`
 	EnableChunkedPrefill           bool   `json:"enable_chunked_prefill,omitempty"`
 	SpeculativeModel               string `json:"speculative_model,omitempty"`
 	SpeculativeModelQuantization   string `json:"speculative_model_quantization,omitempty"`
 	SpeculativeRevision            string `json:"speculative_revision,omitempty"`
 	SpeculativeMaxModelLen         int    `json:"speculative_max_model_len,omitempty"`
 	SpeculativeDisableByBatchSize  int    `json:"speculative_disable_by_batch_size,omitempty"`
 	NgptSpeculativeLength          int    `json:"ngpt_speculative_length,omitempty"`
 	SpeculativeDisableMqa          bool   `json:"speculative_disable_mqa,omitempty"`
 	ModelLoaderExtraConfig         string `json:"model_loader_extra_config,omitempty"`
 	IgnorePatterns                 string `json:"ignore_patterns,omitempty"`
 	PreloadedLoraModules           string `json:"preloaded_lora_modules,omitempty"`
 	// OpenAI server specific options
 	UDS                           string   `json:"uds,omitempty"`
 	UvicornLogLevel               string   `json:"uvicorn_log_level,omitempty"`
 	ResponseRole                  string   `json:"response_role,omitempty"`
 	SSLKeyfile                    string   `json:"ssl_keyfile,omitempty"`
 	SSLCertfile                   string   `json:"ssl_certfile,omitempty"`
 	SSLCACerts                    string   `json:"ssl_ca_certs,omitempty"`
 	SSLCertReqs                   int      `json:"ssl_cert_reqs,omitempty"`
 	RootPath                      string   `json:"root_path,omitempty"`
 	Middleware                    []string `json:"middleware,omitempty"`
 	ReturnTokensAsTokenIDS        bool     `json:"return_tokens_as_token_ids,omitempty"`
 	DisableFrontendMultiprocessing bool    `json:"disable_frontend_multiprocessing,omitempty"`
 	EnableAutoToolChoice          bool     `json:"enable_auto_tool_choice,omitempty"`
 	ToolCallParser                string   `json:"tool_call_parser,omitempty"`
 	ToolServer                    string   `json:"tool_server,omitempty"`
 	ChatTemplate                  string   `json:"chat_template,omitempty"`
 	ChatTemplateContentFormat     string   `json:"chat_template_content_format,omitempty"`
 	AllowCredentials              bool     `json:"allow_credentials,omitempty"`
 	AllowedOrigins                []string `json:"allowed_origins,omitempty"`
 	AllowedMethods                []string `json:"allowed_methods,omitempty"`
 	AllowedHeaders                []string `json:"allowed_headers,omitempty"`
 	APIKey                        []string `json:"api_key,omitempty"`
 	EnableLogOutputs              bool     `json:"enable_log_outputs,omitempty"`
 	EnableTokenUsage              bool     `json:"enable_token_usage,omitempty"`
 	EnableAsyncEngineDebug        bool     `json:"enable_async_engine_debug,omitempty"`
 	EngineUseRay                  bool     `json:"engine_use_ray,omitempty"`
 	DisableLogRequests            bool     `json:"disable_log_requests,omitempty"`
 	MaxLogLen                     int      `json:"max_log_len,omitempty"`
 	// Additional engine configuration
 	Task                         string  `json:"task,omitempty"`
 	MultiModalConfig             string  `json:"multi_modal_config,omitempty"`
 	LimitMmPerPrompt             string  `json:"limit_mm_per_prompt,omitempty"`
 	EnableSleepMode              bool    `json:"enable_sleep_mode,omitempty"`
 	EnableChunkingRequest        bool    `json:"enable_chunking_request,omitempty"`
 	CompilationConfig            string  `json:"compilation_config,omitempty"`
 	DisableSlidingWindowMask     bool    `json:"disable_sliding_window_mask,omitempty"`
 	EnableTRTLLMEngineLatency    bool    `json:"enable_trtllm_engine_latency,omitempty"`
 	OverridePoolingConfig        string  `json:"override_pooling_config,omitempty"`
 	OverrideNeuronConfig         string  `json:"override_neuron_config,omitempty"`
 	OverrideKVCacheALIGNSize     int     `json:"override_kv_cache_align_size,omitempty"`
 }
 // NewVllmServerOptions creates a new VllmServerOptions with defaults
 func NewVllmServerOptions() *VllmServerOptions {
 	return &VllmServerOptions{
 		Host:                    "127.0.0.1",
 		Port:                    8000,
 		TensorParallelSize:      1,
 		PipelineParallelSize:    1,
 		GPUMemoryUtilization:    0.9,
 		BlockSize:              16,
 		SwapSpace:              4,
 		UvicornLogLevel:         "info",
 		ResponseRole:            "assistant",
 		TokenizerMode:           "auto",
 		TrustRemoteCode:         false,
 		EnablePrefixCaching:     false,
 		EnforceEager:            false,
 		DisableLogStats:         false,
 		DisableLogRequests:      false,
 		MaxLogprobs:             20,
 		EnableLogOutputs:        false,
 		EnableTokenUsage:        false,
 		AllowCredentials:        false,
 		AllowedOrigins:          []string{"*"},
 		AllowedMethods:          []string{"*"},
 		AllowedHeaders:          []string{"*"},
 	}
 }
 // UnmarshalJSON implements custom JSON unmarshaling to support multiple field names
 func (o *VllmServerOptions) UnmarshalJSON(data []byte) error {
 	// First unmarshal into a map to handle multiple field names
 	var raw map[string]any
 	if err := json.Unmarshal(data, &raw); err != nil {
 		return err
 	}
 	// Create a temporary struct for standard unmarshaling
 	type tempOptions VllmServerOptions
 	temp := tempOptions{}
 	// Standard unmarshal first
 	if err := json.Unmarshal(data, &temp); err != nil {
 		return err
 	}
 	// Copy to our struct
 	*o = VllmServerOptions(temp)
 	// Handle alternative field names (CLI format with dashes)
 	fieldMappings := map[string]string{
 		// Basic options
 		"tensor-parallel-size":             "tensor_parallel_size",
 		"pipeline-parallel-size":           "pipeline_parallel_size",
 		"max-parallel-loading-workers":     "max_parallel_loading_workers",
 		"disable-async-output-proc":        "disable_async_output_proc",
 		"worker-class":                     "worker_class",
 		"enabled-lora-modules":             "enabled_lora_modules",
 		"max-lora-rank":                    "max_lora_rank",
 		"fully-sharded-loras":              "fully_sharded_loras",
 		"lora-modules":                     "lora_modules",
 		"prompt-adapters":                  "prompt_adapters",
 		"max-prompt-adapter-token":         "max_prompt_adapter_token",
 		"scheduler-delay":                  "scheduler_delay",
 		"enable-chunked-prefill":           "enable_chunked_prefill",
 		"speculative-model":                "speculative_model",
 		"speculative-model-quantization":   "speculative_model_quantization",
 		"speculative-revision":             "speculative_revision",
 		"speculative-max-model-len":        "speculative_max_model_len",
 		"speculative-disable-by-batch-size": "speculative_disable_by_batch_size",
 		"ngpt-speculative-length":          "ngpt_speculative_length",
 		"speculative-disable-mqa":          "speculative_disable_mqa",
 		"model-loader-extra-config":        "model_loader_extra_config",
 		"ignore-patterns":                  "ignore_patterns",
 		"preloaded-lora-modules":           "preloaded_lora_modules",
 		// Model configuration
 		"skip-tokenizer-init":              "skip_tokenizer_init",
 		"code-revision":                    "code_revision",
 		"tokenizer-revision":               "tokenizer_revision",
 		"tokenizer-mode":                   "tokenizer_mode",
 		"trust-remote-code":                "trust_remote_code",
 		"download-dir":                     "download_dir",
 		"load-format":                      "load_format",
 		"config-format":                    "config_format",
 		"kv-cache-dtype":                   "kv_cache_dtype",
 		"quantization-param-path":          "quantization_param_path",
 		"max-model-len":                    "max_model_len",
 		"guided-decoding-backend":          "guided_decoding_backend",
 		"distributed-executor-backend":     "distributed_executor_backend",
 		"worker-use-ray":                   "worker_use_ray",
 		"ray-workers-use-nsight":           "ray_workers_use_nsight",
 		// Performance configuration
 		"block-size":                       "block_size",
 		"enable-prefix-caching":            "enable_prefix_caching",
 		"disable-sliding-window":           "disable_sliding_window",
 		"use-v2-block-manager":             "use_v2_block_manager",
 		"num-lookahead-slots":              "num_lookahead_slots",
 		"swap-space":                       "swap_space",
 		"cpu-offload-gb":                   "cpu_offload_gb",
 		"gpu-memory-utilization":           "gpu_memory_utilization",
 		"num-gpu-blocks-override":          "num_gpu_blocks_override",
 		"max-num-batched-tokens":           "max_num_batched_tokens",
 		"max-num-seqs":                     "max_num_seqs",
 		"max-logprobs":                     "max_logprobs",
 		"disable-log-stats":                "disable_log_stats",
 		"rope-scaling":                     "rope_scaling",
 		"rope-theta":                       "rope_theta",
 		"enforce-eager":                    "enforce_eager",
 		"max-context-len-to-capture":       "max_context_len_to_capture",
 		"max-seq-len-to-capture":           "max_seq_len_to_capture",
 		"disable-custom-all-reduce":        "disable_custom_all_reduce",
 		"tokenizer-pool-size":              "tokenizer_pool_size",
 		"tokenizer-pool-type":              "tokenizer_pool_type",
 		"tokenizer-pool-extra-config":      "tokenizer_pool_extra_config",
 		"enable-lora-bias":                 "enable_lora_bias",
 		"lora-extra-vocab-size":            "lora_extra_vocab_size",
 		"lora-rank":                        "lora_rank",
 		"prompt-lookback-distance":         "prompt_lookback_distance",
 		"preemption-mode":                  "preemption_mode",
 		// Server configuration
 		"uvicorn-log-level":                  "uvicorn_log_level",
 		"response-role":                      "response_role",
 		"ssl-keyfile":                        "ssl_keyfile",
 		"ssl-certfile":                       "ssl_certfile",
 		"ssl-ca-certs":                       "ssl_ca_certs",
 		"ssl-cert-reqs":                      "ssl_cert_reqs",
 		"root-path":                          "root_path",
 		"return-tokens-as-token-ids":         "return_tokens_as_token_ids",
 		"disable-frontend-multiprocessing":   "disable_frontend_multiprocessing",
 		"enable-auto-tool-choice":            "enable_auto_tool_choice",
 		"tool-call-parser":                   "tool_call_parser",
 		"tool-server":                        "tool_server",
 		"chat-template":                      "chat_template",
 		"chat-template-content-format":       "chat_template_content_format",
 		"allow-credentials":                  "allow_credentials",
 		"allowed-origins":                    "allowed_origins",
 		"allowed-methods":                    "allowed_methods",
 		"allowed-headers":                    "allowed_headers",
 		"api-key":                            "api_key",
 		"enable-log-outputs":                 "enable_log_outputs",
 		"enable-token-usage":                 "enable_token_usage",
 		"enable-async-engine-debug":          "enable_async_engine_debug",
 		"engine-use-ray":                     "engine_use_ray",
 		"disable-log-requests":               "disable_log_requests",
 		"max-log-len":                        "max_log_len",
 		// Additional options
 		"multi-modal-config":               "multi_modal_config",
 		"limit-mm-per-prompt":              "limit_mm_per_prompt",
 		"enable-sleep-mode":                "enable_sleep_mode",
 		"enable-chunking-request":          "enable_chunking_request",
 		"compilation-config":               "compilation_config",
 		"disable-sliding-window-mask":      "disable_sliding_window_mask",
 		"enable-trtllm-engine-latency":     "enable_trtllm_engine_latency",
 		"override-pooling-config":          "override_pooling_config",
 		"override-neuron-config":           "override_neuron_config",
 		"override-kv-cache-align-size":     "override_kv_cache_align_size",
 	}
 	// Process alternative field names
 	for altName, canonicalName := range fieldMappings {
 		if value, exists := raw[altName]; exists {
 			// Use reflection to set the field value
 			v := reflect.ValueOf(o).Elem()
 			field := v.FieldByNameFunc(func(fieldName string) bool {
 				field, _ := v.Type().FieldByName(fieldName)
 				jsonTag := field.Tag.Get("json")
 				return jsonTag == canonicalName+",omitempty" || jsonTag == canonicalName
 			})
 			if field.IsValid() && field.CanSet() {
 				switch field.Kind() {
 				case reflect.Int:
 					if intVal, ok := value.(float64); ok {
 						field.SetInt(int64(intVal))
 					} else if strVal, ok := value.(string); ok {
 						if intVal, err := strconv.Atoi(strVal); err == nil {
 							field.SetInt(int64(intVal))
 						}
 					}
 				case reflect.Float64:
 					if floatVal, ok := value.(float64); ok {
 						field.SetFloat(floatVal)
 					} else if strVal, ok := value.(string); ok {
 						if floatVal, err := strconv.ParseFloat(strVal, 64); err == nil {
 							field.SetFloat(floatVal)
 						}
 					}
 				case reflect.String:
 					if strVal, ok := value.(string); ok {
 						field.SetString(strVal)
 					}
 				case reflect.Bool:
 					if boolVal, ok := value.(bool); ok {
 						field.SetBool(boolVal)
 					}
 				case reflect.Slice:
 					if field.Type().Elem().Kind() == reflect.String {
 						if strVal, ok := value.(string); ok {
 							// Split comma-separated values
 							values := strings.Split(strVal, ",")
 							for i, v := range values {
 								values[i] = strings.TrimSpace(v)
 							}
 							field.Set(reflect.ValueOf(values))
 						} else if slice, ok := value.([]interface{}); ok {
 							var strSlice []string
 							for _, item := range slice {
 								if str, ok := item.(string); ok {
 									strSlice = append(strSlice, str)
 								}
 							}
 							field.Set(reflect.ValueOf(strSlice))
 						}
 					}
 				}
 			}
 		}
 	}
 	return nil
 }
 // BuildCommandArgs converts VllmServerOptions to command line arguments
 // Note: This does NOT include the "serve" subcommand, that's handled at the instance level
 func (o *VllmServerOptions) BuildCommandArgs() []string {
 	var args []string
 	v := reflect.ValueOf(o).Elem()
 	t := v.Type()
 	for i := 0; i < v.NumField(); i++ {
 		field := v.Field(i)
 		fieldType := t.Field(i)
 		// Skip unexported fields
 		if !field.CanInterface() {
 			continue
 		}
 		// Get the JSON tag to determine the flag name
 		jsonTag := fieldType.Tag.Get("json")
 		if jsonTag == "" || jsonTag == "-" {
 			continue
 		}
 		// Remove ",omitempty" from the tag
 		flagName := jsonTag
 		if commaIndex := strings.Index(jsonTag, ","); commaIndex != -1 {
 			flagName = jsonTag[:commaIndex]
 		}
 		// Skip host and port as they are handled by llamactl
 		if flagName == "host" || flagName == "port" {
 			continue
 		}
 		// Convert snake_case to kebab-case for CLI flags
 		flagName = strings.ReplaceAll(flagName, "_", "-")
 		// Add the appropriate arguments based on field type and value
 		switch field.Kind() {
 		case reflect.Bool:
 			if field.Bool() {
 				args = append(args, "--"+flagName)
 			}
 		case reflect.Int:
 			if field.Int() != 0 {
 				args = append(args, "--"+flagName, strconv.FormatInt(field.Int(), 10))
 			}
 		case reflect.Float64:
 			if field.Float() != 0 {
 				args = append(args, "--"+flagName, strconv.FormatFloat(field.Float(), 'f', -1, 64))
 			}
 		case reflect.String:
 			if field.String() != "" {
 				args = append(args, "--"+flagName, field.String())
 			}
 		case reflect.Slice:
 			if field.Type().Elem().Kind() == reflect.String {
 				// Handle []string fields - some are comma-separated, some use multiple flags
 				if flagName == "api-key" || flagName == "allowed-origins" || flagName == "allowed-methods" || flagName == "allowed-headers" || flagName == "middleware" {
 					// Multiple flags for these
 					for j := 0; j < field.Len(); j++ {
 						args = append(args, "--"+flagName, field.Index(j).String())
 					}
 				} else {
 					// Comma-separated for others
 					if field.Len() > 0 {
 						var values []string
 						for j := 0; j < field.Len(); j++ {
 							values = append(values, field.Index(j).String())
 						}
 						args = append(args, "--"+flagName, strings.Join(values, ","))
 					}
 				}
 			}
 		}
 	}
 	return args
 }
--- a/pkg/backends/vllm/vllm_test.go
+++ b/pkg/backends/vllm/vllm_test.go
@@ -0,0 +1,106 @@
 package vllm_test
 import (
 	"encoding/json"
 	"llamactl/pkg/backends/vllm"
 	"slices"
 	"testing"
 )
 func TestBuildCommandArgs(t *testing.T) {
 	options := vllm.VllmServerOptions{
 		Model:                "microsoft/DialoGPT-medium",
 		Port:                 8080, // should be excluded
 		Host:                 "localhost", // should be excluded
 		TensorParallelSize:   2,
 		GPUMemoryUtilization: 0.8,
 		EnableLogOutputs:     true,
 		APIKey:              []string{"key1", "key2"},
 	}
 	args := options.BuildCommandArgs()
 	// Check core functionality
 	if !containsFlagWithValue(args, "--model", "microsoft/DialoGPT-medium") {
 		t.Errorf("Expected --model microsoft/DialoGPT-medium not found in %v", args)
 	}
 	if !containsFlagWithValue(args, "--tensor-parallel-size", "2") {
 		t.Errorf("Expected --tensor-parallel-size 2 not found in %v", args)
 	}
 	if !contains(args, "--enable-log-outputs") {
 		t.Errorf("Expected --enable-log-outputs not found in %v", args)
 	}
 	// Host and port should NOT be in the arguments (handled by llamactl)
 	if contains(args, "--host") || contains(args, "--port") {
 		t.Errorf("Host and port should not be in command args, found in %v", args)
 	}
 	// Check array handling (multiple flags)
 	apiKeyCount := 0
 	for i := range args {
 		if args[i] == "--api-key" {
 			apiKeyCount++
 		}
 	}
 	if apiKeyCount != 2 {
 		t.Errorf("Expected 2 --api-key flags, got %d", apiKeyCount)
 	}
 }
 func TestUnmarshalJSON(t *testing.T) {
 	// Test both underscore and dash formats
 	jsonData := `{
 		"model": "test-model",
 		"tensor_parallel_size": 4,
 		"gpu-memory-utilization": 0.9,
 		"enable-log-outputs": true
 	}`
 	var options vllm.VllmServerOptions
 	err := json.Unmarshal([]byte(jsonData), &options)
 	if err != nil {
 		t.Fatalf("Unmarshal failed: %v", err)
 	}
 	if options.Model != "test-model" {
 		t.Errorf("Expected model 'test-model', got %q", options.Model)
 	}
 	if options.TensorParallelSize != 4 {
 		t.Errorf("Expected tensor_parallel_size 4, got %d", options.TensorParallelSize)
 	}
 	if options.GPUMemoryUtilization != 0.9 {
 		t.Errorf("Expected gpu_memory_utilization 0.9, got %f", options.GPUMemoryUtilization)
 	}
 	if !options.EnableLogOutputs {
 		t.Errorf("Expected enable_log_outputs true, got %v", options.EnableLogOutputs)
 	}
 }
 func TestNewVllmServerOptions(t *testing.T) {
 	options := vllm.NewVllmServerOptions()
 	if options == nil {
 		t.Fatal("NewVllmServerOptions returned nil")
 	}
 	if options.Host != "127.0.0.1" {
 		t.Errorf("Expected default host '127.0.0.1', got %q", options.Host)
 	}
 	if options.Port != 8000 {
 		t.Errorf("Expected default port 8000, got %d", options.Port)
 	}
 }
 // Helper functions
 func contains(slice []string, item string) bool {
 	return slices.Contains(slice, item)
 }
 func containsFlagWithValue(args []string, flag, value string) bool {
 	for i, arg := range args {
 		if arg == flag && i+1 < len(args) && args[i+1] == value {
 			return true
 		}
 	}
 	return false
 }
--- a/pkg/config/config.go
+++ b/pkg/config/config.go
@@ -17,6 +17,9 @@ type BackendConfig struct {
 	// Path to mlx_lm executable (MLX-LM backend)
 	MLXLMExecutable string `yaml:"mlx_lm_executable"`
 	// Path to vllm executable (vLLM backend)
 	VllmExecutable string `yaml:"vllm_executable"`
 }
 // AppConfig represents the configuration for llamactl
@@ -122,6 +125,7 @@ func LoadConfig(configPath string) (AppConfig, error) {
 		Backends: BackendConfig{
 			LlamaExecutable: "llama-server",
 			MLXLMExecutable: "mlx_lm.server",
 			VllmExecutable:  "vllm",
 		},
 		Instances: InstancesConfig{
 			PortRange:            [2]int{8000, 9000},
@@ -246,6 +250,9 @@ func loadEnvVars(cfg *AppConfig) {
 	if mlxLMExec := os.Getenv("LLAMACTL_MLX_LM_EXECUTABLE"); mlxLMExec != "" {
 		cfg.Backends.MLXLMExecutable = mlxLMExec
 	}
 	if vllmExec := os.Getenv("LLAMACTL_VLLM_EXECUTABLE"); vllmExec != "" {
 		cfg.Backends.VllmExecutable = vllmExec
 	}
 	if autoRestart := os.Getenv("LLAMACTL_DEFAULT_AUTO_RESTART"); autoRestart != "" {
 		if b, err := strconv.ParseBool(autoRestart); err == nil {
 			cfg.Instances.DefaultAutoRestart = b
--- a/pkg/instance/lifecycle.go
+++ b/pkg/instance/lifecycle.go
@@ -52,6 +52,8 @@ func (i *Process) Start() error {
 		executable = i.globalBackendSettings.LlamaExecutable
 	case backends.BackendTypeMlxLm:
 		executable = i.globalBackendSettings.MLXLMExecutable
 	case backends.BackendTypeVllm:
 		executable = i.globalBackendSettings.VllmExecutable
 	default:
 		return fmt.Errorf("unsupported backend type: %s", i.options.BackendType)
 	}
--- a/pkg/instance/options.go
+++ b/pkg/instance/options.go
@@ -6,6 +6,7 @@ import (
 	"llamactl/pkg/backends"
 	"llamactl/pkg/backends/llamacpp"
 	"llamactl/pkg/backends/mlx"
 	"llamactl/pkg/backends/vllm"
 	"llamactl/pkg/config"
 	"log"
 )
@@ -26,6 +27,7 @@ type CreateInstanceOptions struct {
 	// Backend-specific options
 	LlamaServerOptions *llamacpp.LlamaServerOptions `json:"-"`
 	MlxServerOptions   *mlx.MlxServerOptions        `json:"-"`
 	VllmServerOptions  *vllm.VllmServerOptions      `json:"-"`
 }
 // UnmarshalJSON implements custom JSON unmarshaling for CreateInstanceOptions
@@ -63,12 +65,24 @@ func (c *CreateInstanceOptions) UnmarshalJSON(data []byte) error {
 			if err != nil {
 				return fmt.Errorf("failed to marshal backend options: %w", err)
 			}
-			
+
 			c.MlxServerOptions = &mlx.MlxServerOptions{}
 			if err := json.Unmarshal(optionsData, c.MlxServerOptions); err != nil {
 				return fmt.Errorf("failed to unmarshal MLX options: %w", err)
 			}
 		}
 	case backends.BackendTypeVllm:
 		if c.BackendOptions != nil {
 			optionsData, err := json.Marshal(c.BackendOptions)
 			if err != nil {
 				return fmt.Errorf("failed to marshal backend options: %w", err)
 			}
 			c.VllmServerOptions = &vllm.VllmServerOptions{}
 			if err := json.Unmarshal(optionsData, c.VllmServerOptions); err != nil {
 				return fmt.Errorf("failed to unmarshal vLLM options: %w", err)
 			}
 		}
 	default:
 		return fmt.Errorf("unknown backend type: %s", c.BackendType)
 	}
@@ -114,6 +128,20 @@ func (c *CreateInstanceOptions) MarshalJSON() ([]byte, error) {
 				return nil, fmt.Errorf("failed to unmarshal to map: %w", err)
 			}
 			aux.BackendOptions = backendOpts
 		}
 	case backends.BackendTypeVllm:
 		if c.VllmServerOptions != nil {
 			data, err := json.Marshal(c.VllmServerOptions)
 			if err != nil {
 				return nil, fmt.Errorf("failed to marshal vLLM server options: %w", err)
 			}
 			var backendOpts map[string]any
 			if err := json.Unmarshal(data, &backendOpts); err != nil {
 				return nil, fmt.Errorf("failed to unmarshal to map: %w", err)
 			}
 			aux.BackendOptions = backendOpts
 		}
 	}
@@ -171,6 +199,13 @@ func (c *CreateInstanceOptions) BuildCommandArgs() []string {
 		if c.MlxServerOptions != nil {
 			return c.MlxServerOptions.BuildCommandArgs()
 		}
 	case backends.BackendTypeVllm:
 		if c.VllmServerOptions != nil {
 			// Prepend "serve" as first argument
 			args := []string{"serve"}
 			args = append(args, c.VllmServerOptions.BuildCommandArgs()...)
 			return args
 		}
 	}
 	return []string{}
 }
--- a/pkg/server/handlers.go
+++ b/pkg/server/handlers.go
@@ -8,6 +8,7 @@ import (
 	"llamactl/pkg/backends"
 	"llamactl/pkg/backends/llamacpp"
 	"llamactl/pkg/backends/mlx"
 	"llamactl/pkg/backends/vllm"
 	"llamactl/pkg/config"
 	"llamactl/pkg/instance"
 	"llamactl/pkg/manager"
@@ -732,7 +733,60 @@ func (h *Handler) ParseMlxCommand() http.HandlerFunc {
 			BackendType:      backendType,
 			MlxServerOptions: mlxOptions,
 		}
-		
+
 		w.Header().Set("Content-Type", "application/json")
 		if err := json.NewEncoder(w).Encode(options); err != nil {
 			writeError(w, http.StatusInternalServerError, "encode_error", err.Error())
 		}
 	}
 }
 // ParseVllmCommand godoc
 // @Summary Parse vllm serve command
 // @Description Parses a vLLM serve command string into instance options
 // @Tags backends
 // @Security ApiKeyAuth
 // @Accept json
 // @Produce json
 // @Param request body ParseCommandRequest true "Command to parse"
 // @Success 200 {object} instance.CreateInstanceOptions "Parsed options"
 // @Failure 400 {object} map[string]string "Invalid request or command"
 // @Router /backends/vllm/parse-command [post]
 func (h *Handler) ParseVllmCommand() http.HandlerFunc {
 	type errorResponse struct {
 		Error   string `json:"error"`
 		Details string `json:"details,omitempty"`
 	}
 	writeError := func(w http.ResponseWriter, status int, code, details string) {
 		w.Header().Set("Content-Type", "application/json")
 		w.WriteHeader(status)
 		_ = json.NewEncoder(w).Encode(errorResponse{Error: code, Details: details})
 	}
 	return func(w http.ResponseWriter, r *http.Request) {
 		var req ParseCommandRequest
 		if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
 			writeError(w, http.StatusBadRequest, "invalid_request", "Invalid JSON body")
 			return
 		}
 		if strings.TrimSpace(req.Command) == "" {
 			writeError(w, http.StatusBadRequest, "invalid_command", "Command cannot be empty")
 			return
 		}
 		vllmOptions, err := vllm.ParseVllmCommand(req.Command)
 		if err != nil {
 			writeError(w, http.StatusBadRequest, "parse_error", err.Error())
 			return
 		}
 		backendType := backends.BackendTypeVllm
 		options := &instance.CreateInstanceOptions{
 			BackendType:       backendType,
 			VllmServerOptions: vllmOptions,
 		}
 		w.Header().Set("Content-Type", "application/json")
 		if err := json.NewEncoder(w).Encode(options); err != nil {
 			writeError(w, http.StatusInternalServerError, "encode_error", err.Error())
--- a/pkg/server/routes.go
+++ b/pkg/server/routes.go
@@ -58,6 +58,9 @@ func SetupRouter(handler *Handler) *chi.Mux {
 			r.Route("/mlx", func(r chi.Router) {
 				r.Post("/parse-command", handler.ParseMlxCommand())
 			})
 			r.Route("/vllm", func(r chi.Router) {
 				r.Post("/parse-command", handler.ParseVllmCommand())
 			})
 		})
 		// Instance management endpoints
--- a/pkg/validation/validation.go
+++ b/pkg/validation/validation.go
@@ -46,6 +46,8 @@ func ValidateInstanceOptions(options *instance.CreateInstanceOptions) error {
 		return validateLlamaCppOptions(options)
 	case backends.BackendTypeMlxLm:
 		return validateMlxOptions(options)
 	case backends.BackendTypeVllm:
 		return validateVllmOptions(options)
 	default:
 		return ValidationError(fmt.Errorf("unsupported backend type: %s", options.BackendType))
 	}
@@ -88,6 +90,25 @@ func validateMlxOptions(options *instance.CreateInstanceOptions) error {
 	return nil
 }
 // validateVllmOptions validates vLLM backend specific options
 func validateVllmOptions(options *instance.CreateInstanceOptions) error {
 	if options.VllmServerOptions == nil {
 		return ValidationError(fmt.Errorf("vLLM server options cannot be nil for vLLM backend"))
 	}
 	// Use reflection to check all string fields for injection patterns
 	if err := validateStructStrings(options.VllmServerOptions, ""); err != nil {
 		return err
 	}
 	// Basic network validation for port
 	if options.VllmServerOptions.Port < 0 || options.VllmServerOptions.Port > 65535 {
 		return ValidationError(fmt.Errorf("invalid port range: %d", options.VllmServerOptions.Port))
 	}
 	return nil
 }
 // validateStructStrings recursively validates all string fields in a struct
 func validateStructStrings(v any, fieldPath string) error {
 	val := reflect.ValueOf(v)
--- a/vllm_backend_spec.md
+++ b/vllm_backend_spec.md
@@ -0,0 +1,440 @@
 # vLLM Backend Implementation Specification
 ## Overview
 This specification outlines the implementation of vLLM backend support for llamactl, following the existing patterns established by the llama.cpp and MLX backends.
 ## 1. Backend Configuration
 ### Basic Details
 - **Backend Type**: `vllm`
 - **Executable**: `vllm` (configured via `VllmExecutable`)
 - **Subcommand**: `serve` (automatically prepended to arguments)
 - **Default Host/Port**: Auto-assigned by llamactl
 - **Health Check**: Uses `/health` endpoint (returns HTTP 200 with no content)
 - **API Compatibility**: OpenAI-compatible endpoints
 ### Example Command
 ```bash
 vllm serve --enable-log-outputs --tensor-parallel-size 2 --gpu-memory-utilization 0.5 --model ISTA-DASLab/gemma-3-27b-it-GPTQ-4b-128g
 ```
 ## 2. File Structure
 Following the existing backend pattern:
 ```
 pkg/backends/vllm/
 ├── vllm.go          # VllmServerOptions struct and methods
 ├── vllm_test.go     # Unit tests for VllmServerOptions
 ├── parser.go        # Command parsing logic
 └── parser_test.go   # Parser tests
 ```
 ## 3. Core Implementation Files
 ### 3.1 `pkg/backends/vllm/vllm.go`
 #### VllmServerOptions Struct
 ```go
 type VllmServerOptions struct {
    // Basic connection options (auto-assigned by llamactl)
    Host string `json:"host,omitempty"`
    Port int    `json:"port,omitempty"`
    // Core model options
    Model string `json:"model,omitempty"`
    // Common serving options
    EnableLogOutputs       bool    `json:"enable_log_outputs,omitempty"`
    TensorParallelSize    int     `json:"tensor_parallel_size,omitempty"`
    GPUMemoryUtilization  float64 `json:"gpu_memory_utilization,omitempty"`
    // Additional parameters to be added based on vLLM CLI documentation
    // Following the same comprehensive approach as llamacpp.LlamaServerOptions
 }
 ```
 #### Required Methods
 - `UnmarshalJSON()` - Custom unmarshaling with alternative field name support (dash-to-underscore conversion)
 - `BuildCommandArgs()` - Convert struct to command line arguments (excluding "serve" subcommand)
 - `NewVllmServerOptions()` - Constructor with vLLM defaults
 #### Field Name Mapping
 Support both CLI argument names (with dashes) and programmatic names (with underscores), similar to the llama.cpp implementation:
 ```go
 fieldMappings := map[string]string{
    "enable-log-outputs":       "enable_log_outputs",
    "tensor-parallel-size":     "tensor_parallel_size", 
    "gpu-memory-utilization":   "gpu_memory_utilization",
    // ... other mappings
 }
 ```
 ### 3.2 `pkg/backends/vllm/parser.go`
 #### ParseVllmCommand Function
 Following the same pattern as `llamacpp/parser.go` and `mlx/parser.go`:
 ```go
 func ParseVllmCommand(command string) (*VllmServerOptions, error)
 ```
 **Supported Input Formats:**
 1. `vllm serve --model MODEL_NAME --other-args`
 2. `/path/to/vllm serve --model MODEL_NAME`  
 3. `serve --model MODEL_NAME --other-args`
 4. `--model MODEL_NAME --other-args` (args only)
 5. Multiline commands with backslashes
 **Implementation Details:**
 - Handle "serve" subcommand detection and removal
 - Support quoted strings and escaped characters
 - Validate command structure
 - Convert parsed arguments to `VllmServerOptions`
 ## 4. Backend Integration
 ### 4.1 Backend Type Definition
 **File**: `pkg/backends/backend.go`
 ```go
 const (
    BackendTypeLlamaCpp BackendType = "llama_cpp"
    BackendTypeMlxLm    BackendType = "mlx_lm" 
    BackendTypeVllm     BackendType = "vllm"     // ADD THIS
 )
 ```
 ### 4.2 Configuration Integration
 **File**: `pkg/config/config.go`
 #### BackendConfig Update
 ```go
 type BackendConfig struct {
    LlamaExecutable string `yaml:"llama_executable"`
    MLXLMExecutable string `yaml:"mlx_lm_executable"`
    VllmExecutable  string `yaml:"vllm_executable"`  // ADD THIS
 }
 ```
 #### Default Configuration
 - **Default Value**: `"vllm"`
 - **Environment Variable**: `LLAMACTL_VLLM_EXECUTABLE`
 #### Environment Variable Loading
 Add to `loadEnvVars()` function:
 ```go
 if vllmExec := os.Getenv("LLAMACTL_VLLM_EXECUTABLE"); vllmExec != "" {
    cfg.Backends.VllmExecutable = vllmExec
 }
 ```
 ### 4.3 Instance Options Integration
 **File**: `pkg/instance/options.go`
 #### CreateInstanceOptions Update
 ```go
 type CreateInstanceOptions struct {
    // existing fields...
    VllmServerOptions *vllm.VllmServerOptions `json:"-"`
 }
 ```
 #### JSON Marshaling/Unmarshaling
 Update `UnmarshalJSON()` and `MarshalJSON()` methods to handle vLLM backend similar to existing backends.
 #### BuildCommandArgs Implementation
 ```go
 case backends.BackendTypeVllm:
    if c.VllmServerOptions != nil {
        // Prepend "serve" as first argument
        args := []string{"serve"}
        args = append(args, c.VllmServerOptions.BuildCommandArgs()...)
        return args
    }
 ```
 **Key Point**: The "serve" subcommand is handled at the instance options level, keeping the `VllmServerOptions.BuildCommandArgs()` method focused only on vLLM-specific parameters.
 ## 5. Health Check Integration
 ### 5.1 Standard Health Check for vLLM
 **File**: `pkg/instance/lifecycle.go`
 vLLM provides a standard `/health` endpoint that returns HTTP 200 with no content, so no modifications are needed to the existing health check logic. The current `WaitForHealthy()` method will work as-is:
 ```go
 healthURL := fmt.Sprintf("http://%s:%d/health", host, port)
 ```
 ### 5.2 Startup Time Considerations
 - vLLM typically has longer startup times compared to llama.cpp
 - The existing configurable timeout system should handle this adequately
 - Users may need to adjust `on_demand_start_timeout` for larger models
 ## 6. Lifecycle Integration
 ### 6.1 Executable Selection
 **File**: `pkg/instance/lifecycle.go`
 Update the `Start()` method to handle vLLM executable:
 ```go
 switch i.options.BackendType {
 case backends.BackendTypeLlamaCpp:
    executable = i.globalBackendSettings.LlamaExecutable
 case backends.BackendTypeMlxLm:
    executable = i.globalBackendSettings.MLXLMExecutable
 case backends.BackendTypeVllm:                              // ADD THIS
    executable = i.globalBackendSettings.VllmExecutable
 default:
    return fmt.Errorf("unsupported backend type: %s", i.options.BackendType)
 }
 args := i.options.BuildCommandArgs()
 i.cmd = exec.CommandContext(i.ctx, executable, args...)
 ```
 ### 6.2 Command Execution
 The final executed command will be:
 ```bash
 vllm serve --model MODEL_NAME --other-vllm-args
 ```
 Where:
 - `vllm` comes from `VllmExecutable` configuration
 - `serve` is prepended by `BuildCommandArgs()`
 - Remaining args come from `VllmServerOptions.BuildCommandArgs()`
 ## 7. Server Handler Integration
 ### 7.1 New Handler Method
 **File**: `pkg/server/handlers.go`
 ```go
 // ParseVllmCommand godoc
 // @Summary Parse vllm serve command
 // @Description Parses a vLLM serve command string into instance options
 // @Tags backends
 // @Security ApiKeyAuth
 // @Accept json
 // @Produce json
 // @Param request body ParseCommandRequest true "Command to parse"
 // @Success 200 {object} instance.CreateInstanceOptions "Parsed options"
 // @Failure 400 {object} map[string]string "Invalid request or command"
 // @Router /backends/vllm/parse-command [post]
 func (h *Handler) ParseVllmCommand() http.HandlerFunc {
    // Implementation similar to ParseMlxCommand()
    // Uses vllm.ParseVllmCommand() internally
 }
 ```
 ### 7.2 Router Integration
 **File**: `pkg/server/routes.go`
 Add vLLM route:
 ```go
 r.Route("/backends", func(r chi.Router) {
    r.Route("/llama-cpp", func(r chi.Router) {
        r.Post("/parse-command", handler.ParseLlamaCommand())
    })
    r.Route("/mlx", func(r chi.Router) {
        r.Post("/parse-command", handler.ParseMlxCommand())
    })
    r.Route("/vllm", func(r chi.Router) {      // ADD THIS
        r.Post("/parse-command", handler.ParseVllmCommand())
    })
 })
 ```
 ## 8. Validation Integration
 ### 8.1 Instance Options Validation
 **File**: `pkg/validation/validation.go`
 Add vLLM validation case:
 ```go
 func ValidateInstanceOptions(options *instance.CreateInstanceOptions) error {
    // existing validation...
    switch options.BackendType {
    case backends.BackendTypeLlamaCpp:
        return validateLlamaCppOptions(options)
    case backends.BackendTypeMlxLm:
        return validateMlxOptions(options)
    case backends.BackendTypeVllm:          // ADD THIS
        return validateVllmOptions(options)
    default:
        return ValidationError(fmt.Errorf("unsupported backend type: %s", options.BackendType))
    }
 }
 func validateVllmOptions(options *instance.CreateInstanceOptions) error {
    if options.VllmServerOptions == nil {
        return ValidationError(fmt.Errorf("vLLM server options cannot be nil for vLLM backend"))
    }
    // Basic validation following the same pattern as other backends
    if err := validateStructStrings(options.VllmServerOptions, ""); err != nil {
        return err
    }
    // Port validation
    if options.VllmServerOptions.Port < 0 || options.VllmServerOptions.Port > 65535 {
        return ValidationError(fmt.Errorf("invalid port range: %d", options.VllmServerOptions.Port))
    }
    return nil
 }
 ```
 ## 9. Testing Strategy
 ### 9.1 Unit Tests
 - **`vllm_test.go`**: Test `VllmServerOptions` marshaling/unmarshaling, BuildCommandArgs()
 - **`parser_test.go`**: Test command parsing for various formats
 - **Integration tests**: Mock vLLM commands and validate parsing
 ### 9.2 Test Cases
 ```go
 func TestBuildCommandArgs_VllmBasic(t *testing.T) {
    options := VllmServerOptions{
        Model:              "microsoft/DialoGPT-medium",
        Port:               8080,
        Host:               "localhost", 
        EnableLogOutputs:   true,
        TensorParallelSize: 2,
    }
    args := options.BuildCommandArgs()
    // Validate expected arguments (excluding "serve")
 }
 func TestParseVllmCommand_FullCommand(t *testing.T) {
    command := "vllm serve --model ISTA-DASLab/gemma-3-27b-it-GPTQ-4b-128g --tensor-parallel-size 2"
    result, err := ParseVllmCommand(command)
    // Validate parsing results
 }
 ```
 ## 10. Example Usage
 ### 10.1 Parse Existing vLLM Command
 ```bash
 curl -X POST http://localhost:8080/api/v1/backends/vllm/parse-command \
  -H "Authorization: Bearer your-management-key" \
  -H "Content-Type: application/json" \
  -d '{
    "command": "vllm serve --model ISTA-DASLab/gemma-3-27b-it-GPTQ-4b-128g --tensor-parallel-size 2 --gpu-memory-utilization 0.5"
  }'
 ```
 ### 10.2 Create vLLM Instance
 ```bash
 curl -X POST http://localhost:8080/api/v1/instances/my-vllm-model \
  -H "Authorization: Bearer your-management-key" \
  -H "Content-Type: application/json" \
  -d '{
    "backend_type": "vllm",
    "backend_options": {
      "model": "ISTA-DASLab/gemma-3-27b-it-GPTQ-4b-128g",
      "tensor_parallel_size": 2,
      "gpu_memory_utilization": 0.5,
      "enable_log_outputs": true
    }
  }'
 ```
 ### 10.3 Use via OpenAI-Compatible API
 ```bash
 curl -X POST http://localhost:8080/v1/chat/completions \
  -H "Authorization: Bearer your-inference-key" \
  -H "Content-Type: application/json" \
  -d '{
    "model": "my-vllm-model",
    "messages": [{"role": "user", "content": "Hello!"}]
  }'
 ```
 ## 11. Implementation Checklist
 ### Phase 1: Core Backend
 - [ ] Create `pkg/backends/vllm/vllm.go`
 - [ ] Implement `VllmServerOptions` struct with basic fields
 - [ ] Implement `BuildCommandArgs()`, `UnmarshalJSON()`, `MarshalJSON()`
 - [ ] Add comprehensive field mappings for CLI args
 - [ ] Create unit tests for `VllmServerOptions`
 ### Phase 2: Command Parsing
 - [ ] Create `pkg/backends/vllm/parser.go`  
 - [ ] Implement `ParseVllmCommand()` function
 - [ ] Handle various command input formats
 - [ ] Create comprehensive parser tests
 - [ ] Test edge cases and error conditions
 ### Phase 3: Integration
 - [ ] Add `BackendTypeVllm` to `pkg/backends/backend.go`
 - [ ] Update `BackendConfig` in `pkg/config/config.go`
 - [ ] Add environment variable support
 - [ ] Update `CreateInstanceOptions` in `pkg/instance/options.go`
 - [ ] Implement `BuildCommandArgs()` with "serve" prepending
 ### Phase 4: Lifecycle & Health Checks
 - [ ] Update executable selection in `pkg/instance/lifecycle.go`
 - [ ] Test instance startup and health checking (uses existing `/health` endpoint)
 - [ ] Validate command execution flow
 ### Phase 5: API Integration
 - [ ] Add `ParseVllmCommand()` handler in `pkg/server/handlers.go`
 - [ ] Add vLLM route in `pkg/server/routes.go`
 - [ ] Update validation in `pkg/validation/validation.go`
 - [ ] Test API endpoints
 ### Phase 6: Testing & Documentation
 - [ ] Create comprehensive integration tests
 - [ ] Test with actual vLLM installation (if available)
 - [ ] Update documentation
 - [ ] Test OpenAI-compatible proxy functionality
 ## 12. Configuration Examples
 ### 12.1 YAML Configuration
 ```yaml
 backends:
  llama_executable: "llama-server"
  mlx_lm_executable: "mlx_lm.server"
  vllm_executable: "vllm"
 instances:
  # ... other instance settings
 ```
 ### 12.2 Environment Variables
 ```bash
 export LLAMACTL_VLLM_EXECUTABLE="vllm"
 # OR for custom installation
 export LLAMACTL_VLLM_EXECUTABLE="python -m vllm" 
 # OR for containerized deployment
 export LLAMACTL_VLLM_EXECUTABLE="docker run --rm --gpus all vllm/vllm-openai"
 ```
 ## 13. Notes and Considerations
 ### 13.1 Startup Time
 - vLLM instances may take significantly longer to start than llama.cpp
 - Consider documenting recommended timeout values
 - The configurable `on_demand_start_timeout` should accommodate this
 ### 13.2 Resource Usage  
 - vLLM typically requires substantial GPU memory
 - No special handling needed in llamactl (follows existing pattern)
 - Resource management is left to the user/administrator
 ### 13.3 Model Compatibility
 - Primarily designed for HuggingFace models
 - Supports various quantization formats (GPTQ, AWQ, etc.)
 - Model path validation can be basic (similar to other backends)
 ### 13.4 Future Enhancements
 - Consider adding vLLM-specific parameter validation
 - Could add model download/caching features  
 - May want to add vLLM version detection capabilities
 This specification provides a comprehensive roadmap for implementing vLLM backend support while maintaining consistency with the existing llamactl architecture.