Initial vLLM backend support

2025-12-23 09:34:23 +00:00 · 2025-09-19 18:05:12 +02:00
parent 02fdae24ee
commit 4df02a6519
12 changed files with 1495 additions and 2 deletions
--- a/pkg/backends/backend.go
+++ b/pkg/backends/backend.go
@@ -5,5 +5,6 @@ type BackendType string
 const (
 	BackendTypeLlamaCpp BackendType = "llama_cpp"
 	BackendTypeMlxLm    BackendType = "mlx_lm"
+	BackendTypeVllm     BackendType = "vllm"
 	// BackendTypeMlxVlm BackendType = "mlx_vlm"  // Future expansion
 )
--- a/pkg/backends/vllm/parser.go
+++ b/pkg/backends/vllm/parser.go
@@ -0,0 +1,302 @@
+package vllm
+
+import (
+	"encoding/json"
+	"errors"
+	"fmt"
+	"path/filepath"
+	"regexp"
+	"strconv"
+	"strings"
+)
+
+// ParseVllmCommand parses a vLLM serve command string into VllmServerOptions
+// Supports multiple formats:
+// 1. Full command: "vllm serve --model MODEL_NAME --other-args"
+// 2. Full path: "/usr/local/bin/vllm serve --model MODEL_NAME"
+// 3. Serve only: "serve --model MODEL_NAME --other-args"
+// 4. Args only: "--model MODEL_NAME --other-args"
+// 5. Multiline commands with backslashes
+func ParseVllmCommand(command string) (*VllmServerOptions, error) {
+	// 1. Normalize the command - handle multiline with backslashes
+	trimmed := normalizeMultilineCommand(command)
+	if trimmed == "" {
+		return nil, fmt.Errorf("command cannot be empty")
+	}
+
+	// 2. Extract arguments from command
+	args, err := extractArgumentsFromCommand(trimmed)
+	if err != nil {
+		return nil, err
+	}
+
+	// 3. Parse arguments into map
+	options := make(map[string]any)
+
+	// Known multi-valued flags (snake_case form)
+	multiValued := map[string]struct{}{
+		"middleware":         {},
+		"api_key":            {},
+		"allowed_origins":    {},
+		"allowed_methods":    {},
+		"allowed_headers":    {},
+		"lora_modules":       {},
+		"prompt_adapters":    {},
+	}
+
+	i := 0
+	for i < len(args) {
+		arg := args[i]
+
+		if !strings.HasPrefix(arg, "-") { // skip positional / stray values
+			i++
+			continue
+		}
+
+		// Reject malformed flags with more than two leading dashes (e.g. ---model) to surface user mistakes
+		if strings.HasPrefix(arg, "---") {
+			return nil, fmt.Errorf("malformed flag: %s", arg)
+		}
+
+		// Unified parsing for --flag=value vs --flag value
+		var rawFlag, rawValue string
+		hasEquals := false
+		if strings.Contains(arg, "=") {
+			parts := strings.SplitN(arg, "=", 2)
+			rawFlag = parts[0]
+			rawValue = parts[1] // may be empty string
+			hasEquals = true
+		} else {
+			rawFlag = arg
+		}
+
+		flagCore := strings.TrimPrefix(strings.TrimPrefix(rawFlag, "-"), "-")
+		flagName := strings.ReplaceAll(flagCore, "-", "_")
+
+		// Detect value if not in equals form
+		valueProvided := hasEquals
+		if !hasEquals {
+			if i+1 < len(args) && !isFlag(args[i+1]) { // next token is value
+				rawValue = args[i+1]
+				valueProvided = true
+			}
+		}
+
+		// Determine if multi-valued flag
+		_, isMulti := multiValued[flagName]
+
+		// Normalization helper: ensure slice for multi-valued flags
+		appendValue := func(valStr string) {
+			if existing, ok := options[flagName]; ok {
+				// Existing value; ensure slice semantics for multi-valued flags or repeated occurrences
+				if slice, ok := existing.([]string); ok {
+					options[flagName] = append(slice, valStr)
+					return
+				}
+				// Convert scalar to slice
+				options[flagName] = []string{fmt.Sprintf("%v", existing), valStr}
+				return
+			}
+			// First value
+			if isMulti {
+				options[flagName] = []string{valStr}
+			} else {
+				// We'll parse type below for single-valued flags
+				options[flagName] = valStr
+			}
+		}
+
+		if valueProvided {
+			// Use raw token for multi-valued flags; else allow typed parsing
+			appendValue(rawValue)
+			if !isMulti { // convert to typed value if scalar
+				if strVal, ok := options[flagName].(string); ok { // still scalar
+					options[flagName] = parseValue(strVal)
+				}
+			}
+			// Advance index: if we consumed a following token as value (non equals form), skip it
+			if !hasEquals && i+1 < len(args) && rawValue == args[i+1] {
+				i += 2
+			} else {
+				i++
+			}
+			continue
+		}
+
+		// Boolean flag (no value)
+		options[flagName] = true
+		i++
+	}
+
+	// 4. Convert to VllmServerOptions using existing UnmarshalJSON
+	jsonData, err := json.Marshal(options)
+	if err != nil {
+		return nil, fmt.Errorf("failed to marshal parsed options: %w", err)
+	}
+
+	var vllmOptions VllmServerOptions
+	if err := json.Unmarshal(jsonData, &vllmOptions); err != nil {
+		return nil, fmt.Errorf("failed to parse command options: %w", err)
+	}
+
+	// 5. Return VllmServerOptions
+	return &vllmOptions, nil
+}
+
+// parseValue attempts to parse a string value into the most appropriate type
+func parseValue(value string) any {
+	// Surrounding matching quotes (single or double)
+	if l := len(value); l >= 2 {
+		if (value[0] == '"' && value[l-1] == '"') || (value[0] == '\'' && value[l-1] == '\'') {
+			value = value[1 : l-1]
+		}
+	}
+
+	lower := strings.ToLower(value)
+	if lower == "true" {
+		return true
+	}
+	if lower == "false" {
+		return false
+	}
+
+	if intVal, err := strconv.Atoi(value); err == nil {
+		return intVal
+	}
+	if floatVal, err := strconv.ParseFloat(value, 64); err == nil {
+		return floatVal
+	}
+	return value
+}
+
+// normalizeMultilineCommand handles multiline commands with backslashes
+func normalizeMultilineCommand(command string) string {
+	// Handle escaped newlines (backslash followed by newline)
+	re := regexp.MustCompile(`\\\s*\n\s*`)
+	normalized := re.ReplaceAllString(command, " ")
+
+	// Clean up extra whitespace
+	re = regexp.MustCompile(`\s+`)
+	normalized = re.ReplaceAllString(normalized, " ")
+
+	return strings.TrimSpace(normalized)
+}
+
+// extractArgumentsFromCommand extracts arguments from various command formats
+func extractArgumentsFromCommand(command string) ([]string, error) {
+	// Split command into tokens respecting quotes
+	tokens, err := splitCommandTokens(command)
+	if err != nil {
+		return nil, err
+	}
+
+	if len(tokens) == 0 {
+		return nil, fmt.Errorf("no command tokens found")
+	}
+
+	// Check if first token looks like an executable
+	firstToken := tokens[0]
+
+	// Case 1: Full path to executable (contains path separator or ends with vllm)
+	if strings.Contains(firstToken, string(filepath.Separator)) ||
+		strings.HasSuffix(filepath.Base(firstToken), "vllm") {
+		// Check if second token is "serve"
+		if len(tokens) > 1 && strings.ToLower(tokens[1]) == "serve" {
+			return tokens[2:], nil // Return everything except executable and serve
+		}
+		return tokens[1:], nil // Return everything except the executable
+	}
+
+	// Case 2: Just "vllm" command
+	if strings.ToLower(firstToken) == "vllm" {
+		// Check if second token is "serve"
+		if len(tokens) > 1 && strings.ToLower(tokens[1]) == "serve" {
+			return tokens[2:], nil // Return everything except vllm and serve
+		}
+		return tokens[1:], nil // Return everything except vllm
+	}
+
+	// Case 3: Just "serve" command
+	if strings.ToLower(firstToken) == "serve" {
+		return tokens[1:], nil // Return everything except serve
+	}
+
+	// Case 4: Arguments only (starts with a flag)
+	if strings.HasPrefix(firstToken, "-") {
+		return tokens, nil // Return all tokens as arguments
+	}
+
+	// Case 5: Unknown format - might be a different executable name
+	// Be permissive and assume it's the executable
+	if len(tokens) > 1 && strings.ToLower(tokens[1]) == "serve" {
+		return tokens[2:], nil // Return everything except executable and serve
+	}
+	return tokens[1:], nil
+}
+
+// splitCommandTokens splits a command string into tokens, respecting quotes
+func splitCommandTokens(command string) ([]string, error) {
+	var tokens []string
+	var current strings.Builder
+	inQuotes := false
+	quoteChar := byte(0)
+	escaped := false
+
+	for i := 0; i < len(command); i++ {
+		c := command[i]
+
+		if escaped {
+			current.WriteByte(c)
+			escaped = false
+			continue
+		}
+
+		if c == '\\' {
+			escaped = true
+			current.WriteByte(c)
+			continue
+		}
+
+		if !inQuotes && (c == '"' || c == '\'') {
+			inQuotes = true
+			quoteChar = c
+			current.WriteByte(c)
+		} else if inQuotes && c == quoteChar {
+			inQuotes = false
+			quoteChar = 0
+			current.WriteByte(c)
+		} else if !inQuotes && (c == ' ' || c == '\t') {
+			if current.Len() > 0 {
+				tokens = append(tokens, current.String())
+				current.Reset()
+			}
+		} else {
+			current.WriteByte(c)
+		}
+	}
+
+	if inQuotes {
+		return nil, errors.New("unterminated quoted string")
+	}
+
+	if current.Len() > 0 {
+		tokens = append(tokens, current.String())
+	}
+
+	return tokens, nil
+}
+
+// isFlag determines if a string is a command line flag or a value
+// Handles the special case where negative numbers should be treated as values, not flags
+func isFlag(arg string) bool {
+	if !strings.HasPrefix(arg, "-") {
+		return false
+	}
+
+	// Special case: if it's a negative number, treat it as a value
+	if _, err := strconv.ParseFloat(arg, 64); err == nil {
+		return false
+	}
+
+	return true
+}
--- a/pkg/backends/vllm/parser_test.go
+++ b/pkg/backends/vllm/parser_test.go
@@ -0,0 +1,83 @@
+package vllm
+
+import (
+	"testing"
+)
+
+func TestParseVllmCommand(t *testing.T) {
+	tests := []struct {
+		name      string
+		command   string
+		expectErr bool
+	}{
+		{
+			name:      "basic vllm serve command",
+			command:   "vllm serve --model microsoft/DialoGPT-medium",
+			expectErr: false,
+		},
+		{
+			name:      "serve only command",
+			command:   "serve --model microsoft/DialoGPT-medium",
+			expectErr: false,
+		},
+		{
+			name:      "args only",
+			command:   "--model microsoft/DialoGPT-medium --tensor-parallel-size 2",
+			expectErr: false,
+		},
+		{
+			name:      "empty command",
+			command:   "",
+			expectErr: true,
+		},
+		{
+			name:      "unterminated quote",
+			command:   `vllm serve --model "unterminated`,
+			expectErr: true,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result, err := ParseVllmCommand(tt.command)
+
+			if tt.expectErr {
+				if err == nil {
+					t.Errorf("expected error but got none")
+				}
+				return
+			}
+
+			if err != nil {
+				t.Errorf("unexpected error: %v", err)
+				return
+			}
+
+			if result == nil {
+				t.Errorf("expected result but got nil")
+			}
+		})
+	}
+}
+
+func TestParseVllmCommandValues(t *testing.T) {
+	command := "vllm serve --model test-model --tensor-parallel-size 4 --gpu-memory-utilization 0.8 --enable-log-outputs"
+	result, err := ParseVllmCommand(command)
+
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+
+	if result.Model != "test-model" {
+		t.Errorf("expected model 'test-model', got '%s'", result.Model)
+	}
+	if result.TensorParallelSize != 4 {
+		t.Errorf("expected tensor_parallel_size 4, got %d", result.TensorParallelSize)
+	}
+	if result.GPUMemoryUtilization != 0.8 {
+		t.Errorf("expected gpu_memory_utilization 0.8, got %f", result.GPUMemoryUtilization)
+	}
+	if !result.EnableLogOutputs {
+		t.Errorf("expected enable_log_outputs true, got %v", result.EnableLogOutputs)
+	}
+}
--- a/pkg/backends/vllm/vllm.go
+++ b/pkg/backends/vllm/vllm.go
@@ -0,0 +1,439 @@
+package vllm
+
+import (
+	"encoding/json"
+	"reflect"
+	"strconv"
+	"strings"
+)
+
+type VllmServerOptions struct {
+	// Basic connection options (auto-assigned by llamactl)
+	Host string `json:"host,omitempty"`
+	Port int    `json:"port,omitempty"`
+
+	// Model and engine configuration
+	Model                      string   `json:"model,omitempty"`
+	Tokenizer                  string   `json:"tokenizer,omitempty"`
+	SkipTokenizerInit          bool     `json:"skip_tokenizer_init,omitempty"`
+	Revision                   string   `json:"revision,omitempty"`
+	CodeRevision               string   `json:"code_revision,omitempty"`
+	TokenizerRevision          string   `json:"tokenizer_revision,omitempty"`
+	TokenizerMode              string   `json:"tokenizer_mode,omitempty"`
+	TrustRemoteCode            bool     `json:"trust_remote_code,omitempty"`
+	DownloadDir                string   `json:"download_dir,omitempty"`
+	LoadFormat                 string   `json:"load_format,omitempty"`
+	ConfigFormat               string   `json:"config_format,omitempty"`
+	Dtype                      string   `json:"dtype,omitempty"`
+	KVCacheDtype               string   `json:"kv_cache_dtype,omitempty"`
+	QuantizationParamPath      string   `json:"quantization_param_path,omitempty"`
+	Seed                       int      `json:"seed,omitempty"`
+	MaxModelLen                int      `json:"max_model_len,omitempty"`
+	GuidedDecodingBackend      string   `json:"guided_decoding_backend,omitempty"`
+	DistributedExecutorBackend string   `json:"distributed_executor_backend,omitempty"`
+	WorkerUseRay               bool     `json:"worker_use_ray,omitempty"`
+	RayWorkersUseNSight        bool     `json:"ray_workers_use_nsight,omitempty"`
+
+	// Performance and serving configuration
+	BlockSize                    int     `json:"block_size,omitempty"`
+	EnablePrefixCaching          bool    `json:"enable_prefix_caching,omitempty"`
+	DisableSlidingWindow         bool    `json:"disable_sliding_window,omitempty"`
+	UseV2BlockManager            bool    `json:"use_v2_block_manager,omitempty"`
+	NumLookaheadSlots            int     `json:"num_lookahead_slots,omitempty"`
+	SwapSpace                    int     `json:"swap_space,omitempty"`
+	CPUOffloadGB                 int     `json:"cpu_offload_gb,omitempty"`
+	GPUMemoryUtilization         float64 `json:"gpu_memory_utilization,omitempty"`
+	NumGPUBlocksOverride         int     `json:"num_gpu_blocks_override,omitempty"`
+	MaxNumBatchedTokens          int     `json:"max_num_batched_tokens,omitempty"`
+	MaxNumSeqs                   int     `json:"max_num_seqs,omitempty"`
+	MaxLogprobs                  int     `json:"max_logprobs,omitempty"`
+	DisableLogStats              bool    `json:"disable_log_stats,omitempty"`
+	Quantization                 string  `json:"quantization,omitempty"`
+	RopeScaling                  string  `json:"rope_scaling,omitempty"`
+	RopeTheta                    float64 `json:"rope_theta,omitempty"`
+	EnforceEager                 bool    `json:"enforce_eager,omitempty"`
+	MaxContextLenToCapture       int     `json:"max_context_len_to_capture,omitempty"`
+	MaxSeqLenToCapture           int     `json:"max_seq_len_to_capture,omitempty"`
+	DisableCustomAllReduce       bool    `json:"disable_custom_all_reduce,omitempty"`
+	TokenizerPoolSize            int     `json:"tokenizer_pool_size,omitempty"`
+	TokenizerPoolType            string  `json:"tokenizer_pool_type,omitempty"`
+	TokenizerPoolExtraConfig     string  `json:"tokenizer_pool_extra_config,omitempty"`
+	EnableLoraBias               bool    `json:"enable_lora_bias,omitempty"`
+	LoraExtraVocabSize           int     `json:"lora_extra_vocab_size,omitempty"`
+	LoraRank                     int     `json:"lora_rank,omitempty"`
+	PromptLookbackDistance       int     `json:"prompt_lookback_distance,omitempty"`
+	PreemptionMode               string  `json:"preemption_mode,omitempty"`
+
+	// Distributed and parallel processing
+	TensorParallelSize             int    `json:"tensor_parallel_size,omitempty"`
+	PipelineParallelSize           int    `json:"pipeline_parallel_size,omitempty"`
+	MaxParallelLoadingWorkers      int    `json:"max_parallel_loading_workers,omitempty"`
+	DisableAsyncOutputProc         bool   `json:"disable_async_output_proc,omitempty"`
+	WorkerClass                    string `json:"worker_class,omitempty"`
+	EnabledLoraModules             string `json:"enabled_lora_modules,omitempty"`
+	MaxLoraRank                    int    `json:"max_lora_rank,omitempty"`
+	FullyShardedLoras              bool   `json:"fully_sharded_loras,omitempty"`
+	LoraModules                    string `json:"lora_modules,omitempty"`
+	PromptAdapters                 string `json:"prompt_adapters,omitempty"`
+	MaxPromptAdapterToken          int    `json:"max_prompt_adapter_token,omitempty"`
+	Device                         string `json:"device,omitempty"`
+	SchedulerDelay                 float64 `json:"scheduler_delay,omitempty"`
+	EnableChunkedPrefill           bool   `json:"enable_chunked_prefill,omitempty"`
+	SpeculativeModel               string `json:"speculative_model,omitempty"`
+	SpeculativeModelQuantization   string `json:"speculative_model_quantization,omitempty"`
+	SpeculativeRevision            string `json:"speculative_revision,omitempty"`
+	SpeculativeMaxModelLen         int    `json:"speculative_max_model_len,omitempty"`
+	SpeculativeDisableByBatchSize  int    `json:"speculative_disable_by_batch_size,omitempty"`
+	NgptSpeculativeLength          int    `json:"ngpt_speculative_length,omitempty"`
+	SpeculativeDisableMqa          bool   `json:"speculative_disable_mqa,omitempty"`
+	ModelLoaderExtraConfig         string `json:"model_loader_extra_config,omitempty"`
+	IgnorePatterns                 string `json:"ignore_patterns,omitempty"`
+	PreloadedLoraModules           string `json:"preloaded_lora_modules,omitempty"`
+
+	// OpenAI server specific options
+	UDS                           string   `json:"uds,omitempty"`
+	UvicornLogLevel               string   `json:"uvicorn_log_level,omitempty"`
+	ResponseRole                  string   `json:"response_role,omitempty"`
+	SSLKeyfile                    string   `json:"ssl_keyfile,omitempty"`
+	SSLCertfile                   string   `json:"ssl_certfile,omitempty"`
+	SSLCACerts                    string   `json:"ssl_ca_certs,omitempty"`
+	SSLCertReqs                   int      `json:"ssl_cert_reqs,omitempty"`
+	RootPath                      string   `json:"root_path,omitempty"`
+	Middleware                    []string `json:"middleware,omitempty"`
+	ReturnTokensAsTokenIDS        bool     `json:"return_tokens_as_token_ids,omitempty"`
+	DisableFrontendMultiprocessing bool    `json:"disable_frontend_multiprocessing,omitempty"`
+	EnableAutoToolChoice          bool     `json:"enable_auto_tool_choice,omitempty"`
+	ToolCallParser                string   `json:"tool_call_parser,omitempty"`
+	ToolServer                    string   `json:"tool_server,omitempty"`
+	ChatTemplate                  string   `json:"chat_template,omitempty"`
+	ChatTemplateContentFormat     string   `json:"chat_template_content_format,omitempty"`
+	AllowCredentials              bool     `json:"allow_credentials,omitempty"`
+	AllowedOrigins                []string `json:"allowed_origins,omitempty"`
+	AllowedMethods                []string `json:"allowed_methods,omitempty"`
+	AllowedHeaders                []string `json:"allowed_headers,omitempty"`
+	APIKey                        []string `json:"api_key,omitempty"`
+	EnableLogOutputs              bool     `json:"enable_log_outputs,omitempty"`
+	EnableTokenUsage              bool     `json:"enable_token_usage,omitempty"`
+	EnableAsyncEngineDebug        bool     `json:"enable_async_engine_debug,omitempty"`
+	EngineUseRay                  bool     `json:"engine_use_ray,omitempty"`
+	DisableLogRequests            bool     `json:"disable_log_requests,omitempty"`
+	MaxLogLen                     int      `json:"max_log_len,omitempty"`
+
+	// Additional engine configuration
+	Task                         string  `json:"task,omitempty"`
+	MultiModalConfig             string  `json:"multi_modal_config,omitempty"`
+	LimitMmPerPrompt             string  `json:"limit_mm_per_prompt,omitempty"`
+	EnableSleepMode              bool    `json:"enable_sleep_mode,omitempty"`
+	EnableChunkingRequest        bool    `json:"enable_chunking_request,omitempty"`
+	CompilationConfig            string  `json:"compilation_config,omitempty"`
+	DisableSlidingWindowMask     bool    `json:"disable_sliding_window_mask,omitempty"`
+	EnableTRTLLMEngineLatency    bool    `json:"enable_trtllm_engine_latency,omitempty"`
+	OverridePoolingConfig        string  `json:"override_pooling_config,omitempty"`
+	OverrideNeuronConfig         string  `json:"override_neuron_config,omitempty"`
+	OverrideKVCacheALIGNSize     int     `json:"override_kv_cache_align_size,omitempty"`
+}
+
+// NewVllmServerOptions creates a new VllmServerOptions with defaults
+func NewVllmServerOptions() *VllmServerOptions {
+	return &VllmServerOptions{
+		Host:                    "127.0.0.1",
+		Port:                    8000,
+		TensorParallelSize:      1,
+		PipelineParallelSize:    1,
+		GPUMemoryUtilization:    0.9,
+		BlockSize:              16,
+		SwapSpace:              4,
+		UvicornLogLevel:         "info",
+		ResponseRole:            "assistant",
+		TokenizerMode:           "auto",
+		TrustRemoteCode:         false,
+		EnablePrefixCaching:     false,
+		EnforceEager:            false,
+		DisableLogStats:         false,
+		DisableLogRequests:      false,
+		MaxLogprobs:             20,
+		EnableLogOutputs:        false,
+		EnableTokenUsage:        false,
+		AllowCredentials:        false,
+		AllowedOrigins:          []string{"*"},
+		AllowedMethods:          []string{"*"},
+		AllowedHeaders:          []string{"*"},
+	}
+}
+
+// UnmarshalJSON implements custom JSON unmarshaling to support multiple field names
+func (o *VllmServerOptions) UnmarshalJSON(data []byte) error {
+	// First unmarshal into a map to handle multiple field names
+	var raw map[string]any
+	if err := json.Unmarshal(data, &raw); err != nil {
+		return err
+	}
+
+	// Create a temporary struct for standard unmarshaling
+	type tempOptions VllmServerOptions
+	temp := tempOptions{}
+
+	// Standard unmarshal first
+	if err := json.Unmarshal(data, &temp); err != nil {
+		return err
+	}
+
+	// Copy to our struct
+	*o = VllmServerOptions(temp)
+
+	// Handle alternative field names (CLI format with dashes)
+	fieldMappings := map[string]string{
+		// Basic options
+		"tensor-parallel-size":             "tensor_parallel_size",
+		"pipeline-parallel-size":           "pipeline_parallel_size",
+		"max-parallel-loading-workers":     "max_parallel_loading_workers",
+		"disable-async-output-proc":        "disable_async_output_proc",
+		"worker-class":                     "worker_class",
+		"enabled-lora-modules":             "enabled_lora_modules",
+		"max-lora-rank":                    "max_lora_rank",
+		"fully-sharded-loras":              "fully_sharded_loras",
+		"lora-modules":                     "lora_modules",
+		"prompt-adapters":                  "prompt_adapters",
+		"max-prompt-adapter-token":         "max_prompt_adapter_token",
+		"scheduler-delay":                  "scheduler_delay",
+		"enable-chunked-prefill":           "enable_chunked_prefill",
+		"speculative-model":                "speculative_model",
+		"speculative-model-quantization":   "speculative_model_quantization",
+		"speculative-revision":             "speculative_revision",
+		"speculative-max-model-len":        "speculative_max_model_len",
+		"speculative-disable-by-batch-size": "speculative_disable_by_batch_size",
+		"ngpt-speculative-length":          "ngpt_speculative_length",
+		"speculative-disable-mqa":          "speculative_disable_mqa",
+		"model-loader-extra-config":        "model_loader_extra_config",
+		"ignore-patterns":                  "ignore_patterns",
+		"preloaded-lora-modules":           "preloaded_lora_modules",
+
+		// Model configuration
+		"skip-tokenizer-init":              "skip_tokenizer_init",
+		"code-revision":                    "code_revision",
+		"tokenizer-revision":               "tokenizer_revision",
+		"tokenizer-mode":                   "tokenizer_mode",
+		"trust-remote-code":                "trust_remote_code",
+		"download-dir":                     "download_dir",
+		"load-format":                      "load_format",
+		"config-format":                    "config_format",
+		"kv-cache-dtype":                   "kv_cache_dtype",
+		"quantization-param-path":          "quantization_param_path",
+		"max-model-len":                    "max_model_len",
+		"guided-decoding-backend":          "guided_decoding_backend",
+		"distributed-executor-backend":     "distributed_executor_backend",
+		"worker-use-ray":                   "worker_use_ray",
+		"ray-workers-use-nsight":           "ray_workers_use_nsight",
+
+		// Performance configuration
+		"block-size":                       "block_size",
+		"enable-prefix-caching":            "enable_prefix_caching",
+		"disable-sliding-window":           "disable_sliding_window",
+		"use-v2-block-manager":             "use_v2_block_manager",
+		"num-lookahead-slots":              "num_lookahead_slots",
+		"swap-space":                       "swap_space",
+		"cpu-offload-gb":                   "cpu_offload_gb",
+		"gpu-memory-utilization":           "gpu_memory_utilization",
+		"num-gpu-blocks-override":          "num_gpu_blocks_override",
+		"max-num-batched-tokens":           "max_num_batched_tokens",
+		"max-num-seqs":                     "max_num_seqs",
+		"max-logprobs":                     "max_logprobs",
+		"disable-log-stats":                "disable_log_stats",
+		"rope-scaling":                     "rope_scaling",
+		"rope-theta":                       "rope_theta",
+		"enforce-eager":                    "enforce_eager",
+		"max-context-len-to-capture":       "max_context_len_to_capture",
+		"max-seq-len-to-capture":           "max_seq_len_to_capture",
+		"disable-custom-all-reduce":        "disable_custom_all_reduce",
+		"tokenizer-pool-size":              "tokenizer_pool_size",
+		"tokenizer-pool-type":              "tokenizer_pool_type",
+		"tokenizer-pool-extra-config":      "tokenizer_pool_extra_config",
+		"enable-lora-bias":                 "enable_lora_bias",
+		"lora-extra-vocab-size":            "lora_extra_vocab_size",
+		"lora-rank":                        "lora_rank",
+		"prompt-lookback-distance":         "prompt_lookback_distance",
+		"preemption-mode":                  "preemption_mode",
+
+		// Server configuration
+		"uvicorn-log-level":                  "uvicorn_log_level",
+		"response-role":                      "response_role",
+		"ssl-keyfile":                        "ssl_keyfile",
+		"ssl-certfile":                       "ssl_certfile",
+		"ssl-ca-certs":                       "ssl_ca_certs",
+		"ssl-cert-reqs":                      "ssl_cert_reqs",
+		"root-path":                          "root_path",
+		"return-tokens-as-token-ids":         "return_tokens_as_token_ids",
+		"disable-frontend-multiprocessing":   "disable_frontend_multiprocessing",
+		"enable-auto-tool-choice":            "enable_auto_tool_choice",
+		"tool-call-parser":                   "tool_call_parser",
+		"tool-server":                        "tool_server",
+		"chat-template":                      "chat_template",
+		"chat-template-content-format":       "chat_template_content_format",
+		"allow-credentials":                  "allow_credentials",
+		"allowed-origins":                    "allowed_origins",
+		"allowed-methods":                    "allowed_methods",
+		"allowed-headers":                    "allowed_headers",
+		"api-key":                            "api_key",
+		"enable-log-outputs":                 "enable_log_outputs",
+		"enable-token-usage":                 "enable_token_usage",
+		"enable-async-engine-debug":          "enable_async_engine_debug",
+		"engine-use-ray":                     "engine_use_ray",
+		"disable-log-requests":               "disable_log_requests",
+		"max-log-len":                        "max_log_len",
+
+		// Additional options
+		"multi-modal-config":               "multi_modal_config",
+		"limit-mm-per-prompt":              "limit_mm_per_prompt",
+		"enable-sleep-mode":                "enable_sleep_mode",
+		"enable-chunking-request":          "enable_chunking_request",
+		"compilation-config":               "compilation_config",
+		"disable-sliding-window-mask":      "disable_sliding_window_mask",
+		"enable-trtllm-engine-latency":     "enable_trtllm_engine_latency",
+		"override-pooling-config":          "override_pooling_config",
+		"override-neuron-config":           "override_neuron_config",
+		"override-kv-cache-align-size":     "override_kv_cache_align_size",
+	}
+
+	// Process alternative field names
+	for altName, canonicalName := range fieldMappings {
+		if value, exists := raw[altName]; exists {
+			// Use reflection to set the field value
+			v := reflect.ValueOf(o).Elem()
+			field := v.FieldByNameFunc(func(fieldName string) bool {
+				field, _ := v.Type().FieldByName(fieldName)
+				jsonTag := field.Tag.Get("json")
+				return jsonTag == canonicalName+",omitempty" || jsonTag == canonicalName
+			})
+
+			if field.IsValid() && field.CanSet() {
+				switch field.Kind() {
+				case reflect.Int:
+					if intVal, ok := value.(float64); ok {
+						field.SetInt(int64(intVal))
+					} else if strVal, ok := value.(string); ok {
+						if intVal, err := strconv.Atoi(strVal); err == nil {
+							field.SetInt(int64(intVal))
+						}
+					}
+				case reflect.Float64:
+					if floatVal, ok := value.(float64); ok {
+						field.SetFloat(floatVal)
+					} else if strVal, ok := value.(string); ok {
+						if floatVal, err := strconv.ParseFloat(strVal, 64); err == nil {
+							field.SetFloat(floatVal)
+						}
+					}
+				case reflect.String:
+					if strVal, ok := value.(string); ok {
+						field.SetString(strVal)
+					}
+				case reflect.Bool:
+					if boolVal, ok := value.(bool); ok {
+						field.SetBool(boolVal)
+					}
+				case reflect.Slice:
+					if field.Type().Elem().Kind() == reflect.String {
+						if strVal, ok := value.(string); ok {
+							// Split comma-separated values
+							values := strings.Split(strVal, ",")
+							for i, v := range values {
+								values[i] = strings.TrimSpace(v)
+							}
+							field.Set(reflect.ValueOf(values))
+						} else if slice, ok := value.([]interface{}); ok {
+							var strSlice []string
+							for _, item := range slice {
+								if str, ok := item.(string); ok {
+									strSlice = append(strSlice, str)
+								}
+							}
+							field.Set(reflect.ValueOf(strSlice))
+						}
+					}
+				}
+			}
+		}
+	}
+
+	return nil
+}
+
+// BuildCommandArgs converts VllmServerOptions to command line arguments
+// Note: This does NOT include the "serve" subcommand, that's handled at the instance level
+func (o *VllmServerOptions) BuildCommandArgs() []string {
+	var args []string
+
+	v := reflect.ValueOf(o).Elem()
+	t := v.Type()
+
+	for i := 0; i < v.NumField(); i++ {
+		field := v.Field(i)
+		fieldType := t.Field(i)
+
+		// Skip unexported fields
+		if !field.CanInterface() {
+			continue
+		}
+
+		// Get the JSON tag to determine the flag name
+		jsonTag := fieldType.Tag.Get("json")
+		if jsonTag == "" || jsonTag == "-" {
+			continue
+		}
+
+		// Remove ",omitempty" from the tag
+		flagName := jsonTag
+		if commaIndex := strings.Index(jsonTag, ","); commaIndex != -1 {
+			flagName = jsonTag[:commaIndex]
+		}
+
+		// Skip host and port as they are handled by llamactl
+		if flagName == "host" || flagName == "port" {
+			continue
+		}
+
+		// Convert snake_case to kebab-case for CLI flags
+		flagName = strings.ReplaceAll(flagName, "_", "-")
+
+		// Add the appropriate arguments based on field type and value
+		switch field.Kind() {
+		case reflect.Bool:
+			if field.Bool() {
+				args = append(args, "--"+flagName)
+			}
+		case reflect.Int:
+			if field.Int() != 0 {
+				args = append(args, "--"+flagName, strconv.FormatInt(field.Int(), 10))
+			}
+		case reflect.Float64:
+			if field.Float() != 0 {
+				args = append(args, "--"+flagName, strconv.FormatFloat(field.Float(), 'f', -1, 64))
+			}
+		case reflect.String:
+			if field.String() != "" {
+				args = append(args, "--"+flagName, field.String())
+			}
+		case reflect.Slice:
+			if field.Type().Elem().Kind() == reflect.String {
+				// Handle []string fields - some are comma-separated, some use multiple flags
+				if flagName == "api-key" || flagName == "allowed-origins" || flagName == "allowed-methods" || flagName == "allowed-headers" || flagName == "middleware" {
+					// Multiple flags for these
+					for j := 0; j < field.Len(); j++ {
+						args = append(args, "--"+flagName, field.Index(j).String())
+					}
+				} else {
+					// Comma-separated for others
+					if field.Len() > 0 {
+						var values []string
+						for j := 0; j < field.Len(); j++ {
+							values = append(values, field.Index(j).String())
+						}
+						args = append(args, "--"+flagName, strings.Join(values, ","))
+					}
+				}
+			}
+		}
+	}
+
+	return args
+}
--- a/pkg/backends/vllm/vllm_test.go
+++ b/pkg/backends/vllm/vllm_test.go
@@ -0,0 +1,106 @@
+package vllm_test
+
+import (
+	"encoding/json"
+	"llamactl/pkg/backends/vllm"
+	"slices"
+	"testing"
+)
+
+func TestBuildCommandArgs(t *testing.T) {
+	options := vllm.VllmServerOptions{
+		Model:                "microsoft/DialoGPT-medium",
+		Port:                 8080, // should be excluded
+		Host:                 "localhost", // should be excluded
+		TensorParallelSize:   2,
+		GPUMemoryUtilization: 0.8,
+		EnableLogOutputs:     true,
+		APIKey:              []string{"key1", "key2"},
+	}
+
+	args := options.BuildCommandArgs()
+
+	// Check core functionality
+	if !containsFlagWithValue(args, "--model", "microsoft/DialoGPT-medium") {
+		t.Errorf("Expected --model microsoft/DialoGPT-medium not found in %v", args)
+	}
+	if !containsFlagWithValue(args, "--tensor-parallel-size", "2") {
+		t.Errorf("Expected --tensor-parallel-size 2 not found in %v", args)
+	}
+	if !contains(args, "--enable-log-outputs") {
+		t.Errorf("Expected --enable-log-outputs not found in %v", args)
+	}
+
+	// Host and port should NOT be in the arguments (handled by llamactl)
+	if contains(args, "--host") || contains(args, "--port") {
+		t.Errorf("Host and port should not be in command args, found in %v", args)
+	}
+
+	// Check array handling (multiple flags)
+	apiKeyCount := 0
+	for i := range args {
+		if args[i] == "--api-key" {
+			apiKeyCount++
+		}
+	}
+	if apiKeyCount != 2 {
+		t.Errorf("Expected 2 --api-key flags, got %d", apiKeyCount)
+	}
+}
+
+func TestUnmarshalJSON(t *testing.T) {
+	// Test both underscore and dash formats
+	jsonData := `{
+		"model": "test-model",
+		"tensor_parallel_size": 4,
+		"gpu-memory-utilization": 0.9,
+		"enable-log-outputs": true
+	}`
+
+	var options vllm.VllmServerOptions
+	err := json.Unmarshal([]byte(jsonData), &options)
+	if err != nil {
+		t.Fatalf("Unmarshal failed: %v", err)
+	}
+
+	if options.Model != "test-model" {
+		t.Errorf("Expected model 'test-model', got %q", options.Model)
+	}
+	if options.TensorParallelSize != 4 {
+		t.Errorf("Expected tensor_parallel_size 4, got %d", options.TensorParallelSize)
+	}
+	if options.GPUMemoryUtilization != 0.9 {
+		t.Errorf("Expected gpu_memory_utilization 0.9, got %f", options.GPUMemoryUtilization)
+	}
+	if !options.EnableLogOutputs {
+		t.Errorf("Expected enable_log_outputs true, got %v", options.EnableLogOutputs)
+	}
+}
+
+func TestNewVllmServerOptions(t *testing.T) {
+	options := vllm.NewVllmServerOptions()
+
+	if options == nil {
+		t.Fatal("NewVllmServerOptions returned nil")
+	}
+	if options.Host != "127.0.0.1" {
+		t.Errorf("Expected default host '127.0.0.1', got %q", options.Host)
+	}
+	if options.Port != 8000 {
+		t.Errorf("Expected default port 8000, got %d", options.Port)
+	}
+}
+
+// Helper functions
+func contains(slice []string, item string) bool {
+	return slices.Contains(slice, item)
+}
+
+func containsFlagWithValue(args []string, flag, value string) bool {
+	for i, arg := range args {
+		if arg == flag && i+1 < len(args) && args[i+1] == value {
+			return true
+		}
+	}
+	return false
+}