package vllm import ( "encoding/json" "reflect" "strconv" "strings" ) type VllmServerOptions struct { // Basic connection options (auto-assigned by llamactl) Host string `json:"host,omitempty"` Port int `json:"port,omitempty"` // Model and engine configuration Model string `json:"model,omitempty"` Tokenizer string `json:"tokenizer,omitempty"` SkipTokenizerInit bool `json:"skip_tokenizer_init,omitempty"` Revision string `json:"revision,omitempty"` CodeRevision string `json:"code_revision,omitempty"` TokenizerRevision string `json:"tokenizer_revision,omitempty"` TokenizerMode string `json:"tokenizer_mode,omitempty"` TrustRemoteCode bool `json:"trust_remote_code,omitempty"` DownloadDir string `json:"download_dir,omitempty"` LoadFormat string `json:"load_format,omitempty"` ConfigFormat string `json:"config_format,omitempty"` Dtype string `json:"dtype,omitempty"` KVCacheDtype string `json:"kv_cache_dtype,omitempty"` QuantizationParamPath string `json:"quantization_param_path,omitempty"` Seed int `json:"seed,omitempty"` MaxModelLen int `json:"max_model_len,omitempty"` GuidedDecodingBackend string `json:"guided_decoding_backend,omitempty"` DistributedExecutorBackend string `json:"distributed_executor_backend,omitempty"` WorkerUseRay bool `json:"worker_use_ray,omitempty"` RayWorkersUseNSight bool `json:"ray_workers_use_nsight,omitempty"` // Performance and serving configuration BlockSize int `json:"block_size,omitempty"` EnablePrefixCaching bool `json:"enable_prefix_caching,omitempty"` DisableSlidingWindow bool `json:"disable_sliding_window,omitempty"` UseV2BlockManager bool `json:"use_v2_block_manager,omitempty"` NumLookaheadSlots int `json:"num_lookahead_slots,omitempty"` SwapSpace int `json:"swap_space,omitempty"` CPUOffloadGB int `json:"cpu_offload_gb,omitempty"` GPUMemoryUtilization float64 `json:"gpu_memory_utilization,omitempty"` NumGPUBlocksOverride int `json:"num_gpu_blocks_override,omitempty"` MaxNumBatchedTokens int `json:"max_num_batched_tokens,omitempty"` MaxNumSeqs int `json:"max_num_seqs,omitempty"` MaxLogprobs int `json:"max_logprobs,omitempty"` DisableLogStats bool `json:"disable_log_stats,omitempty"` Quantization string `json:"quantization,omitempty"` RopeScaling string `json:"rope_scaling,omitempty"` RopeTheta float64 `json:"rope_theta,omitempty"` EnforceEager bool `json:"enforce_eager,omitempty"` MaxContextLenToCapture int `json:"max_context_len_to_capture,omitempty"` MaxSeqLenToCapture int `json:"max_seq_len_to_capture,omitempty"` DisableCustomAllReduce bool `json:"disable_custom_all_reduce,omitempty"` TokenizerPoolSize int `json:"tokenizer_pool_size,omitempty"` TokenizerPoolType string `json:"tokenizer_pool_type,omitempty"` TokenizerPoolExtraConfig string `json:"tokenizer_pool_extra_config,omitempty"` EnableLoraBias bool `json:"enable_lora_bias,omitempty"` LoraExtraVocabSize int `json:"lora_extra_vocab_size,omitempty"` LoraRank int `json:"lora_rank,omitempty"` PromptLookbackDistance int `json:"prompt_lookback_distance,omitempty"` PreemptionMode string `json:"preemption_mode,omitempty"` // Distributed and parallel processing TensorParallelSize int `json:"tensor_parallel_size,omitempty"` PipelineParallelSize int `json:"pipeline_parallel_size,omitempty"` MaxParallelLoadingWorkers int `json:"max_parallel_loading_workers,omitempty"` DisableAsyncOutputProc bool `json:"disable_async_output_proc,omitempty"` WorkerClass string `json:"worker_class,omitempty"` EnabledLoraModules string `json:"enabled_lora_modules,omitempty"` MaxLoraRank int `json:"max_lora_rank,omitempty"` FullyShardedLoras bool `json:"fully_sharded_loras,omitempty"` LoraModules string `json:"lora_modules,omitempty"` PromptAdapters string `json:"prompt_adapters,omitempty"` MaxPromptAdapterToken int `json:"max_prompt_adapter_token,omitempty"` Device string `json:"device,omitempty"` SchedulerDelay float64 `json:"scheduler_delay,omitempty"` EnableChunkedPrefill bool `json:"enable_chunked_prefill,omitempty"` SpeculativeModel string `json:"speculative_model,omitempty"` SpeculativeModelQuantization string `json:"speculative_model_quantization,omitempty"` SpeculativeRevision string `json:"speculative_revision,omitempty"` SpeculativeMaxModelLen int `json:"speculative_max_model_len,omitempty"` SpeculativeDisableByBatchSize int `json:"speculative_disable_by_batch_size,omitempty"` NgptSpeculativeLength int `json:"ngpt_speculative_length,omitempty"` SpeculativeDisableMqa bool `json:"speculative_disable_mqa,omitempty"` ModelLoaderExtraConfig string `json:"model_loader_extra_config,omitempty"` IgnorePatterns string `json:"ignore_patterns,omitempty"` PreloadedLoraModules string `json:"preloaded_lora_modules,omitempty"` // OpenAI server specific options UDS string `json:"uds,omitempty"` UvicornLogLevel string `json:"uvicorn_log_level,omitempty"` ResponseRole string `json:"response_role,omitempty"` SSLKeyfile string `json:"ssl_keyfile,omitempty"` SSLCertfile string `json:"ssl_certfile,omitempty"` SSLCACerts string `json:"ssl_ca_certs,omitempty"` SSLCertReqs int `json:"ssl_cert_reqs,omitempty"` RootPath string `json:"root_path,omitempty"` Middleware []string `json:"middleware,omitempty"` ReturnTokensAsTokenIDS bool `json:"return_tokens_as_token_ids,omitempty"` DisableFrontendMultiprocessing bool `json:"disable_frontend_multiprocessing,omitempty"` EnableAutoToolChoice bool `json:"enable_auto_tool_choice,omitempty"` ToolCallParser string `json:"tool_call_parser,omitempty"` ToolServer string `json:"tool_server,omitempty"` ChatTemplate string `json:"chat_template,omitempty"` ChatTemplateContentFormat string `json:"chat_template_content_format,omitempty"` AllowCredentials bool `json:"allow_credentials,omitempty"` AllowedOrigins []string `json:"allowed_origins,omitempty"` AllowedMethods []string `json:"allowed_methods,omitempty"` AllowedHeaders []string `json:"allowed_headers,omitempty"` APIKey []string `json:"api_key,omitempty"` EnableLogOutputs bool `json:"enable_log_outputs,omitempty"` EnableTokenUsage bool `json:"enable_token_usage,omitempty"` EnableAsyncEngineDebug bool `json:"enable_async_engine_debug,omitempty"` EngineUseRay bool `json:"engine_use_ray,omitempty"` DisableLogRequests bool `json:"disable_log_requests,omitempty"` MaxLogLen int `json:"max_log_len,omitempty"` // Additional engine configuration Task string `json:"task,omitempty"` MultiModalConfig string `json:"multi_modal_config,omitempty"` LimitMmPerPrompt string `json:"limit_mm_per_prompt,omitempty"` EnableSleepMode bool `json:"enable_sleep_mode,omitempty"` EnableChunkingRequest bool `json:"enable_chunking_request,omitempty"` CompilationConfig string `json:"compilation_config,omitempty"` DisableSlidingWindowMask bool `json:"disable_sliding_window_mask,omitempty"` EnableTRTLLMEngineLatency bool `json:"enable_trtllm_engine_latency,omitempty"` OverridePoolingConfig string `json:"override_pooling_config,omitempty"` OverrideNeuronConfig string `json:"override_neuron_config,omitempty"` OverrideKVCacheALIGNSize int `json:"override_kv_cache_align_size,omitempty"` } // NewVllmServerOptions creates a new VllmServerOptions with defaults func NewVllmServerOptions() *VllmServerOptions { return &VllmServerOptions{ Host: "127.0.0.1", Port: 8000, TensorParallelSize: 1, PipelineParallelSize: 1, GPUMemoryUtilization: 0.9, BlockSize: 16, SwapSpace: 4, UvicornLogLevel: "info", ResponseRole: "assistant", TokenizerMode: "auto", TrustRemoteCode: false, EnablePrefixCaching: false, EnforceEager: false, DisableLogStats: false, DisableLogRequests: false, MaxLogprobs: 20, EnableLogOutputs: false, EnableTokenUsage: false, AllowCredentials: false, AllowedOrigins: []string{"*"}, AllowedMethods: []string{"*"}, AllowedHeaders: []string{"*"}, } } // UnmarshalJSON implements custom JSON unmarshaling to support multiple field names func (o *VllmServerOptions) UnmarshalJSON(data []byte) error { // First unmarshal into a map to handle multiple field names var raw map[string]any if err := json.Unmarshal(data, &raw); err != nil { return err } // Create a temporary struct for standard unmarshaling type tempOptions VllmServerOptions temp := tempOptions{} // Standard unmarshal first if err := json.Unmarshal(data, &temp); err != nil { return err } // Copy to our struct *o = VllmServerOptions(temp) // Handle alternative field names (CLI format with dashes) fieldMappings := map[string]string{ // Basic options "tensor-parallel-size": "tensor_parallel_size", "pipeline-parallel-size": "pipeline_parallel_size", "max-parallel-loading-workers": "max_parallel_loading_workers", "disable-async-output-proc": "disable_async_output_proc", "worker-class": "worker_class", "enabled-lora-modules": "enabled_lora_modules", "max-lora-rank": "max_lora_rank", "fully-sharded-loras": "fully_sharded_loras", "lora-modules": "lora_modules", "prompt-adapters": "prompt_adapters", "max-prompt-adapter-token": "max_prompt_adapter_token", "scheduler-delay": "scheduler_delay", "enable-chunked-prefill": "enable_chunked_prefill", "speculative-model": "speculative_model", "speculative-model-quantization": "speculative_model_quantization", "speculative-revision": "speculative_revision", "speculative-max-model-len": "speculative_max_model_len", "speculative-disable-by-batch-size": "speculative_disable_by_batch_size", "ngpt-speculative-length": "ngpt_speculative_length", "speculative-disable-mqa": "speculative_disable_mqa", "model-loader-extra-config": "model_loader_extra_config", "ignore-patterns": "ignore_patterns", "preloaded-lora-modules": "preloaded_lora_modules", // Model configuration "skip-tokenizer-init": "skip_tokenizer_init", "code-revision": "code_revision", "tokenizer-revision": "tokenizer_revision", "tokenizer-mode": "tokenizer_mode", "trust-remote-code": "trust_remote_code", "download-dir": "download_dir", "load-format": "load_format", "config-format": "config_format", "kv-cache-dtype": "kv_cache_dtype", "quantization-param-path": "quantization_param_path", "max-model-len": "max_model_len", "guided-decoding-backend": "guided_decoding_backend", "distributed-executor-backend": "distributed_executor_backend", "worker-use-ray": "worker_use_ray", "ray-workers-use-nsight": "ray_workers_use_nsight", // Performance configuration "block-size": "block_size", "enable-prefix-caching": "enable_prefix_caching", "disable-sliding-window": "disable_sliding_window", "use-v2-block-manager": "use_v2_block_manager", "num-lookahead-slots": "num_lookahead_slots", "swap-space": "swap_space", "cpu-offload-gb": "cpu_offload_gb", "gpu-memory-utilization": "gpu_memory_utilization", "num-gpu-blocks-override": "num_gpu_blocks_override", "max-num-batched-tokens": "max_num_batched_tokens", "max-num-seqs": "max_num_seqs", "max-logprobs": "max_logprobs", "disable-log-stats": "disable_log_stats", "rope-scaling": "rope_scaling", "rope-theta": "rope_theta", "enforce-eager": "enforce_eager", "max-context-len-to-capture": "max_context_len_to_capture", "max-seq-len-to-capture": "max_seq_len_to_capture", "disable-custom-all-reduce": "disable_custom_all_reduce", "tokenizer-pool-size": "tokenizer_pool_size", "tokenizer-pool-type": "tokenizer_pool_type", "tokenizer-pool-extra-config": "tokenizer_pool_extra_config", "enable-lora-bias": "enable_lora_bias", "lora-extra-vocab-size": "lora_extra_vocab_size", "lora-rank": "lora_rank", "prompt-lookback-distance": "prompt_lookback_distance", "preemption-mode": "preemption_mode", // Server configuration "uvicorn-log-level": "uvicorn_log_level", "response-role": "response_role", "ssl-keyfile": "ssl_keyfile", "ssl-certfile": "ssl_certfile", "ssl-ca-certs": "ssl_ca_certs", "ssl-cert-reqs": "ssl_cert_reqs", "root-path": "root_path", "return-tokens-as-token-ids": "return_tokens_as_token_ids", "disable-frontend-multiprocessing": "disable_frontend_multiprocessing", "enable-auto-tool-choice": "enable_auto_tool_choice", "tool-call-parser": "tool_call_parser", "tool-server": "tool_server", "chat-template": "chat_template", "chat-template-content-format": "chat_template_content_format", "allow-credentials": "allow_credentials", "allowed-origins": "allowed_origins", "allowed-methods": "allowed_methods", "allowed-headers": "allowed_headers", "api-key": "api_key", "enable-log-outputs": "enable_log_outputs", "enable-token-usage": "enable_token_usage", "enable-async-engine-debug": "enable_async_engine_debug", "engine-use-ray": "engine_use_ray", "disable-log-requests": "disable_log_requests", "max-log-len": "max_log_len", // Additional options "multi-modal-config": "multi_modal_config", "limit-mm-per-prompt": "limit_mm_per_prompt", "enable-sleep-mode": "enable_sleep_mode", "enable-chunking-request": "enable_chunking_request", "compilation-config": "compilation_config", "disable-sliding-window-mask": "disable_sliding_window_mask", "enable-trtllm-engine-latency": "enable_trtllm_engine_latency", "override-pooling-config": "override_pooling_config", "override-neuron-config": "override_neuron_config", "override-kv-cache-align-size": "override_kv_cache_align_size", } // Process alternative field names for altName, canonicalName := range fieldMappings { if value, exists := raw[altName]; exists { // Use reflection to set the field value v := reflect.ValueOf(o).Elem() field := v.FieldByNameFunc(func(fieldName string) bool { field, _ := v.Type().FieldByName(fieldName) jsonTag := field.Tag.Get("json") return jsonTag == canonicalName+",omitempty" || jsonTag == canonicalName }) if field.IsValid() && field.CanSet() { switch field.Kind() { case reflect.Int: if intVal, ok := value.(float64); ok { field.SetInt(int64(intVal)) } else if strVal, ok := value.(string); ok { if intVal, err := strconv.Atoi(strVal); err == nil { field.SetInt(int64(intVal)) } } case reflect.Float64: if floatVal, ok := value.(float64); ok { field.SetFloat(floatVal) } else if strVal, ok := value.(string); ok { if floatVal, err := strconv.ParseFloat(strVal, 64); err == nil { field.SetFloat(floatVal) } } case reflect.String: if strVal, ok := value.(string); ok { field.SetString(strVal) } case reflect.Bool: if boolVal, ok := value.(bool); ok { field.SetBool(boolVal) } case reflect.Slice: if field.Type().Elem().Kind() == reflect.String { if strVal, ok := value.(string); ok { // Split comma-separated values values := strings.Split(strVal, ",") for i, v := range values { values[i] = strings.TrimSpace(v) } field.Set(reflect.ValueOf(values)) } else if slice, ok := value.([]interface{}); ok { var strSlice []string for _, item := range slice { if str, ok := item.(string); ok { strSlice = append(strSlice, str) } } field.Set(reflect.ValueOf(strSlice)) } } } } } } return nil } // BuildCommandArgs converts VllmServerOptions to command line arguments // Note: This does NOT include the "serve" subcommand, that's handled at the instance level func (o *VllmServerOptions) BuildCommandArgs() []string { var args []string v := reflect.ValueOf(o).Elem() t := v.Type() for i := 0; i < v.NumField(); i++ { field := v.Field(i) fieldType := t.Field(i) // Skip unexported fields if !field.CanInterface() { continue } // Get the JSON tag to determine the flag name jsonTag := fieldType.Tag.Get("json") if jsonTag == "" || jsonTag == "-" { continue } // Remove ",omitempty" from the tag flagName := jsonTag if commaIndex := strings.Index(jsonTag, ","); commaIndex != -1 { flagName = jsonTag[:commaIndex] } // Skip host and port as they are handled by llamactl if flagName == "host" || flagName == "port" { continue } // Convert snake_case to kebab-case for CLI flags flagName = strings.ReplaceAll(flagName, "_", "-") // Add the appropriate arguments based on field type and value switch field.Kind() { case reflect.Bool: if field.Bool() { args = append(args, "--"+flagName) } case reflect.Int: if field.Int() != 0 { args = append(args, "--"+flagName, strconv.FormatInt(field.Int(), 10)) } case reflect.Float64: if field.Float() != 0 { args = append(args, "--"+flagName, strconv.FormatFloat(field.Float(), 'f', -1, 64)) } case reflect.String: if field.String() != "" { args = append(args, "--"+flagName, field.String()) } case reflect.Slice: if field.Type().Elem().Kind() == reflect.String { // Handle []string fields - some are comma-separated, some use multiple flags if flagName == "api-key" || flagName == "allowed-origins" || flagName == "allowed-methods" || flagName == "allowed-headers" || flagName == "middleware" { // Multiple flags for these for j := 0; j < field.Len(); j++ { args = append(args, "--"+flagName, field.Index(j).String()) } } else { // Comma-separated for others if field.Len() > 0 { var values []string for j := 0; j < field.Len(); j++ { values = append(values, field.Index(j).String()) } args = append(args, "--"+flagName, strings.Join(values, ",")) } } } } } return args }