Initial vLLM backend support

This commit is contained in:
2025-09-19 18:05:12 +02:00
parent 02fdae24ee
commit 4df02a6519
12 changed files with 1495 additions and 2 deletions

302
pkg/backends/vllm/parser.go Normal file
View File

@@ -0,0 +1,302 @@
package vllm
import (
"encoding/json"
"errors"
"fmt"
"path/filepath"
"regexp"
"strconv"
"strings"
)
// ParseVllmCommand parses a vLLM serve command string into VllmServerOptions
// Supports multiple formats:
// 1. Full command: "vllm serve --model MODEL_NAME --other-args"
// 2. Full path: "/usr/local/bin/vllm serve --model MODEL_NAME"
// 3. Serve only: "serve --model MODEL_NAME --other-args"
// 4. Args only: "--model MODEL_NAME --other-args"
// 5. Multiline commands with backslashes
func ParseVllmCommand(command string) (*VllmServerOptions, error) {
// 1. Normalize the command - handle multiline with backslashes
trimmed := normalizeMultilineCommand(command)
if trimmed == "" {
return nil, fmt.Errorf("command cannot be empty")
}
// 2. Extract arguments from command
args, err := extractArgumentsFromCommand(trimmed)
if err != nil {
return nil, err
}
// 3. Parse arguments into map
options := make(map[string]any)
// Known multi-valued flags (snake_case form)
multiValued := map[string]struct{}{
"middleware": {},
"api_key": {},
"allowed_origins": {},
"allowed_methods": {},
"allowed_headers": {},
"lora_modules": {},
"prompt_adapters": {},
}
i := 0
for i < len(args) {
arg := args[i]
if !strings.HasPrefix(arg, "-") { // skip positional / stray values
i++
continue
}
// Reject malformed flags with more than two leading dashes (e.g. ---model) to surface user mistakes
if strings.HasPrefix(arg, "---") {
return nil, fmt.Errorf("malformed flag: %s", arg)
}
// Unified parsing for --flag=value vs --flag value
var rawFlag, rawValue string
hasEquals := false
if strings.Contains(arg, "=") {
parts := strings.SplitN(arg, "=", 2)
rawFlag = parts[0]
rawValue = parts[1] // may be empty string
hasEquals = true
} else {
rawFlag = arg
}
flagCore := strings.TrimPrefix(strings.TrimPrefix(rawFlag, "-"), "-")
flagName := strings.ReplaceAll(flagCore, "-", "_")
// Detect value if not in equals form
valueProvided := hasEquals
if !hasEquals {
if i+1 < len(args) && !isFlag(args[i+1]) { // next token is value
rawValue = args[i+1]
valueProvided = true
}
}
// Determine if multi-valued flag
_, isMulti := multiValued[flagName]
// Normalization helper: ensure slice for multi-valued flags
appendValue := func(valStr string) {
if existing, ok := options[flagName]; ok {
// Existing value; ensure slice semantics for multi-valued flags or repeated occurrences
if slice, ok := existing.([]string); ok {
options[flagName] = append(slice, valStr)
return
}
// Convert scalar to slice
options[flagName] = []string{fmt.Sprintf("%v", existing), valStr}
return
}
// First value
if isMulti {
options[flagName] = []string{valStr}
} else {
// We'll parse type below for single-valued flags
options[flagName] = valStr
}
}
if valueProvided {
// Use raw token for multi-valued flags; else allow typed parsing
appendValue(rawValue)
if !isMulti { // convert to typed value if scalar
if strVal, ok := options[flagName].(string); ok { // still scalar
options[flagName] = parseValue(strVal)
}
}
// Advance index: if we consumed a following token as value (non equals form), skip it
if !hasEquals && i+1 < len(args) && rawValue == args[i+1] {
i += 2
} else {
i++
}
continue
}
// Boolean flag (no value)
options[flagName] = true
i++
}
// 4. Convert to VllmServerOptions using existing UnmarshalJSON
jsonData, err := json.Marshal(options)
if err != nil {
return nil, fmt.Errorf("failed to marshal parsed options: %w", err)
}
var vllmOptions VllmServerOptions
if err := json.Unmarshal(jsonData, &vllmOptions); err != nil {
return nil, fmt.Errorf("failed to parse command options: %w", err)
}
// 5. Return VllmServerOptions
return &vllmOptions, nil
}
// parseValue attempts to parse a string value into the most appropriate type
func parseValue(value string) any {
// Surrounding matching quotes (single or double)
if l := len(value); l >= 2 {
if (value[0] == '"' && value[l-1] == '"') || (value[0] == '\'' && value[l-1] == '\'') {
value = value[1 : l-1]
}
}
lower := strings.ToLower(value)
if lower == "true" {
return true
}
if lower == "false" {
return false
}
if intVal, err := strconv.Atoi(value); err == nil {
return intVal
}
if floatVal, err := strconv.ParseFloat(value, 64); err == nil {
return floatVal
}
return value
}
// normalizeMultilineCommand handles multiline commands with backslashes
func normalizeMultilineCommand(command string) string {
// Handle escaped newlines (backslash followed by newline)
re := regexp.MustCompile(`\\\s*\n\s*`)
normalized := re.ReplaceAllString(command, " ")
// Clean up extra whitespace
re = regexp.MustCompile(`\s+`)
normalized = re.ReplaceAllString(normalized, " ")
return strings.TrimSpace(normalized)
}
// extractArgumentsFromCommand extracts arguments from various command formats
func extractArgumentsFromCommand(command string) ([]string, error) {
// Split command into tokens respecting quotes
tokens, err := splitCommandTokens(command)
if err != nil {
return nil, err
}
if len(tokens) == 0 {
return nil, fmt.Errorf("no command tokens found")
}
// Check if first token looks like an executable
firstToken := tokens[0]
// Case 1: Full path to executable (contains path separator or ends with vllm)
if strings.Contains(firstToken, string(filepath.Separator)) ||
strings.HasSuffix(filepath.Base(firstToken), "vllm") {
// Check if second token is "serve"
if len(tokens) > 1 && strings.ToLower(tokens[1]) == "serve" {
return tokens[2:], nil // Return everything except executable and serve
}
return tokens[1:], nil // Return everything except the executable
}
// Case 2: Just "vllm" command
if strings.ToLower(firstToken) == "vllm" {
// Check if second token is "serve"
if len(tokens) > 1 && strings.ToLower(tokens[1]) == "serve" {
return tokens[2:], nil // Return everything except vllm and serve
}
return tokens[1:], nil // Return everything except vllm
}
// Case 3: Just "serve" command
if strings.ToLower(firstToken) == "serve" {
return tokens[1:], nil // Return everything except serve
}
// Case 4: Arguments only (starts with a flag)
if strings.HasPrefix(firstToken, "-") {
return tokens, nil // Return all tokens as arguments
}
// Case 5: Unknown format - might be a different executable name
// Be permissive and assume it's the executable
if len(tokens) > 1 && strings.ToLower(tokens[1]) == "serve" {
return tokens[2:], nil // Return everything except executable and serve
}
return tokens[1:], nil
}
// splitCommandTokens splits a command string into tokens, respecting quotes
func splitCommandTokens(command string) ([]string, error) {
var tokens []string
var current strings.Builder
inQuotes := false
quoteChar := byte(0)
escaped := false
for i := 0; i < len(command); i++ {
c := command[i]
if escaped {
current.WriteByte(c)
escaped = false
continue
}
if c == '\\' {
escaped = true
current.WriteByte(c)
continue
}
if !inQuotes && (c == '"' || c == '\'') {
inQuotes = true
quoteChar = c
current.WriteByte(c)
} else if inQuotes && c == quoteChar {
inQuotes = false
quoteChar = 0
current.WriteByte(c)
} else if !inQuotes && (c == ' ' || c == '\t') {
if current.Len() > 0 {
tokens = append(tokens, current.String())
current.Reset()
}
} else {
current.WriteByte(c)
}
}
if inQuotes {
return nil, errors.New("unterminated quoted string")
}
if current.Len() > 0 {
tokens = append(tokens, current.String())
}
return tokens, nil
}
// isFlag determines if a string is a command line flag or a value
// Handles the special case where negative numbers should be treated as values, not flags
func isFlag(arg string) bool {
if !strings.HasPrefix(arg, "-") {
return false
}
// Special case: if it's a negative number, treat it as a value
if _, err := strconv.ParseFloat(arg, 64); err == nil {
return false
}
return true
}

View File

@@ -0,0 +1,83 @@
package vllm
import (
"testing"
)
func TestParseVllmCommand(t *testing.T) {
tests := []struct {
name string
command string
expectErr bool
}{
{
name: "basic vllm serve command",
command: "vllm serve --model microsoft/DialoGPT-medium",
expectErr: false,
},
{
name: "serve only command",
command: "serve --model microsoft/DialoGPT-medium",
expectErr: false,
},
{
name: "args only",
command: "--model microsoft/DialoGPT-medium --tensor-parallel-size 2",
expectErr: false,
},
{
name: "empty command",
command: "",
expectErr: true,
},
{
name: "unterminated quote",
command: `vllm serve --model "unterminated`,
expectErr: true,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result, err := ParseVllmCommand(tt.command)
if tt.expectErr {
if err == nil {
t.Errorf("expected error but got none")
}
return
}
if err != nil {
t.Errorf("unexpected error: %v", err)
return
}
if result == nil {
t.Errorf("expected result but got nil")
}
})
}
}
func TestParseVllmCommandValues(t *testing.T) {
command := "vllm serve --model test-model --tensor-parallel-size 4 --gpu-memory-utilization 0.8 --enable-log-outputs"
result, err := ParseVllmCommand(command)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if result.Model != "test-model" {
t.Errorf("expected model 'test-model', got '%s'", result.Model)
}
if result.TensorParallelSize != 4 {
t.Errorf("expected tensor_parallel_size 4, got %d", result.TensorParallelSize)
}
if result.GPUMemoryUtilization != 0.8 {
t.Errorf("expected gpu_memory_utilization 0.8, got %f", result.GPUMemoryUtilization)
}
if !result.EnableLogOutputs {
t.Errorf("expected enable_log_outputs true, got %v", result.EnableLogOutputs)
}
}

439
pkg/backends/vllm/vllm.go Normal file
View File

@@ -0,0 +1,439 @@
package vllm
import (
"encoding/json"
"reflect"
"strconv"
"strings"
)
type VllmServerOptions struct {
// Basic connection options (auto-assigned by llamactl)
Host string `json:"host,omitempty"`
Port int `json:"port,omitempty"`
// Model and engine configuration
Model string `json:"model,omitempty"`
Tokenizer string `json:"tokenizer,omitempty"`
SkipTokenizerInit bool `json:"skip_tokenizer_init,omitempty"`
Revision string `json:"revision,omitempty"`
CodeRevision string `json:"code_revision,omitempty"`
TokenizerRevision string `json:"tokenizer_revision,omitempty"`
TokenizerMode string `json:"tokenizer_mode,omitempty"`
TrustRemoteCode bool `json:"trust_remote_code,omitempty"`
DownloadDir string `json:"download_dir,omitempty"`
LoadFormat string `json:"load_format,omitempty"`
ConfigFormat string `json:"config_format,omitempty"`
Dtype string `json:"dtype,omitempty"`
KVCacheDtype string `json:"kv_cache_dtype,omitempty"`
QuantizationParamPath string `json:"quantization_param_path,omitempty"`
Seed int `json:"seed,omitempty"`
MaxModelLen int `json:"max_model_len,omitempty"`
GuidedDecodingBackend string `json:"guided_decoding_backend,omitempty"`
DistributedExecutorBackend string `json:"distributed_executor_backend,omitempty"`
WorkerUseRay bool `json:"worker_use_ray,omitempty"`
RayWorkersUseNSight bool `json:"ray_workers_use_nsight,omitempty"`
// Performance and serving configuration
BlockSize int `json:"block_size,omitempty"`
EnablePrefixCaching bool `json:"enable_prefix_caching,omitempty"`
DisableSlidingWindow bool `json:"disable_sliding_window,omitempty"`
UseV2BlockManager bool `json:"use_v2_block_manager,omitempty"`
NumLookaheadSlots int `json:"num_lookahead_slots,omitempty"`
SwapSpace int `json:"swap_space,omitempty"`
CPUOffloadGB int `json:"cpu_offload_gb,omitempty"`
GPUMemoryUtilization float64 `json:"gpu_memory_utilization,omitempty"`
NumGPUBlocksOverride int `json:"num_gpu_blocks_override,omitempty"`
MaxNumBatchedTokens int `json:"max_num_batched_tokens,omitempty"`
MaxNumSeqs int `json:"max_num_seqs,omitempty"`
MaxLogprobs int `json:"max_logprobs,omitempty"`
DisableLogStats bool `json:"disable_log_stats,omitempty"`
Quantization string `json:"quantization,omitempty"`
RopeScaling string `json:"rope_scaling,omitempty"`
RopeTheta float64 `json:"rope_theta,omitempty"`
EnforceEager bool `json:"enforce_eager,omitempty"`
MaxContextLenToCapture int `json:"max_context_len_to_capture,omitempty"`
MaxSeqLenToCapture int `json:"max_seq_len_to_capture,omitempty"`
DisableCustomAllReduce bool `json:"disable_custom_all_reduce,omitempty"`
TokenizerPoolSize int `json:"tokenizer_pool_size,omitempty"`
TokenizerPoolType string `json:"tokenizer_pool_type,omitempty"`
TokenizerPoolExtraConfig string `json:"tokenizer_pool_extra_config,omitempty"`
EnableLoraBias bool `json:"enable_lora_bias,omitempty"`
LoraExtraVocabSize int `json:"lora_extra_vocab_size,omitempty"`
LoraRank int `json:"lora_rank,omitempty"`
PromptLookbackDistance int `json:"prompt_lookback_distance,omitempty"`
PreemptionMode string `json:"preemption_mode,omitempty"`
// Distributed and parallel processing
TensorParallelSize int `json:"tensor_parallel_size,omitempty"`
PipelineParallelSize int `json:"pipeline_parallel_size,omitempty"`
MaxParallelLoadingWorkers int `json:"max_parallel_loading_workers,omitempty"`
DisableAsyncOutputProc bool `json:"disable_async_output_proc,omitempty"`
WorkerClass string `json:"worker_class,omitempty"`
EnabledLoraModules string `json:"enabled_lora_modules,omitempty"`
MaxLoraRank int `json:"max_lora_rank,omitempty"`
FullyShardedLoras bool `json:"fully_sharded_loras,omitempty"`
LoraModules string `json:"lora_modules,omitempty"`
PromptAdapters string `json:"prompt_adapters,omitempty"`
MaxPromptAdapterToken int `json:"max_prompt_adapter_token,omitempty"`
Device string `json:"device,omitempty"`
SchedulerDelay float64 `json:"scheduler_delay,omitempty"`
EnableChunkedPrefill bool `json:"enable_chunked_prefill,omitempty"`
SpeculativeModel string `json:"speculative_model,omitempty"`
SpeculativeModelQuantization string `json:"speculative_model_quantization,omitempty"`
SpeculativeRevision string `json:"speculative_revision,omitempty"`
SpeculativeMaxModelLen int `json:"speculative_max_model_len,omitempty"`
SpeculativeDisableByBatchSize int `json:"speculative_disable_by_batch_size,omitempty"`
NgptSpeculativeLength int `json:"ngpt_speculative_length,omitempty"`
SpeculativeDisableMqa bool `json:"speculative_disable_mqa,omitempty"`
ModelLoaderExtraConfig string `json:"model_loader_extra_config,omitempty"`
IgnorePatterns string `json:"ignore_patterns,omitempty"`
PreloadedLoraModules string `json:"preloaded_lora_modules,omitempty"`
// OpenAI server specific options
UDS string `json:"uds,omitempty"`
UvicornLogLevel string `json:"uvicorn_log_level,omitempty"`
ResponseRole string `json:"response_role,omitempty"`
SSLKeyfile string `json:"ssl_keyfile,omitempty"`
SSLCertfile string `json:"ssl_certfile,omitempty"`
SSLCACerts string `json:"ssl_ca_certs,omitempty"`
SSLCertReqs int `json:"ssl_cert_reqs,omitempty"`
RootPath string `json:"root_path,omitempty"`
Middleware []string `json:"middleware,omitempty"`
ReturnTokensAsTokenIDS bool `json:"return_tokens_as_token_ids,omitempty"`
DisableFrontendMultiprocessing bool `json:"disable_frontend_multiprocessing,omitempty"`
EnableAutoToolChoice bool `json:"enable_auto_tool_choice,omitempty"`
ToolCallParser string `json:"tool_call_parser,omitempty"`
ToolServer string `json:"tool_server,omitempty"`
ChatTemplate string `json:"chat_template,omitempty"`
ChatTemplateContentFormat string `json:"chat_template_content_format,omitempty"`
AllowCredentials bool `json:"allow_credentials,omitempty"`
AllowedOrigins []string `json:"allowed_origins,omitempty"`
AllowedMethods []string `json:"allowed_methods,omitempty"`
AllowedHeaders []string `json:"allowed_headers,omitempty"`
APIKey []string `json:"api_key,omitempty"`
EnableLogOutputs bool `json:"enable_log_outputs,omitempty"`
EnableTokenUsage bool `json:"enable_token_usage,omitempty"`
EnableAsyncEngineDebug bool `json:"enable_async_engine_debug,omitempty"`
EngineUseRay bool `json:"engine_use_ray,omitempty"`
DisableLogRequests bool `json:"disable_log_requests,omitempty"`
MaxLogLen int `json:"max_log_len,omitempty"`
// Additional engine configuration
Task string `json:"task,omitempty"`
MultiModalConfig string `json:"multi_modal_config,omitempty"`
LimitMmPerPrompt string `json:"limit_mm_per_prompt,omitempty"`
EnableSleepMode bool `json:"enable_sleep_mode,omitempty"`
EnableChunkingRequest bool `json:"enable_chunking_request,omitempty"`
CompilationConfig string `json:"compilation_config,omitempty"`
DisableSlidingWindowMask bool `json:"disable_sliding_window_mask,omitempty"`
EnableTRTLLMEngineLatency bool `json:"enable_trtllm_engine_latency,omitempty"`
OverridePoolingConfig string `json:"override_pooling_config,omitempty"`
OverrideNeuronConfig string `json:"override_neuron_config,omitempty"`
OverrideKVCacheALIGNSize int `json:"override_kv_cache_align_size,omitempty"`
}
// NewVllmServerOptions creates a new VllmServerOptions with defaults
func NewVllmServerOptions() *VllmServerOptions {
return &VllmServerOptions{
Host: "127.0.0.1",
Port: 8000,
TensorParallelSize: 1,
PipelineParallelSize: 1,
GPUMemoryUtilization: 0.9,
BlockSize: 16,
SwapSpace: 4,
UvicornLogLevel: "info",
ResponseRole: "assistant",
TokenizerMode: "auto",
TrustRemoteCode: false,
EnablePrefixCaching: false,
EnforceEager: false,
DisableLogStats: false,
DisableLogRequests: false,
MaxLogprobs: 20,
EnableLogOutputs: false,
EnableTokenUsage: false,
AllowCredentials: false,
AllowedOrigins: []string{"*"},
AllowedMethods: []string{"*"},
AllowedHeaders: []string{"*"},
}
}
// UnmarshalJSON implements custom JSON unmarshaling to support multiple field names
func (o *VllmServerOptions) UnmarshalJSON(data []byte) error {
// First unmarshal into a map to handle multiple field names
var raw map[string]any
if err := json.Unmarshal(data, &raw); err != nil {
return err
}
// Create a temporary struct for standard unmarshaling
type tempOptions VllmServerOptions
temp := tempOptions{}
// Standard unmarshal first
if err := json.Unmarshal(data, &temp); err != nil {
return err
}
// Copy to our struct
*o = VllmServerOptions(temp)
// Handle alternative field names (CLI format with dashes)
fieldMappings := map[string]string{
// Basic options
"tensor-parallel-size": "tensor_parallel_size",
"pipeline-parallel-size": "pipeline_parallel_size",
"max-parallel-loading-workers": "max_parallel_loading_workers",
"disable-async-output-proc": "disable_async_output_proc",
"worker-class": "worker_class",
"enabled-lora-modules": "enabled_lora_modules",
"max-lora-rank": "max_lora_rank",
"fully-sharded-loras": "fully_sharded_loras",
"lora-modules": "lora_modules",
"prompt-adapters": "prompt_adapters",
"max-prompt-adapter-token": "max_prompt_adapter_token",
"scheduler-delay": "scheduler_delay",
"enable-chunked-prefill": "enable_chunked_prefill",
"speculative-model": "speculative_model",
"speculative-model-quantization": "speculative_model_quantization",
"speculative-revision": "speculative_revision",
"speculative-max-model-len": "speculative_max_model_len",
"speculative-disable-by-batch-size": "speculative_disable_by_batch_size",
"ngpt-speculative-length": "ngpt_speculative_length",
"speculative-disable-mqa": "speculative_disable_mqa",
"model-loader-extra-config": "model_loader_extra_config",
"ignore-patterns": "ignore_patterns",
"preloaded-lora-modules": "preloaded_lora_modules",
// Model configuration
"skip-tokenizer-init": "skip_tokenizer_init",
"code-revision": "code_revision",
"tokenizer-revision": "tokenizer_revision",
"tokenizer-mode": "tokenizer_mode",
"trust-remote-code": "trust_remote_code",
"download-dir": "download_dir",
"load-format": "load_format",
"config-format": "config_format",
"kv-cache-dtype": "kv_cache_dtype",
"quantization-param-path": "quantization_param_path",
"max-model-len": "max_model_len",
"guided-decoding-backend": "guided_decoding_backend",
"distributed-executor-backend": "distributed_executor_backend",
"worker-use-ray": "worker_use_ray",
"ray-workers-use-nsight": "ray_workers_use_nsight",
// Performance configuration
"block-size": "block_size",
"enable-prefix-caching": "enable_prefix_caching",
"disable-sliding-window": "disable_sliding_window",
"use-v2-block-manager": "use_v2_block_manager",
"num-lookahead-slots": "num_lookahead_slots",
"swap-space": "swap_space",
"cpu-offload-gb": "cpu_offload_gb",
"gpu-memory-utilization": "gpu_memory_utilization",
"num-gpu-blocks-override": "num_gpu_blocks_override",
"max-num-batched-tokens": "max_num_batched_tokens",
"max-num-seqs": "max_num_seqs",
"max-logprobs": "max_logprobs",
"disable-log-stats": "disable_log_stats",
"rope-scaling": "rope_scaling",
"rope-theta": "rope_theta",
"enforce-eager": "enforce_eager",
"max-context-len-to-capture": "max_context_len_to_capture",
"max-seq-len-to-capture": "max_seq_len_to_capture",
"disable-custom-all-reduce": "disable_custom_all_reduce",
"tokenizer-pool-size": "tokenizer_pool_size",
"tokenizer-pool-type": "tokenizer_pool_type",
"tokenizer-pool-extra-config": "tokenizer_pool_extra_config",
"enable-lora-bias": "enable_lora_bias",
"lora-extra-vocab-size": "lora_extra_vocab_size",
"lora-rank": "lora_rank",
"prompt-lookback-distance": "prompt_lookback_distance",
"preemption-mode": "preemption_mode",
// Server configuration
"uvicorn-log-level": "uvicorn_log_level",
"response-role": "response_role",
"ssl-keyfile": "ssl_keyfile",
"ssl-certfile": "ssl_certfile",
"ssl-ca-certs": "ssl_ca_certs",
"ssl-cert-reqs": "ssl_cert_reqs",
"root-path": "root_path",
"return-tokens-as-token-ids": "return_tokens_as_token_ids",
"disable-frontend-multiprocessing": "disable_frontend_multiprocessing",
"enable-auto-tool-choice": "enable_auto_tool_choice",
"tool-call-parser": "tool_call_parser",
"tool-server": "tool_server",
"chat-template": "chat_template",
"chat-template-content-format": "chat_template_content_format",
"allow-credentials": "allow_credentials",
"allowed-origins": "allowed_origins",
"allowed-methods": "allowed_methods",
"allowed-headers": "allowed_headers",
"api-key": "api_key",
"enable-log-outputs": "enable_log_outputs",
"enable-token-usage": "enable_token_usage",
"enable-async-engine-debug": "enable_async_engine_debug",
"engine-use-ray": "engine_use_ray",
"disable-log-requests": "disable_log_requests",
"max-log-len": "max_log_len",
// Additional options
"multi-modal-config": "multi_modal_config",
"limit-mm-per-prompt": "limit_mm_per_prompt",
"enable-sleep-mode": "enable_sleep_mode",
"enable-chunking-request": "enable_chunking_request",
"compilation-config": "compilation_config",
"disable-sliding-window-mask": "disable_sliding_window_mask",
"enable-trtllm-engine-latency": "enable_trtllm_engine_latency",
"override-pooling-config": "override_pooling_config",
"override-neuron-config": "override_neuron_config",
"override-kv-cache-align-size": "override_kv_cache_align_size",
}
// Process alternative field names
for altName, canonicalName := range fieldMappings {
if value, exists := raw[altName]; exists {
// Use reflection to set the field value
v := reflect.ValueOf(o).Elem()
field := v.FieldByNameFunc(func(fieldName string) bool {
field, _ := v.Type().FieldByName(fieldName)
jsonTag := field.Tag.Get("json")
return jsonTag == canonicalName+",omitempty" || jsonTag == canonicalName
})
if field.IsValid() && field.CanSet() {
switch field.Kind() {
case reflect.Int:
if intVal, ok := value.(float64); ok {
field.SetInt(int64(intVal))
} else if strVal, ok := value.(string); ok {
if intVal, err := strconv.Atoi(strVal); err == nil {
field.SetInt(int64(intVal))
}
}
case reflect.Float64:
if floatVal, ok := value.(float64); ok {
field.SetFloat(floatVal)
} else if strVal, ok := value.(string); ok {
if floatVal, err := strconv.ParseFloat(strVal, 64); err == nil {
field.SetFloat(floatVal)
}
}
case reflect.String:
if strVal, ok := value.(string); ok {
field.SetString(strVal)
}
case reflect.Bool:
if boolVal, ok := value.(bool); ok {
field.SetBool(boolVal)
}
case reflect.Slice:
if field.Type().Elem().Kind() == reflect.String {
if strVal, ok := value.(string); ok {
// Split comma-separated values
values := strings.Split(strVal, ",")
for i, v := range values {
values[i] = strings.TrimSpace(v)
}
field.Set(reflect.ValueOf(values))
} else if slice, ok := value.([]interface{}); ok {
var strSlice []string
for _, item := range slice {
if str, ok := item.(string); ok {
strSlice = append(strSlice, str)
}
}
field.Set(reflect.ValueOf(strSlice))
}
}
}
}
}
}
return nil
}
// BuildCommandArgs converts VllmServerOptions to command line arguments
// Note: This does NOT include the "serve" subcommand, that's handled at the instance level
func (o *VllmServerOptions) BuildCommandArgs() []string {
var args []string
v := reflect.ValueOf(o).Elem()
t := v.Type()
for i := 0; i < v.NumField(); i++ {
field := v.Field(i)
fieldType := t.Field(i)
// Skip unexported fields
if !field.CanInterface() {
continue
}
// Get the JSON tag to determine the flag name
jsonTag := fieldType.Tag.Get("json")
if jsonTag == "" || jsonTag == "-" {
continue
}
// Remove ",omitempty" from the tag
flagName := jsonTag
if commaIndex := strings.Index(jsonTag, ","); commaIndex != -1 {
flagName = jsonTag[:commaIndex]
}
// Skip host and port as they are handled by llamactl
if flagName == "host" || flagName == "port" {
continue
}
// Convert snake_case to kebab-case for CLI flags
flagName = strings.ReplaceAll(flagName, "_", "-")
// Add the appropriate arguments based on field type and value
switch field.Kind() {
case reflect.Bool:
if field.Bool() {
args = append(args, "--"+flagName)
}
case reflect.Int:
if field.Int() != 0 {
args = append(args, "--"+flagName, strconv.FormatInt(field.Int(), 10))
}
case reflect.Float64:
if field.Float() != 0 {
args = append(args, "--"+flagName, strconv.FormatFloat(field.Float(), 'f', -1, 64))
}
case reflect.String:
if field.String() != "" {
args = append(args, "--"+flagName, field.String())
}
case reflect.Slice:
if field.Type().Elem().Kind() == reflect.String {
// Handle []string fields - some are comma-separated, some use multiple flags
if flagName == "api-key" || flagName == "allowed-origins" || flagName == "allowed-methods" || flagName == "allowed-headers" || flagName == "middleware" {
// Multiple flags for these
for j := 0; j < field.Len(); j++ {
args = append(args, "--"+flagName, field.Index(j).String())
}
} else {
// Comma-separated for others
if field.Len() > 0 {
var values []string
for j := 0; j < field.Len(); j++ {
values = append(values, field.Index(j).String())
}
args = append(args, "--"+flagName, strings.Join(values, ","))
}
}
}
}
}
return args
}

View File

@@ -0,0 +1,106 @@
package vllm_test
import (
"encoding/json"
"llamactl/pkg/backends/vllm"
"slices"
"testing"
)
func TestBuildCommandArgs(t *testing.T) {
options := vllm.VllmServerOptions{
Model: "microsoft/DialoGPT-medium",
Port: 8080, // should be excluded
Host: "localhost", // should be excluded
TensorParallelSize: 2,
GPUMemoryUtilization: 0.8,
EnableLogOutputs: true,
APIKey: []string{"key1", "key2"},
}
args := options.BuildCommandArgs()
// Check core functionality
if !containsFlagWithValue(args, "--model", "microsoft/DialoGPT-medium") {
t.Errorf("Expected --model microsoft/DialoGPT-medium not found in %v", args)
}
if !containsFlagWithValue(args, "--tensor-parallel-size", "2") {
t.Errorf("Expected --tensor-parallel-size 2 not found in %v", args)
}
if !contains(args, "--enable-log-outputs") {
t.Errorf("Expected --enable-log-outputs not found in %v", args)
}
// Host and port should NOT be in the arguments (handled by llamactl)
if contains(args, "--host") || contains(args, "--port") {
t.Errorf("Host and port should not be in command args, found in %v", args)
}
// Check array handling (multiple flags)
apiKeyCount := 0
for i := range args {
if args[i] == "--api-key" {
apiKeyCount++
}
}
if apiKeyCount != 2 {
t.Errorf("Expected 2 --api-key flags, got %d", apiKeyCount)
}
}
func TestUnmarshalJSON(t *testing.T) {
// Test both underscore and dash formats
jsonData := `{
"model": "test-model",
"tensor_parallel_size": 4,
"gpu-memory-utilization": 0.9,
"enable-log-outputs": true
}`
var options vllm.VllmServerOptions
err := json.Unmarshal([]byte(jsonData), &options)
if err != nil {
t.Fatalf("Unmarshal failed: %v", err)
}
if options.Model != "test-model" {
t.Errorf("Expected model 'test-model', got %q", options.Model)
}
if options.TensorParallelSize != 4 {
t.Errorf("Expected tensor_parallel_size 4, got %d", options.TensorParallelSize)
}
if options.GPUMemoryUtilization != 0.9 {
t.Errorf("Expected gpu_memory_utilization 0.9, got %f", options.GPUMemoryUtilization)
}
if !options.EnableLogOutputs {
t.Errorf("Expected enable_log_outputs true, got %v", options.EnableLogOutputs)
}
}
func TestNewVllmServerOptions(t *testing.T) {
options := vllm.NewVllmServerOptions()
if options == nil {
t.Fatal("NewVllmServerOptions returned nil")
}
if options.Host != "127.0.0.1" {
t.Errorf("Expected default host '127.0.0.1', got %q", options.Host)
}
if options.Port != 8000 {
t.Errorf("Expected default port 8000, got %d", options.Port)
}
}
// Helper functions
func contains(slice []string, item string) bool {
return slices.Contains(slice, item)
}
func containsFlagWithValue(args []string, flag, value string) bool {
for i, arg := range args {
if arg == flag && i+1 < len(args) && args[i+1] == value {
return true
}
}
return false
}