mirror of
https://github.com/lordmathis/llamactl.git
synced 2025-11-06 00:54:23 +00:00
Initial vLLM backend support
This commit is contained in:
439
pkg/backends/vllm/vllm.go
Normal file
439
pkg/backends/vllm/vllm.go
Normal file
@@ -0,0 +1,439 @@
|
||||
package vllm
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"reflect"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
type VllmServerOptions struct {
|
||||
// Basic connection options (auto-assigned by llamactl)
|
||||
Host string `json:"host,omitempty"`
|
||||
Port int `json:"port,omitempty"`
|
||||
|
||||
// Model and engine configuration
|
||||
Model string `json:"model,omitempty"`
|
||||
Tokenizer string `json:"tokenizer,omitempty"`
|
||||
SkipTokenizerInit bool `json:"skip_tokenizer_init,omitempty"`
|
||||
Revision string `json:"revision,omitempty"`
|
||||
CodeRevision string `json:"code_revision,omitempty"`
|
||||
TokenizerRevision string `json:"tokenizer_revision,omitempty"`
|
||||
TokenizerMode string `json:"tokenizer_mode,omitempty"`
|
||||
TrustRemoteCode bool `json:"trust_remote_code,omitempty"`
|
||||
DownloadDir string `json:"download_dir,omitempty"`
|
||||
LoadFormat string `json:"load_format,omitempty"`
|
||||
ConfigFormat string `json:"config_format,omitempty"`
|
||||
Dtype string `json:"dtype,omitempty"`
|
||||
KVCacheDtype string `json:"kv_cache_dtype,omitempty"`
|
||||
QuantizationParamPath string `json:"quantization_param_path,omitempty"`
|
||||
Seed int `json:"seed,omitempty"`
|
||||
MaxModelLen int `json:"max_model_len,omitempty"`
|
||||
GuidedDecodingBackend string `json:"guided_decoding_backend,omitempty"`
|
||||
DistributedExecutorBackend string `json:"distributed_executor_backend,omitempty"`
|
||||
WorkerUseRay bool `json:"worker_use_ray,omitempty"`
|
||||
RayWorkersUseNSight bool `json:"ray_workers_use_nsight,omitempty"`
|
||||
|
||||
// Performance and serving configuration
|
||||
BlockSize int `json:"block_size,omitempty"`
|
||||
EnablePrefixCaching bool `json:"enable_prefix_caching,omitempty"`
|
||||
DisableSlidingWindow bool `json:"disable_sliding_window,omitempty"`
|
||||
UseV2BlockManager bool `json:"use_v2_block_manager,omitempty"`
|
||||
NumLookaheadSlots int `json:"num_lookahead_slots,omitempty"`
|
||||
SwapSpace int `json:"swap_space,omitempty"`
|
||||
CPUOffloadGB int `json:"cpu_offload_gb,omitempty"`
|
||||
GPUMemoryUtilization float64 `json:"gpu_memory_utilization,omitempty"`
|
||||
NumGPUBlocksOverride int `json:"num_gpu_blocks_override,omitempty"`
|
||||
MaxNumBatchedTokens int `json:"max_num_batched_tokens,omitempty"`
|
||||
MaxNumSeqs int `json:"max_num_seqs,omitempty"`
|
||||
MaxLogprobs int `json:"max_logprobs,omitempty"`
|
||||
DisableLogStats bool `json:"disable_log_stats,omitempty"`
|
||||
Quantization string `json:"quantization,omitempty"`
|
||||
RopeScaling string `json:"rope_scaling,omitempty"`
|
||||
RopeTheta float64 `json:"rope_theta,omitempty"`
|
||||
EnforceEager bool `json:"enforce_eager,omitempty"`
|
||||
MaxContextLenToCapture int `json:"max_context_len_to_capture,omitempty"`
|
||||
MaxSeqLenToCapture int `json:"max_seq_len_to_capture,omitempty"`
|
||||
DisableCustomAllReduce bool `json:"disable_custom_all_reduce,omitempty"`
|
||||
TokenizerPoolSize int `json:"tokenizer_pool_size,omitempty"`
|
||||
TokenizerPoolType string `json:"tokenizer_pool_type,omitempty"`
|
||||
TokenizerPoolExtraConfig string `json:"tokenizer_pool_extra_config,omitempty"`
|
||||
EnableLoraBias bool `json:"enable_lora_bias,omitempty"`
|
||||
LoraExtraVocabSize int `json:"lora_extra_vocab_size,omitempty"`
|
||||
LoraRank int `json:"lora_rank,omitempty"`
|
||||
PromptLookbackDistance int `json:"prompt_lookback_distance,omitempty"`
|
||||
PreemptionMode string `json:"preemption_mode,omitempty"`
|
||||
|
||||
// Distributed and parallel processing
|
||||
TensorParallelSize int `json:"tensor_parallel_size,omitempty"`
|
||||
PipelineParallelSize int `json:"pipeline_parallel_size,omitempty"`
|
||||
MaxParallelLoadingWorkers int `json:"max_parallel_loading_workers,omitempty"`
|
||||
DisableAsyncOutputProc bool `json:"disable_async_output_proc,omitempty"`
|
||||
WorkerClass string `json:"worker_class,omitempty"`
|
||||
EnabledLoraModules string `json:"enabled_lora_modules,omitempty"`
|
||||
MaxLoraRank int `json:"max_lora_rank,omitempty"`
|
||||
FullyShardedLoras bool `json:"fully_sharded_loras,omitempty"`
|
||||
LoraModules string `json:"lora_modules,omitempty"`
|
||||
PromptAdapters string `json:"prompt_adapters,omitempty"`
|
||||
MaxPromptAdapterToken int `json:"max_prompt_adapter_token,omitempty"`
|
||||
Device string `json:"device,omitempty"`
|
||||
SchedulerDelay float64 `json:"scheduler_delay,omitempty"`
|
||||
EnableChunkedPrefill bool `json:"enable_chunked_prefill,omitempty"`
|
||||
SpeculativeModel string `json:"speculative_model,omitempty"`
|
||||
SpeculativeModelQuantization string `json:"speculative_model_quantization,omitempty"`
|
||||
SpeculativeRevision string `json:"speculative_revision,omitempty"`
|
||||
SpeculativeMaxModelLen int `json:"speculative_max_model_len,omitempty"`
|
||||
SpeculativeDisableByBatchSize int `json:"speculative_disable_by_batch_size,omitempty"`
|
||||
NgptSpeculativeLength int `json:"ngpt_speculative_length,omitempty"`
|
||||
SpeculativeDisableMqa bool `json:"speculative_disable_mqa,omitempty"`
|
||||
ModelLoaderExtraConfig string `json:"model_loader_extra_config,omitempty"`
|
||||
IgnorePatterns string `json:"ignore_patterns,omitempty"`
|
||||
PreloadedLoraModules string `json:"preloaded_lora_modules,omitempty"`
|
||||
|
||||
// OpenAI server specific options
|
||||
UDS string `json:"uds,omitempty"`
|
||||
UvicornLogLevel string `json:"uvicorn_log_level,omitempty"`
|
||||
ResponseRole string `json:"response_role,omitempty"`
|
||||
SSLKeyfile string `json:"ssl_keyfile,omitempty"`
|
||||
SSLCertfile string `json:"ssl_certfile,omitempty"`
|
||||
SSLCACerts string `json:"ssl_ca_certs,omitempty"`
|
||||
SSLCertReqs int `json:"ssl_cert_reqs,omitempty"`
|
||||
RootPath string `json:"root_path,omitempty"`
|
||||
Middleware []string `json:"middleware,omitempty"`
|
||||
ReturnTokensAsTokenIDS bool `json:"return_tokens_as_token_ids,omitempty"`
|
||||
DisableFrontendMultiprocessing bool `json:"disable_frontend_multiprocessing,omitempty"`
|
||||
EnableAutoToolChoice bool `json:"enable_auto_tool_choice,omitempty"`
|
||||
ToolCallParser string `json:"tool_call_parser,omitempty"`
|
||||
ToolServer string `json:"tool_server,omitempty"`
|
||||
ChatTemplate string `json:"chat_template,omitempty"`
|
||||
ChatTemplateContentFormat string `json:"chat_template_content_format,omitempty"`
|
||||
AllowCredentials bool `json:"allow_credentials,omitempty"`
|
||||
AllowedOrigins []string `json:"allowed_origins,omitempty"`
|
||||
AllowedMethods []string `json:"allowed_methods,omitempty"`
|
||||
AllowedHeaders []string `json:"allowed_headers,omitempty"`
|
||||
APIKey []string `json:"api_key,omitempty"`
|
||||
EnableLogOutputs bool `json:"enable_log_outputs,omitempty"`
|
||||
EnableTokenUsage bool `json:"enable_token_usage,omitempty"`
|
||||
EnableAsyncEngineDebug bool `json:"enable_async_engine_debug,omitempty"`
|
||||
EngineUseRay bool `json:"engine_use_ray,omitempty"`
|
||||
DisableLogRequests bool `json:"disable_log_requests,omitempty"`
|
||||
MaxLogLen int `json:"max_log_len,omitempty"`
|
||||
|
||||
// Additional engine configuration
|
||||
Task string `json:"task,omitempty"`
|
||||
MultiModalConfig string `json:"multi_modal_config,omitempty"`
|
||||
LimitMmPerPrompt string `json:"limit_mm_per_prompt,omitempty"`
|
||||
EnableSleepMode bool `json:"enable_sleep_mode,omitempty"`
|
||||
EnableChunkingRequest bool `json:"enable_chunking_request,omitempty"`
|
||||
CompilationConfig string `json:"compilation_config,omitempty"`
|
||||
DisableSlidingWindowMask bool `json:"disable_sliding_window_mask,omitempty"`
|
||||
EnableTRTLLMEngineLatency bool `json:"enable_trtllm_engine_latency,omitempty"`
|
||||
OverridePoolingConfig string `json:"override_pooling_config,omitempty"`
|
||||
OverrideNeuronConfig string `json:"override_neuron_config,omitempty"`
|
||||
OverrideKVCacheALIGNSize int `json:"override_kv_cache_align_size,omitempty"`
|
||||
}
|
||||
|
||||
// NewVllmServerOptions creates a new VllmServerOptions with defaults
|
||||
func NewVllmServerOptions() *VllmServerOptions {
|
||||
return &VllmServerOptions{
|
||||
Host: "127.0.0.1",
|
||||
Port: 8000,
|
||||
TensorParallelSize: 1,
|
||||
PipelineParallelSize: 1,
|
||||
GPUMemoryUtilization: 0.9,
|
||||
BlockSize: 16,
|
||||
SwapSpace: 4,
|
||||
UvicornLogLevel: "info",
|
||||
ResponseRole: "assistant",
|
||||
TokenizerMode: "auto",
|
||||
TrustRemoteCode: false,
|
||||
EnablePrefixCaching: false,
|
||||
EnforceEager: false,
|
||||
DisableLogStats: false,
|
||||
DisableLogRequests: false,
|
||||
MaxLogprobs: 20,
|
||||
EnableLogOutputs: false,
|
||||
EnableTokenUsage: false,
|
||||
AllowCredentials: false,
|
||||
AllowedOrigins: []string{"*"},
|
||||
AllowedMethods: []string{"*"},
|
||||
AllowedHeaders: []string{"*"},
|
||||
}
|
||||
}
|
||||
|
||||
// UnmarshalJSON implements custom JSON unmarshaling to support multiple field names
|
||||
func (o *VllmServerOptions) UnmarshalJSON(data []byte) error {
|
||||
// First unmarshal into a map to handle multiple field names
|
||||
var raw map[string]any
|
||||
if err := json.Unmarshal(data, &raw); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Create a temporary struct for standard unmarshaling
|
||||
type tempOptions VllmServerOptions
|
||||
temp := tempOptions{}
|
||||
|
||||
// Standard unmarshal first
|
||||
if err := json.Unmarshal(data, &temp); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Copy to our struct
|
||||
*o = VllmServerOptions(temp)
|
||||
|
||||
// Handle alternative field names (CLI format with dashes)
|
||||
fieldMappings := map[string]string{
|
||||
// Basic options
|
||||
"tensor-parallel-size": "tensor_parallel_size",
|
||||
"pipeline-parallel-size": "pipeline_parallel_size",
|
||||
"max-parallel-loading-workers": "max_parallel_loading_workers",
|
||||
"disable-async-output-proc": "disable_async_output_proc",
|
||||
"worker-class": "worker_class",
|
||||
"enabled-lora-modules": "enabled_lora_modules",
|
||||
"max-lora-rank": "max_lora_rank",
|
||||
"fully-sharded-loras": "fully_sharded_loras",
|
||||
"lora-modules": "lora_modules",
|
||||
"prompt-adapters": "prompt_adapters",
|
||||
"max-prompt-adapter-token": "max_prompt_adapter_token",
|
||||
"scheduler-delay": "scheduler_delay",
|
||||
"enable-chunked-prefill": "enable_chunked_prefill",
|
||||
"speculative-model": "speculative_model",
|
||||
"speculative-model-quantization": "speculative_model_quantization",
|
||||
"speculative-revision": "speculative_revision",
|
||||
"speculative-max-model-len": "speculative_max_model_len",
|
||||
"speculative-disable-by-batch-size": "speculative_disable_by_batch_size",
|
||||
"ngpt-speculative-length": "ngpt_speculative_length",
|
||||
"speculative-disable-mqa": "speculative_disable_mqa",
|
||||
"model-loader-extra-config": "model_loader_extra_config",
|
||||
"ignore-patterns": "ignore_patterns",
|
||||
"preloaded-lora-modules": "preloaded_lora_modules",
|
||||
|
||||
// Model configuration
|
||||
"skip-tokenizer-init": "skip_tokenizer_init",
|
||||
"code-revision": "code_revision",
|
||||
"tokenizer-revision": "tokenizer_revision",
|
||||
"tokenizer-mode": "tokenizer_mode",
|
||||
"trust-remote-code": "trust_remote_code",
|
||||
"download-dir": "download_dir",
|
||||
"load-format": "load_format",
|
||||
"config-format": "config_format",
|
||||
"kv-cache-dtype": "kv_cache_dtype",
|
||||
"quantization-param-path": "quantization_param_path",
|
||||
"max-model-len": "max_model_len",
|
||||
"guided-decoding-backend": "guided_decoding_backend",
|
||||
"distributed-executor-backend": "distributed_executor_backend",
|
||||
"worker-use-ray": "worker_use_ray",
|
||||
"ray-workers-use-nsight": "ray_workers_use_nsight",
|
||||
|
||||
// Performance configuration
|
||||
"block-size": "block_size",
|
||||
"enable-prefix-caching": "enable_prefix_caching",
|
||||
"disable-sliding-window": "disable_sliding_window",
|
||||
"use-v2-block-manager": "use_v2_block_manager",
|
||||
"num-lookahead-slots": "num_lookahead_slots",
|
||||
"swap-space": "swap_space",
|
||||
"cpu-offload-gb": "cpu_offload_gb",
|
||||
"gpu-memory-utilization": "gpu_memory_utilization",
|
||||
"num-gpu-blocks-override": "num_gpu_blocks_override",
|
||||
"max-num-batched-tokens": "max_num_batched_tokens",
|
||||
"max-num-seqs": "max_num_seqs",
|
||||
"max-logprobs": "max_logprobs",
|
||||
"disable-log-stats": "disable_log_stats",
|
||||
"rope-scaling": "rope_scaling",
|
||||
"rope-theta": "rope_theta",
|
||||
"enforce-eager": "enforce_eager",
|
||||
"max-context-len-to-capture": "max_context_len_to_capture",
|
||||
"max-seq-len-to-capture": "max_seq_len_to_capture",
|
||||
"disable-custom-all-reduce": "disable_custom_all_reduce",
|
||||
"tokenizer-pool-size": "tokenizer_pool_size",
|
||||
"tokenizer-pool-type": "tokenizer_pool_type",
|
||||
"tokenizer-pool-extra-config": "tokenizer_pool_extra_config",
|
||||
"enable-lora-bias": "enable_lora_bias",
|
||||
"lora-extra-vocab-size": "lora_extra_vocab_size",
|
||||
"lora-rank": "lora_rank",
|
||||
"prompt-lookback-distance": "prompt_lookback_distance",
|
||||
"preemption-mode": "preemption_mode",
|
||||
|
||||
// Server configuration
|
||||
"uvicorn-log-level": "uvicorn_log_level",
|
||||
"response-role": "response_role",
|
||||
"ssl-keyfile": "ssl_keyfile",
|
||||
"ssl-certfile": "ssl_certfile",
|
||||
"ssl-ca-certs": "ssl_ca_certs",
|
||||
"ssl-cert-reqs": "ssl_cert_reqs",
|
||||
"root-path": "root_path",
|
||||
"return-tokens-as-token-ids": "return_tokens_as_token_ids",
|
||||
"disable-frontend-multiprocessing": "disable_frontend_multiprocessing",
|
||||
"enable-auto-tool-choice": "enable_auto_tool_choice",
|
||||
"tool-call-parser": "tool_call_parser",
|
||||
"tool-server": "tool_server",
|
||||
"chat-template": "chat_template",
|
||||
"chat-template-content-format": "chat_template_content_format",
|
||||
"allow-credentials": "allow_credentials",
|
||||
"allowed-origins": "allowed_origins",
|
||||
"allowed-methods": "allowed_methods",
|
||||
"allowed-headers": "allowed_headers",
|
||||
"api-key": "api_key",
|
||||
"enable-log-outputs": "enable_log_outputs",
|
||||
"enable-token-usage": "enable_token_usage",
|
||||
"enable-async-engine-debug": "enable_async_engine_debug",
|
||||
"engine-use-ray": "engine_use_ray",
|
||||
"disable-log-requests": "disable_log_requests",
|
||||
"max-log-len": "max_log_len",
|
||||
|
||||
// Additional options
|
||||
"multi-modal-config": "multi_modal_config",
|
||||
"limit-mm-per-prompt": "limit_mm_per_prompt",
|
||||
"enable-sleep-mode": "enable_sleep_mode",
|
||||
"enable-chunking-request": "enable_chunking_request",
|
||||
"compilation-config": "compilation_config",
|
||||
"disable-sliding-window-mask": "disable_sliding_window_mask",
|
||||
"enable-trtllm-engine-latency": "enable_trtllm_engine_latency",
|
||||
"override-pooling-config": "override_pooling_config",
|
||||
"override-neuron-config": "override_neuron_config",
|
||||
"override-kv-cache-align-size": "override_kv_cache_align_size",
|
||||
}
|
||||
|
||||
// Process alternative field names
|
||||
for altName, canonicalName := range fieldMappings {
|
||||
if value, exists := raw[altName]; exists {
|
||||
// Use reflection to set the field value
|
||||
v := reflect.ValueOf(o).Elem()
|
||||
field := v.FieldByNameFunc(func(fieldName string) bool {
|
||||
field, _ := v.Type().FieldByName(fieldName)
|
||||
jsonTag := field.Tag.Get("json")
|
||||
return jsonTag == canonicalName+",omitempty" || jsonTag == canonicalName
|
||||
})
|
||||
|
||||
if field.IsValid() && field.CanSet() {
|
||||
switch field.Kind() {
|
||||
case reflect.Int:
|
||||
if intVal, ok := value.(float64); ok {
|
||||
field.SetInt(int64(intVal))
|
||||
} else if strVal, ok := value.(string); ok {
|
||||
if intVal, err := strconv.Atoi(strVal); err == nil {
|
||||
field.SetInt(int64(intVal))
|
||||
}
|
||||
}
|
||||
case reflect.Float64:
|
||||
if floatVal, ok := value.(float64); ok {
|
||||
field.SetFloat(floatVal)
|
||||
} else if strVal, ok := value.(string); ok {
|
||||
if floatVal, err := strconv.ParseFloat(strVal, 64); err == nil {
|
||||
field.SetFloat(floatVal)
|
||||
}
|
||||
}
|
||||
case reflect.String:
|
||||
if strVal, ok := value.(string); ok {
|
||||
field.SetString(strVal)
|
||||
}
|
||||
case reflect.Bool:
|
||||
if boolVal, ok := value.(bool); ok {
|
||||
field.SetBool(boolVal)
|
||||
}
|
||||
case reflect.Slice:
|
||||
if field.Type().Elem().Kind() == reflect.String {
|
||||
if strVal, ok := value.(string); ok {
|
||||
// Split comma-separated values
|
||||
values := strings.Split(strVal, ",")
|
||||
for i, v := range values {
|
||||
values[i] = strings.TrimSpace(v)
|
||||
}
|
||||
field.Set(reflect.ValueOf(values))
|
||||
} else if slice, ok := value.([]interface{}); ok {
|
||||
var strSlice []string
|
||||
for _, item := range slice {
|
||||
if str, ok := item.(string); ok {
|
||||
strSlice = append(strSlice, str)
|
||||
}
|
||||
}
|
||||
field.Set(reflect.ValueOf(strSlice))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// BuildCommandArgs converts VllmServerOptions to command line arguments
|
||||
// Note: This does NOT include the "serve" subcommand, that's handled at the instance level
|
||||
func (o *VllmServerOptions) BuildCommandArgs() []string {
|
||||
var args []string
|
||||
|
||||
v := reflect.ValueOf(o).Elem()
|
||||
t := v.Type()
|
||||
|
||||
for i := 0; i < v.NumField(); i++ {
|
||||
field := v.Field(i)
|
||||
fieldType := t.Field(i)
|
||||
|
||||
// Skip unexported fields
|
||||
if !field.CanInterface() {
|
||||
continue
|
||||
}
|
||||
|
||||
// Get the JSON tag to determine the flag name
|
||||
jsonTag := fieldType.Tag.Get("json")
|
||||
if jsonTag == "" || jsonTag == "-" {
|
||||
continue
|
||||
}
|
||||
|
||||
// Remove ",omitempty" from the tag
|
||||
flagName := jsonTag
|
||||
if commaIndex := strings.Index(jsonTag, ","); commaIndex != -1 {
|
||||
flagName = jsonTag[:commaIndex]
|
||||
}
|
||||
|
||||
// Skip host and port as they are handled by llamactl
|
||||
if flagName == "host" || flagName == "port" {
|
||||
continue
|
||||
}
|
||||
|
||||
// Convert snake_case to kebab-case for CLI flags
|
||||
flagName = strings.ReplaceAll(flagName, "_", "-")
|
||||
|
||||
// Add the appropriate arguments based on field type and value
|
||||
switch field.Kind() {
|
||||
case reflect.Bool:
|
||||
if field.Bool() {
|
||||
args = append(args, "--"+flagName)
|
||||
}
|
||||
case reflect.Int:
|
||||
if field.Int() != 0 {
|
||||
args = append(args, "--"+flagName, strconv.FormatInt(field.Int(), 10))
|
||||
}
|
||||
case reflect.Float64:
|
||||
if field.Float() != 0 {
|
||||
args = append(args, "--"+flagName, strconv.FormatFloat(field.Float(), 'f', -1, 64))
|
||||
}
|
||||
case reflect.String:
|
||||
if field.String() != "" {
|
||||
args = append(args, "--"+flagName, field.String())
|
||||
}
|
||||
case reflect.Slice:
|
||||
if field.Type().Elem().Kind() == reflect.String {
|
||||
// Handle []string fields - some are comma-separated, some use multiple flags
|
||||
if flagName == "api-key" || flagName == "allowed-origins" || flagName == "allowed-methods" || flagName == "allowed-headers" || flagName == "middleware" {
|
||||
// Multiple flags for these
|
||||
for j := 0; j < field.Len(); j++ {
|
||||
args = append(args, "--"+flagName, field.Index(j).String())
|
||||
}
|
||||
} else {
|
||||
// Comma-separated for others
|
||||
if field.Len() > 0 {
|
||||
var values []string
|
||||
for j := 0; j < field.Len(); j++ {
|
||||
values = append(values, field.Index(j).String())
|
||||
}
|
||||
args = append(args, "--"+flagName, strings.Join(values, ","))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return args
|
||||
}
|
||||
Reference in New Issue
Block a user