mirror of
https://github.com/lordmathis/llamactl.git
synced 2025-11-06 00:54:23 +00:00
148 lines
8.9 KiB
Go
148 lines
8.9 KiB
Go
package vllm
|
|
|
|
import (
|
|
"llamactl/pkg/backends"
|
|
)
|
|
|
|
type VllmServerOptions struct {
|
|
// Basic connection options (auto-assigned by llamactl)
|
|
Host string `json:"host,omitempty"`
|
|
Port int `json:"port,omitempty"`
|
|
|
|
// Model and engine configuration
|
|
Model string `json:"model,omitempty"`
|
|
Tokenizer string `json:"tokenizer,omitempty"`
|
|
SkipTokenizerInit bool `json:"skip_tokenizer_init,omitempty"`
|
|
Revision string `json:"revision,omitempty"`
|
|
CodeRevision string `json:"code_revision,omitempty"`
|
|
TokenizerRevision string `json:"tokenizer_revision,omitempty"`
|
|
TokenizerMode string `json:"tokenizer_mode,omitempty"`
|
|
TrustRemoteCode bool `json:"trust_remote_code,omitempty"`
|
|
DownloadDir string `json:"download_dir,omitempty"`
|
|
LoadFormat string `json:"load_format,omitempty"`
|
|
ConfigFormat string `json:"config_format,omitempty"`
|
|
Dtype string `json:"dtype,omitempty"`
|
|
KVCacheDtype string `json:"kv_cache_dtype,omitempty"`
|
|
QuantizationParamPath string `json:"quantization_param_path,omitempty"`
|
|
Seed int `json:"seed,omitempty"`
|
|
MaxModelLen int `json:"max_model_len,omitempty"`
|
|
GuidedDecodingBackend string `json:"guided_decoding_backend,omitempty"`
|
|
DistributedExecutorBackend string `json:"distributed_executor_backend,omitempty"`
|
|
WorkerUseRay bool `json:"worker_use_ray,omitempty"`
|
|
RayWorkersUseNSight bool `json:"ray_workers_use_nsight,omitempty"`
|
|
|
|
// Performance and serving configuration
|
|
BlockSize int `json:"block_size,omitempty"`
|
|
EnablePrefixCaching bool `json:"enable_prefix_caching,omitempty"`
|
|
DisableSlidingWindow bool `json:"disable_sliding_window,omitempty"`
|
|
UseV2BlockManager bool `json:"use_v2_block_manager,omitempty"`
|
|
NumLookaheadSlots int `json:"num_lookahead_slots,omitempty"`
|
|
SwapSpace int `json:"swap_space,omitempty"`
|
|
CPUOffloadGB int `json:"cpu_offload_gb,omitempty"`
|
|
GPUMemoryUtilization float64 `json:"gpu_memory_utilization,omitempty"`
|
|
NumGPUBlocksOverride int `json:"num_gpu_blocks_override,omitempty"`
|
|
MaxNumBatchedTokens int `json:"max_num_batched_tokens,omitempty"`
|
|
MaxNumSeqs int `json:"max_num_seqs,omitempty"`
|
|
MaxLogprobs int `json:"max_logprobs,omitempty"`
|
|
DisableLogStats bool `json:"disable_log_stats,omitempty"`
|
|
Quantization string `json:"quantization,omitempty"`
|
|
RopeScaling string `json:"rope_scaling,omitempty"`
|
|
RopeTheta float64 `json:"rope_theta,omitempty"`
|
|
EnforceEager bool `json:"enforce_eager,omitempty"`
|
|
MaxContextLenToCapture int `json:"max_context_len_to_capture,omitempty"`
|
|
MaxSeqLenToCapture int `json:"max_seq_len_to_capture,omitempty"`
|
|
DisableCustomAllReduce bool `json:"disable_custom_all_reduce,omitempty"`
|
|
TokenizerPoolSize int `json:"tokenizer_pool_size,omitempty"`
|
|
TokenizerPoolType string `json:"tokenizer_pool_type,omitempty"`
|
|
TokenizerPoolExtraConfig string `json:"tokenizer_pool_extra_config,omitempty"`
|
|
EnableLoraBias bool `json:"enable_lora_bias,omitempty"`
|
|
LoraExtraVocabSize int `json:"lora_extra_vocab_size,omitempty"`
|
|
LoraRank int `json:"lora_rank,omitempty"`
|
|
PromptLookbackDistance int `json:"prompt_lookback_distance,omitempty"`
|
|
PreemptionMode string `json:"preemption_mode,omitempty"`
|
|
|
|
// Distributed and parallel processing
|
|
TensorParallelSize int `json:"tensor_parallel_size,omitempty"`
|
|
PipelineParallelSize int `json:"pipeline_parallel_size,omitempty"`
|
|
MaxParallelLoadingWorkers int `json:"max_parallel_loading_workers,omitempty"`
|
|
DisableAsyncOutputProc bool `json:"disable_async_output_proc,omitempty"`
|
|
WorkerClass string `json:"worker_class,omitempty"`
|
|
EnabledLoraModules string `json:"enabled_lora_modules,omitempty"`
|
|
MaxLoraRank int `json:"max_lora_rank,omitempty"`
|
|
FullyShardedLoras bool `json:"fully_sharded_loras,omitempty"`
|
|
LoraModules string `json:"lora_modules,omitempty"`
|
|
PromptAdapters string `json:"prompt_adapters,omitempty"`
|
|
MaxPromptAdapterToken int `json:"max_prompt_adapter_token,omitempty"`
|
|
Device string `json:"device,omitempty"`
|
|
SchedulerDelay float64 `json:"scheduler_delay,omitempty"`
|
|
EnableChunkedPrefill bool `json:"enable_chunked_prefill,omitempty"`
|
|
SpeculativeModel string `json:"speculative_model,omitempty"`
|
|
SpeculativeModelQuantization string `json:"speculative_model_quantization,omitempty"`
|
|
SpeculativeRevision string `json:"speculative_revision,omitempty"`
|
|
SpeculativeMaxModelLen int `json:"speculative_max_model_len,omitempty"`
|
|
SpeculativeDisableByBatchSize int `json:"speculative_disable_by_batch_size,omitempty"`
|
|
NgptSpeculativeLength int `json:"ngpt_speculative_length,omitempty"`
|
|
SpeculativeDisableMqa bool `json:"speculative_disable_mqa,omitempty"`
|
|
ModelLoaderExtraConfig string `json:"model_loader_extra_config,omitempty"`
|
|
IgnorePatterns string `json:"ignore_patterns,omitempty"`
|
|
PreloadedLoraModules string `json:"preloaded_lora_modules,omitempty"`
|
|
|
|
// OpenAI server specific options
|
|
UDS string `json:"uds,omitempty"`
|
|
UvicornLogLevel string `json:"uvicorn_log_level,omitempty"`
|
|
ResponseRole string `json:"response_role,omitempty"`
|
|
SSLKeyfile string `json:"ssl_keyfile,omitempty"`
|
|
SSLCertfile string `json:"ssl_certfile,omitempty"`
|
|
SSLCACerts string `json:"ssl_ca_certs,omitempty"`
|
|
SSLCertReqs int `json:"ssl_cert_reqs,omitempty"`
|
|
RootPath string `json:"root_path,omitempty"`
|
|
Middleware []string `json:"middleware,omitempty"`
|
|
ReturnTokensAsTokenIDS bool `json:"return_tokens_as_token_ids,omitempty"`
|
|
DisableFrontendMultiprocessing bool `json:"disable_frontend_multiprocessing,omitempty"`
|
|
EnableAutoToolChoice bool `json:"enable_auto_tool_choice,omitempty"`
|
|
ToolCallParser string `json:"tool_call_parser,omitempty"`
|
|
ToolServer string `json:"tool_server,omitempty"`
|
|
ChatTemplate string `json:"chat_template,omitempty"`
|
|
ChatTemplateContentFormat string `json:"chat_template_content_format,omitempty"`
|
|
AllowCredentials bool `json:"allow_credentials,omitempty"`
|
|
AllowedOrigins []string `json:"allowed_origins,omitempty"`
|
|
AllowedMethods []string `json:"allowed_methods,omitempty"`
|
|
AllowedHeaders []string `json:"allowed_headers,omitempty"`
|
|
APIKey []string `json:"api_key,omitempty"`
|
|
EnableLogOutputs bool `json:"enable_log_outputs,omitempty"`
|
|
EnableTokenUsage bool `json:"enable_token_usage,omitempty"`
|
|
EnableAsyncEngineDebug bool `json:"enable_async_engine_debug,omitempty"`
|
|
EngineUseRay bool `json:"engine_use_ray,omitempty"`
|
|
DisableLogRequests bool `json:"disable_log_requests,omitempty"`
|
|
MaxLogLen int `json:"max_log_len,omitempty"`
|
|
|
|
// Additional engine configuration
|
|
Task string `json:"task,omitempty"`
|
|
MultiModalConfig string `json:"multi_modal_config,omitempty"`
|
|
LimitMmPerPrompt string `json:"limit_mm_per_prompt,omitempty"`
|
|
EnableSleepMode bool `json:"enable_sleep_mode,omitempty"`
|
|
EnableChunkingRequest bool `json:"enable_chunking_request,omitempty"`
|
|
CompilationConfig string `json:"compilation_config,omitempty"`
|
|
DisableSlidingWindowMask bool `json:"disable_sliding_window_mask,omitempty"`
|
|
EnableTRTLLMEngineLatency bool `json:"enable_trtllm_engine_latency,omitempty"`
|
|
OverridePoolingConfig string `json:"override_pooling_config,omitempty"`
|
|
OverrideNeuronConfig string `json:"override_neuron_config,omitempty"`
|
|
OverrideKVCacheALIGNSize int `json:"override_kv_cache_align_size,omitempty"`
|
|
}
|
|
|
|
// BuildCommandArgs converts VllmServerOptions to command line arguments using the common builder
|
|
// Note: This does NOT include the "serve" subcommand, that's handled at the instance level
|
|
func (o *VllmServerOptions) BuildCommandArgs() []string {
|
|
config := backends.ArgsBuilderConfig{
|
|
SliceHandling: backends.SliceAsMixed,
|
|
MultipleFlags: map[string]struct{}{
|
|
"api-key": {},
|
|
"allowed-origins": {},
|
|
"allowed-methods": {},
|
|
"allowed-headers": {},
|
|
"middleware": {},
|
|
},
|
|
}
|
|
return backends.BuildCommandArgs(o, config)
|
|
}
|