mirror of
https://github.com/lordmathis/llamactl.git
synced 2025-11-05 16:44:22 +00:00
Initial vLLM backend support
This commit is contained in:
@@ -5,5 +5,6 @@ type BackendType string
|
|||||||
const (
|
const (
|
||||||
BackendTypeLlamaCpp BackendType = "llama_cpp"
|
BackendTypeLlamaCpp BackendType = "llama_cpp"
|
||||||
BackendTypeMlxLm BackendType = "mlx_lm"
|
BackendTypeMlxLm BackendType = "mlx_lm"
|
||||||
|
BackendTypeVllm BackendType = "vllm"
|
||||||
// BackendTypeMlxVlm BackendType = "mlx_vlm" // Future expansion
|
// BackendTypeMlxVlm BackendType = "mlx_vlm" // Future expansion
|
||||||
)
|
)
|
||||||
|
|||||||
302
pkg/backends/vllm/parser.go
Normal file
302
pkg/backends/vllm/parser.go
Normal file
@@ -0,0 +1,302 @@
|
|||||||
|
package vllm
|
||||||
|
|
||||||
|
import (
|
||||||
|
"encoding/json"
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
|
"path/filepath"
|
||||||
|
"regexp"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
// ParseVllmCommand parses a vLLM serve command string into VllmServerOptions
|
||||||
|
// Supports multiple formats:
|
||||||
|
// 1. Full command: "vllm serve --model MODEL_NAME --other-args"
|
||||||
|
// 2. Full path: "/usr/local/bin/vllm serve --model MODEL_NAME"
|
||||||
|
// 3. Serve only: "serve --model MODEL_NAME --other-args"
|
||||||
|
// 4. Args only: "--model MODEL_NAME --other-args"
|
||||||
|
// 5. Multiline commands with backslashes
|
||||||
|
func ParseVllmCommand(command string) (*VllmServerOptions, error) {
|
||||||
|
// 1. Normalize the command - handle multiline with backslashes
|
||||||
|
trimmed := normalizeMultilineCommand(command)
|
||||||
|
if trimmed == "" {
|
||||||
|
return nil, fmt.Errorf("command cannot be empty")
|
||||||
|
}
|
||||||
|
|
||||||
|
// 2. Extract arguments from command
|
||||||
|
args, err := extractArgumentsFromCommand(trimmed)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
// 3. Parse arguments into map
|
||||||
|
options := make(map[string]any)
|
||||||
|
|
||||||
|
// Known multi-valued flags (snake_case form)
|
||||||
|
multiValued := map[string]struct{}{
|
||||||
|
"middleware": {},
|
||||||
|
"api_key": {},
|
||||||
|
"allowed_origins": {},
|
||||||
|
"allowed_methods": {},
|
||||||
|
"allowed_headers": {},
|
||||||
|
"lora_modules": {},
|
||||||
|
"prompt_adapters": {},
|
||||||
|
}
|
||||||
|
|
||||||
|
i := 0
|
||||||
|
for i < len(args) {
|
||||||
|
arg := args[i]
|
||||||
|
|
||||||
|
if !strings.HasPrefix(arg, "-") { // skip positional / stray values
|
||||||
|
i++
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
// Reject malformed flags with more than two leading dashes (e.g. ---model) to surface user mistakes
|
||||||
|
if strings.HasPrefix(arg, "---") {
|
||||||
|
return nil, fmt.Errorf("malformed flag: %s", arg)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Unified parsing for --flag=value vs --flag value
|
||||||
|
var rawFlag, rawValue string
|
||||||
|
hasEquals := false
|
||||||
|
if strings.Contains(arg, "=") {
|
||||||
|
parts := strings.SplitN(arg, "=", 2)
|
||||||
|
rawFlag = parts[0]
|
||||||
|
rawValue = parts[1] // may be empty string
|
||||||
|
hasEquals = true
|
||||||
|
} else {
|
||||||
|
rawFlag = arg
|
||||||
|
}
|
||||||
|
|
||||||
|
flagCore := strings.TrimPrefix(strings.TrimPrefix(rawFlag, "-"), "-")
|
||||||
|
flagName := strings.ReplaceAll(flagCore, "-", "_")
|
||||||
|
|
||||||
|
// Detect value if not in equals form
|
||||||
|
valueProvided := hasEquals
|
||||||
|
if !hasEquals {
|
||||||
|
if i+1 < len(args) && !isFlag(args[i+1]) { // next token is value
|
||||||
|
rawValue = args[i+1]
|
||||||
|
valueProvided = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Determine if multi-valued flag
|
||||||
|
_, isMulti := multiValued[flagName]
|
||||||
|
|
||||||
|
// Normalization helper: ensure slice for multi-valued flags
|
||||||
|
appendValue := func(valStr string) {
|
||||||
|
if existing, ok := options[flagName]; ok {
|
||||||
|
// Existing value; ensure slice semantics for multi-valued flags or repeated occurrences
|
||||||
|
if slice, ok := existing.([]string); ok {
|
||||||
|
options[flagName] = append(slice, valStr)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
// Convert scalar to slice
|
||||||
|
options[flagName] = []string{fmt.Sprintf("%v", existing), valStr}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
// First value
|
||||||
|
if isMulti {
|
||||||
|
options[flagName] = []string{valStr}
|
||||||
|
} else {
|
||||||
|
// We'll parse type below for single-valued flags
|
||||||
|
options[flagName] = valStr
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if valueProvided {
|
||||||
|
// Use raw token for multi-valued flags; else allow typed parsing
|
||||||
|
appendValue(rawValue)
|
||||||
|
if !isMulti { // convert to typed value if scalar
|
||||||
|
if strVal, ok := options[flagName].(string); ok { // still scalar
|
||||||
|
options[flagName] = parseValue(strVal)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Advance index: if we consumed a following token as value (non equals form), skip it
|
||||||
|
if !hasEquals && i+1 < len(args) && rawValue == args[i+1] {
|
||||||
|
i += 2
|
||||||
|
} else {
|
||||||
|
i++
|
||||||
|
}
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
// Boolean flag (no value)
|
||||||
|
options[flagName] = true
|
||||||
|
i++
|
||||||
|
}
|
||||||
|
|
||||||
|
// 4. Convert to VllmServerOptions using existing UnmarshalJSON
|
||||||
|
jsonData, err := json.Marshal(options)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to marshal parsed options: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
var vllmOptions VllmServerOptions
|
||||||
|
if err := json.Unmarshal(jsonData, &vllmOptions); err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to parse command options: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// 5. Return VllmServerOptions
|
||||||
|
return &vllmOptions, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// parseValue attempts to parse a string value into the most appropriate type
|
||||||
|
func parseValue(value string) any {
|
||||||
|
// Surrounding matching quotes (single or double)
|
||||||
|
if l := len(value); l >= 2 {
|
||||||
|
if (value[0] == '"' && value[l-1] == '"') || (value[0] == '\'' && value[l-1] == '\'') {
|
||||||
|
value = value[1 : l-1]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
lower := strings.ToLower(value)
|
||||||
|
if lower == "true" {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
if lower == "false" {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
if intVal, err := strconv.Atoi(value); err == nil {
|
||||||
|
return intVal
|
||||||
|
}
|
||||||
|
if floatVal, err := strconv.ParseFloat(value, 64); err == nil {
|
||||||
|
return floatVal
|
||||||
|
}
|
||||||
|
return value
|
||||||
|
}
|
||||||
|
|
||||||
|
// normalizeMultilineCommand handles multiline commands with backslashes
|
||||||
|
func normalizeMultilineCommand(command string) string {
|
||||||
|
// Handle escaped newlines (backslash followed by newline)
|
||||||
|
re := regexp.MustCompile(`\\\s*\n\s*`)
|
||||||
|
normalized := re.ReplaceAllString(command, " ")
|
||||||
|
|
||||||
|
// Clean up extra whitespace
|
||||||
|
re = regexp.MustCompile(`\s+`)
|
||||||
|
normalized = re.ReplaceAllString(normalized, " ")
|
||||||
|
|
||||||
|
return strings.TrimSpace(normalized)
|
||||||
|
}
|
||||||
|
|
||||||
|
// extractArgumentsFromCommand extracts arguments from various command formats
|
||||||
|
func extractArgumentsFromCommand(command string) ([]string, error) {
|
||||||
|
// Split command into tokens respecting quotes
|
||||||
|
tokens, err := splitCommandTokens(command)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(tokens) == 0 {
|
||||||
|
return nil, fmt.Errorf("no command tokens found")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if first token looks like an executable
|
||||||
|
firstToken := tokens[0]
|
||||||
|
|
||||||
|
// Case 1: Full path to executable (contains path separator or ends with vllm)
|
||||||
|
if strings.Contains(firstToken, string(filepath.Separator)) ||
|
||||||
|
strings.HasSuffix(filepath.Base(firstToken), "vllm") {
|
||||||
|
// Check if second token is "serve"
|
||||||
|
if len(tokens) > 1 && strings.ToLower(tokens[1]) == "serve" {
|
||||||
|
return tokens[2:], nil // Return everything except executable and serve
|
||||||
|
}
|
||||||
|
return tokens[1:], nil // Return everything except the executable
|
||||||
|
}
|
||||||
|
|
||||||
|
// Case 2: Just "vllm" command
|
||||||
|
if strings.ToLower(firstToken) == "vllm" {
|
||||||
|
// Check if second token is "serve"
|
||||||
|
if len(tokens) > 1 && strings.ToLower(tokens[1]) == "serve" {
|
||||||
|
return tokens[2:], nil // Return everything except vllm and serve
|
||||||
|
}
|
||||||
|
return tokens[1:], nil // Return everything except vllm
|
||||||
|
}
|
||||||
|
|
||||||
|
// Case 3: Just "serve" command
|
||||||
|
if strings.ToLower(firstToken) == "serve" {
|
||||||
|
return tokens[1:], nil // Return everything except serve
|
||||||
|
}
|
||||||
|
|
||||||
|
// Case 4: Arguments only (starts with a flag)
|
||||||
|
if strings.HasPrefix(firstToken, "-") {
|
||||||
|
return tokens, nil // Return all tokens as arguments
|
||||||
|
}
|
||||||
|
|
||||||
|
// Case 5: Unknown format - might be a different executable name
|
||||||
|
// Be permissive and assume it's the executable
|
||||||
|
if len(tokens) > 1 && strings.ToLower(tokens[1]) == "serve" {
|
||||||
|
return tokens[2:], nil // Return everything except executable and serve
|
||||||
|
}
|
||||||
|
return tokens[1:], nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// splitCommandTokens splits a command string into tokens, respecting quotes
|
||||||
|
func splitCommandTokens(command string) ([]string, error) {
|
||||||
|
var tokens []string
|
||||||
|
var current strings.Builder
|
||||||
|
inQuotes := false
|
||||||
|
quoteChar := byte(0)
|
||||||
|
escaped := false
|
||||||
|
|
||||||
|
for i := 0; i < len(command); i++ {
|
||||||
|
c := command[i]
|
||||||
|
|
||||||
|
if escaped {
|
||||||
|
current.WriteByte(c)
|
||||||
|
escaped = false
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
if c == '\\' {
|
||||||
|
escaped = true
|
||||||
|
current.WriteByte(c)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
if !inQuotes && (c == '"' || c == '\'') {
|
||||||
|
inQuotes = true
|
||||||
|
quoteChar = c
|
||||||
|
current.WriteByte(c)
|
||||||
|
} else if inQuotes && c == quoteChar {
|
||||||
|
inQuotes = false
|
||||||
|
quoteChar = 0
|
||||||
|
current.WriteByte(c)
|
||||||
|
} else if !inQuotes && (c == ' ' || c == '\t') {
|
||||||
|
if current.Len() > 0 {
|
||||||
|
tokens = append(tokens, current.String())
|
||||||
|
current.Reset()
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
current.WriteByte(c)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if inQuotes {
|
||||||
|
return nil, errors.New("unterminated quoted string")
|
||||||
|
}
|
||||||
|
|
||||||
|
if current.Len() > 0 {
|
||||||
|
tokens = append(tokens, current.String())
|
||||||
|
}
|
||||||
|
|
||||||
|
return tokens, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// isFlag determines if a string is a command line flag or a value
|
||||||
|
// Handles the special case where negative numbers should be treated as values, not flags
|
||||||
|
func isFlag(arg string) bool {
|
||||||
|
if !strings.HasPrefix(arg, "-") {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
// Special case: if it's a negative number, treat it as a value
|
||||||
|
if _, err := strconv.ParseFloat(arg, 64); err == nil {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
return true
|
||||||
|
}
|
||||||
83
pkg/backends/vllm/parser_test.go
Normal file
83
pkg/backends/vllm/parser_test.go
Normal file
@@ -0,0 +1,83 @@
|
|||||||
|
package vllm
|
||||||
|
|
||||||
|
import (
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestParseVllmCommand(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
command string
|
||||||
|
expectErr bool
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "basic vllm serve command",
|
||||||
|
command: "vllm serve --model microsoft/DialoGPT-medium",
|
||||||
|
expectErr: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "serve only command",
|
||||||
|
command: "serve --model microsoft/DialoGPT-medium",
|
||||||
|
expectErr: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "args only",
|
||||||
|
command: "--model microsoft/DialoGPT-medium --tensor-parallel-size 2",
|
||||||
|
expectErr: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "empty command",
|
||||||
|
command: "",
|
||||||
|
expectErr: true,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "unterminated quote",
|
||||||
|
command: `vllm serve --model "unterminated`,
|
||||||
|
expectErr: true,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range tests {
|
||||||
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
|
result, err := ParseVllmCommand(tt.command)
|
||||||
|
|
||||||
|
if tt.expectErr {
|
||||||
|
if err == nil {
|
||||||
|
t.Errorf("expected error but got none")
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
t.Errorf("unexpected error: %v", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if result == nil {
|
||||||
|
t.Errorf("expected result but got nil")
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestParseVllmCommandValues(t *testing.T) {
|
||||||
|
command := "vllm serve --model test-model --tensor-parallel-size 4 --gpu-memory-utilization 0.8 --enable-log-outputs"
|
||||||
|
result, err := ParseVllmCommand(command)
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("unexpected error: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if result.Model != "test-model" {
|
||||||
|
t.Errorf("expected model 'test-model', got '%s'", result.Model)
|
||||||
|
}
|
||||||
|
if result.TensorParallelSize != 4 {
|
||||||
|
t.Errorf("expected tensor_parallel_size 4, got %d", result.TensorParallelSize)
|
||||||
|
}
|
||||||
|
if result.GPUMemoryUtilization != 0.8 {
|
||||||
|
t.Errorf("expected gpu_memory_utilization 0.8, got %f", result.GPUMemoryUtilization)
|
||||||
|
}
|
||||||
|
if !result.EnableLogOutputs {
|
||||||
|
t.Errorf("expected enable_log_outputs true, got %v", result.EnableLogOutputs)
|
||||||
|
}
|
||||||
|
}
|
||||||
439
pkg/backends/vllm/vllm.go
Normal file
439
pkg/backends/vllm/vllm.go
Normal file
@@ -0,0 +1,439 @@
|
|||||||
|
package vllm
|
||||||
|
|
||||||
|
import (
|
||||||
|
"encoding/json"
|
||||||
|
"reflect"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
type VllmServerOptions struct {
|
||||||
|
// Basic connection options (auto-assigned by llamactl)
|
||||||
|
Host string `json:"host,omitempty"`
|
||||||
|
Port int `json:"port,omitempty"`
|
||||||
|
|
||||||
|
// Model and engine configuration
|
||||||
|
Model string `json:"model,omitempty"`
|
||||||
|
Tokenizer string `json:"tokenizer,omitempty"`
|
||||||
|
SkipTokenizerInit bool `json:"skip_tokenizer_init,omitempty"`
|
||||||
|
Revision string `json:"revision,omitempty"`
|
||||||
|
CodeRevision string `json:"code_revision,omitempty"`
|
||||||
|
TokenizerRevision string `json:"tokenizer_revision,omitempty"`
|
||||||
|
TokenizerMode string `json:"tokenizer_mode,omitempty"`
|
||||||
|
TrustRemoteCode bool `json:"trust_remote_code,omitempty"`
|
||||||
|
DownloadDir string `json:"download_dir,omitempty"`
|
||||||
|
LoadFormat string `json:"load_format,omitempty"`
|
||||||
|
ConfigFormat string `json:"config_format,omitempty"`
|
||||||
|
Dtype string `json:"dtype,omitempty"`
|
||||||
|
KVCacheDtype string `json:"kv_cache_dtype,omitempty"`
|
||||||
|
QuantizationParamPath string `json:"quantization_param_path,omitempty"`
|
||||||
|
Seed int `json:"seed,omitempty"`
|
||||||
|
MaxModelLen int `json:"max_model_len,omitempty"`
|
||||||
|
GuidedDecodingBackend string `json:"guided_decoding_backend,omitempty"`
|
||||||
|
DistributedExecutorBackend string `json:"distributed_executor_backend,omitempty"`
|
||||||
|
WorkerUseRay bool `json:"worker_use_ray,omitempty"`
|
||||||
|
RayWorkersUseNSight bool `json:"ray_workers_use_nsight,omitempty"`
|
||||||
|
|
||||||
|
// Performance and serving configuration
|
||||||
|
BlockSize int `json:"block_size,omitempty"`
|
||||||
|
EnablePrefixCaching bool `json:"enable_prefix_caching,omitempty"`
|
||||||
|
DisableSlidingWindow bool `json:"disable_sliding_window,omitempty"`
|
||||||
|
UseV2BlockManager bool `json:"use_v2_block_manager,omitempty"`
|
||||||
|
NumLookaheadSlots int `json:"num_lookahead_slots,omitempty"`
|
||||||
|
SwapSpace int `json:"swap_space,omitempty"`
|
||||||
|
CPUOffloadGB int `json:"cpu_offload_gb,omitempty"`
|
||||||
|
GPUMemoryUtilization float64 `json:"gpu_memory_utilization,omitempty"`
|
||||||
|
NumGPUBlocksOverride int `json:"num_gpu_blocks_override,omitempty"`
|
||||||
|
MaxNumBatchedTokens int `json:"max_num_batched_tokens,omitempty"`
|
||||||
|
MaxNumSeqs int `json:"max_num_seqs,omitempty"`
|
||||||
|
MaxLogprobs int `json:"max_logprobs,omitempty"`
|
||||||
|
DisableLogStats bool `json:"disable_log_stats,omitempty"`
|
||||||
|
Quantization string `json:"quantization,omitempty"`
|
||||||
|
RopeScaling string `json:"rope_scaling,omitempty"`
|
||||||
|
RopeTheta float64 `json:"rope_theta,omitempty"`
|
||||||
|
EnforceEager bool `json:"enforce_eager,omitempty"`
|
||||||
|
MaxContextLenToCapture int `json:"max_context_len_to_capture,omitempty"`
|
||||||
|
MaxSeqLenToCapture int `json:"max_seq_len_to_capture,omitempty"`
|
||||||
|
DisableCustomAllReduce bool `json:"disable_custom_all_reduce,omitempty"`
|
||||||
|
TokenizerPoolSize int `json:"tokenizer_pool_size,omitempty"`
|
||||||
|
TokenizerPoolType string `json:"tokenizer_pool_type,omitempty"`
|
||||||
|
TokenizerPoolExtraConfig string `json:"tokenizer_pool_extra_config,omitempty"`
|
||||||
|
EnableLoraBias bool `json:"enable_lora_bias,omitempty"`
|
||||||
|
LoraExtraVocabSize int `json:"lora_extra_vocab_size,omitempty"`
|
||||||
|
LoraRank int `json:"lora_rank,omitempty"`
|
||||||
|
PromptLookbackDistance int `json:"prompt_lookback_distance,omitempty"`
|
||||||
|
PreemptionMode string `json:"preemption_mode,omitempty"`
|
||||||
|
|
||||||
|
// Distributed and parallel processing
|
||||||
|
TensorParallelSize int `json:"tensor_parallel_size,omitempty"`
|
||||||
|
PipelineParallelSize int `json:"pipeline_parallel_size,omitempty"`
|
||||||
|
MaxParallelLoadingWorkers int `json:"max_parallel_loading_workers,omitempty"`
|
||||||
|
DisableAsyncOutputProc bool `json:"disable_async_output_proc,omitempty"`
|
||||||
|
WorkerClass string `json:"worker_class,omitempty"`
|
||||||
|
EnabledLoraModules string `json:"enabled_lora_modules,omitempty"`
|
||||||
|
MaxLoraRank int `json:"max_lora_rank,omitempty"`
|
||||||
|
FullyShardedLoras bool `json:"fully_sharded_loras,omitempty"`
|
||||||
|
LoraModules string `json:"lora_modules,omitempty"`
|
||||||
|
PromptAdapters string `json:"prompt_adapters,omitempty"`
|
||||||
|
MaxPromptAdapterToken int `json:"max_prompt_adapter_token,omitempty"`
|
||||||
|
Device string `json:"device,omitempty"`
|
||||||
|
SchedulerDelay float64 `json:"scheduler_delay,omitempty"`
|
||||||
|
EnableChunkedPrefill bool `json:"enable_chunked_prefill,omitempty"`
|
||||||
|
SpeculativeModel string `json:"speculative_model,omitempty"`
|
||||||
|
SpeculativeModelQuantization string `json:"speculative_model_quantization,omitempty"`
|
||||||
|
SpeculativeRevision string `json:"speculative_revision,omitempty"`
|
||||||
|
SpeculativeMaxModelLen int `json:"speculative_max_model_len,omitempty"`
|
||||||
|
SpeculativeDisableByBatchSize int `json:"speculative_disable_by_batch_size,omitempty"`
|
||||||
|
NgptSpeculativeLength int `json:"ngpt_speculative_length,omitempty"`
|
||||||
|
SpeculativeDisableMqa bool `json:"speculative_disable_mqa,omitempty"`
|
||||||
|
ModelLoaderExtraConfig string `json:"model_loader_extra_config,omitempty"`
|
||||||
|
IgnorePatterns string `json:"ignore_patterns,omitempty"`
|
||||||
|
PreloadedLoraModules string `json:"preloaded_lora_modules,omitempty"`
|
||||||
|
|
||||||
|
// OpenAI server specific options
|
||||||
|
UDS string `json:"uds,omitempty"`
|
||||||
|
UvicornLogLevel string `json:"uvicorn_log_level,omitempty"`
|
||||||
|
ResponseRole string `json:"response_role,omitempty"`
|
||||||
|
SSLKeyfile string `json:"ssl_keyfile,omitempty"`
|
||||||
|
SSLCertfile string `json:"ssl_certfile,omitempty"`
|
||||||
|
SSLCACerts string `json:"ssl_ca_certs,omitempty"`
|
||||||
|
SSLCertReqs int `json:"ssl_cert_reqs,omitempty"`
|
||||||
|
RootPath string `json:"root_path,omitempty"`
|
||||||
|
Middleware []string `json:"middleware,omitempty"`
|
||||||
|
ReturnTokensAsTokenIDS bool `json:"return_tokens_as_token_ids,omitempty"`
|
||||||
|
DisableFrontendMultiprocessing bool `json:"disable_frontend_multiprocessing,omitempty"`
|
||||||
|
EnableAutoToolChoice bool `json:"enable_auto_tool_choice,omitempty"`
|
||||||
|
ToolCallParser string `json:"tool_call_parser,omitempty"`
|
||||||
|
ToolServer string `json:"tool_server,omitempty"`
|
||||||
|
ChatTemplate string `json:"chat_template,omitempty"`
|
||||||
|
ChatTemplateContentFormat string `json:"chat_template_content_format,omitempty"`
|
||||||
|
AllowCredentials bool `json:"allow_credentials,omitempty"`
|
||||||
|
AllowedOrigins []string `json:"allowed_origins,omitempty"`
|
||||||
|
AllowedMethods []string `json:"allowed_methods,omitempty"`
|
||||||
|
AllowedHeaders []string `json:"allowed_headers,omitempty"`
|
||||||
|
APIKey []string `json:"api_key,omitempty"`
|
||||||
|
EnableLogOutputs bool `json:"enable_log_outputs,omitempty"`
|
||||||
|
EnableTokenUsage bool `json:"enable_token_usage,omitempty"`
|
||||||
|
EnableAsyncEngineDebug bool `json:"enable_async_engine_debug,omitempty"`
|
||||||
|
EngineUseRay bool `json:"engine_use_ray,omitempty"`
|
||||||
|
DisableLogRequests bool `json:"disable_log_requests,omitempty"`
|
||||||
|
MaxLogLen int `json:"max_log_len,omitempty"`
|
||||||
|
|
||||||
|
// Additional engine configuration
|
||||||
|
Task string `json:"task,omitempty"`
|
||||||
|
MultiModalConfig string `json:"multi_modal_config,omitempty"`
|
||||||
|
LimitMmPerPrompt string `json:"limit_mm_per_prompt,omitempty"`
|
||||||
|
EnableSleepMode bool `json:"enable_sleep_mode,omitempty"`
|
||||||
|
EnableChunkingRequest bool `json:"enable_chunking_request,omitempty"`
|
||||||
|
CompilationConfig string `json:"compilation_config,omitempty"`
|
||||||
|
DisableSlidingWindowMask bool `json:"disable_sliding_window_mask,omitempty"`
|
||||||
|
EnableTRTLLMEngineLatency bool `json:"enable_trtllm_engine_latency,omitempty"`
|
||||||
|
OverridePoolingConfig string `json:"override_pooling_config,omitempty"`
|
||||||
|
OverrideNeuronConfig string `json:"override_neuron_config,omitempty"`
|
||||||
|
OverrideKVCacheALIGNSize int `json:"override_kv_cache_align_size,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewVllmServerOptions creates a new VllmServerOptions with defaults
|
||||||
|
func NewVllmServerOptions() *VllmServerOptions {
|
||||||
|
return &VllmServerOptions{
|
||||||
|
Host: "127.0.0.1",
|
||||||
|
Port: 8000,
|
||||||
|
TensorParallelSize: 1,
|
||||||
|
PipelineParallelSize: 1,
|
||||||
|
GPUMemoryUtilization: 0.9,
|
||||||
|
BlockSize: 16,
|
||||||
|
SwapSpace: 4,
|
||||||
|
UvicornLogLevel: "info",
|
||||||
|
ResponseRole: "assistant",
|
||||||
|
TokenizerMode: "auto",
|
||||||
|
TrustRemoteCode: false,
|
||||||
|
EnablePrefixCaching: false,
|
||||||
|
EnforceEager: false,
|
||||||
|
DisableLogStats: false,
|
||||||
|
DisableLogRequests: false,
|
||||||
|
MaxLogprobs: 20,
|
||||||
|
EnableLogOutputs: false,
|
||||||
|
EnableTokenUsage: false,
|
||||||
|
AllowCredentials: false,
|
||||||
|
AllowedOrigins: []string{"*"},
|
||||||
|
AllowedMethods: []string{"*"},
|
||||||
|
AllowedHeaders: []string{"*"},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// UnmarshalJSON implements custom JSON unmarshaling to support multiple field names
|
||||||
|
func (o *VllmServerOptions) UnmarshalJSON(data []byte) error {
|
||||||
|
// First unmarshal into a map to handle multiple field names
|
||||||
|
var raw map[string]any
|
||||||
|
if err := json.Unmarshal(data, &raw); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create a temporary struct for standard unmarshaling
|
||||||
|
type tempOptions VllmServerOptions
|
||||||
|
temp := tempOptions{}
|
||||||
|
|
||||||
|
// Standard unmarshal first
|
||||||
|
if err := json.Unmarshal(data, &temp); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Copy to our struct
|
||||||
|
*o = VllmServerOptions(temp)
|
||||||
|
|
||||||
|
// Handle alternative field names (CLI format with dashes)
|
||||||
|
fieldMappings := map[string]string{
|
||||||
|
// Basic options
|
||||||
|
"tensor-parallel-size": "tensor_parallel_size",
|
||||||
|
"pipeline-parallel-size": "pipeline_parallel_size",
|
||||||
|
"max-parallel-loading-workers": "max_parallel_loading_workers",
|
||||||
|
"disable-async-output-proc": "disable_async_output_proc",
|
||||||
|
"worker-class": "worker_class",
|
||||||
|
"enabled-lora-modules": "enabled_lora_modules",
|
||||||
|
"max-lora-rank": "max_lora_rank",
|
||||||
|
"fully-sharded-loras": "fully_sharded_loras",
|
||||||
|
"lora-modules": "lora_modules",
|
||||||
|
"prompt-adapters": "prompt_adapters",
|
||||||
|
"max-prompt-adapter-token": "max_prompt_adapter_token",
|
||||||
|
"scheduler-delay": "scheduler_delay",
|
||||||
|
"enable-chunked-prefill": "enable_chunked_prefill",
|
||||||
|
"speculative-model": "speculative_model",
|
||||||
|
"speculative-model-quantization": "speculative_model_quantization",
|
||||||
|
"speculative-revision": "speculative_revision",
|
||||||
|
"speculative-max-model-len": "speculative_max_model_len",
|
||||||
|
"speculative-disable-by-batch-size": "speculative_disable_by_batch_size",
|
||||||
|
"ngpt-speculative-length": "ngpt_speculative_length",
|
||||||
|
"speculative-disable-mqa": "speculative_disable_mqa",
|
||||||
|
"model-loader-extra-config": "model_loader_extra_config",
|
||||||
|
"ignore-patterns": "ignore_patterns",
|
||||||
|
"preloaded-lora-modules": "preloaded_lora_modules",
|
||||||
|
|
||||||
|
// Model configuration
|
||||||
|
"skip-tokenizer-init": "skip_tokenizer_init",
|
||||||
|
"code-revision": "code_revision",
|
||||||
|
"tokenizer-revision": "tokenizer_revision",
|
||||||
|
"tokenizer-mode": "tokenizer_mode",
|
||||||
|
"trust-remote-code": "trust_remote_code",
|
||||||
|
"download-dir": "download_dir",
|
||||||
|
"load-format": "load_format",
|
||||||
|
"config-format": "config_format",
|
||||||
|
"kv-cache-dtype": "kv_cache_dtype",
|
||||||
|
"quantization-param-path": "quantization_param_path",
|
||||||
|
"max-model-len": "max_model_len",
|
||||||
|
"guided-decoding-backend": "guided_decoding_backend",
|
||||||
|
"distributed-executor-backend": "distributed_executor_backend",
|
||||||
|
"worker-use-ray": "worker_use_ray",
|
||||||
|
"ray-workers-use-nsight": "ray_workers_use_nsight",
|
||||||
|
|
||||||
|
// Performance configuration
|
||||||
|
"block-size": "block_size",
|
||||||
|
"enable-prefix-caching": "enable_prefix_caching",
|
||||||
|
"disable-sliding-window": "disable_sliding_window",
|
||||||
|
"use-v2-block-manager": "use_v2_block_manager",
|
||||||
|
"num-lookahead-slots": "num_lookahead_slots",
|
||||||
|
"swap-space": "swap_space",
|
||||||
|
"cpu-offload-gb": "cpu_offload_gb",
|
||||||
|
"gpu-memory-utilization": "gpu_memory_utilization",
|
||||||
|
"num-gpu-blocks-override": "num_gpu_blocks_override",
|
||||||
|
"max-num-batched-tokens": "max_num_batched_tokens",
|
||||||
|
"max-num-seqs": "max_num_seqs",
|
||||||
|
"max-logprobs": "max_logprobs",
|
||||||
|
"disable-log-stats": "disable_log_stats",
|
||||||
|
"rope-scaling": "rope_scaling",
|
||||||
|
"rope-theta": "rope_theta",
|
||||||
|
"enforce-eager": "enforce_eager",
|
||||||
|
"max-context-len-to-capture": "max_context_len_to_capture",
|
||||||
|
"max-seq-len-to-capture": "max_seq_len_to_capture",
|
||||||
|
"disable-custom-all-reduce": "disable_custom_all_reduce",
|
||||||
|
"tokenizer-pool-size": "tokenizer_pool_size",
|
||||||
|
"tokenizer-pool-type": "tokenizer_pool_type",
|
||||||
|
"tokenizer-pool-extra-config": "tokenizer_pool_extra_config",
|
||||||
|
"enable-lora-bias": "enable_lora_bias",
|
||||||
|
"lora-extra-vocab-size": "lora_extra_vocab_size",
|
||||||
|
"lora-rank": "lora_rank",
|
||||||
|
"prompt-lookback-distance": "prompt_lookback_distance",
|
||||||
|
"preemption-mode": "preemption_mode",
|
||||||
|
|
||||||
|
// Server configuration
|
||||||
|
"uvicorn-log-level": "uvicorn_log_level",
|
||||||
|
"response-role": "response_role",
|
||||||
|
"ssl-keyfile": "ssl_keyfile",
|
||||||
|
"ssl-certfile": "ssl_certfile",
|
||||||
|
"ssl-ca-certs": "ssl_ca_certs",
|
||||||
|
"ssl-cert-reqs": "ssl_cert_reqs",
|
||||||
|
"root-path": "root_path",
|
||||||
|
"return-tokens-as-token-ids": "return_tokens_as_token_ids",
|
||||||
|
"disable-frontend-multiprocessing": "disable_frontend_multiprocessing",
|
||||||
|
"enable-auto-tool-choice": "enable_auto_tool_choice",
|
||||||
|
"tool-call-parser": "tool_call_parser",
|
||||||
|
"tool-server": "tool_server",
|
||||||
|
"chat-template": "chat_template",
|
||||||
|
"chat-template-content-format": "chat_template_content_format",
|
||||||
|
"allow-credentials": "allow_credentials",
|
||||||
|
"allowed-origins": "allowed_origins",
|
||||||
|
"allowed-methods": "allowed_methods",
|
||||||
|
"allowed-headers": "allowed_headers",
|
||||||
|
"api-key": "api_key",
|
||||||
|
"enable-log-outputs": "enable_log_outputs",
|
||||||
|
"enable-token-usage": "enable_token_usage",
|
||||||
|
"enable-async-engine-debug": "enable_async_engine_debug",
|
||||||
|
"engine-use-ray": "engine_use_ray",
|
||||||
|
"disable-log-requests": "disable_log_requests",
|
||||||
|
"max-log-len": "max_log_len",
|
||||||
|
|
||||||
|
// Additional options
|
||||||
|
"multi-modal-config": "multi_modal_config",
|
||||||
|
"limit-mm-per-prompt": "limit_mm_per_prompt",
|
||||||
|
"enable-sleep-mode": "enable_sleep_mode",
|
||||||
|
"enable-chunking-request": "enable_chunking_request",
|
||||||
|
"compilation-config": "compilation_config",
|
||||||
|
"disable-sliding-window-mask": "disable_sliding_window_mask",
|
||||||
|
"enable-trtllm-engine-latency": "enable_trtllm_engine_latency",
|
||||||
|
"override-pooling-config": "override_pooling_config",
|
||||||
|
"override-neuron-config": "override_neuron_config",
|
||||||
|
"override-kv-cache-align-size": "override_kv_cache_align_size",
|
||||||
|
}
|
||||||
|
|
||||||
|
// Process alternative field names
|
||||||
|
for altName, canonicalName := range fieldMappings {
|
||||||
|
if value, exists := raw[altName]; exists {
|
||||||
|
// Use reflection to set the field value
|
||||||
|
v := reflect.ValueOf(o).Elem()
|
||||||
|
field := v.FieldByNameFunc(func(fieldName string) bool {
|
||||||
|
field, _ := v.Type().FieldByName(fieldName)
|
||||||
|
jsonTag := field.Tag.Get("json")
|
||||||
|
return jsonTag == canonicalName+",omitempty" || jsonTag == canonicalName
|
||||||
|
})
|
||||||
|
|
||||||
|
if field.IsValid() && field.CanSet() {
|
||||||
|
switch field.Kind() {
|
||||||
|
case reflect.Int:
|
||||||
|
if intVal, ok := value.(float64); ok {
|
||||||
|
field.SetInt(int64(intVal))
|
||||||
|
} else if strVal, ok := value.(string); ok {
|
||||||
|
if intVal, err := strconv.Atoi(strVal); err == nil {
|
||||||
|
field.SetInt(int64(intVal))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
case reflect.Float64:
|
||||||
|
if floatVal, ok := value.(float64); ok {
|
||||||
|
field.SetFloat(floatVal)
|
||||||
|
} else if strVal, ok := value.(string); ok {
|
||||||
|
if floatVal, err := strconv.ParseFloat(strVal, 64); err == nil {
|
||||||
|
field.SetFloat(floatVal)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
case reflect.String:
|
||||||
|
if strVal, ok := value.(string); ok {
|
||||||
|
field.SetString(strVal)
|
||||||
|
}
|
||||||
|
case reflect.Bool:
|
||||||
|
if boolVal, ok := value.(bool); ok {
|
||||||
|
field.SetBool(boolVal)
|
||||||
|
}
|
||||||
|
case reflect.Slice:
|
||||||
|
if field.Type().Elem().Kind() == reflect.String {
|
||||||
|
if strVal, ok := value.(string); ok {
|
||||||
|
// Split comma-separated values
|
||||||
|
values := strings.Split(strVal, ",")
|
||||||
|
for i, v := range values {
|
||||||
|
values[i] = strings.TrimSpace(v)
|
||||||
|
}
|
||||||
|
field.Set(reflect.ValueOf(values))
|
||||||
|
} else if slice, ok := value.([]interface{}); ok {
|
||||||
|
var strSlice []string
|
||||||
|
for _, item := range slice {
|
||||||
|
if str, ok := item.(string); ok {
|
||||||
|
strSlice = append(strSlice, str)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
field.Set(reflect.ValueOf(strSlice))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// BuildCommandArgs converts VllmServerOptions to command line arguments
|
||||||
|
// Note: This does NOT include the "serve" subcommand, that's handled at the instance level
|
||||||
|
func (o *VllmServerOptions) BuildCommandArgs() []string {
|
||||||
|
var args []string
|
||||||
|
|
||||||
|
v := reflect.ValueOf(o).Elem()
|
||||||
|
t := v.Type()
|
||||||
|
|
||||||
|
for i := 0; i < v.NumField(); i++ {
|
||||||
|
field := v.Field(i)
|
||||||
|
fieldType := t.Field(i)
|
||||||
|
|
||||||
|
// Skip unexported fields
|
||||||
|
if !field.CanInterface() {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get the JSON tag to determine the flag name
|
||||||
|
jsonTag := fieldType.Tag.Get("json")
|
||||||
|
if jsonTag == "" || jsonTag == "-" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
// Remove ",omitempty" from the tag
|
||||||
|
flagName := jsonTag
|
||||||
|
if commaIndex := strings.Index(jsonTag, ","); commaIndex != -1 {
|
||||||
|
flagName = jsonTag[:commaIndex]
|
||||||
|
}
|
||||||
|
|
||||||
|
// Skip host and port as they are handled by llamactl
|
||||||
|
if flagName == "host" || flagName == "port" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
// Convert snake_case to kebab-case for CLI flags
|
||||||
|
flagName = strings.ReplaceAll(flagName, "_", "-")
|
||||||
|
|
||||||
|
// Add the appropriate arguments based on field type and value
|
||||||
|
switch field.Kind() {
|
||||||
|
case reflect.Bool:
|
||||||
|
if field.Bool() {
|
||||||
|
args = append(args, "--"+flagName)
|
||||||
|
}
|
||||||
|
case reflect.Int:
|
||||||
|
if field.Int() != 0 {
|
||||||
|
args = append(args, "--"+flagName, strconv.FormatInt(field.Int(), 10))
|
||||||
|
}
|
||||||
|
case reflect.Float64:
|
||||||
|
if field.Float() != 0 {
|
||||||
|
args = append(args, "--"+flagName, strconv.FormatFloat(field.Float(), 'f', -1, 64))
|
||||||
|
}
|
||||||
|
case reflect.String:
|
||||||
|
if field.String() != "" {
|
||||||
|
args = append(args, "--"+flagName, field.String())
|
||||||
|
}
|
||||||
|
case reflect.Slice:
|
||||||
|
if field.Type().Elem().Kind() == reflect.String {
|
||||||
|
// Handle []string fields - some are comma-separated, some use multiple flags
|
||||||
|
if flagName == "api-key" || flagName == "allowed-origins" || flagName == "allowed-methods" || flagName == "allowed-headers" || flagName == "middleware" {
|
||||||
|
// Multiple flags for these
|
||||||
|
for j := 0; j < field.Len(); j++ {
|
||||||
|
args = append(args, "--"+flagName, field.Index(j).String())
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Comma-separated for others
|
||||||
|
if field.Len() > 0 {
|
||||||
|
var values []string
|
||||||
|
for j := 0; j < field.Len(); j++ {
|
||||||
|
values = append(values, field.Index(j).String())
|
||||||
|
}
|
||||||
|
args = append(args, "--"+flagName, strings.Join(values, ","))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return args
|
||||||
|
}
|
||||||
106
pkg/backends/vllm/vllm_test.go
Normal file
106
pkg/backends/vllm/vllm_test.go
Normal file
@@ -0,0 +1,106 @@
|
|||||||
|
package vllm_test
|
||||||
|
|
||||||
|
import (
|
||||||
|
"encoding/json"
|
||||||
|
"llamactl/pkg/backends/vllm"
|
||||||
|
"slices"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestBuildCommandArgs(t *testing.T) {
|
||||||
|
options := vllm.VllmServerOptions{
|
||||||
|
Model: "microsoft/DialoGPT-medium",
|
||||||
|
Port: 8080, // should be excluded
|
||||||
|
Host: "localhost", // should be excluded
|
||||||
|
TensorParallelSize: 2,
|
||||||
|
GPUMemoryUtilization: 0.8,
|
||||||
|
EnableLogOutputs: true,
|
||||||
|
APIKey: []string{"key1", "key2"},
|
||||||
|
}
|
||||||
|
|
||||||
|
args := options.BuildCommandArgs()
|
||||||
|
|
||||||
|
// Check core functionality
|
||||||
|
if !containsFlagWithValue(args, "--model", "microsoft/DialoGPT-medium") {
|
||||||
|
t.Errorf("Expected --model microsoft/DialoGPT-medium not found in %v", args)
|
||||||
|
}
|
||||||
|
if !containsFlagWithValue(args, "--tensor-parallel-size", "2") {
|
||||||
|
t.Errorf("Expected --tensor-parallel-size 2 not found in %v", args)
|
||||||
|
}
|
||||||
|
if !contains(args, "--enable-log-outputs") {
|
||||||
|
t.Errorf("Expected --enable-log-outputs not found in %v", args)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Host and port should NOT be in the arguments (handled by llamactl)
|
||||||
|
if contains(args, "--host") || contains(args, "--port") {
|
||||||
|
t.Errorf("Host and port should not be in command args, found in %v", args)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check array handling (multiple flags)
|
||||||
|
apiKeyCount := 0
|
||||||
|
for i := range args {
|
||||||
|
if args[i] == "--api-key" {
|
||||||
|
apiKeyCount++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if apiKeyCount != 2 {
|
||||||
|
t.Errorf("Expected 2 --api-key flags, got %d", apiKeyCount)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestUnmarshalJSON(t *testing.T) {
|
||||||
|
// Test both underscore and dash formats
|
||||||
|
jsonData := `{
|
||||||
|
"model": "test-model",
|
||||||
|
"tensor_parallel_size": 4,
|
||||||
|
"gpu-memory-utilization": 0.9,
|
||||||
|
"enable-log-outputs": true
|
||||||
|
}`
|
||||||
|
|
||||||
|
var options vllm.VllmServerOptions
|
||||||
|
err := json.Unmarshal([]byte(jsonData), &options)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Unmarshal failed: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if options.Model != "test-model" {
|
||||||
|
t.Errorf("Expected model 'test-model', got %q", options.Model)
|
||||||
|
}
|
||||||
|
if options.TensorParallelSize != 4 {
|
||||||
|
t.Errorf("Expected tensor_parallel_size 4, got %d", options.TensorParallelSize)
|
||||||
|
}
|
||||||
|
if options.GPUMemoryUtilization != 0.9 {
|
||||||
|
t.Errorf("Expected gpu_memory_utilization 0.9, got %f", options.GPUMemoryUtilization)
|
||||||
|
}
|
||||||
|
if !options.EnableLogOutputs {
|
||||||
|
t.Errorf("Expected enable_log_outputs true, got %v", options.EnableLogOutputs)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNewVllmServerOptions(t *testing.T) {
|
||||||
|
options := vllm.NewVllmServerOptions()
|
||||||
|
|
||||||
|
if options == nil {
|
||||||
|
t.Fatal("NewVllmServerOptions returned nil")
|
||||||
|
}
|
||||||
|
if options.Host != "127.0.0.1" {
|
||||||
|
t.Errorf("Expected default host '127.0.0.1', got %q", options.Host)
|
||||||
|
}
|
||||||
|
if options.Port != 8000 {
|
||||||
|
t.Errorf("Expected default port 8000, got %d", options.Port)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Helper functions
|
||||||
|
func contains(slice []string, item string) bool {
|
||||||
|
return slices.Contains(slice, item)
|
||||||
|
}
|
||||||
|
|
||||||
|
func containsFlagWithValue(args []string, flag, value string) bool {
|
||||||
|
for i, arg := range args {
|
||||||
|
if arg == flag && i+1 < len(args) && args[i+1] == value {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
@@ -17,6 +17,9 @@ type BackendConfig struct {
|
|||||||
|
|
||||||
// Path to mlx_lm executable (MLX-LM backend)
|
// Path to mlx_lm executable (MLX-LM backend)
|
||||||
MLXLMExecutable string `yaml:"mlx_lm_executable"`
|
MLXLMExecutable string `yaml:"mlx_lm_executable"`
|
||||||
|
|
||||||
|
// Path to vllm executable (vLLM backend)
|
||||||
|
VllmExecutable string `yaml:"vllm_executable"`
|
||||||
}
|
}
|
||||||
|
|
||||||
// AppConfig represents the configuration for llamactl
|
// AppConfig represents the configuration for llamactl
|
||||||
@@ -122,6 +125,7 @@ func LoadConfig(configPath string) (AppConfig, error) {
|
|||||||
Backends: BackendConfig{
|
Backends: BackendConfig{
|
||||||
LlamaExecutable: "llama-server",
|
LlamaExecutable: "llama-server",
|
||||||
MLXLMExecutable: "mlx_lm.server",
|
MLXLMExecutable: "mlx_lm.server",
|
||||||
|
VllmExecutable: "vllm",
|
||||||
},
|
},
|
||||||
Instances: InstancesConfig{
|
Instances: InstancesConfig{
|
||||||
PortRange: [2]int{8000, 9000},
|
PortRange: [2]int{8000, 9000},
|
||||||
@@ -246,6 +250,9 @@ func loadEnvVars(cfg *AppConfig) {
|
|||||||
if mlxLMExec := os.Getenv("LLAMACTL_MLX_LM_EXECUTABLE"); mlxLMExec != "" {
|
if mlxLMExec := os.Getenv("LLAMACTL_MLX_LM_EXECUTABLE"); mlxLMExec != "" {
|
||||||
cfg.Backends.MLXLMExecutable = mlxLMExec
|
cfg.Backends.MLXLMExecutable = mlxLMExec
|
||||||
}
|
}
|
||||||
|
if vllmExec := os.Getenv("LLAMACTL_VLLM_EXECUTABLE"); vllmExec != "" {
|
||||||
|
cfg.Backends.VllmExecutable = vllmExec
|
||||||
|
}
|
||||||
if autoRestart := os.Getenv("LLAMACTL_DEFAULT_AUTO_RESTART"); autoRestart != "" {
|
if autoRestart := os.Getenv("LLAMACTL_DEFAULT_AUTO_RESTART"); autoRestart != "" {
|
||||||
if b, err := strconv.ParseBool(autoRestart); err == nil {
|
if b, err := strconv.ParseBool(autoRestart); err == nil {
|
||||||
cfg.Instances.DefaultAutoRestart = b
|
cfg.Instances.DefaultAutoRestart = b
|
||||||
|
|||||||
@@ -52,6 +52,8 @@ func (i *Process) Start() error {
|
|||||||
executable = i.globalBackendSettings.LlamaExecutable
|
executable = i.globalBackendSettings.LlamaExecutable
|
||||||
case backends.BackendTypeMlxLm:
|
case backends.BackendTypeMlxLm:
|
||||||
executable = i.globalBackendSettings.MLXLMExecutable
|
executable = i.globalBackendSettings.MLXLMExecutable
|
||||||
|
case backends.BackendTypeVllm:
|
||||||
|
executable = i.globalBackendSettings.VllmExecutable
|
||||||
default:
|
default:
|
||||||
return fmt.Errorf("unsupported backend type: %s", i.options.BackendType)
|
return fmt.Errorf("unsupported backend type: %s", i.options.BackendType)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -6,6 +6,7 @@ import (
|
|||||||
"llamactl/pkg/backends"
|
"llamactl/pkg/backends"
|
||||||
"llamactl/pkg/backends/llamacpp"
|
"llamactl/pkg/backends/llamacpp"
|
||||||
"llamactl/pkg/backends/mlx"
|
"llamactl/pkg/backends/mlx"
|
||||||
|
"llamactl/pkg/backends/vllm"
|
||||||
"llamactl/pkg/config"
|
"llamactl/pkg/config"
|
||||||
"log"
|
"log"
|
||||||
)
|
)
|
||||||
@@ -26,6 +27,7 @@ type CreateInstanceOptions struct {
|
|||||||
// Backend-specific options
|
// Backend-specific options
|
||||||
LlamaServerOptions *llamacpp.LlamaServerOptions `json:"-"`
|
LlamaServerOptions *llamacpp.LlamaServerOptions `json:"-"`
|
||||||
MlxServerOptions *mlx.MlxServerOptions `json:"-"`
|
MlxServerOptions *mlx.MlxServerOptions `json:"-"`
|
||||||
|
VllmServerOptions *vllm.VllmServerOptions `json:"-"`
|
||||||
}
|
}
|
||||||
|
|
||||||
// UnmarshalJSON implements custom JSON unmarshaling for CreateInstanceOptions
|
// UnmarshalJSON implements custom JSON unmarshaling for CreateInstanceOptions
|
||||||
@@ -63,12 +65,24 @@ func (c *CreateInstanceOptions) UnmarshalJSON(data []byte) error {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("failed to marshal backend options: %w", err)
|
return fmt.Errorf("failed to marshal backend options: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
c.MlxServerOptions = &mlx.MlxServerOptions{}
|
c.MlxServerOptions = &mlx.MlxServerOptions{}
|
||||||
if err := json.Unmarshal(optionsData, c.MlxServerOptions); err != nil {
|
if err := json.Unmarshal(optionsData, c.MlxServerOptions); err != nil {
|
||||||
return fmt.Errorf("failed to unmarshal MLX options: %w", err)
|
return fmt.Errorf("failed to unmarshal MLX options: %w", err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
case backends.BackendTypeVllm:
|
||||||
|
if c.BackendOptions != nil {
|
||||||
|
optionsData, err := json.Marshal(c.BackendOptions)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("failed to marshal backend options: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
c.VllmServerOptions = &vllm.VllmServerOptions{}
|
||||||
|
if err := json.Unmarshal(optionsData, c.VllmServerOptions); err != nil {
|
||||||
|
return fmt.Errorf("failed to unmarshal vLLM options: %w", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
default:
|
default:
|
||||||
return fmt.Errorf("unknown backend type: %s", c.BackendType)
|
return fmt.Errorf("unknown backend type: %s", c.BackendType)
|
||||||
}
|
}
|
||||||
@@ -114,6 +128,20 @@ func (c *CreateInstanceOptions) MarshalJSON() ([]byte, error) {
|
|||||||
return nil, fmt.Errorf("failed to unmarshal to map: %w", err)
|
return nil, fmt.Errorf("failed to unmarshal to map: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
aux.BackendOptions = backendOpts
|
||||||
|
}
|
||||||
|
case backends.BackendTypeVllm:
|
||||||
|
if c.VllmServerOptions != nil {
|
||||||
|
data, err := json.Marshal(c.VllmServerOptions)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to marshal vLLM server options: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
var backendOpts map[string]any
|
||||||
|
if err := json.Unmarshal(data, &backendOpts); err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to unmarshal to map: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
aux.BackendOptions = backendOpts
|
aux.BackendOptions = backendOpts
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -171,6 +199,13 @@ func (c *CreateInstanceOptions) BuildCommandArgs() []string {
|
|||||||
if c.MlxServerOptions != nil {
|
if c.MlxServerOptions != nil {
|
||||||
return c.MlxServerOptions.BuildCommandArgs()
|
return c.MlxServerOptions.BuildCommandArgs()
|
||||||
}
|
}
|
||||||
|
case backends.BackendTypeVllm:
|
||||||
|
if c.VllmServerOptions != nil {
|
||||||
|
// Prepend "serve" as first argument
|
||||||
|
args := []string{"serve"}
|
||||||
|
args = append(args, c.VllmServerOptions.BuildCommandArgs()...)
|
||||||
|
return args
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return []string{}
|
return []string{}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -8,6 +8,7 @@ import (
|
|||||||
"llamactl/pkg/backends"
|
"llamactl/pkg/backends"
|
||||||
"llamactl/pkg/backends/llamacpp"
|
"llamactl/pkg/backends/llamacpp"
|
||||||
"llamactl/pkg/backends/mlx"
|
"llamactl/pkg/backends/mlx"
|
||||||
|
"llamactl/pkg/backends/vllm"
|
||||||
"llamactl/pkg/config"
|
"llamactl/pkg/config"
|
||||||
"llamactl/pkg/instance"
|
"llamactl/pkg/instance"
|
||||||
"llamactl/pkg/manager"
|
"llamactl/pkg/manager"
|
||||||
@@ -732,7 +733,60 @@ func (h *Handler) ParseMlxCommand() http.HandlerFunc {
|
|||||||
BackendType: backendType,
|
BackendType: backendType,
|
||||||
MlxServerOptions: mlxOptions,
|
MlxServerOptions: mlxOptions,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
w.Header().Set("Content-Type", "application/json")
|
||||||
|
if err := json.NewEncoder(w).Encode(options); err != nil {
|
||||||
|
writeError(w, http.StatusInternalServerError, "encode_error", err.Error())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ParseVllmCommand godoc
|
||||||
|
// @Summary Parse vllm serve command
|
||||||
|
// @Description Parses a vLLM serve command string into instance options
|
||||||
|
// @Tags backends
|
||||||
|
// @Security ApiKeyAuth
|
||||||
|
// @Accept json
|
||||||
|
// @Produce json
|
||||||
|
// @Param request body ParseCommandRequest true "Command to parse"
|
||||||
|
// @Success 200 {object} instance.CreateInstanceOptions "Parsed options"
|
||||||
|
// @Failure 400 {object} map[string]string "Invalid request or command"
|
||||||
|
// @Router /backends/vllm/parse-command [post]
|
||||||
|
func (h *Handler) ParseVllmCommand() http.HandlerFunc {
|
||||||
|
type errorResponse struct {
|
||||||
|
Error string `json:"error"`
|
||||||
|
Details string `json:"details,omitempty"`
|
||||||
|
}
|
||||||
|
writeError := func(w http.ResponseWriter, status int, code, details string) {
|
||||||
|
w.Header().Set("Content-Type", "application/json")
|
||||||
|
w.WriteHeader(status)
|
||||||
|
_ = json.NewEncoder(w).Encode(errorResponse{Error: code, Details: details})
|
||||||
|
}
|
||||||
|
return func(w http.ResponseWriter, r *http.Request) {
|
||||||
|
var req ParseCommandRequest
|
||||||
|
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||||
|
writeError(w, http.StatusBadRequest, "invalid_request", "Invalid JSON body")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if strings.TrimSpace(req.Command) == "" {
|
||||||
|
writeError(w, http.StatusBadRequest, "invalid_command", "Command cannot be empty")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
vllmOptions, err := vllm.ParseVllmCommand(req.Command)
|
||||||
|
if err != nil {
|
||||||
|
writeError(w, http.StatusBadRequest, "parse_error", err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
backendType := backends.BackendTypeVllm
|
||||||
|
|
||||||
|
options := &instance.CreateInstanceOptions{
|
||||||
|
BackendType: backendType,
|
||||||
|
VllmServerOptions: vllmOptions,
|
||||||
|
}
|
||||||
|
|
||||||
w.Header().Set("Content-Type", "application/json")
|
w.Header().Set("Content-Type", "application/json")
|
||||||
if err := json.NewEncoder(w).Encode(options); err != nil {
|
if err := json.NewEncoder(w).Encode(options); err != nil {
|
||||||
writeError(w, http.StatusInternalServerError, "encode_error", err.Error())
|
writeError(w, http.StatusInternalServerError, "encode_error", err.Error())
|
||||||
|
|||||||
@@ -58,6 +58,9 @@ func SetupRouter(handler *Handler) *chi.Mux {
|
|||||||
r.Route("/mlx", func(r chi.Router) {
|
r.Route("/mlx", func(r chi.Router) {
|
||||||
r.Post("/parse-command", handler.ParseMlxCommand())
|
r.Post("/parse-command", handler.ParseMlxCommand())
|
||||||
})
|
})
|
||||||
|
r.Route("/vllm", func(r chi.Router) {
|
||||||
|
r.Post("/parse-command", handler.ParseVllmCommand())
|
||||||
|
})
|
||||||
})
|
})
|
||||||
|
|
||||||
// Instance management endpoints
|
// Instance management endpoints
|
||||||
|
|||||||
@@ -46,6 +46,8 @@ func ValidateInstanceOptions(options *instance.CreateInstanceOptions) error {
|
|||||||
return validateLlamaCppOptions(options)
|
return validateLlamaCppOptions(options)
|
||||||
case backends.BackendTypeMlxLm:
|
case backends.BackendTypeMlxLm:
|
||||||
return validateMlxOptions(options)
|
return validateMlxOptions(options)
|
||||||
|
case backends.BackendTypeVllm:
|
||||||
|
return validateVllmOptions(options)
|
||||||
default:
|
default:
|
||||||
return ValidationError(fmt.Errorf("unsupported backend type: %s", options.BackendType))
|
return ValidationError(fmt.Errorf("unsupported backend type: %s", options.BackendType))
|
||||||
}
|
}
|
||||||
@@ -88,6 +90,25 @@ func validateMlxOptions(options *instance.CreateInstanceOptions) error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// validateVllmOptions validates vLLM backend specific options
|
||||||
|
func validateVllmOptions(options *instance.CreateInstanceOptions) error {
|
||||||
|
if options.VllmServerOptions == nil {
|
||||||
|
return ValidationError(fmt.Errorf("vLLM server options cannot be nil for vLLM backend"))
|
||||||
|
}
|
||||||
|
|
||||||
|
// Use reflection to check all string fields for injection patterns
|
||||||
|
if err := validateStructStrings(options.VllmServerOptions, ""); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Basic network validation for port
|
||||||
|
if options.VllmServerOptions.Port < 0 || options.VllmServerOptions.Port > 65535 {
|
||||||
|
return ValidationError(fmt.Errorf("invalid port range: %d", options.VllmServerOptions.Port))
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
// validateStructStrings recursively validates all string fields in a struct
|
// validateStructStrings recursively validates all string fields in a struct
|
||||||
func validateStructStrings(v any, fieldPath string) error {
|
func validateStructStrings(v any, fieldPath string) error {
|
||||||
val := reflect.ValueOf(v)
|
val := reflect.ValueOf(v)
|
||||||
|
|||||||
440
vllm_backend_spec.md
Normal file
440
vllm_backend_spec.md
Normal file
@@ -0,0 +1,440 @@
|
|||||||
|
# vLLM Backend Implementation Specification
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
This specification outlines the implementation of vLLM backend support for llamactl, following the existing patterns established by the llama.cpp and MLX backends.
|
||||||
|
|
||||||
|
## 1. Backend Configuration
|
||||||
|
|
||||||
|
### Basic Details
|
||||||
|
- **Backend Type**: `vllm`
|
||||||
|
- **Executable**: `vllm` (configured via `VllmExecutable`)
|
||||||
|
- **Subcommand**: `serve` (automatically prepended to arguments)
|
||||||
|
- **Default Host/Port**: Auto-assigned by llamactl
|
||||||
|
- **Health Check**: Uses `/health` endpoint (returns HTTP 200 with no content)
|
||||||
|
- **API Compatibility**: OpenAI-compatible endpoints
|
||||||
|
|
||||||
|
### Example Command
|
||||||
|
```bash
|
||||||
|
vllm serve --enable-log-outputs --tensor-parallel-size 2 --gpu-memory-utilization 0.5 --model ISTA-DASLab/gemma-3-27b-it-GPTQ-4b-128g
|
||||||
|
```
|
||||||
|
|
||||||
|
## 2. File Structure
|
||||||
|
Following the existing backend pattern:
|
||||||
|
```
|
||||||
|
pkg/backends/vllm/
|
||||||
|
├── vllm.go # VllmServerOptions struct and methods
|
||||||
|
├── vllm_test.go # Unit tests for VllmServerOptions
|
||||||
|
├── parser.go # Command parsing logic
|
||||||
|
└── parser_test.go # Parser tests
|
||||||
|
```
|
||||||
|
|
||||||
|
## 3. Core Implementation Files
|
||||||
|
|
||||||
|
### 3.1 `pkg/backends/vllm/vllm.go`
|
||||||
|
|
||||||
|
#### VllmServerOptions Struct
|
||||||
|
```go
|
||||||
|
type VllmServerOptions struct {
|
||||||
|
// Basic connection options (auto-assigned by llamactl)
|
||||||
|
Host string `json:"host,omitempty"`
|
||||||
|
Port int `json:"port,omitempty"`
|
||||||
|
|
||||||
|
// Core model options
|
||||||
|
Model string `json:"model,omitempty"`
|
||||||
|
|
||||||
|
// Common serving options
|
||||||
|
EnableLogOutputs bool `json:"enable_log_outputs,omitempty"`
|
||||||
|
TensorParallelSize int `json:"tensor_parallel_size,omitempty"`
|
||||||
|
GPUMemoryUtilization float64 `json:"gpu_memory_utilization,omitempty"`
|
||||||
|
|
||||||
|
// Additional parameters to be added based on vLLM CLI documentation
|
||||||
|
// Following the same comprehensive approach as llamacpp.LlamaServerOptions
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Required Methods
|
||||||
|
- `UnmarshalJSON()` - Custom unmarshaling with alternative field name support (dash-to-underscore conversion)
|
||||||
|
- `BuildCommandArgs()` - Convert struct to command line arguments (excluding "serve" subcommand)
|
||||||
|
- `NewVllmServerOptions()` - Constructor with vLLM defaults
|
||||||
|
|
||||||
|
#### Field Name Mapping
|
||||||
|
Support both CLI argument names (with dashes) and programmatic names (with underscores), similar to the llama.cpp implementation:
|
||||||
|
```go
|
||||||
|
fieldMappings := map[string]string{
|
||||||
|
"enable-log-outputs": "enable_log_outputs",
|
||||||
|
"tensor-parallel-size": "tensor_parallel_size",
|
||||||
|
"gpu-memory-utilization": "gpu_memory_utilization",
|
||||||
|
// ... other mappings
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3.2 `pkg/backends/vllm/parser.go`
|
||||||
|
|
||||||
|
#### ParseVllmCommand Function
|
||||||
|
Following the same pattern as `llamacpp/parser.go` and `mlx/parser.go`:
|
||||||
|
|
||||||
|
```go
|
||||||
|
func ParseVllmCommand(command string) (*VllmServerOptions, error)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Supported Input Formats:**
|
||||||
|
1. `vllm serve --model MODEL_NAME --other-args`
|
||||||
|
2. `/path/to/vllm serve --model MODEL_NAME`
|
||||||
|
3. `serve --model MODEL_NAME --other-args`
|
||||||
|
4. `--model MODEL_NAME --other-args` (args only)
|
||||||
|
5. Multiline commands with backslashes
|
||||||
|
|
||||||
|
**Implementation Details:**
|
||||||
|
- Handle "serve" subcommand detection and removal
|
||||||
|
- Support quoted strings and escaped characters
|
||||||
|
- Validate command structure
|
||||||
|
- Convert parsed arguments to `VllmServerOptions`
|
||||||
|
|
||||||
|
## 4. Backend Integration
|
||||||
|
|
||||||
|
### 4.1 Backend Type Definition
|
||||||
|
**File**: `pkg/backends/backend.go`
|
||||||
|
```go
|
||||||
|
const (
|
||||||
|
BackendTypeLlamaCpp BackendType = "llama_cpp"
|
||||||
|
BackendTypeMlxLm BackendType = "mlx_lm"
|
||||||
|
BackendTypeVllm BackendType = "vllm" // ADD THIS
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
### 4.2 Configuration Integration
|
||||||
|
**File**: `pkg/config/config.go`
|
||||||
|
|
||||||
|
#### BackendConfig Update
|
||||||
|
```go
|
||||||
|
type BackendConfig struct {
|
||||||
|
LlamaExecutable string `yaml:"llama_executable"`
|
||||||
|
MLXLMExecutable string `yaml:"mlx_lm_executable"`
|
||||||
|
VllmExecutable string `yaml:"vllm_executable"` // ADD THIS
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Default Configuration
|
||||||
|
- **Default Value**: `"vllm"`
|
||||||
|
- **Environment Variable**: `LLAMACTL_VLLM_EXECUTABLE`
|
||||||
|
|
||||||
|
#### Environment Variable Loading
|
||||||
|
Add to `loadEnvVars()` function:
|
||||||
|
```go
|
||||||
|
if vllmExec := os.Getenv("LLAMACTL_VLLM_EXECUTABLE"); vllmExec != "" {
|
||||||
|
cfg.Backends.VllmExecutable = vllmExec
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 4.3 Instance Options Integration
|
||||||
|
**File**: `pkg/instance/options.go`
|
||||||
|
|
||||||
|
#### CreateInstanceOptions Update
|
||||||
|
```go
|
||||||
|
type CreateInstanceOptions struct {
|
||||||
|
// existing fields...
|
||||||
|
VllmServerOptions *vllm.VllmServerOptions `json:"-"`
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
#### JSON Marshaling/Unmarshaling
|
||||||
|
Update `UnmarshalJSON()` and `MarshalJSON()` methods to handle vLLM backend similar to existing backends.
|
||||||
|
|
||||||
|
#### BuildCommandArgs Implementation
|
||||||
|
```go
|
||||||
|
case backends.BackendTypeVllm:
|
||||||
|
if c.VllmServerOptions != nil {
|
||||||
|
// Prepend "serve" as first argument
|
||||||
|
args := []string{"serve"}
|
||||||
|
args = append(args, c.VllmServerOptions.BuildCommandArgs()...)
|
||||||
|
return args
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Key Point**: The "serve" subcommand is handled at the instance options level, keeping the `VllmServerOptions.BuildCommandArgs()` method focused only on vLLM-specific parameters.
|
||||||
|
|
||||||
|
## 5. Health Check Integration
|
||||||
|
|
||||||
|
### 5.1 Standard Health Check for vLLM
|
||||||
|
**File**: `pkg/instance/lifecycle.go`
|
||||||
|
|
||||||
|
vLLM provides a standard `/health` endpoint that returns HTTP 200 with no content, so no modifications are needed to the existing health check logic. The current `WaitForHealthy()` method will work as-is:
|
||||||
|
|
||||||
|
```go
|
||||||
|
healthURL := fmt.Sprintf("http://%s:%d/health", host, port)
|
||||||
|
```
|
||||||
|
|
||||||
|
### 5.2 Startup Time Considerations
|
||||||
|
- vLLM typically has longer startup times compared to llama.cpp
|
||||||
|
- The existing configurable timeout system should handle this adequately
|
||||||
|
- Users may need to adjust `on_demand_start_timeout` for larger models
|
||||||
|
|
||||||
|
## 6. Lifecycle Integration
|
||||||
|
|
||||||
|
### 6.1 Executable Selection
|
||||||
|
**File**: `pkg/instance/lifecycle.go`
|
||||||
|
|
||||||
|
Update the `Start()` method to handle vLLM executable:
|
||||||
|
|
||||||
|
```go
|
||||||
|
switch i.options.BackendType {
|
||||||
|
case backends.BackendTypeLlamaCpp:
|
||||||
|
executable = i.globalBackendSettings.LlamaExecutable
|
||||||
|
case backends.BackendTypeMlxLm:
|
||||||
|
executable = i.globalBackendSettings.MLXLMExecutable
|
||||||
|
case backends.BackendTypeVllm: // ADD THIS
|
||||||
|
executable = i.globalBackendSettings.VllmExecutable
|
||||||
|
default:
|
||||||
|
return fmt.Errorf("unsupported backend type: %s", i.options.BackendType)
|
||||||
|
}
|
||||||
|
|
||||||
|
args := i.options.BuildCommandArgs()
|
||||||
|
i.cmd = exec.CommandContext(i.ctx, executable, args...)
|
||||||
|
```
|
||||||
|
|
||||||
|
### 6.2 Command Execution
|
||||||
|
The final executed command will be:
|
||||||
|
```bash
|
||||||
|
vllm serve --model MODEL_NAME --other-vllm-args
|
||||||
|
```
|
||||||
|
|
||||||
|
Where:
|
||||||
|
- `vllm` comes from `VllmExecutable` configuration
|
||||||
|
- `serve` is prepended by `BuildCommandArgs()`
|
||||||
|
- Remaining args come from `VllmServerOptions.BuildCommandArgs()`
|
||||||
|
|
||||||
|
## 7. Server Handler Integration
|
||||||
|
|
||||||
|
### 7.1 New Handler Method
|
||||||
|
**File**: `pkg/server/handlers.go`
|
||||||
|
|
||||||
|
```go
|
||||||
|
// ParseVllmCommand godoc
|
||||||
|
// @Summary Parse vllm serve command
|
||||||
|
// @Description Parses a vLLM serve command string into instance options
|
||||||
|
// @Tags backends
|
||||||
|
// @Security ApiKeyAuth
|
||||||
|
// @Accept json
|
||||||
|
// @Produce json
|
||||||
|
// @Param request body ParseCommandRequest true "Command to parse"
|
||||||
|
// @Success 200 {object} instance.CreateInstanceOptions "Parsed options"
|
||||||
|
// @Failure 400 {object} map[string]string "Invalid request or command"
|
||||||
|
// @Router /backends/vllm/parse-command [post]
|
||||||
|
func (h *Handler) ParseVllmCommand() http.HandlerFunc {
|
||||||
|
// Implementation similar to ParseMlxCommand()
|
||||||
|
// Uses vllm.ParseVllmCommand() internally
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 7.2 Router Integration
|
||||||
|
**File**: `pkg/server/routes.go`
|
||||||
|
|
||||||
|
Add vLLM route:
|
||||||
|
```go
|
||||||
|
r.Route("/backends", func(r chi.Router) {
|
||||||
|
r.Route("/llama-cpp", func(r chi.Router) {
|
||||||
|
r.Post("/parse-command", handler.ParseLlamaCommand())
|
||||||
|
})
|
||||||
|
r.Route("/mlx", func(r chi.Router) {
|
||||||
|
r.Post("/parse-command", handler.ParseMlxCommand())
|
||||||
|
})
|
||||||
|
r.Route("/vllm", func(r chi.Router) { // ADD THIS
|
||||||
|
r.Post("/parse-command", handler.ParseVllmCommand())
|
||||||
|
})
|
||||||
|
})
|
||||||
|
```
|
||||||
|
|
||||||
|
## 8. Validation Integration
|
||||||
|
|
||||||
|
### 8.1 Instance Options Validation
|
||||||
|
**File**: `pkg/validation/validation.go`
|
||||||
|
|
||||||
|
Add vLLM validation case:
|
||||||
|
```go
|
||||||
|
func ValidateInstanceOptions(options *instance.CreateInstanceOptions) error {
|
||||||
|
// existing validation...
|
||||||
|
|
||||||
|
switch options.BackendType {
|
||||||
|
case backends.BackendTypeLlamaCpp:
|
||||||
|
return validateLlamaCppOptions(options)
|
||||||
|
case backends.BackendTypeMlxLm:
|
||||||
|
return validateMlxOptions(options)
|
||||||
|
case backends.BackendTypeVllm: // ADD THIS
|
||||||
|
return validateVllmOptions(options)
|
||||||
|
default:
|
||||||
|
return ValidationError(fmt.Errorf("unsupported backend type: %s", options.BackendType))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func validateVllmOptions(options *instance.CreateInstanceOptions) error {
|
||||||
|
if options.VllmServerOptions == nil {
|
||||||
|
return ValidationError(fmt.Errorf("vLLM server options cannot be nil for vLLM backend"))
|
||||||
|
}
|
||||||
|
|
||||||
|
// Basic validation following the same pattern as other backends
|
||||||
|
if err := validateStructStrings(options.VllmServerOptions, ""); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Port validation
|
||||||
|
if options.VllmServerOptions.Port < 0 || options.VllmServerOptions.Port > 65535 {
|
||||||
|
return ValidationError(fmt.Errorf("invalid port range: %d", options.VllmServerOptions.Port))
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## 9. Testing Strategy
|
||||||
|
|
||||||
|
### 9.1 Unit Tests
|
||||||
|
- **`vllm_test.go`**: Test `VllmServerOptions` marshaling/unmarshaling, BuildCommandArgs()
|
||||||
|
- **`parser_test.go`**: Test command parsing for various formats
|
||||||
|
- **Integration tests**: Mock vLLM commands and validate parsing
|
||||||
|
|
||||||
|
### 9.2 Test Cases
|
||||||
|
```go
|
||||||
|
func TestBuildCommandArgs_VllmBasic(t *testing.T) {
|
||||||
|
options := VllmServerOptions{
|
||||||
|
Model: "microsoft/DialoGPT-medium",
|
||||||
|
Port: 8080,
|
||||||
|
Host: "localhost",
|
||||||
|
EnableLogOutputs: true,
|
||||||
|
TensorParallelSize: 2,
|
||||||
|
}
|
||||||
|
|
||||||
|
args := options.BuildCommandArgs()
|
||||||
|
// Validate expected arguments (excluding "serve")
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestParseVllmCommand_FullCommand(t *testing.T) {
|
||||||
|
command := "vllm serve --model ISTA-DASLab/gemma-3-27b-it-GPTQ-4b-128g --tensor-parallel-size 2"
|
||||||
|
result, err := ParseVllmCommand(command)
|
||||||
|
// Validate parsing results
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## 10. Example Usage
|
||||||
|
|
||||||
|
### 10.1 Parse Existing vLLM Command
|
||||||
|
```bash
|
||||||
|
curl -X POST http://localhost:8080/api/v1/backends/vllm/parse-command \
|
||||||
|
-H "Authorization: Bearer your-management-key" \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"command": "vllm serve --model ISTA-DASLab/gemma-3-27b-it-GPTQ-4b-128g --tensor-parallel-size 2 --gpu-memory-utilization 0.5"
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
### 10.2 Create vLLM Instance
|
||||||
|
```bash
|
||||||
|
curl -X POST http://localhost:8080/api/v1/instances/my-vllm-model \
|
||||||
|
-H "Authorization: Bearer your-management-key" \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"backend_type": "vllm",
|
||||||
|
"backend_options": {
|
||||||
|
"model": "ISTA-DASLab/gemma-3-27b-it-GPTQ-4b-128g",
|
||||||
|
"tensor_parallel_size": 2,
|
||||||
|
"gpu_memory_utilization": 0.5,
|
||||||
|
"enable_log_outputs": true
|
||||||
|
}
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
### 10.3 Use via OpenAI-Compatible API
|
||||||
|
```bash
|
||||||
|
curl -X POST http://localhost:8080/v1/chat/completions \
|
||||||
|
-H "Authorization: Bearer your-inference-key" \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"model": "my-vllm-model",
|
||||||
|
"messages": [{"role": "user", "content": "Hello!"}]
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
## 11. Implementation Checklist
|
||||||
|
|
||||||
|
### Phase 1: Core Backend
|
||||||
|
- [ ] Create `pkg/backends/vllm/vllm.go`
|
||||||
|
- [ ] Implement `VllmServerOptions` struct with basic fields
|
||||||
|
- [ ] Implement `BuildCommandArgs()`, `UnmarshalJSON()`, `MarshalJSON()`
|
||||||
|
- [ ] Add comprehensive field mappings for CLI args
|
||||||
|
- [ ] Create unit tests for `VllmServerOptions`
|
||||||
|
|
||||||
|
### Phase 2: Command Parsing
|
||||||
|
- [ ] Create `pkg/backends/vllm/parser.go`
|
||||||
|
- [ ] Implement `ParseVllmCommand()` function
|
||||||
|
- [ ] Handle various command input formats
|
||||||
|
- [ ] Create comprehensive parser tests
|
||||||
|
- [ ] Test edge cases and error conditions
|
||||||
|
|
||||||
|
### Phase 3: Integration
|
||||||
|
- [ ] Add `BackendTypeVllm` to `pkg/backends/backend.go`
|
||||||
|
- [ ] Update `BackendConfig` in `pkg/config/config.go`
|
||||||
|
- [ ] Add environment variable support
|
||||||
|
- [ ] Update `CreateInstanceOptions` in `pkg/instance/options.go`
|
||||||
|
- [ ] Implement `BuildCommandArgs()` with "serve" prepending
|
||||||
|
|
||||||
|
### Phase 4: Lifecycle & Health Checks
|
||||||
|
- [ ] Update executable selection in `pkg/instance/lifecycle.go`
|
||||||
|
- [ ] Test instance startup and health checking (uses existing `/health` endpoint)
|
||||||
|
- [ ] Validate command execution flow
|
||||||
|
|
||||||
|
### Phase 5: API Integration
|
||||||
|
- [ ] Add `ParseVllmCommand()` handler in `pkg/server/handlers.go`
|
||||||
|
- [ ] Add vLLM route in `pkg/server/routes.go`
|
||||||
|
- [ ] Update validation in `pkg/validation/validation.go`
|
||||||
|
- [ ] Test API endpoints
|
||||||
|
|
||||||
|
### Phase 6: Testing & Documentation
|
||||||
|
- [ ] Create comprehensive integration tests
|
||||||
|
- [ ] Test with actual vLLM installation (if available)
|
||||||
|
- [ ] Update documentation
|
||||||
|
- [ ] Test OpenAI-compatible proxy functionality
|
||||||
|
|
||||||
|
## 12. Configuration Examples
|
||||||
|
|
||||||
|
### 12.1 YAML Configuration
|
||||||
|
```yaml
|
||||||
|
backends:
|
||||||
|
llama_executable: "llama-server"
|
||||||
|
mlx_lm_executable: "mlx_lm.server"
|
||||||
|
vllm_executable: "vllm"
|
||||||
|
|
||||||
|
instances:
|
||||||
|
# ... other instance settings
|
||||||
|
```
|
||||||
|
|
||||||
|
### 12.2 Environment Variables
|
||||||
|
```bash
|
||||||
|
export LLAMACTL_VLLM_EXECUTABLE="vllm"
|
||||||
|
# OR for custom installation
|
||||||
|
export LLAMACTL_VLLM_EXECUTABLE="python -m vllm"
|
||||||
|
# OR for containerized deployment
|
||||||
|
export LLAMACTL_VLLM_EXECUTABLE="docker run --rm --gpus all vllm/vllm-openai"
|
||||||
|
```
|
||||||
|
|
||||||
|
## 13. Notes and Considerations
|
||||||
|
|
||||||
|
### 13.1 Startup Time
|
||||||
|
- vLLM instances may take significantly longer to start than llama.cpp
|
||||||
|
- Consider documenting recommended timeout values
|
||||||
|
- The configurable `on_demand_start_timeout` should accommodate this
|
||||||
|
|
||||||
|
### 13.2 Resource Usage
|
||||||
|
- vLLM typically requires substantial GPU memory
|
||||||
|
- No special handling needed in llamactl (follows existing pattern)
|
||||||
|
- Resource management is left to the user/administrator
|
||||||
|
|
||||||
|
### 13.3 Model Compatibility
|
||||||
|
- Primarily designed for HuggingFace models
|
||||||
|
- Supports various quantization formats (GPTQ, AWQ, etc.)
|
||||||
|
- Model path validation can be basic (similar to other backends)
|
||||||
|
|
||||||
|
### 13.4 Future Enhancements
|
||||||
|
- Consider adding vLLM-specific parameter validation
|
||||||
|
- Could add model download/caching features
|
||||||
|
- May want to add vLLM version detection capabilities
|
||||||
|
|
||||||
|
This specification provides a comprehensive roadmap for implementing vLLM backend support while maintaining consistency with the existing llamactl architecture.
|
||||||
Reference in New Issue
Block a user