mirror of
https://github.com/lordmathis/llamactl.git
synced 2025-11-06 17:14:28 +00:00
Refactor MLX and VLLM server options parsing and args building
This commit is contained in:
@@ -1,6 +1,7 @@
|
|||||||
package llamacpp
|
package llamacpp_test
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"llamactl/pkg/backends/llamacpp"
|
||||||
"testing"
|
"testing"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -11,28 +12,23 @@ func TestParseLlamaCommand(t *testing.T) {
|
|||||||
expectErr bool
|
expectErr bool
|
||||||
}{
|
}{
|
||||||
{
|
{
|
||||||
name: "basic command with model",
|
name: "basic command",
|
||||||
command: "llama-server --model /path/to/model.gguf",
|
command: "llama-server --model /path/to/model.gguf --gpu-layers 32",
|
||||||
expectErr: false,
|
expectErr: false,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "command with multiple flags",
|
name: "args only",
|
||||||
command: "llama-server --model /path/to/model.gguf --gpu-layers 32 --ctx-size 4096",
|
command: "--model /path/to/model.gguf --ctx-size 4096",
|
||||||
expectErr: false,
|
expectErr: false,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "command with short flags",
|
name: "mixed flag formats",
|
||||||
command: "llama-server -m /path/to/model.gguf -ngl 32 -c 4096",
|
command: "llama-server --model=/path/model.gguf --gpu-layers 16 --verbose",
|
||||||
expectErr: false,
|
expectErr: false,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "command with equals format",
|
name: "quoted strings",
|
||||||
command: "llama-server --model=/path/to/model.gguf --gpu-layers=32",
|
command: `llama-server --model test.gguf --api-key "sk-1234567890abcdef"`,
|
||||||
expectErr: false,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "command with boolean flags",
|
|
||||||
command: "llama-server --model /path/to/model.gguf --verbose --no-mmap",
|
|
||||||
expectErr: false,
|
expectErr: false,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -41,46 +37,20 @@ func TestParseLlamaCommand(t *testing.T) {
|
|||||||
expectErr: true,
|
expectErr: true,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "case insensitive command",
|
name: "unterminated quote",
|
||||||
command: "LLAMA-SERVER --model /path/to/model.gguf",
|
command: `llama-server --model test.gguf --api-key "unterminated`,
|
||||||
expectErr: false,
|
expectErr: true,
|
||||||
},
|
|
||||||
// New test cases for improved functionality
|
|
||||||
{
|
|
||||||
name: "args only without llama-server",
|
|
||||||
command: "--model /path/to/model.gguf --gpu-layers 32",
|
|
||||||
expectErr: false,
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "full path to executable",
|
name: "malformed flag",
|
||||||
command: "/usr/local/bin/llama-server --model /path/to/model.gguf",
|
command: "llama-server ---model test.gguf",
|
||||||
expectErr: false,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "negative number handling",
|
|
||||||
command: "llama-server --gpu-layers -1 --model test.gguf",
|
|
||||||
expectErr: false,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "multiline command with backslashes",
|
|
||||||
command: "llama-server --model /path/to/model.gguf \\\n --ctx-size 4096 \\\n --batch-size 512",
|
|
||||||
expectErr: false,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "quoted string with special characters",
|
|
||||||
command: `llama-server --model test.gguf --chat-template "{% for message in messages %}{{ message.role }}: {{ message.content }}\n{% endfor %}"`,
|
|
||||||
expectErr: false,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "unterminated quoted string",
|
|
||||||
command: `llama-server --model test.gguf --chat-template "unterminated quote`,
|
|
||||||
expectErr: true,
|
expectErr: true,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, tt := range tests {
|
for _, tt := range tests {
|
||||||
t.Run(tt.name, func(t *testing.T) {
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
result, err := ParseLlamaCommand(tt.command)
|
result, err := llamacpp.ParseLlamaCommand(tt.command)
|
||||||
|
|
||||||
if tt.expectErr {
|
if tt.expectErr {
|
||||||
if err == nil {
|
if err == nil {
|
||||||
@@ -96,16 +66,14 @@ func TestParseLlamaCommand(t *testing.T) {
|
|||||||
|
|
||||||
if result == nil {
|
if result == nil {
|
||||||
t.Errorf("expected result but got nil")
|
t.Errorf("expected result but got nil")
|
||||||
return
|
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestParseLlamaCommandSpecificValues(t *testing.T) {
|
func TestParseLlamaCommandValues(t *testing.T) {
|
||||||
// Test specific value parsing
|
command := "llama-server --model /test/model.gguf --gpu-layers 32 --temp 0.7 --verbose --no-mmap"
|
||||||
command := "llama-server --model /test/model.gguf --gpu-layers 32 --ctx-size 4096 --verbose"
|
result, err := llamacpp.ParseLlamaCommand(command)
|
||||||
result, err := ParseLlamaCommand(command)
|
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatalf("unexpected error: %v", err)
|
t.Fatalf("unexpected error: %v", err)
|
||||||
@@ -119,19 +87,22 @@ func TestParseLlamaCommandSpecificValues(t *testing.T) {
|
|||||||
t.Errorf("expected gpu_layers 32, got %d", result.GPULayers)
|
t.Errorf("expected gpu_layers 32, got %d", result.GPULayers)
|
||||||
}
|
}
|
||||||
|
|
||||||
if result.CtxSize != 4096 {
|
if result.Temperature != 0.7 {
|
||||||
t.Errorf("expected ctx_size 4096, got %d", result.CtxSize)
|
t.Errorf("expected temperature 0.7, got %f", result.Temperature)
|
||||||
}
|
}
|
||||||
|
|
||||||
if !result.Verbose {
|
if !result.Verbose {
|
||||||
t.Errorf("expected verbose to be true, got %v", result.Verbose)
|
t.Errorf("expected verbose to be true")
|
||||||
|
}
|
||||||
|
|
||||||
|
if !result.NoMmap {
|
||||||
|
t.Errorf("expected no_mmap to be true")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestParseLlamaCommandArrayFlags(t *testing.T) {
|
func TestParseLlamaCommandArrays(t *testing.T) {
|
||||||
// Test array flag handling (critical for lora, override-tensor, etc.)
|
command := "llama-server --model test.gguf --lora adapter1.bin --lora=adapter2.bin"
|
||||||
command := "llama-server --model test.gguf --lora adapter1.bin --lora adapter2.bin"
|
result, err := llamacpp.ParseLlamaCommand(command)
|
||||||
result, err := ParseLlamaCommand(command)
|
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatalf("unexpected error: %v", err)
|
t.Fatalf("unexpected error: %v", err)
|
||||||
@@ -141,273 +112,10 @@ func TestParseLlamaCommandArrayFlags(t *testing.T) {
|
|||||||
t.Errorf("expected 2 lora adapters, got %d", len(result.Lora))
|
t.Errorf("expected 2 lora adapters, got %d", len(result.Lora))
|
||||||
}
|
}
|
||||||
|
|
||||||
if result.Lora[0] != "adapter1.bin" || result.Lora[1] != "adapter2.bin" {
|
expected := []string{"adapter1.bin", "adapter2.bin"}
|
||||||
t.Errorf("expected lora adapters [adapter1.bin, adapter2.bin], got %v", result.Lora)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestParseLlamaCommandMixedFormats(t *testing.T) {
|
|
||||||
// Test mixing --flag=value and --flag value formats
|
|
||||||
command := "llama-server --model=/path/model.gguf --gpu-layers 16 --batch-size=512 --verbose"
|
|
||||||
result, err := ParseLlamaCommand(command)
|
|
||||||
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("unexpected error: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
if result.Model != "/path/model.gguf" {
|
|
||||||
t.Errorf("expected model '/path/model.gguf', got '%s'", result.Model)
|
|
||||||
}
|
|
||||||
|
|
||||||
if result.GPULayers != 16 {
|
|
||||||
t.Errorf("expected gpu_layers 16, got %d", result.GPULayers)
|
|
||||||
}
|
|
||||||
|
|
||||||
if result.BatchSize != 512 {
|
|
||||||
t.Errorf("expected batch_size 512, got %d", result.BatchSize)
|
|
||||||
}
|
|
||||||
|
|
||||||
if !result.Verbose {
|
|
||||||
t.Errorf("expected verbose to be true, got %v", result.Verbose)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestParseLlamaCommandTypeConversion(t *testing.T) {
|
|
||||||
// Test that values are converted to appropriate types
|
|
||||||
command := "llama-server --model test.gguf --temp 0.7 --top-k 40 --no-mmap"
|
|
||||||
result, err := ParseLlamaCommand(command)
|
|
||||||
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("unexpected error: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
if result.Temperature != 0.7 {
|
|
||||||
t.Errorf("expected temperature 0.7, got %f", result.Temperature)
|
|
||||||
}
|
|
||||||
|
|
||||||
if result.TopK != 40 {
|
|
||||||
t.Errorf("expected top_k 40, got %d", result.TopK)
|
|
||||||
}
|
|
||||||
|
|
||||||
if !result.NoMmap {
|
|
||||||
t.Errorf("expected no_mmap to be true, got %v", result.NoMmap)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestParseLlamaCommandArgsOnly(t *testing.T) {
|
|
||||||
// Test parsing arguments without llama-server command
|
|
||||||
command := "--model /path/to/model.gguf --gpu-layers 32 --ctx-size 4096"
|
|
||||||
result, err := ParseLlamaCommand(command)
|
|
||||||
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("unexpected error: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
if result.Model != "/path/to/model.gguf" {
|
|
||||||
t.Errorf("expected model '/path/to/model.gguf', got '%s'", result.Model)
|
|
||||||
}
|
|
||||||
|
|
||||||
if result.GPULayers != 32 {
|
|
||||||
t.Errorf("expected gpu_layers 32, got %d", result.GPULayers)
|
|
||||||
}
|
|
||||||
|
|
||||||
if result.CtxSize != 4096 {
|
|
||||||
t.Errorf("expected ctx_size 4096, got %d", result.CtxSize)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestParseLlamaCommandFullPath(t *testing.T) {
|
|
||||||
// Test full path to executable
|
|
||||||
command := "/usr/local/bin/llama-server --model test.gguf --gpu-layers 16"
|
|
||||||
result, err := ParseLlamaCommand(command)
|
|
||||||
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("unexpected error: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
if result.Model != "test.gguf" {
|
|
||||||
t.Errorf("expected model 'test.gguf', got '%s'", result.Model)
|
|
||||||
}
|
|
||||||
|
|
||||||
if result.GPULayers != 16 {
|
|
||||||
t.Errorf("expected gpu_layers 16, got %d", result.GPULayers)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestParseLlamaCommandNegativeNumbers(t *testing.T) {
|
|
||||||
// Test negative number parsing
|
|
||||||
command := "llama-server --model test.gguf --gpu-layers -1 --seed -12345"
|
|
||||||
result, err := ParseLlamaCommand(command)
|
|
||||||
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("unexpected error: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
if result.GPULayers != -1 {
|
|
||||||
t.Errorf("expected gpu_layers -1, got %d", result.GPULayers)
|
|
||||||
}
|
|
||||||
|
|
||||||
if result.Seed != -12345 {
|
|
||||||
t.Errorf("expected seed -12345, got %d", result.Seed)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestParseLlamaCommandMultiline(t *testing.T) {
|
|
||||||
// Test multiline command with backslashes
|
|
||||||
command := `llama-server --model /path/to/model.gguf \
|
|
||||||
--ctx-size 4096 \
|
|
||||||
--batch-size 512 \
|
|
||||||
--gpu-layers 32`
|
|
||||||
|
|
||||||
result, err := ParseLlamaCommand(command)
|
|
||||||
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("unexpected error: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
if result.Model != "/path/to/model.gguf" {
|
|
||||||
t.Errorf("expected model '/path/to/model.gguf', got '%s'", result.Model)
|
|
||||||
}
|
|
||||||
|
|
||||||
if result.CtxSize != 4096 {
|
|
||||||
t.Errorf("expected ctx_size 4096, got %d", result.CtxSize)
|
|
||||||
}
|
|
||||||
|
|
||||||
if result.BatchSize != 512 {
|
|
||||||
t.Errorf("expected batch_size 512, got %d", result.BatchSize)
|
|
||||||
}
|
|
||||||
|
|
||||||
if result.GPULayers != 32 {
|
|
||||||
t.Errorf("expected gpu_layers 32, got %d", result.GPULayers)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestParseLlamaCommandQuotedStrings(t *testing.T) {
|
|
||||||
// Test quoted strings with special characters
|
|
||||||
command := `llama-server --model test.gguf --api-key "sk-1234567890abcdef" --chat-template "User: {user}\nAssistant: "`
|
|
||||||
result, err := ParseLlamaCommand(command)
|
|
||||||
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("unexpected error: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
if result.Model != "test.gguf" {
|
|
||||||
t.Errorf("expected model 'test.gguf', got '%s'", result.Model)
|
|
||||||
}
|
|
||||||
|
|
||||||
if result.APIKey != "sk-1234567890abcdef" {
|
|
||||||
t.Errorf("expected api_key 'sk-1234567890abcdef', got '%s'", result.APIKey)
|
|
||||||
}
|
|
||||||
|
|
||||||
expectedTemplate := "User: {user}\\nAssistant: "
|
|
||||||
if result.ChatTemplate != expectedTemplate {
|
|
||||||
t.Errorf("expected chat_template '%s', got '%s'", expectedTemplate, result.ChatTemplate)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestParseLlamaCommandUnslothExample(t *testing.T) {
|
|
||||||
// Test with realistic unsloth-style command
|
|
||||||
command := `llama-server --model /path/to/model.gguf \
|
|
||||||
--ctx-size 4096 \
|
|
||||||
--batch-size 512 \
|
|
||||||
--gpu-layers -1 \
|
|
||||||
--temp 0.7 \
|
|
||||||
--repeat-penalty 1.1 \
|
|
||||||
--top-k 40 \
|
|
||||||
--top-p 0.95 \
|
|
||||||
--host 0.0.0.0 \
|
|
||||||
--port 8000 \
|
|
||||||
--api-key "sk-1234567890abcdef"`
|
|
||||||
|
|
||||||
result, err := ParseLlamaCommand(command)
|
|
||||||
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("unexpected error: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Verify key fields
|
|
||||||
if result.Model != "/path/to/model.gguf" {
|
|
||||||
t.Errorf("expected model '/path/to/model.gguf', got '%s'", result.Model)
|
|
||||||
}
|
|
||||||
|
|
||||||
if result.CtxSize != 4096 {
|
|
||||||
t.Errorf("expected ctx_size 4096, got %d", result.CtxSize)
|
|
||||||
}
|
|
||||||
|
|
||||||
if result.BatchSize != 512 {
|
|
||||||
t.Errorf("expected batch_size 512, got %d", result.BatchSize)
|
|
||||||
}
|
|
||||||
|
|
||||||
if result.GPULayers != -1 {
|
|
||||||
t.Errorf("expected gpu_layers -1, got %d", result.GPULayers)
|
|
||||||
}
|
|
||||||
|
|
||||||
if result.Temperature != 0.7 {
|
|
||||||
t.Errorf("expected temperature 0.7, got %f", result.Temperature)
|
|
||||||
}
|
|
||||||
|
|
||||||
if result.RepeatPenalty != 1.1 {
|
|
||||||
t.Errorf("expected repeat_penalty 1.1, got %f", result.RepeatPenalty)
|
|
||||||
}
|
|
||||||
|
|
||||||
if result.TopK != 40 {
|
|
||||||
t.Errorf("expected top_k 40, got %d", result.TopK)
|
|
||||||
}
|
|
||||||
|
|
||||||
if result.TopP != 0.95 {
|
|
||||||
t.Errorf("expected top_p 0.95, got %f", result.TopP)
|
|
||||||
}
|
|
||||||
|
|
||||||
if result.Host != "0.0.0.0" {
|
|
||||||
t.Errorf("expected host '0.0.0.0', got '%s'", result.Host)
|
|
||||||
}
|
|
||||||
|
|
||||||
if result.Port != 8000 {
|
|
||||||
t.Errorf("expected port 8000, got %d", result.Port)
|
|
||||||
}
|
|
||||||
|
|
||||||
if result.APIKey != "sk-1234567890abcdef" {
|
|
||||||
t.Errorf("expected api_key 'sk-1234567890abcdef', got '%s'", result.APIKey)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Focused additional edge case tests (kept minimal per guidance)
|
|
||||||
func TestParseLlamaCommandSingleQuotedValue(t *testing.T) {
|
|
||||||
cmd := "llama-server --model 'my model.gguf' --alias 'Test Alias'"
|
|
||||||
result, err := ParseLlamaCommand(cmd)
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("unexpected error: %v", err)
|
|
||||||
}
|
|
||||||
if result.Model != "my model.gguf" {
|
|
||||||
t.Errorf("expected model 'my model.gguf', got '%s'", result.Model)
|
|
||||||
}
|
|
||||||
if result.Alias != "Test Alias" {
|
|
||||||
t.Errorf("expected alias 'Test Alias', got '%s'", result.Alias)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestParseLlamaCommandMixedArrayForms(t *testing.T) {
|
|
||||||
// Same multi-value flag using --flag value and --flag=value forms
|
|
||||||
cmd := "llama-server --lora adapter1.bin --lora=adapter2.bin --lora adapter3.bin"
|
|
||||||
result, err := ParseLlamaCommand(cmd)
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("unexpected error: %v", err)
|
|
||||||
}
|
|
||||||
if len(result.Lora) != 3 {
|
|
||||||
t.Fatalf("expected 3 lora values, got %d (%v)", len(result.Lora), result.Lora)
|
|
||||||
}
|
|
||||||
expected := []string{"adapter1.bin", "adapter2.bin", "adapter3.bin"}
|
|
||||||
for i, v := range expected {
|
for i, v := range expected {
|
||||||
if result.Lora[i] != v {
|
if result.Lora[i] != v {
|
||||||
t.Errorf("expected lora[%d]=%s got %s", i, v, result.Lora[i])
|
t.Errorf("expected lora[%d]=%s got %s", i, v, result.Lora[i])
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestParseLlamaCommandMalformedFlag(t *testing.T) {
|
|
||||||
cmd := "llama-server ---model test.gguf"
|
|
||||||
_, err := ParseLlamaCommand(cmd)
|
|
||||||
if err == nil {
|
|
||||||
t.Fatalf("expected error for malformed flag but got none")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -1,16 +1,16 @@
|
|||||||
package mlx
|
package mlx
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"encoding/json"
|
|
||||||
"reflect"
|
"reflect"
|
||||||
"strconv"
|
"strconv"
|
||||||
|
"strings"
|
||||||
)
|
)
|
||||||
|
|
||||||
type MlxServerOptions struct {
|
type MlxServerOptions struct {
|
||||||
// Basic connection options
|
// Basic connection options
|
||||||
Model string `json:"model,omitempty"`
|
Model string `json:"model,omitempty"`
|
||||||
Host string `json:"host,omitempty"`
|
Host string `json:"host,omitempty"`
|
||||||
Port int `json:"port,omitempty"`
|
Port int `json:"port,omitempty"`
|
||||||
|
|
||||||
// Model and adapter options
|
// Model and adapter options
|
||||||
AdapterPath string `json:"adapter_path,omitempty"`
|
AdapterPath string `json:"adapter_path,omitempty"`
|
||||||
@@ -19,187 +19,70 @@ type MlxServerOptions struct {
|
|||||||
TrustRemoteCode bool `json:"trust_remote_code,omitempty"`
|
TrustRemoteCode bool `json:"trust_remote_code,omitempty"`
|
||||||
|
|
||||||
// Logging and templates
|
// Logging and templates
|
||||||
LogLevel string `json:"log_level,omitempty"`
|
LogLevel string `json:"log_level,omitempty"`
|
||||||
ChatTemplate string `json:"chat_template,omitempty"`
|
ChatTemplate string `json:"chat_template,omitempty"`
|
||||||
UseDefaultChatTemplate bool `json:"use_default_chat_template,omitempty"`
|
UseDefaultChatTemplate bool `json:"use_default_chat_template,omitempty"`
|
||||||
ChatTemplateArgs string `json:"chat_template_args,omitempty"` // JSON string
|
ChatTemplateArgs string `json:"chat_template_args,omitempty"` // JSON string
|
||||||
|
|
||||||
// Sampling defaults
|
// Sampling defaults
|
||||||
Temp float64 `json:"temp,omitempty"` // Note: MLX uses "temp" not "temperature"
|
Temp float64 `json:"temp,omitempty"` // Note: MLX uses "temp" not "temperature"
|
||||||
TopP float64 `json:"top_p,omitempty"`
|
TopP float64 `json:"top_p,omitempty"`
|
||||||
TopK int `json:"top_k,omitempty"`
|
TopK int `json:"top_k,omitempty"`
|
||||||
MinP float64 `json:"min_p,omitempty"`
|
MinP float64 `json:"min_p,omitempty"`
|
||||||
MaxTokens int `json:"max_tokens,omitempty"`
|
MaxTokens int `json:"max_tokens,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
// UnmarshalJSON implements custom JSON unmarshaling to support multiple field names
|
// BuildCommandArgs converts to command line arguments using reflection
|
||||||
func (o *MlxServerOptions) UnmarshalJSON(data []byte) error {
|
func (o *MlxServerOptions) BuildCommandArgs() []string {
|
||||||
// First unmarshal into a map to handle multiple field names
|
var args []string
|
||||||
var raw map[string]any
|
|
||||||
if err := json.Unmarshal(data, &raw); err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
// Create a temporary struct for standard unmarshaling
|
v := reflect.ValueOf(o).Elem()
|
||||||
type tempOptions MlxServerOptions
|
t := v.Type()
|
||||||
temp := tempOptions{}
|
|
||||||
|
|
||||||
// Standard unmarshal first
|
for i := 0; i < v.NumField(); i++ {
|
||||||
if err := json.Unmarshal(data, &temp); err != nil {
|
field := v.Field(i)
|
||||||
return err
|
fieldType := t.Field(i)
|
||||||
}
|
|
||||||
|
|
||||||
// Copy to our struct
|
// Skip unexported fields
|
||||||
*o = MlxServerOptions(temp)
|
if !field.CanInterface() {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
// Handle alternative field names
|
// Get the JSON tag to determine the flag name
|
||||||
fieldMappings := map[string]string{
|
jsonTag := fieldType.Tag.Get("json")
|
||||||
// Basic connection options
|
if jsonTag == "" || jsonTag == "-" {
|
||||||
"m": "model",
|
continue
|
||||||
"host": "host",
|
}
|
||||||
"port": "port",
|
|
||||||
// "python_path": "python_path", // removed
|
|
||||||
|
|
||||||
// Model and adapter options
|
// Remove ",omitempty" from the tag
|
||||||
"adapter-path": "adapter_path",
|
flagName := jsonTag
|
||||||
"draft-model": "draft_model",
|
if commaIndex := strings.Index(jsonTag, ","); commaIndex != -1 {
|
||||||
"num-draft-tokens": "num_draft_tokens",
|
flagName = jsonTag[:commaIndex]
|
||||||
"trust-remote-code": "trust_remote_code",
|
}
|
||||||
|
|
||||||
// Logging and templates
|
// Convert snake_case to kebab-case for CLI flags
|
||||||
"log-level": "log_level",
|
flagName = strings.ReplaceAll(flagName, "_", "-")
|
||||||
"chat-template": "chat_template",
|
|
||||||
"use-default-chat-template": "use_default_chat_template",
|
|
||||||
"chat-template-args": "chat_template_args",
|
|
||||||
|
|
||||||
// Sampling defaults
|
// Add the appropriate arguments based on field type and value
|
||||||
"temperature": "temp", // Support both temp and temperature
|
switch field.Kind() {
|
||||||
"top-p": "top_p",
|
case reflect.Bool:
|
||||||
"top-k": "top_k",
|
if field.Bool() {
|
||||||
"min-p": "min_p",
|
args = append(args, "--"+flagName)
|
||||||
"max-tokens": "max_tokens",
|
}
|
||||||
}
|
case reflect.Int:
|
||||||
|
if field.Int() != 0 {
|
||||||
// Process alternative field names
|
args = append(args, "--"+flagName, strconv.FormatInt(field.Int(), 10))
|
||||||
for altName, canonicalName := range fieldMappings {
|
}
|
||||||
if value, exists := raw[altName]; exists {
|
case reflect.Float64:
|
||||||
// Use reflection to set the field value
|
if field.Float() != 0 {
|
||||||
v := reflect.ValueOf(o).Elem()
|
args = append(args, "--"+flagName, strconv.FormatFloat(field.Float(), 'f', -1, 64))
|
||||||
field := v.FieldByNameFunc(func(fieldName string) bool {
|
}
|
||||||
field, _ := v.Type().FieldByName(fieldName)
|
case reflect.String:
|
||||||
jsonTag := field.Tag.Get("json")
|
if field.String() != "" {
|
||||||
return jsonTag == canonicalName+",omitempty" || jsonTag == canonicalName
|
args = append(args, "--"+flagName, field.String())
|
||||||
})
|
|
||||||
|
|
||||||
if field.IsValid() && field.CanSet() {
|
|
||||||
switch field.Kind() {
|
|
||||||
case reflect.Int:
|
|
||||||
if intVal, ok := value.(float64); ok {
|
|
||||||
field.SetInt(int64(intVal))
|
|
||||||
} else if strVal, ok := value.(string); ok {
|
|
||||||
if intVal, err := strconv.Atoi(strVal); err == nil {
|
|
||||||
field.SetInt(int64(intVal))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
case reflect.Float64:
|
|
||||||
if floatVal, ok := value.(float64); ok {
|
|
||||||
field.SetFloat(floatVal)
|
|
||||||
} else if strVal, ok := value.(string); ok {
|
|
||||||
if floatVal, err := strconv.ParseFloat(strVal, 64); err == nil {
|
|
||||||
field.SetFloat(floatVal)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
case reflect.String:
|
|
||||||
if strVal, ok := value.(string); ok {
|
|
||||||
field.SetString(strVal)
|
|
||||||
}
|
|
||||||
case reflect.Bool:
|
|
||||||
if boolVal, ok := value.(bool); ok {
|
|
||||||
field.SetBool(boolVal)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// NewMlxServerOptions creates MlxServerOptions with MLX defaults
|
|
||||||
func NewMlxServerOptions() *MlxServerOptions {
|
|
||||||
return &MlxServerOptions{
|
|
||||||
Host: "127.0.0.1", // MLX default (different from llama-server)
|
|
||||||
Port: 8080, // MLX default
|
|
||||||
NumDraftTokens: 3, // MLX default for speculative decoding
|
|
||||||
LogLevel: "INFO", // MLX default
|
|
||||||
Temp: 0.0, // MLX default
|
|
||||||
TopP: 1.0, // MLX default
|
|
||||||
TopK: 0, // MLX default (disabled)
|
|
||||||
MinP: 0.0, // MLX default (disabled)
|
|
||||||
MaxTokens: 512, // MLX default
|
|
||||||
ChatTemplateArgs: "{}", // MLX default (empty JSON object)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// BuildCommandArgs converts to command line arguments
|
|
||||||
func (o *MlxServerOptions) BuildCommandArgs() []string {
|
|
||||||
var args []string
|
|
||||||
|
|
||||||
// Required and basic options
|
|
||||||
if o.Model != "" {
|
|
||||||
args = append(args, "--model", o.Model)
|
|
||||||
}
|
|
||||||
if o.Host != "" {
|
|
||||||
args = append(args, "--host", o.Host)
|
|
||||||
}
|
|
||||||
if o.Port != 0 {
|
|
||||||
args = append(args, "--port", strconv.Itoa(o.Port))
|
|
||||||
}
|
|
||||||
|
|
||||||
// Model and adapter options
|
|
||||||
if o.AdapterPath != "" {
|
|
||||||
args = append(args, "--adapter-path", o.AdapterPath)
|
|
||||||
}
|
|
||||||
if o.DraftModel != "" {
|
|
||||||
args = append(args, "--draft-model", o.DraftModel)
|
|
||||||
}
|
|
||||||
if o.NumDraftTokens != 0 {
|
|
||||||
args = append(args, "--num-draft-tokens", strconv.Itoa(o.NumDraftTokens))
|
|
||||||
}
|
|
||||||
if o.TrustRemoteCode {
|
|
||||||
args = append(args, "--trust-remote-code")
|
|
||||||
}
|
|
||||||
|
|
||||||
// Logging and templates
|
|
||||||
if o.LogLevel != "" {
|
|
||||||
args = append(args, "--log-level", o.LogLevel)
|
|
||||||
}
|
|
||||||
if o.ChatTemplate != "" {
|
|
||||||
args = append(args, "--chat-template", o.ChatTemplate)
|
|
||||||
}
|
|
||||||
if o.UseDefaultChatTemplate {
|
|
||||||
args = append(args, "--use-default-chat-template")
|
|
||||||
}
|
|
||||||
if o.ChatTemplateArgs != "" {
|
|
||||||
args = append(args, "--chat-template-args", o.ChatTemplateArgs)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Sampling defaults
|
|
||||||
if o.Temp != 0 {
|
|
||||||
args = append(args, "--temp", strconv.FormatFloat(o.Temp, 'f', -1, 64))
|
|
||||||
}
|
|
||||||
if o.TopP != 0 {
|
|
||||||
args = append(args, "--top-p", strconv.FormatFloat(o.TopP, 'f', -1, 64))
|
|
||||||
}
|
|
||||||
if o.TopK != 0 {
|
|
||||||
args = append(args, "--top-k", strconv.Itoa(o.TopK))
|
|
||||||
}
|
|
||||||
if o.MinP != 0 {
|
|
||||||
args = append(args, "--min-p", strconv.FormatFloat(o.MinP, 'f', -1, 64))
|
|
||||||
}
|
|
||||||
if o.MaxTokens != 0 {
|
|
||||||
args = append(args, "--max-tokens", strconv.Itoa(o.MaxTokens))
|
|
||||||
}
|
|
||||||
|
|
||||||
return args
|
return args
|
||||||
}
|
}
|
||||||
62
pkg/backends/mlx/mlx_test.go
Normal file
62
pkg/backends/mlx/mlx_test.go
Normal file
@@ -0,0 +1,62 @@
|
|||||||
|
package mlx_test
|
||||||
|
|
||||||
|
import (
|
||||||
|
"llamactl/pkg/backends/mlx"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestBuildCommandArgs(t *testing.T) {
|
||||||
|
options := &mlx.MlxServerOptions{
|
||||||
|
Model: "/test/model.mlx",
|
||||||
|
Host: "127.0.0.1",
|
||||||
|
Port: 8080,
|
||||||
|
Temp: 0.7,
|
||||||
|
TopP: 0.9,
|
||||||
|
TopK: 40,
|
||||||
|
MaxTokens: 2048,
|
||||||
|
TrustRemoteCode: true,
|
||||||
|
LogLevel: "DEBUG",
|
||||||
|
ChatTemplate: "custom template",
|
||||||
|
}
|
||||||
|
|
||||||
|
args := options.BuildCommandArgs()
|
||||||
|
|
||||||
|
// Check that all expected flags are present
|
||||||
|
expectedFlags := map[string]string{
|
||||||
|
"--model": "/test/model.mlx",
|
||||||
|
"--host": "127.0.0.1",
|
||||||
|
"--port": "8080",
|
||||||
|
"--log-level": "DEBUG",
|
||||||
|
"--chat-template": "custom template",
|
||||||
|
"--temp": "0.7",
|
||||||
|
"--top-p": "0.9",
|
||||||
|
"--top-k": "40",
|
||||||
|
"--max-tokens": "2048",
|
||||||
|
}
|
||||||
|
|
||||||
|
for i := 0; i < len(args); i++ {
|
||||||
|
if args[i] == "--trust-remote-code" {
|
||||||
|
continue // Boolean flag with no value
|
||||||
|
}
|
||||||
|
if args[i] == "--use-default-chat-template" {
|
||||||
|
continue // Boolean flag with no value
|
||||||
|
}
|
||||||
|
|
||||||
|
if expectedValue, exists := expectedFlags[args[i]]; exists && i+1 < len(args) {
|
||||||
|
if args[i+1] != expectedValue {
|
||||||
|
t.Errorf("expected %s to have value %s, got %s", args[i], expectedValue, args[i+1])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check boolean flags
|
||||||
|
foundTrustRemoteCode := false
|
||||||
|
for _, arg := range args {
|
||||||
|
if arg == "--trust-remote-code" {
|
||||||
|
foundTrustRemoteCode = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !foundTrustRemoteCode {
|
||||||
|
t.Errorf("expected --trust-remote-code flag to be present")
|
||||||
|
}
|
||||||
|
}
|
||||||
101
pkg/backends/mlx/parser_test.go
Normal file
101
pkg/backends/mlx/parser_test.go
Normal file
@@ -0,0 +1,101 @@
|
|||||||
|
package mlx_test
|
||||||
|
|
||||||
|
import (
|
||||||
|
"llamactl/pkg/backends/mlx"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestParseMlxCommand(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
command string
|
||||||
|
expectErr bool
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "basic command",
|
||||||
|
command: "mlx_lm.server --model /path/to/model --host 0.0.0.0",
|
||||||
|
expectErr: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "args only",
|
||||||
|
command: "--model /path/to/model --port 8080",
|
||||||
|
expectErr: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "mixed flag formats",
|
||||||
|
command: "mlx_lm.server --model=/path/model --temp=0.7 --trust-remote-code",
|
||||||
|
expectErr: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "quoted strings",
|
||||||
|
command: `mlx_lm.server --model test.mlx --chat-template "User: {user}\nAssistant: "`,
|
||||||
|
expectErr: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "empty command",
|
||||||
|
command: "",
|
||||||
|
expectErr: true,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "unterminated quote",
|
||||||
|
command: `mlx_lm.server --model test.mlx --chat-template "unterminated`,
|
||||||
|
expectErr: true,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "malformed flag",
|
||||||
|
command: "mlx_lm.server ---model test.mlx",
|
||||||
|
expectErr: true,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range tests {
|
||||||
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
|
result, err := mlx.ParseMlxCommand(tt.command)
|
||||||
|
|
||||||
|
if tt.expectErr {
|
||||||
|
if err == nil {
|
||||||
|
t.Errorf("expected error but got none")
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
t.Errorf("unexpected error: %v", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if result == nil {
|
||||||
|
t.Errorf("expected result but got nil")
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestParseMlxCommandValues(t *testing.T) {
|
||||||
|
command := "mlx_lm.server --model /test/model.mlx --port 8080 --temp 0.7 --trust-remote-code --log-level DEBUG"
|
||||||
|
result, err := mlx.ParseMlxCommand(command)
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("unexpected error: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if result.Model != "/test/model.mlx" {
|
||||||
|
t.Errorf("expected model '/test/model.mlx', got '%s'", result.Model)
|
||||||
|
}
|
||||||
|
|
||||||
|
if result.Port != 8080 {
|
||||||
|
t.Errorf("expected port 8080, got %d", result.Port)
|
||||||
|
}
|
||||||
|
|
||||||
|
if result.Temp != 0.7 {
|
||||||
|
t.Errorf("expected temp 0.7, got %f", result.Temp)
|
||||||
|
}
|
||||||
|
|
||||||
|
if !result.TrustRemoteCode {
|
||||||
|
t.Errorf("expected trust_remote_code to be true")
|
||||||
|
}
|
||||||
|
|
||||||
|
if result.LogLevel != "DEBUG" {
|
||||||
|
t.Errorf("expected log_level 'DEBUG', got '%s'", result.LogLevel)
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,6 +1,7 @@
|
|||||||
package vllm
|
package vllm_test
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"llamactl/pkg/backends/vllm"
|
||||||
"testing"
|
"testing"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -39,7 +40,7 @@ func TestParseVllmCommand(t *testing.T) {
|
|||||||
|
|
||||||
for _, tt := range tests {
|
for _, tt := range tests {
|
||||||
t.Run(tt.name, func(t *testing.T) {
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
result, err := ParseVllmCommand(tt.command)
|
result, err := vllm.ParseVllmCommand(tt.command)
|
||||||
|
|
||||||
if tt.expectErr {
|
if tt.expectErr {
|
||||||
if err == nil {
|
if err == nil {
|
||||||
@@ -62,7 +63,7 @@ func TestParseVllmCommand(t *testing.T) {
|
|||||||
|
|
||||||
func TestParseVllmCommandValues(t *testing.T) {
|
func TestParseVllmCommandValues(t *testing.T) {
|
||||||
command := "vllm serve --model test-model --tensor-parallel-size 4 --gpu-memory-utilization 0.8 --enable-log-outputs"
|
command := "vllm serve --model test-model --tensor-parallel-size 4 --gpu-memory-utilization 0.8 --enable-log-outputs"
|
||||||
result, err := ParseVllmCommand(command)
|
result, err := vllm.ParseVllmCommand(command)
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatalf("unexpected error: %v", err)
|
t.Fatalf("unexpected error: %v", err)
|
||||||
|
|||||||
@@ -1,7 +1,6 @@
|
|||||||
package vllm
|
package vllm
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"encoding/json"
|
|
||||||
"reflect"
|
"reflect"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
@@ -13,349 +12,124 @@ type VllmServerOptions struct {
|
|||||||
Port int `json:"port,omitempty"`
|
Port int `json:"port,omitempty"`
|
||||||
|
|
||||||
// Model and engine configuration
|
// Model and engine configuration
|
||||||
Model string `json:"model,omitempty"`
|
Model string `json:"model,omitempty"`
|
||||||
Tokenizer string `json:"tokenizer,omitempty"`
|
Tokenizer string `json:"tokenizer,omitempty"`
|
||||||
SkipTokenizerInit bool `json:"skip_tokenizer_init,omitempty"`
|
SkipTokenizerInit bool `json:"skip_tokenizer_init,omitempty"`
|
||||||
Revision string `json:"revision,omitempty"`
|
Revision string `json:"revision,omitempty"`
|
||||||
CodeRevision string `json:"code_revision,omitempty"`
|
CodeRevision string `json:"code_revision,omitempty"`
|
||||||
TokenizerRevision string `json:"tokenizer_revision,omitempty"`
|
TokenizerRevision string `json:"tokenizer_revision,omitempty"`
|
||||||
TokenizerMode string `json:"tokenizer_mode,omitempty"`
|
TokenizerMode string `json:"tokenizer_mode,omitempty"`
|
||||||
TrustRemoteCode bool `json:"trust_remote_code,omitempty"`
|
TrustRemoteCode bool `json:"trust_remote_code,omitempty"`
|
||||||
DownloadDir string `json:"download_dir,omitempty"`
|
DownloadDir string `json:"download_dir,omitempty"`
|
||||||
LoadFormat string `json:"load_format,omitempty"`
|
LoadFormat string `json:"load_format,omitempty"`
|
||||||
ConfigFormat string `json:"config_format,omitempty"`
|
ConfigFormat string `json:"config_format,omitempty"`
|
||||||
Dtype string `json:"dtype,omitempty"`
|
Dtype string `json:"dtype,omitempty"`
|
||||||
KVCacheDtype string `json:"kv_cache_dtype,omitempty"`
|
KVCacheDtype string `json:"kv_cache_dtype,omitempty"`
|
||||||
QuantizationParamPath string `json:"quantization_param_path,omitempty"`
|
QuantizationParamPath string `json:"quantization_param_path,omitempty"`
|
||||||
Seed int `json:"seed,omitempty"`
|
Seed int `json:"seed,omitempty"`
|
||||||
MaxModelLen int `json:"max_model_len,omitempty"`
|
MaxModelLen int `json:"max_model_len,omitempty"`
|
||||||
GuidedDecodingBackend string `json:"guided_decoding_backend,omitempty"`
|
GuidedDecodingBackend string `json:"guided_decoding_backend,omitempty"`
|
||||||
DistributedExecutorBackend string `json:"distributed_executor_backend,omitempty"`
|
DistributedExecutorBackend string `json:"distributed_executor_backend,omitempty"`
|
||||||
WorkerUseRay bool `json:"worker_use_ray,omitempty"`
|
WorkerUseRay bool `json:"worker_use_ray,omitempty"`
|
||||||
RayWorkersUseNSight bool `json:"ray_workers_use_nsight,omitempty"`
|
RayWorkersUseNSight bool `json:"ray_workers_use_nsight,omitempty"`
|
||||||
|
|
||||||
// Performance and serving configuration
|
// Performance and serving configuration
|
||||||
BlockSize int `json:"block_size,omitempty"`
|
BlockSize int `json:"block_size,omitempty"`
|
||||||
EnablePrefixCaching bool `json:"enable_prefix_caching,omitempty"`
|
EnablePrefixCaching bool `json:"enable_prefix_caching,omitempty"`
|
||||||
DisableSlidingWindow bool `json:"disable_sliding_window,omitempty"`
|
DisableSlidingWindow bool `json:"disable_sliding_window,omitempty"`
|
||||||
UseV2BlockManager bool `json:"use_v2_block_manager,omitempty"`
|
UseV2BlockManager bool `json:"use_v2_block_manager,omitempty"`
|
||||||
NumLookaheadSlots int `json:"num_lookahead_slots,omitempty"`
|
NumLookaheadSlots int `json:"num_lookahead_slots,omitempty"`
|
||||||
SwapSpace int `json:"swap_space,omitempty"`
|
SwapSpace int `json:"swap_space,omitempty"`
|
||||||
CPUOffloadGB int `json:"cpu_offload_gb,omitempty"`
|
CPUOffloadGB int `json:"cpu_offload_gb,omitempty"`
|
||||||
GPUMemoryUtilization float64 `json:"gpu_memory_utilization,omitempty"`
|
GPUMemoryUtilization float64 `json:"gpu_memory_utilization,omitempty"`
|
||||||
NumGPUBlocksOverride int `json:"num_gpu_blocks_override,omitempty"`
|
NumGPUBlocksOverride int `json:"num_gpu_blocks_override,omitempty"`
|
||||||
MaxNumBatchedTokens int `json:"max_num_batched_tokens,omitempty"`
|
MaxNumBatchedTokens int `json:"max_num_batched_tokens,omitempty"`
|
||||||
MaxNumSeqs int `json:"max_num_seqs,omitempty"`
|
MaxNumSeqs int `json:"max_num_seqs,omitempty"`
|
||||||
MaxLogprobs int `json:"max_logprobs,omitempty"`
|
MaxLogprobs int `json:"max_logprobs,omitempty"`
|
||||||
DisableLogStats bool `json:"disable_log_stats,omitempty"`
|
DisableLogStats bool `json:"disable_log_stats,omitempty"`
|
||||||
Quantization string `json:"quantization,omitempty"`
|
Quantization string `json:"quantization,omitempty"`
|
||||||
RopeScaling string `json:"rope_scaling,omitempty"`
|
RopeScaling string `json:"rope_scaling,omitempty"`
|
||||||
RopeTheta float64 `json:"rope_theta,omitempty"`
|
RopeTheta float64 `json:"rope_theta,omitempty"`
|
||||||
EnforceEager bool `json:"enforce_eager,omitempty"`
|
EnforceEager bool `json:"enforce_eager,omitempty"`
|
||||||
MaxContextLenToCapture int `json:"max_context_len_to_capture,omitempty"`
|
MaxContextLenToCapture int `json:"max_context_len_to_capture,omitempty"`
|
||||||
MaxSeqLenToCapture int `json:"max_seq_len_to_capture,omitempty"`
|
MaxSeqLenToCapture int `json:"max_seq_len_to_capture,omitempty"`
|
||||||
DisableCustomAllReduce bool `json:"disable_custom_all_reduce,omitempty"`
|
DisableCustomAllReduce bool `json:"disable_custom_all_reduce,omitempty"`
|
||||||
TokenizerPoolSize int `json:"tokenizer_pool_size,omitempty"`
|
TokenizerPoolSize int `json:"tokenizer_pool_size,omitempty"`
|
||||||
TokenizerPoolType string `json:"tokenizer_pool_type,omitempty"`
|
TokenizerPoolType string `json:"tokenizer_pool_type,omitempty"`
|
||||||
TokenizerPoolExtraConfig string `json:"tokenizer_pool_extra_config,omitempty"`
|
TokenizerPoolExtraConfig string `json:"tokenizer_pool_extra_config,omitempty"`
|
||||||
EnableLoraBias bool `json:"enable_lora_bias,omitempty"`
|
EnableLoraBias bool `json:"enable_lora_bias,omitempty"`
|
||||||
LoraExtraVocabSize int `json:"lora_extra_vocab_size,omitempty"`
|
LoraExtraVocabSize int `json:"lora_extra_vocab_size,omitempty"`
|
||||||
LoraRank int `json:"lora_rank,omitempty"`
|
LoraRank int `json:"lora_rank,omitempty"`
|
||||||
PromptLookbackDistance int `json:"prompt_lookback_distance,omitempty"`
|
PromptLookbackDistance int `json:"prompt_lookback_distance,omitempty"`
|
||||||
PreemptionMode string `json:"preemption_mode,omitempty"`
|
PreemptionMode string `json:"preemption_mode,omitempty"`
|
||||||
|
|
||||||
// Distributed and parallel processing
|
// Distributed and parallel processing
|
||||||
TensorParallelSize int `json:"tensor_parallel_size,omitempty"`
|
TensorParallelSize int `json:"tensor_parallel_size,omitempty"`
|
||||||
PipelineParallelSize int `json:"pipeline_parallel_size,omitempty"`
|
PipelineParallelSize int `json:"pipeline_parallel_size,omitempty"`
|
||||||
MaxParallelLoadingWorkers int `json:"max_parallel_loading_workers,omitempty"`
|
MaxParallelLoadingWorkers int `json:"max_parallel_loading_workers,omitempty"`
|
||||||
DisableAsyncOutputProc bool `json:"disable_async_output_proc,omitempty"`
|
DisableAsyncOutputProc bool `json:"disable_async_output_proc,omitempty"`
|
||||||
WorkerClass string `json:"worker_class,omitempty"`
|
WorkerClass string `json:"worker_class,omitempty"`
|
||||||
EnabledLoraModules string `json:"enabled_lora_modules,omitempty"`
|
EnabledLoraModules string `json:"enabled_lora_modules,omitempty"`
|
||||||
MaxLoraRank int `json:"max_lora_rank,omitempty"`
|
MaxLoraRank int `json:"max_lora_rank,omitempty"`
|
||||||
FullyShardedLoras bool `json:"fully_sharded_loras,omitempty"`
|
FullyShardedLoras bool `json:"fully_sharded_loras,omitempty"`
|
||||||
LoraModules string `json:"lora_modules,omitempty"`
|
LoraModules string `json:"lora_modules,omitempty"`
|
||||||
PromptAdapters string `json:"prompt_adapters,omitempty"`
|
PromptAdapters string `json:"prompt_adapters,omitempty"`
|
||||||
MaxPromptAdapterToken int `json:"max_prompt_adapter_token,omitempty"`
|
MaxPromptAdapterToken int `json:"max_prompt_adapter_token,omitempty"`
|
||||||
Device string `json:"device,omitempty"`
|
Device string `json:"device,omitempty"`
|
||||||
SchedulerDelay float64 `json:"scheduler_delay,omitempty"`
|
SchedulerDelay float64 `json:"scheduler_delay,omitempty"`
|
||||||
EnableChunkedPrefill bool `json:"enable_chunked_prefill,omitempty"`
|
EnableChunkedPrefill bool `json:"enable_chunked_prefill,omitempty"`
|
||||||
SpeculativeModel string `json:"speculative_model,omitempty"`
|
SpeculativeModel string `json:"speculative_model,omitempty"`
|
||||||
SpeculativeModelQuantization string `json:"speculative_model_quantization,omitempty"`
|
SpeculativeModelQuantization string `json:"speculative_model_quantization,omitempty"`
|
||||||
SpeculativeRevision string `json:"speculative_revision,omitempty"`
|
SpeculativeRevision string `json:"speculative_revision,omitempty"`
|
||||||
SpeculativeMaxModelLen int `json:"speculative_max_model_len,omitempty"`
|
SpeculativeMaxModelLen int `json:"speculative_max_model_len,omitempty"`
|
||||||
SpeculativeDisableByBatchSize int `json:"speculative_disable_by_batch_size,omitempty"`
|
SpeculativeDisableByBatchSize int `json:"speculative_disable_by_batch_size,omitempty"`
|
||||||
NgptSpeculativeLength int `json:"ngpt_speculative_length,omitempty"`
|
NgptSpeculativeLength int `json:"ngpt_speculative_length,omitempty"`
|
||||||
SpeculativeDisableMqa bool `json:"speculative_disable_mqa,omitempty"`
|
SpeculativeDisableMqa bool `json:"speculative_disable_mqa,omitempty"`
|
||||||
ModelLoaderExtraConfig string `json:"model_loader_extra_config,omitempty"`
|
ModelLoaderExtraConfig string `json:"model_loader_extra_config,omitempty"`
|
||||||
IgnorePatterns string `json:"ignore_patterns,omitempty"`
|
IgnorePatterns string `json:"ignore_patterns,omitempty"`
|
||||||
PreloadedLoraModules string `json:"preloaded_lora_modules,omitempty"`
|
PreloadedLoraModules string `json:"preloaded_lora_modules,omitempty"`
|
||||||
|
|
||||||
// OpenAI server specific options
|
// OpenAI server specific options
|
||||||
UDS string `json:"uds,omitempty"`
|
UDS string `json:"uds,omitempty"`
|
||||||
UvicornLogLevel string `json:"uvicorn_log_level,omitempty"`
|
UvicornLogLevel string `json:"uvicorn_log_level,omitempty"`
|
||||||
ResponseRole string `json:"response_role,omitempty"`
|
ResponseRole string `json:"response_role,omitempty"`
|
||||||
SSLKeyfile string `json:"ssl_keyfile,omitempty"`
|
SSLKeyfile string `json:"ssl_keyfile,omitempty"`
|
||||||
SSLCertfile string `json:"ssl_certfile,omitempty"`
|
SSLCertfile string `json:"ssl_certfile,omitempty"`
|
||||||
SSLCACerts string `json:"ssl_ca_certs,omitempty"`
|
SSLCACerts string `json:"ssl_ca_certs,omitempty"`
|
||||||
SSLCertReqs int `json:"ssl_cert_reqs,omitempty"`
|
SSLCertReqs int `json:"ssl_cert_reqs,omitempty"`
|
||||||
RootPath string `json:"root_path,omitempty"`
|
RootPath string `json:"root_path,omitempty"`
|
||||||
Middleware []string `json:"middleware,omitempty"`
|
Middleware []string `json:"middleware,omitempty"`
|
||||||
ReturnTokensAsTokenIDS bool `json:"return_tokens_as_token_ids,omitempty"`
|
ReturnTokensAsTokenIDS bool `json:"return_tokens_as_token_ids,omitempty"`
|
||||||
DisableFrontendMultiprocessing bool `json:"disable_frontend_multiprocessing,omitempty"`
|
DisableFrontendMultiprocessing bool `json:"disable_frontend_multiprocessing,omitempty"`
|
||||||
EnableAutoToolChoice bool `json:"enable_auto_tool_choice,omitempty"`
|
EnableAutoToolChoice bool `json:"enable_auto_tool_choice,omitempty"`
|
||||||
ToolCallParser string `json:"tool_call_parser,omitempty"`
|
ToolCallParser string `json:"tool_call_parser,omitempty"`
|
||||||
ToolServer string `json:"tool_server,omitempty"`
|
ToolServer string `json:"tool_server,omitempty"`
|
||||||
ChatTemplate string `json:"chat_template,omitempty"`
|
ChatTemplate string `json:"chat_template,omitempty"`
|
||||||
ChatTemplateContentFormat string `json:"chat_template_content_format,omitempty"`
|
ChatTemplateContentFormat string `json:"chat_template_content_format,omitempty"`
|
||||||
AllowCredentials bool `json:"allow_credentials,omitempty"`
|
AllowCredentials bool `json:"allow_credentials,omitempty"`
|
||||||
AllowedOrigins []string `json:"allowed_origins,omitempty"`
|
AllowedOrigins []string `json:"allowed_origins,omitempty"`
|
||||||
AllowedMethods []string `json:"allowed_methods,omitempty"`
|
AllowedMethods []string `json:"allowed_methods,omitempty"`
|
||||||
AllowedHeaders []string `json:"allowed_headers,omitempty"`
|
AllowedHeaders []string `json:"allowed_headers,omitempty"`
|
||||||
APIKey []string `json:"api_key,omitempty"`
|
APIKey []string `json:"api_key,omitempty"`
|
||||||
EnableLogOutputs bool `json:"enable_log_outputs,omitempty"`
|
EnableLogOutputs bool `json:"enable_log_outputs,omitempty"`
|
||||||
EnableTokenUsage bool `json:"enable_token_usage,omitempty"`
|
EnableTokenUsage bool `json:"enable_token_usage,omitempty"`
|
||||||
EnableAsyncEngineDebug bool `json:"enable_async_engine_debug,omitempty"`
|
EnableAsyncEngineDebug bool `json:"enable_async_engine_debug,omitempty"`
|
||||||
EngineUseRay bool `json:"engine_use_ray,omitempty"`
|
EngineUseRay bool `json:"engine_use_ray,omitempty"`
|
||||||
DisableLogRequests bool `json:"disable_log_requests,omitempty"`
|
DisableLogRequests bool `json:"disable_log_requests,omitempty"`
|
||||||
MaxLogLen int `json:"max_log_len,omitempty"`
|
MaxLogLen int `json:"max_log_len,omitempty"`
|
||||||
|
|
||||||
// Additional engine configuration
|
// Additional engine configuration
|
||||||
Task string `json:"task,omitempty"`
|
Task string `json:"task,omitempty"`
|
||||||
MultiModalConfig string `json:"multi_modal_config,omitempty"`
|
MultiModalConfig string `json:"multi_modal_config,omitempty"`
|
||||||
LimitMmPerPrompt string `json:"limit_mm_per_prompt,omitempty"`
|
LimitMmPerPrompt string `json:"limit_mm_per_prompt,omitempty"`
|
||||||
EnableSleepMode bool `json:"enable_sleep_mode,omitempty"`
|
EnableSleepMode bool `json:"enable_sleep_mode,omitempty"`
|
||||||
EnableChunkingRequest bool `json:"enable_chunking_request,omitempty"`
|
EnableChunkingRequest bool `json:"enable_chunking_request,omitempty"`
|
||||||
CompilationConfig string `json:"compilation_config,omitempty"`
|
CompilationConfig string `json:"compilation_config,omitempty"`
|
||||||
DisableSlidingWindowMask bool `json:"disable_sliding_window_mask,omitempty"`
|
DisableSlidingWindowMask bool `json:"disable_sliding_window_mask,omitempty"`
|
||||||
EnableTRTLLMEngineLatency bool `json:"enable_trtllm_engine_latency,omitempty"`
|
EnableTRTLLMEngineLatency bool `json:"enable_trtllm_engine_latency,omitempty"`
|
||||||
OverridePoolingConfig string `json:"override_pooling_config,omitempty"`
|
OverridePoolingConfig string `json:"override_pooling_config,omitempty"`
|
||||||
OverrideNeuronConfig string `json:"override_neuron_config,omitempty"`
|
OverrideNeuronConfig string `json:"override_neuron_config,omitempty"`
|
||||||
OverrideKVCacheALIGNSize int `json:"override_kv_cache_align_size,omitempty"`
|
OverrideKVCacheALIGNSize int `json:"override_kv_cache_align_size,omitempty"`
|
||||||
}
|
|
||||||
|
|
||||||
// NewVllmServerOptions creates a new VllmServerOptions with defaults
|
|
||||||
func NewVllmServerOptions() *VllmServerOptions {
|
|
||||||
return &VllmServerOptions{
|
|
||||||
Host: "127.0.0.1",
|
|
||||||
Port: 8000,
|
|
||||||
TensorParallelSize: 1,
|
|
||||||
PipelineParallelSize: 1,
|
|
||||||
GPUMemoryUtilization: 0.9,
|
|
||||||
BlockSize: 16,
|
|
||||||
SwapSpace: 4,
|
|
||||||
UvicornLogLevel: "info",
|
|
||||||
ResponseRole: "assistant",
|
|
||||||
TokenizerMode: "auto",
|
|
||||||
TrustRemoteCode: false,
|
|
||||||
EnablePrefixCaching: false,
|
|
||||||
EnforceEager: false,
|
|
||||||
DisableLogStats: false,
|
|
||||||
DisableLogRequests: false,
|
|
||||||
MaxLogprobs: 20,
|
|
||||||
EnableLogOutputs: false,
|
|
||||||
EnableTokenUsage: false,
|
|
||||||
AllowCredentials: false,
|
|
||||||
AllowedOrigins: []string{"*"},
|
|
||||||
AllowedMethods: []string{"*"},
|
|
||||||
AllowedHeaders: []string{"*"},
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// UnmarshalJSON implements custom JSON unmarshaling to support multiple field names
|
|
||||||
func (o *VllmServerOptions) UnmarshalJSON(data []byte) error {
|
|
||||||
// First unmarshal into a map to handle multiple field names
|
|
||||||
var raw map[string]any
|
|
||||||
if err := json.Unmarshal(data, &raw); err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
// Create a temporary struct for standard unmarshaling
|
|
||||||
type tempOptions VllmServerOptions
|
|
||||||
temp := tempOptions{}
|
|
||||||
|
|
||||||
// Standard unmarshal first
|
|
||||||
if err := json.Unmarshal(data, &temp); err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
// Copy to our struct
|
|
||||||
*o = VllmServerOptions(temp)
|
|
||||||
|
|
||||||
// Handle alternative field names (CLI format with dashes)
|
|
||||||
fieldMappings := map[string]string{
|
|
||||||
// Basic options
|
|
||||||
"tensor-parallel-size": "tensor_parallel_size",
|
|
||||||
"pipeline-parallel-size": "pipeline_parallel_size",
|
|
||||||
"max-parallel-loading-workers": "max_parallel_loading_workers",
|
|
||||||
"disable-async-output-proc": "disable_async_output_proc",
|
|
||||||
"worker-class": "worker_class",
|
|
||||||
"enabled-lora-modules": "enabled_lora_modules",
|
|
||||||
"max-lora-rank": "max_lora_rank",
|
|
||||||
"fully-sharded-loras": "fully_sharded_loras",
|
|
||||||
"lora-modules": "lora_modules",
|
|
||||||
"prompt-adapters": "prompt_adapters",
|
|
||||||
"max-prompt-adapter-token": "max_prompt_adapter_token",
|
|
||||||
"scheduler-delay": "scheduler_delay",
|
|
||||||
"enable-chunked-prefill": "enable_chunked_prefill",
|
|
||||||
"speculative-model": "speculative_model",
|
|
||||||
"speculative-model-quantization": "speculative_model_quantization",
|
|
||||||
"speculative-revision": "speculative_revision",
|
|
||||||
"speculative-max-model-len": "speculative_max_model_len",
|
|
||||||
"speculative-disable-by-batch-size": "speculative_disable_by_batch_size",
|
|
||||||
"ngpt-speculative-length": "ngpt_speculative_length",
|
|
||||||
"speculative-disable-mqa": "speculative_disable_mqa",
|
|
||||||
"model-loader-extra-config": "model_loader_extra_config",
|
|
||||||
"ignore-patterns": "ignore_patterns",
|
|
||||||
"preloaded-lora-modules": "preloaded_lora_modules",
|
|
||||||
|
|
||||||
// Model configuration
|
|
||||||
"skip-tokenizer-init": "skip_tokenizer_init",
|
|
||||||
"code-revision": "code_revision",
|
|
||||||
"tokenizer-revision": "tokenizer_revision",
|
|
||||||
"tokenizer-mode": "tokenizer_mode",
|
|
||||||
"trust-remote-code": "trust_remote_code",
|
|
||||||
"download-dir": "download_dir",
|
|
||||||
"load-format": "load_format",
|
|
||||||
"config-format": "config_format",
|
|
||||||
"kv-cache-dtype": "kv_cache_dtype",
|
|
||||||
"quantization-param-path": "quantization_param_path",
|
|
||||||
"max-model-len": "max_model_len",
|
|
||||||
"guided-decoding-backend": "guided_decoding_backend",
|
|
||||||
"distributed-executor-backend": "distributed_executor_backend",
|
|
||||||
"worker-use-ray": "worker_use_ray",
|
|
||||||
"ray-workers-use-nsight": "ray_workers_use_nsight",
|
|
||||||
|
|
||||||
// Performance configuration
|
|
||||||
"block-size": "block_size",
|
|
||||||
"enable-prefix-caching": "enable_prefix_caching",
|
|
||||||
"disable-sliding-window": "disable_sliding_window",
|
|
||||||
"use-v2-block-manager": "use_v2_block_manager",
|
|
||||||
"num-lookahead-slots": "num_lookahead_slots",
|
|
||||||
"swap-space": "swap_space",
|
|
||||||
"cpu-offload-gb": "cpu_offload_gb",
|
|
||||||
"gpu-memory-utilization": "gpu_memory_utilization",
|
|
||||||
"num-gpu-blocks-override": "num_gpu_blocks_override",
|
|
||||||
"max-num-batched-tokens": "max_num_batched_tokens",
|
|
||||||
"max-num-seqs": "max_num_seqs",
|
|
||||||
"max-logprobs": "max_logprobs",
|
|
||||||
"disable-log-stats": "disable_log_stats",
|
|
||||||
"rope-scaling": "rope_scaling",
|
|
||||||
"rope-theta": "rope_theta",
|
|
||||||
"enforce-eager": "enforce_eager",
|
|
||||||
"max-context-len-to-capture": "max_context_len_to_capture",
|
|
||||||
"max-seq-len-to-capture": "max_seq_len_to_capture",
|
|
||||||
"disable-custom-all-reduce": "disable_custom_all_reduce",
|
|
||||||
"tokenizer-pool-size": "tokenizer_pool_size",
|
|
||||||
"tokenizer-pool-type": "tokenizer_pool_type",
|
|
||||||
"tokenizer-pool-extra-config": "tokenizer_pool_extra_config",
|
|
||||||
"enable-lora-bias": "enable_lora_bias",
|
|
||||||
"lora-extra-vocab-size": "lora_extra_vocab_size",
|
|
||||||
"lora-rank": "lora_rank",
|
|
||||||
"prompt-lookback-distance": "prompt_lookback_distance",
|
|
||||||
"preemption-mode": "preemption_mode",
|
|
||||||
|
|
||||||
// Server configuration
|
|
||||||
"uvicorn-log-level": "uvicorn_log_level",
|
|
||||||
"response-role": "response_role",
|
|
||||||
"ssl-keyfile": "ssl_keyfile",
|
|
||||||
"ssl-certfile": "ssl_certfile",
|
|
||||||
"ssl-ca-certs": "ssl_ca_certs",
|
|
||||||
"ssl-cert-reqs": "ssl_cert_reqs",
|
|
||||||
"root-path": "root_path",
|
|
||||||
"return-tokens-as-token-ids": "return_tokens_as_token_ids",
|
|
||||||
"disable-frontend-multiprocessing": "disable_frontend_multiprocessing",
|
|
||||||
"enable-auto-tool-choice": "enable_auto_tool_choice",
|
|
||||||
"tool-call-parser": "tool_call_parser",
|
|
||||||
"tool-server": "tool_server",
|
|
||||||
"chat-template": "chat_template",
|
|
||||||
"chat-template-content-format": "chat_template_content_format",
|
|
||||||
"allow-credentials": "allow_credentials",
|
|
||||||
"allowed-origins": "allowed_origins",
|
|
||||||
"allowed-methods": "allowed_methods",
|
|
||||||
"allowed-headers": "allowed_headers",
|
|
||||||
"api-key": "api_key",
|
|
||||||
"enable-log-outputs": "enable_log_outputs",
|
|
||||||
"enable-token-usage": "enable_token_usage",
|
|
||||||
"enable-async-engine-debug": "enable_async_engine_debug",
|
|
||||||
"engine-use-ray": "engine_use_ray",
|
|
||||||
"disable-log-requests": "disable_log_requests",
|
|
||||||
"max-log-len": "max_log_len",
|
|
||||||
|
|
||||||
// Additional options
|
|
||||||
"multi-modal-config": "multi_modal_config",
|
|
||||||
"limit-mm-per-prompt": "limit_mm_per_prompt",
|
|
||||||
"enable-sleep-mode": "enable_sleep_mode",
|
|
||||||
"enable-chunking-request": "enable_chunking_request",
|
|
||||||
"compilation-config": "compilation_config",
|
|
||||||
"disable-sliding-window-mask": "disable_sliding_window_mask",
|
|
||||||
"enable-trtllm-engine-latency": "enable_trtllm_engine_latency",
|
|
||||||
"override-pooling-config": "override_pooling_config",
|
|
||||||
"override-neuron-config": "override_neuron_config",
|
|
||||||
"override-kv-cache-align-size": "override_kv_cache_align_size",
|
|
||||||
}
|
|
||||||
|
|
||||||
// Process alternative field names
|
|
||||||
for altName, canonicalName := range fieldMappings {
|
|
||||||
if value, exists := raw[altName]; exists {
|
|
||||||
// Use reflection to set the field value
|
|
||||||
v := reflect.ValueOf(o).Elem()
|
|
||||||
field := v.FieldByNameFunc(func(fieldName string) bool {
|
|
||||||
field, _ := v.Type().FieldByName(fieldName)
|
|
||||||
jsonTag := field.Tag.Get("json")
|
|
||||||
return jsonTag == canonicalName+",omitempty" || jsonTag == canonicalName
|
|
||||||
})
|
|
||||||
|
|
||||||
if field.IsValid() && field.CanSet() {
|
|
||||||
switch field.Kind() {
|
|
||||||
case reflect.Int:
|
|
||||||
if intVal, ok := value.(float64); ok {
|
|
||||||
field.SetInt(int64(intVal))
|
|
||||||
} else if strVal, ok := value.(string); ok {
|
|
||||||
if intVal, err := strconv.Atoi(strVal); err == nil {
|
|
||||||
field.SetInt(int64(intVal))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
case reflect.Float64:
|
|
||||||
if floatVal, ok := value.(float64); ok {
|
|
||||||
field.SetFloat(floatVal)
|
|
||||||
} else if strVal, ok := value.(string); ok {
|
|
||||||
if floatVal, err := strconv.ParseFloat(strVal, 64); err == nil {
|
|
||||||
field.SetFloat(floatVal)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
case reflect.String:
|
|
||||||
if strVal, ok := value.(string); ok {
|
|
||||||
field.SetString(strVal)
|
|
||||||
}
|
|
||||||
case reflect.Bool:
|
|
||||||
if boolVal, ok := value.(bool); ok {
|
|
||||||
field.SetBool(boolVal)
|
|
||||||
}
|
|
||||||
case reflect.Slice:
|
|
||||||
if field.Type().Elem().Kind() == reflect.String {
|
|
||||||
if strVal, ok := value.(string); ok {
|
|
||||||
// Split comma-separated values
|
|
||||||
values := strings.Split(strVal, ",")
|
|
||||||
for i, v := range values {
|
|
||||||
values[i] = strings.TrimSpace(v)
|
|
||||||
}
|
|
||||||
field.Set(reflect.ValueOf(values))
|
|
||||||
} else if slice, ok := value.([]interface{}); ok {
|
|
||||||
var strSlice []string
|
|
||||||
for _, item := range slice {
|
|
||||||
if str, ok := item.(string); ok {
|
|
||||||
strSlice = append(strSlice, str)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
field.Set(reflect.ValueOf(strSlice))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// BuildCommandArgs converts VllmServerOptions to command line arguments
|
// BuildCommandArgs converts VllmServerOptions to command line arguments
|
||||||
@@ -387,11 +161,6 @@ func (o *VllmServerOptions) BuildCommandArgs() []string {
|
|||||||
flagName = jsonTag[:commaIndex]
|
flagName = jsonTag[:commaIndex]
|
||||||
}
|
}
|
||||||
|
|
||||||
// Skip host and port as they are handled by llamactl
|
|
||||||
if flagName == "host" || flagName == "port" {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
// Convert snake_case to kebab-case for CLI flags
|
// Convert snake_case to kebab-case for CLI flags
|
||||||
flagName = strings.ReplaceAll(flagName, "_", "-")
|
flagName = strings.ReplaceAll(flagName, "_", "-")
|
||||||
|
|
||||||
|
|||||||
@@ -10,12 +10,12 @@ import (
|
|||||||
func TestBuildCommandArgs(t *testing.T) {
|
func TestBuildCommandArgs(t *testing.T) {
|
||||||
options := vllm.VllmServerOptions{
|
options := vllm.VllmServerOptions{
|
||||||
Model: "microsoft/DialoGPT-medium",
|
Model: "microsoft/DialoGPT-medium",
|
||||||
Port: 8080, // should be excluded
|
Port: 8080,
|
||||||
Host: "localhost", // should be excluded
|
Host: "localhost",
|
||||||
TensorParallelSize: 2,
|
TensorParallelSize: 2,
|
||||||
GPUMemoryUtilization: 0.8,
|
GPUMemoryUtilization: 0.8,
|
||||||
EnableLogOutputs: true,
|
EnableLogOutputs: true,
|
||||||
APIKey: []string{"key1", "key2"},
|
AllowedOrigins: []string{"http://localhost:3000", "https://example.com"},
|
||||||
}
|
}
|
||||||
|
|
||||||
args := options.BuildCommandArgs()
|
args := options.BuildCommandArgs()
|
||||||
@@ -32,19 +32,22 @@ func TestBuildCommandArgs(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Host and port should NOT be in the arguments (handled by llamactl)
|
// Host and port should NOT be in the arguments (handled by llamactl)
|
||||||
if contains(args, "--host") || contains(args, "--port") {
|
if !contains(args, "--host") {
|
||||||
t.Errorf("Host and port should not be in command args, found in %v", args)
|
t.Errorf("Expected --host not found in %v", args)
|
||||||
|
}
|
||||||
|
if !contains(args, "--port") {
|
||||||
|
t.Errorf("Expected --port not found in %v", args)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check array handling (multiple flags)
|
// Check array handling (multiple flags)
|
||||||
apiKeyCount := 0
|
allowedOriginsCount := 0
|
||||||
for i := range args {
|
for i := range args {
|
||||||
if args[i] == "--api-key" {
|
if args[i] == "--allowed-origins" {
|
||||||
apiKeyCount++
|
allowedOriginsCount++
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if apiKeyCount != 2 {
|
if allowedOriginsCount != 2 {
|
||||||
t.Errorf("Expected 2 --api-key flags, got %d", apiKeyCount)
|
t.Errorf("Expected 2 --allowed-origins flags, got %d", allowedOriginsCount)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -77,20 +80,6 @@ func TestUnmarshalJSON(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestNewVllmServerOptions(t *testing.T) {
|
|
||||||
options := vllm.NewVllmServerOptions()
|
|
||||||
|
|
||||||
if options == nil {
|
|
||||||
t.Fatal("NewVllmServerOptions returned nil")
|
|
||||||
}
|
|
||||||
if options.Host != "127.0.0.1" {
|
|
||||||
t.Errorf("Expected default host '127.0.0.1', got %q", options.Host)
|
|
||||||
}
|
|
||||||
if options.Port != 8000 {
|
|
||||||
t.Errorf("Expected default port 8000, got %d", options.Port)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Helper functions
|
// Helper functions
|
||||||
func contains(slice []string, item string) bool {
|
func contains(slice []string, item string) bool {
|
||||||
return slices.Contains(slice, item)
|
return slices.Contains(slice, item)
|
||||||
|
|||||||
Reference in New Issue
Block a user