Add vLLM backend support to webui

This commit is contained in:
2025-09-21 20:58:43 +02:00
parent 7eb59aa7e0
commit b665194307
10 changed files with 545 additions and 258 deletions

View File

@@ -9,7 +9,7 @@ import {
DialogHeader, DialogHeader,
DialogTitle, DialogTitle,
} from "@/components/ui/dialog"; } from "@/components/ui/dialog";
import { type CreateInstanceOptions } from "@/types/instance"; import { BackendType, type BackendTypeValue, type CreateInstanceOptions } from "@/types/instance";
import { backendsApi } from "@/lib/api"; import { backendsApi } from "@/lib/api";
import { toast } from "sonner"; import { toast } from "sonner";
@@ -25,6 +25,7 @@ const ParseCommandDialog: React.FC<ParseCommandDialogProps> = ({
onParsed, onParsed,
}) => { }) => {
const [command, setCommand] = useState(''); const [command, setCommand] = useState('');
const [backendType, setBackendType] = useState<BackendTypeValue>(BackendType.LLAMA_CPP);
const [loading, setLoading] = useState(false); const [loading, setLoading] = useState(false);
const [error, setError] = useState<string | null>(null); const [error, setError] = useState<string | null>(null);
@@ -38,18 +39,31 @@ const ParseCommandDialog: React.FC<ParseCommandDialogProps> = ({
setError(null); setError(null);
try { try {
const options = await backendsApi.llamaCpp.parseCommand(command); let options: CreateInstanceOptions;
// Parse based on selected backend type
switch (backendType) {
case BackendType.LLAMA_CPP:
options = await backendsApi.llamaCpp.parseCommand(command);
break;
case BackendType.MLX_LM:
options = await backendsApi.mlx.parseCommand(command);
break;
case BackendType.VLLM:
options = await backendsApi.vllm.parseCommand(command);
break;
default:
throw new Error(`Unsupported backend type: ${backendType}`);
}
onParsed(options); onParsed(options);
onOpenChange(false); onOpenChange(false);
// Reset form
setCommand(''); setCommand('');
setError(null); setError(null);
// Show success toast
toast.success('Command parsed successfully'); toast.success('Command parsed successfully');
} catch (err) { } catch (err) {
const errorMessage = err instanceof Error ? err.message : 'Failed to parse command'; const errorMessage = err instanceof Error ? err.message : 'Failed to parse command';
setError(errorMessage); setError(errorMessage);
// Show error toast
toast.error('Failed to parse command', { toast.error('Failed to parse command', {
description: errorMessage description: errorMessage
}); });
@@ -60,35 +74,62 @@ const ParseCommandDialog: React.FC<ParseCommandDialogProps> = ({
const handleOpenChange = (open: boolean) => { const handleOpenChange = (open: boolean) => {
if (!open) { if (!open) {
// Reset form when closing
setCommand(''); setCommand('');
setBackendType(BackendType.LLAMA_CPP);
setError(null); setError(null);
} }
onOpenChange(open); onOpenChange(open);
}; };
const getPlaceholderForBackend = (backendType: BackendTypeValue): string => {
switch (backendType) {
case BackendType.LLAMA_CPP:
return "llama-server --model /path/to/model.gguf --gpu-layers 32 --ctx-size 4096";
case BackendType.MLX_LM:
return "mlx_lm.server --model mlx-community/Mistral-7B-Instruct-v0.3-4bit --host 0.0.0.0 --port 8080";
case BackendType.VLLM:
return "vllm serve --model microsoft/DialoGPT-medium --tensor-parallel-size 2 --gpu-memory-utilization 0.9";
default:
return "Enter your command here...";
}
};
return ( return (
<Dialog open={open} onOpenChange={handleOpenChange}> <Dialog open={open} onOpenChange={handleOpenChange}>
<DialogContent className="sm:max-w-[600px]"> <DialogContent className="sm:max-w-[600px]">
<DialogHeader> <DialogHeader>
<DialogTitle>Parse Llama Server Command</DialogTitle> <DialogTitle>Parse Backend Command</DialogTitle>
<DialogDescription> <DialogDescription>
Paste your llama-server command to automatically populate the form fields Select your backend type and paste the command to automatically populate the form fields
</DialogDescription> </DialogDescription>
</DialogHeader> </DialogHeader>
<div className="space-y-4"> <div className="space-y-4">
<div>
<Label htmlFor="backend-type">Backend Type</Label>
<select
id="backend-type"
value={backendType}
onChange={(e) => setBackendType(e.target.value as BackendTypeValue)}
className="flex h-10 w-full rounded-md border border-input bg-background px-3 py-2 text-sm ring-offset-background file:border-0 file:bg-transparent file:text-sm file:font-medium placeholder:text-muted-foreground focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-ring focus-visible:ring-offset-2 disabled:cursor-not-allowed disabled:opacity-50"
>
<option value={BackendType.LLAMA_CPP}>Llama Server</option>
<option value={BackendType.MLX_LM}>MLX LM</option>
<option value={BackendType.VLLM}>vLLM</option>
</select>
</div>
<div> <div>
<Label htmlFor="command">Command</Label> <Label htmlFor="command">Command</Label>
<textarea <textarea
id="command" id="command"
value={command} value={command}
onChange={(e) => setCommand(e.target.value)} onChange={(e) => setCommand(e.target.value)}
placeholder="llama-server --model /path/to/model.gguf --gpu-layers 32 --ctx-size 4096" placeholder={getPlaceholderForBackend(backendType)}
className="w-full h-32 p-3 mt-2 border border-input rounded-md font-mono text-sm resize-vertical focus:outline-none focus:ring-2 focus:ring-ring focus:ring-offset-2" className="w-full h-32 p-3 mt-2 border border-input rounded-md font-mono text-sm resize-vertical focus:outline-none focus:ring-2 focus:ring-ring focus:ring-offset-2"
/> />
</div> </div>
{error && ( {error && (
<div className="text-destructive text-sm bg-destructive/10 p-3 rounded-md"> <div className="text-destructive text-sm bg-destructive/10 p-3 rounded-md">
{error} {error}

View File

@@ -39,6 +39,7 @@ const ZodFormField: React.FC<ZodFormFieldProps> = ({ fieldKey, value, onChange }
> >
<option value={BackendType.LLAMA_CPP}>Llama Server</option> <option value={BackendType.LLAMA_CPP}>Llama Server</option>
<option value={BackendType.MLX_LM}>MLX LM</option> <option value={BackendType.MLX_LM}>MLX LM</option>
<option value={BackendType.VLLM}>vLLM</option>
</select> </select>
{config.description && ( {config.description && (
<p className="text-sm text-muted-foreground">{config.description}</p> <p className="text-sm text-muted-foreground">{config.description}</p>

View File

@@ -101,6 +101,14 @@ export const backendsApi = {
body: JSON.stringify({ command }), body: JSON.stringify({ command }),
}), }),
}, },
vllm: {
// POST /backends/vllm/parse-command
parseCommand: (command: string) =>
apiCall<CreateInstanceOptions>('/backends/vllm/parse-command', {
method: 'POST',
body: JSON.stringify({ command }),
}),
},
}; };
// Instance API functions // Instance API functions

View File

@@ -1,14 +1,18 @@
import { import {
type CreateInstanceOptions, type CreateInstanceOptions,
type LlamaCppBackendOptions, type LlamaCppBackendOptions,
type MlxBackendOptions, type MlxBackendOptions,
type VllmBackendOptions,
LlamaCppBackendOptionsSchema, LlamaCppBackendOptionsSchema,
MlxBackendOptionsSchema, MlxBackendOptionsSchema,
getAllFieldKeys, VllmBackendOptionsSchema,
getAllFieldKeys,
getAllLlamaCppFieldKeys, getAllLlamaCppFieldKeys,
getAllMlxFieldKeys, getAllMlxFieldKeys,
getAllVllmFieldKeys,
getLlamaCppFieldType, getLlamaCppFieldType,
getMlxFieldType getMlxFieldType,
getVllmFieldType
} from '@/schemas/instanceOptions' } from '@/schemas/instanceOptions'
// Instance-level basic fields (not backend-specific) // Instance-level basic fields (not backend-specific)
@@ -117,6 +121,31 @@ const basicMlxFieldsConfig: Record<string, {
} }
} }
// vLLM backend-specific basic fields
const basicVllmFieldsConfig: Record<string, {
label: string
description?: string
placeholder?: string
required?: boolean
}> = {
model: {
label: 'Model',
placeholder: 'microsoft/DialoGPT-medium',
description: 'The name or path of the Hugging Face model to use',
required: true
},
tensor_parallel_size: {
label: 'Tensor Parallel Size',
placeholder: '1',
description: 'Number of GPUs to use for distributed serving'
},
gpu_memory_utilization: {
label: 'GPU Memory Utilization',
placeholder: '0.9',
description: 'The fraction of GPU memory to be used for the model executor'
}
}
function isBasicField(key: keyof CreateInstanceOptions): boolean { function isBasicField(key: keyof CreateInstanceOptions): boolean {
return key in basicFieldsConfig return key in basicFieldsConfig
} }
@@ -134,6 +163,8 @@ export function getAdvancedFields(): (keyof CreateInstanceOptions)[] {
export function getBasicBackendFields(backendType?: string): string[] { export function getBasicBackendFields(backendType?: string): string[] {
if (backendType === 'mlx_lm') { if (backendType === 'mlx_lm') {
return Object.keys(basicMlxFieldsConfig) return Object.keys(basicMlxFieldsConfig)
} else if (backendType === 'vllm') {
return Object.keys(basicVllmFieldsConfig)
} else if (backendType === 'llama_cpp') { } else if (backendType === 'llama_cpp') {
return Object.keys(basicLlamaCppFieldsConfig) return Object.keys(basicLlamaCppFieldsConfig)
} }
@@ -144,6 +175,8 @@ export function getBasicBackendFields(backendType?: string): string[] {
export function getAdvancedBackendFields(backendType?: string): string[] { export function getAdvancedBackendFields(backendType?: string): string[] {
if (backendType === 'mlx_lm') { if (backendType === 'mlx_lm') {
return getAllMlxFieldKeys().filter(key => !(key in basicMlxFieldsConfig)) return getAllMlxFieldKeys().filter(key => !(key in basicMlxFieldsConfig))
} else if (backendType === 'vllm') {
return getAllVllmFieldKeys().filter(key => !(key in basicVllmFieldsConfig))
} else if (backendType === 'llama_cpp') { } else if (backendType === 'llama_cpp') {
return getAllLlamaCppFieldKeys().filter(key => !(key in basicLlamaCppFieldsConfig)) return getAllLlamaCppFieldKeys().filter(key => !(key in basicLlamaCppFieldsConfig))
} }
@@ -159,7 +192,8 @@ export const basicBackendFieldsConfig: Record<string, {
required?: boolean required?: boolean
}> = { }> = {
...basicLlamaCppFieldsConfig, ...basicLlamaCppFieldsConfig,
...basicMlxFieldsConfig ...basicMlxFieldsConfig,
...basicVllmFieldsConfig
} }
// Get field type for any backend option (union type) // Get field type for any backend option (union type)
@@ -172,7 +206,7 @@ export function getBackendFieldType(key: string): 'text' | 'number' | 'boolean'
} catch { } catch {
// Schema might not be available // Schema might not be available
} }
// Try MLX schema // Try MLX schema
try { try {
if (MlxBackendOptionsSchema.shape && key in MlxBackendOptionsSchema.shape) { if (MlxBackendOptionsSchema.shape && key in MlxBackendOptionsSchema.shape) {
@@ -181,7 +215,16 @@ export function getBackendFieldType(key: string): 'text' | 'number' | 'boolean'
} catch { } catch {
// Schema might not be available // Schema might not be available
} }
// Try vLLM schema
try {
if (VllmBackendOptionsSchema.shape && key in VllmBackendOptionsSchema.shape) {
return getVllmFieldType(key as keyof VllmBackendOptions)
}
} catch {
// Schema might not be available
}
// Default fallback // Default fallback
return 'text' return 'text'
} }

View File

@@ -0,0 +1,4 @@
// Re-export all backend schemas from one place
export * from './llamacpp'
export * from './mlx'
export * from './vllm'

View File

@@ -0,0 +1,192 @@
import { z } from 'zod'
// Define the LlamaCpp backend options schema
export const LlamaCppBackendOptionsSchema = z.object({
// Common params
verbose_prompt: z.boolean().optional(),
threads: z.number().optional(),
threads_batch: z.number().optional(),
cpu_mask: z.string().optional(),
cpu_range: z.string().optional(),
cpu_strict: z.number().optional(),
prio: z.number().optional(),
poll: z.number().optional(),
cpu_mask_batch: z.string().optional(),
cpu_range_batch: z.string().optional(),
cpu_strict_batch: z.number().optional(),
prio_batch: z.number().optional(),
poll_batch: z.number().optional(),
ctx_size: z.number().optional(),
predict: z.number().optional(),
batch_size: z.number().optional(),
ubatch_size: z.number().optional(),
keep: z.number().optional(),
flash_attn: z.boolean().optional(),
no_perf: z.boolean().optional(),
escape: z.boolean().optional(),
no_escape: z.boolean().optional(),
rope_scaling: z.string().optional(),
rope_scale: z.number().optional(),
rope_freq_base: z.number().optional(),
rope_freq_scale: z.number().optional(),
yarn_orig_ctx: z.number().optional(),
yarn_ext_factor: z.number().optional(),
yarn_attn_factor: z.number().optional(),
yarn_beta_slow: z.number().optional(),
yarn_beta_fast: z.number().optional(),
dump_kv_cache: z.boolean().optional(),
no_kv_offload: z.boolean().optional(),
cache_type_k: z.string().optional(),
cache_type_v: z.string().optional(),
defrag_thold: z.number().optional(),
parallel: z.number().optional(),
mlock: z.boolean().optional(),
no_mmap: z.boolean().optional(),
numa: z.string().optional(),
device: z.string().optional(),
override_tensor: z.array(z.string()).optional(),
gpu_layers: z.number().optional(),
split_mode: z.string().optional(),
tensor_split: z.string().optional(),
main_gpu: z.number().optional(),
check_tensors: z.boolean().optional(),
override_kv: z.array(z.string()).optional(),
lora: z.array(z.string()).optional(),
lora_scaled: z.array(z.string()).optional(),
control_vector: z.array(z.string()).optional(),
control_vector_scaled: z.array(z.string()).optional(),
control_vector_layer_range: z.string().optional(),
model: z.string().optional(),
model_url: z.string().optional(),
hf_repo: z.string().optional(),
hf_repo_draft: z.string().optional(),
hf_file: z.string().optional(),
hf_repo_v: z.string().optional(),
hf_file_v: z.string().optional(),
hf_token: z.string().optional(),
log_disable: z.boolean().optional(),
log_file: z.string().optional(),
log_colors: z.boolean().optional(),
verbose: z.boolean().optional(),
verbosity: z.number().optional(),
log_prefix: z.boolean().optional(),
log_timestamps: z.boolean().optional(),
// Sampling params
samplers: z.string().optional(),
seed: z.number().optional(),
sampling_seq: z.string().optional(),
ignore_eos: z.boolean().optional(),
temp: z.number().optional(),
top_k: z.number().optional(),
top_p: z.number().optional(),
min_p: z.number().optional(),
xtc_probability: z.number().optional(),
xtc_threshold: z.number().optional(),
typical: z.number().optional(),
repeat_last_n: z.number().optional(),
repeat_penalty: z.number().optional(),
presence_penalty: z.number().optional(),
frequency_penalty: z.number().optional(),
dry_multiplier: z.number().optional(),
dry_base: z.number().optional(),
dry_allowed_length: z.number().optional(),
dry_penalty_last_n: z.number().optional(),
dry_sequence_breaker: z.array(z.string()).optional(),
dynatemp_range: z.number().optional(),
dynatemp_exp: z.number().optional(),
mirostat: z.number().optional(),
mirostat_lr: z.number().optional(),
mirostat_ent: z.number().optional(),
logit_bias: z.array(z.string()).optional(),
grammar: z.string().optional(),
grammar_file: z.string().optional(),
json_schema: z.string().optional(),
json_schema_file: z.string().optional(),
// Example-specific params
no_context_shift: z.boolean().optional(),
special: z.boolean().optional(),
no_warmup: z.boolean().optional(),
spm_infill: z.boolean().optional(),
pooling: z.string().optional(),
cont_batching: z.boolean().optional(),
no_cont_batching: z.boolean().optional(),
mmproj: z.string().optional(),
mmproj_url: z.string().optional(),
no_mmproj: z.boolean().optional(),
no_mmproj_offload: z.boolean().optional(),
alias: z.string().optional(),
host: z.string().optional(),
port: z.number().optional(),
path: z.string().optional(),
no_webui: z.boolean().optional(),
embedding: z.boolean().optional(),
reranking: z.boolean().optional(),
api_key: z.string().optional(),
api_key_file: z.string().optional(),
ssl_key_file: z.string().optional(),
ssl_cert_file: z.string().optional(),
chat_template_kwargs: z.string().optional(),
timeout: z.number().optional(),
threads_http: z.number().optional(),
cache_reuse: z.number().optional(),
metrics: z.boolean().optional(),
slots: z.boolean().optional(),
props: z.boolean().optional(),
no_slots: z.boolean().optional(),
slot_save_path: z.string().optional(),
jinja: z.boolean().optional(),
reasoning_format: z.string().optional(),
reasoning_budget: z.number().optional(),
chat_template: z.string().optional(),
chat_template_file: z.string().optional(),
no_prefill_assistant: z.boolean().optional(),
slot_prompt_similarity: z.number().optional(),
lora_init_without_apply: z.boolean().optional(),
draft_max: z.number().optional(),
draft_min: z.number().optional(),
draft_p_min: z.number().optional(),
ctx_size_draft: z.number().optional(),
device_draft: z.string().optional(),
gpu_layers_draft: z.number().optional(),
model_draft: z.string().optional(),
cache_type_k_draft: z.string().optional(),
cache_type_v_draft: z.string().optional(),
// Audio/TTS params
model_vocoder: z.string().optional(),
tts_use_guide_tokens: z.boolean().optional(),
// Default model params
embd_bge_small_en_default: z.boolean().optional(),
embd_e5_small_en_default: z.boolean().optional(),
embd_gte_small_default: z.boolean().optional(),
fim_qwen_1_5b_default: z.boolean().optional(),
fim_qwen_3b_default: z.boolean().optional(),
fim_qwen_7b_default: z.boolean().optional(),
fim_qwen_7b_spec: z.boolean().optional(),
fim_qwen_14b_spec: z.boolean().optional(),
})
// Infer the TypeScript type from the schema
export type LlamaCppBackendOptions = z.infer<typeof LlamaCppBackendOptionsSchema>
// Helper to get all LlamaCpp backend option field keys
export function getAllLlamaCppFieldKeys(): (keyof LlamaCppBackendOptions)[] {
return Object.keys(LlamaCppBackendOptionsSchema.shape) as (keyof LlamaCppBackendOptions)[]
}
// Get field type for LlamaCpp backend options
export function getLlamaCppFieldType(key: keyof LlamaCppBackendOptions): 'text' | 'number' | 'boolean' | 'array' {
const fieldSchema = LlamaCppBackendOptionsSchema.shape[key]
if (!fieldSchema) return 'text'
// Handle ZodOptional wrapper
const innerSchema = fieldSchema instanceof z.ZodOptional ? fieldSchema.unwrap() : fieldSchema
if (innerSchema instanceof z.ZodBoolean) return 'boolean'
if (innerSchema instanceof z.ZodNumber) return 'number'
if (innerSchema instanceof z.ZodArray) return 'array'
return 'text' // ZodString and others default to text
}

View File

@@ -0,0 +1,51 @@
import { z } from 'zod'
// Define the MLX backend options schema
export const MlxBackendOptionsSchema = z.object({
// Basic connection options
model: z.string().optional(),
host: z.string().optional(),
port: z.number().optional(),
// Model and adapter options
adapter_path: z.string().optional(),
draft_model: z.string().optional(),
num_draft_tokens: z.number().optional(),
trust_remote_code: z.boolean().optional(),
// Logging and templates
log_level: z.enum(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']).optional(),
chat_template: z.string().optional(),
use_default_chat_template: z.boolean().optional(),
chat_template_args: z.string().optional(), // JSON string
// Sampling defaults
temp: z.number().optional(), // Note: MLX uses "temp" not "temperature"
top_p: z.number().optional(),
top_k: z.number().optional(),
min_p: z.number().optional(),
max_tokens: z.number().optional(),
})
// Infer the TypeScript type from the schema
export type MlxBackendOptions = z.infer<typeof MlxBackendOptionsSchema>
// Helper to get all MLX backend option field keys
export function getAllMlxFieldKeys(): (keyof MlxBackendOptions)[] {
return Object.keys(MlxBackendOptionsSchema.shape) as (keyof MlxBackendOptions)[]
}
// Get field type for MLX backend options
export function getMlxFieldType(key: keyof MlxBackendOptions): 'text' | 'number' | 'boolean' | 'array' {
const fieldSchema = MlxBackendOptionsSchema.shape[key]
if (!fieldSchema) return 'text'
// Handle ZodOptional wrapper
const innerSchema = fieldSchema instanceof z.ZodOptional ? fieldSchema.unwrap() : fieldSchema
if (innerSchema instanceof z.ZodBoolean) return 'boolean'
if (innerSchema instanceof z.ZodNumber) return 'number'
if (innerSchema instanceof z.ZodArray) return 'array'
if (innerSchema instanceof z.ZodEnum) return 'text' // Enum treated as text/select
return 'text' // ZodString and others default to text
}

View File

@@ -0,0 +1,150 @@
import { z } from 'zod'
// Define the vLLM backend options schema
export const VllmBackendOptionsSchema = z.object({
// Basic connection options (auto-assigned by llamactl)
host: z.string().optional(),
port: z.number().optional(),
// Model and engine configuration
model: z.string().optional(),
tokenizer: z.string().optional(),
skip_tokenizer_init: z.boolean().optional(),
revision: z.string().optional(),
code_revision: z.string().optional(),
tokenizer_revision: z.string().optional(),
tokenizer_mode: z.string().optional(),
trust_remote_code: z.boolean().optional(),
download_dir: z.string().optional(),
load_format: z.string().optional(),
config_format: z.string().optional(),
dtype: z.string().optional(),
kv_cache_dtype: z.string().optional(),
quantization_param_path: z.string().optional(),
seed: z.number().optional(),
max_model_len: z.number().optional(),
guided_decoding_backend: z.string().optional(),
distributed_executor_backend: z.string().optional(),
worker_use_ray: z.boolean().optional(),
ray_workers_use_nsight: z.boolean().optional(),
// Performance and serving configuration
block_size: z.number().optional(),
enable_prefix_caching: z.boolean().optional(),
disable_sliding_window: z.boolean().optional(),
use_v2_block_manager: z.boolean().optional(),
num_lookahead_slots: z.number().optional(),
swap_space: z.number().optional(),
cpu_offload_gb: z.number().optional(),
gpu_memory_utilization: z.number().optional(),
num_gpu_blocks_override: z.number().optional(),
max_num_batched_tokens: z.number().optional(),
max_num_seqs: z.number().optional(),
max_logprobs: z.number().optional(),
disable_log_stats: z.boolean().optional(),
quantization: z.string().optional(),
rope_scaling: z.string().optional(),
rope_theta: z.number().optional(),
enforce_eager: z.boolean().optional(),
max_context_len_to_capture: z.number().optional(),
max_seq_len_to_capture: z.number().optional(),
disable_custom_all_reduce: z.boolean().optional(),
tokenizer_pool_size: z.number().optional(),
tokenizer_pool_type: z.string().optional(),
tokenizer_pool_extra_config: z.string().optional(),
enable_lora_bias: z.boolean().optional(),
lora_extra_vocab_size: z.number().optional(),
lora_rank: z.number().optional(),
prompt_lookback_distance: z.number().optional(),
preemption_mode: z.string().optional(),
// Distributed and parallel processing
tensor_parallel_size: z.number().optional(),
pipeline_parallel_size: z.number().optional(),
max_parallel_loading_workers: z.number().optional(),
disable_async_output_proc: z.boolean().optional(),
worker_class: z.string().optional(),
enabled_lora_modules: z.string().optional(),
max_lora_rank: z.number().optional(),
fully_sharded_loras: z.boolean().optional(),
lora_modules: z.string().optional(),
prompt_adapters: z.string().optional(),
max_prompt_adapter_token: z.number().optional(),
device: z.string().optional(),
scheduler_delay: z.number().optional(),
enable_chunked_prefill: z.boolean().optional(),
speculative_model: z.string().optional(),
speculative_model_quantization: z.string().optional(),
speculative_revision: z.string().optional(),
speculative_max_model_len: z.number().optional(),
speculative_disable_by_batch_size: z.number().optional(),
ngpt_speculative_length: z.number().optional(),
speculative_disable_mqa: z.boolean().optional(),
model_loader_extra_config: z.string().optional(),
ignore_patterns: z.string().optional(),
preloaded_lora_modules: z.string().optional(),
// OpenAI server specific options
uds: z.string().optional(),
uvicorn_log_level: z.string().optional(),
response_role: z.string().optional(),
ssl_keyfile: z.string().optional(),
ssl_certfile: z.string().optional(),
ssl_ca_certs: z.string().optional(),
ssl_cert_reqs: z.number().optional(),
root_path: z.string().optional(),
middleware: z.array(z.string()).optional(),
return_tokens_as_token_ids: z.boolean().optional(),
disable_frontend_multiprocessing: z.boolean().optional(),
enable_auto_tool_choice: z.boolean().optional(),
tool_call_parser: z.string().optional(),
tool_server: z.string().optional(),
chat_template: z.string().optional(),
chat_template_content_format: z.string().optional(),
allow_credentials: z.boolean().optional(),
allowed_origins: z.array(z.string()).optional(),
allowed_methods: z.array(z.string()).optional(),
allowed_headers: z.array(z.string()).optional(),
api_key: z.array(z.string()).optional(),
enable_log_outputs: z.boolean().optional(),
enable_token_usage: z.boolean().optional(),
enable_async_engine_debug: z.boolean().optional(),
engine_use_ray: z.boolean().optional(),
disable_log_requests: z.boolean().optional(),
max_log_len: z.number().optional(),
// Additional engine configuration
task: z.string().optional(),
multi_modal_config: z.string().optional(),
limit_mm_per_prompt: z.string().optional(),
enable_sleep_mode: z.boolean().optional(),
enable_chunking_request: z.boolean().optional(),
compilation_config: z.string().optional(),
disable_sliding_window_mask: z.boolean().optional(),
enable_trtllm_engine_latency: z.boolean().optional(),
override_pooling_config: z.string().optional(),
override_neuron_config: z.string().optional(),
override_kv_cache_align_size: z.number().optional(),
})
// Infer the TypeScript type from the schema
export type VllmBackendOptions = z.infer<typeof VllmBackendOptionsSchema>
// Helper to get all vLLM backend option field keys
export function getAllVllmFieldKeys(): (keyof VllmBackendOptions)[] {
return Object.keys(VllmBackendOptionsSchema.shape) as (keyof VllmBackendOptions)[]
}
// Get field type for vLLM backend options
export function getVllmFieldType(key: keyof VllmBackendOptions): 'text' | 'number' | 'boolean' | 'array' {
const fieldSchema = VllmBackendOptionsSchema.shape[key]
if (!fieldSchema) return 'text'
// Handle ZodOptional wrapper
const innerSchema = fieldSchema instanceof z.ZodOptional ? fieldSchema.unwrap() : fieldSchema
if (innerSchema instanceof z.ZodBoolean) return 'boolean'
if (innerSchema instanceof z.ZodNumber) return 'number'
if (innerSchema instanceof z.ZodArray) return 'array'
return 'text' // ZodString and others default to text
}

View File

@@ -1,206 +1,27 @@
import { BackendType } from '@/types/instance' import { BackendType } from '@/types/instance'
import { z } from 'zod' import { z } from 'zod'
// Define the LlamaCpp backend options schema // Import backend schemas from separate files
export const LlamaCppBackendOptionsSchema = z.object({ import {
// Common params LlamaCppBackendOptionsSchema,
verbose_prompt: z.boolean().optional(), type LlamaCppBackendOptions,
threads: z.number().optional(), getAllLlamaCppFieldKeys,
threads_batch: z.number().optional(), getLlamaCppFieldType,
cpu_mask: z.string().optional(), MlxBackendOptionsSchema,
cpu_range: z.string().optional(), type MlxBackendOptions,
cpu_strict: z.number().optional(), getAllMlxFieldKeys,
prio: z.number().optional(), getMlxFieldType,
poll: z.number().optional(), VllmBackendOptionsSchema,
cpu_mask_batch: z.string().optional(), type VllmBackendOptions,
cpu_range_batch: z.string().optional(), getAllVllmFieldKeys,
cpu_strict_batch: z.number().optional(), getVllmFieldType
prio_batch: z.number().optional(), } from './backends'
poll_batch: z.number().optional(),
ctx_size: z.number().optional(),
predict: z.number().optional(),
batch_size: z.number().optional(),
ubatch_size: z.number().optional(),
keep: z.number().optional(),
flash_attn: z.boolean().optional(),
no_perf: z.boolean().optional(),
escape: z.boolean().optional(),
no_escape: z.boolean().optional(),
rope_scaling: z.string().optional(),
rope_scale: z.number().optional(),
rope_freq_base: z.number().optional(),
rope_freq_scale: z.number().optional(),
yarn_orig_ctx: z.number().optional(),
yarn_ext_factor: z.number().optional(),
yarn_attn_factor: z.number().optional(),
yarn_beta_slow: z.number().optional(),
yarn_beta_fast: z.number().optional(),
dump_kv_cache: z.boolean().optional(),
no_kv_offload: z.boolean().optional(),
cache_type_k: z.string().optional(),
cache_type_v: z.string().optional(),
defrag_thold: z.number().optional(),
parallel: z.number().optional(),
mlock: z.boolean().optional(),
no_mmap: z.boolean().optional(),
numa: z.string().optional(),
device: z.string().optional(),
override_tensor: z.array(z.string()).optional(),
gpu_layers: z.number().optional(),
split_mode: z.string().optional(),
tensor_split: z.string().optional(),
main_gpu: z.number().optional(),
check_tensors: z.boolean().optional(),
override_kv: z.array(z.string()).optional(),
lora: z.array(z.string()).optional(),
lora_scaled: z.array(z.string()).optional(),
control_vector: z.array(z.string()).optional(),
control_vector_scaled: z.array(z.string()).optional(),
control_vector_layer_range: z.string().optional(),
model: z.string().optional(),
model_url: z.string().optional(),
hf_repo: z.string().optional(),
hf_repo_draft: z.string().optional(),
hf_file: z.string().optional(),
hf_repo_v: z.string().optional(),
hf_file_v: z.string().optional(),
hf_token: z.string().optional(),
log_disable: z.boolean().optional(),
log_file: z.string().optional(),
log_colors: z.boolean().optional(),
verbose: z.boolean().optional(),
verbosity: z.number().optional(),
log_prefix: z.boolean().optional(),
log_timestamps: z.boolean().optional(),
// Sampling params
samplers: z.string().optional(),
seed: z.number().optional(),
sampling_seq: z.string().optional(),
ignore_eos: z.boolean().optional(),
temp: z.number().optional(),
top_k: z.number().optional(),
top_p: z.number().optional(),
min_p: z.number().optional(),
xtc_probability: z.number().optional(),
xtc_threshold: z.number().optional(),
typical: z.number().optional(),
repeat_last_n: z.number().optional(),
repeat_penalty: z.number().optional(),
presence_penalty: z.number().optional(),
frequency_penalty: z.number().optional(),
dry_multiplier: z.number().optional(),
dry_base: z.number().optional(),
dry_allowed_length: z.number().optional(),
dry_penalty_last_n: z.number().optional(),
dry_sequence_breaker: z.array(z.string()).optional(),
dynatemp_range: z.number().optional(),
dynatemp_exp: z.number().optional(),
mirostat: z.number().optional(),
mirostat_lr: z.number().optional(),
mirostat_ent: z.number().optional(),
logit_bias: z.array(z.string()).optional(),
grammar: z.string().optional(),
grammar_file: z.string().optional(),
json_schema: z.string().optional(),
json_schema_file: z.string().optional(),
// Example-specific params
no_context_shift: z.boolean().optional(),
special: z.boolean().optional(),
no_warmup: z.boolean().optional(),
spm_infill: z.boolean().optional(),
pooling: z.string().optional(),
cont_batching: z.boolean().optional(),
no_cont_batching: z.boolean().optional(),
mmproj: z.string().optional(),
mmproj_url: z.string().optional(),
no_mmproj: z.boolean().optional(),
no_mmproj_offload: z.boolean().optional(),
alias: z.string().optional(),
host: z.string().optional(),
port: z.number().optional(),
path: z.string().optional(),
no_webui: z.boolean().optional(),
embedding: z.boolean().optional(),
reranking: z.boolean().optional(),
api_key: z.string().optional(),
api_key_file: z.string().optional(),
ssl_key_file: z.string().optional(),
ssl_cert_file: z.string().optional(),
chat_template_kwargs: z.string().optional(),
timeout: z.number().optional(),
threads_http: z.number().optional(),
cache_reuse: z.number().optional(),
metrics: z.boolean().optional(),
slots: z.boolean().optional(),
props: z.boolean().optional(),
no_slots: z.boolean().optional(),
slot_save_path: z.string().optional(),
jinja: z.boolean().optional(),
reasoning_format: z.string().optional(),
reasoning_budget: z.number().optional(),
chat_template: z.string().optional(),
chat_template_file: z.string().optional(),
no_prefill_assistant: z.boolean().optional(),
slot_prompt_similarity: z.number().optional(),
lora_init_without_apply: z.boolean().optional(),
draft_max: z.number().optional(),
draft_min: z.number().optional(),
draft_p_min: z.number().optional(),
ctx_size_draft: z.number().optional(),
device_draft: z.string().optional(),
gpu_layers_draft: z.number().optional(),
model_draft: z.string().optional(),
cache_type_k_draft: z.string().optional(),
cache_type_v_draft: z.string().optional(),
// Audio/TTS params
model_vocoder: z.string().optional(),
tts_use_guide_tokens: z.boolean().optional(),
// Default model params
embd_bge_small_en_default: z.boolean().optional(),
embd_e5_small_en_default: z.boolean().optional(),
embd_gte_small_default: z.boolean().optional(),
fim_qwen_1_5b_default: z.boolean().optional(),
fim_qwen_3b_default: z.boolean().optional(),
fim_qwen_7b_default: z.boolean().optional(),
fim_qwen_7b_spec: z.boolean().optional(),
fim_qwen_14b_spec: z.boolean().optional(),
})
// Define the MLX backend options schema
export const MlxBackendOptionsSchema = z.object({
// Basic connection options
model: z.string().optional(),
host: z.string().optional(),
port: z.number().optional(),
// Model and adapter options
adapter_path: z.string().optional(),
draft_model: z.string().optional(),
num_draft_tokens: z.number().optional(),
trust_remote_code: z.boolean().optional(),
// Logging and templates
log_level: z.enum(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']).optional(),
chat_template: z.string().optional(),
use_default_chat_template: z.boolean().optional(),
chat_template_args: z.string().optional(), // JSON string
// Sampling defaults
temp: z.number().optional(), // Note: MLX uses "temp" not "temperature"
top_p: z.number().optional(),
top_k: z.number().optional(),
min_p: z.number().optional(),
max_tokens: z.number().optional(),
})
// Backend options union // Backend options union
export const BackendOptionsSchema = z.union([ export const BackendOptionsSchema = z.union([
LlamaCppBackendOptionsSchema, LlamaCppBackendOptionsSchema,
MlxBackendOptionsSchema, MlxBackendOptionsSchema,
VllmBackendOptionsSchema,
]) ])
// Define the main create instance options schema // Define the main create instance options schema
@@ -213,13 +34,27 @@ export const CreateInstanceOptionsSchema = z.object({
on_demand_start: z.boolean().optional(), on_demand_start: z.boolean().optional(),
// Backend configuration // Backend configuration
backend_type: z.enum([BackendType.LLAMA_CPP, BackendType.MLX_LM]).optional(), backend_type: z.enum([BackendType.LLAMA_CPP, BackendType.MLX_LM, BackendType.VLLM]).optional(),
backend_options: BackendOptionsSchema.optional(), backend_options: BackendOptionsSchema.optional(),
}) })
// Re-export types and schemas from backend files
export {
LlamaCppBackendOptionsSchema,
MlxBackendOptionsSchema,
VllmBackendOptionsSchema,
type LlamaCppBackendOptions,
type MlxBackendOptions,
type VllmBackendOptions,
getAllLlamaCppFieldKeys,
getAllMlxFieldKeys,
getAllVllmFieldKeys,
getLlamaCppFieldType,
getMlxFieldType,
getVllmFieldType
}
// Infer the TypeScript types from the schemas // Infer the TypeScript types from the schemas
export type LlamaCppBackendOptions = z.infer<typeof LlamaCppBackendOptionsSchema>
export type MlxBackendOptions = z.infer<typeof MlxBackendOptionsSchema>
export type BackendOptions = z.infer<typeof BackendOptionsSchema> export type BackendOptions = z.infer<typeof BackendOptionsSchema>
export type CreateInstanceOptions = z.infer<typeof CreateInstanceOptionsSchema> export type CreateInstanceOptions = z.infer<typeof CreateInstanceOptionsSchema>
@@ -228,56 +63,17 @@ export function getAllFieldKeys(): (keyof CreateInstanceOptions)[] {
return Object.keys(CreateInstanceOptionsSchema.shape) as (keyof CreateInstanceOptions)[] return Object.keys(CreateInstanceOptionsSchema.shape) as (keyof CreateInstanceOptions)[]
} }
// Helper to get all LlamaCpp backend option field keys
export function getAllLlamaCppFieldKeys(): (keyof LlamaCppBackendOptions)[] {
return Object.keys(LlamaCppBackendOptionsSchema.shape) as (keyof LlamaCppBackendOptions)[]
}
// Helper to get all MLX backend option field keys
export function getAllMlxFieldKeys(): (keyof MlxBackendOptions)[] {
return Object.keys(MlxBackendOptionsSchema.shape) as (keyof MlxBackendOptions)[]
}
// Get field type from Zod schema // Get field type from Zod schema
export function getFieldType(key: keyof CreateInstanceOptions): 'text' | 'number' | 'boolean' | 'array' | 'object' { export function getFieldType(key: keyof CreateInstanceOptions): 'text' | 'number' | 'boolean' | 'array' | 'object' {
const fieldSchema = CreateInstanceOptionsSchema.shape[key] const fieldSchema = CreateInstanceOptionsSchema.shape[key]
if (!fieldSchema) return 'text' if (!fieldSchema) return 'text'
// Handle ZodOptional wrapper // Handle ZodOptional wrapper
const innerSchema = fieldSchema instanceof z.ZodOptional ? fieldSchema.unwrap() : fieldSchema const innerSchema = fieldSchema instanceof z.ZodOptional ? fieldSchema.unwrap() : fieldSchema
if (innerSchema instanceof z.ZodBoolean) return 'boolean' if (innerSchema instanceof z.ZodBoolean) return 'boolean'
if (innerSchema instanceof z.ZodNumber) return 'number' if (innerSchema instanceof z.ZodNumber) return 'number'
if (innerSchema instanceof z.ZodArray) return 'array' if (innerSchema instanceof z.ZodArray) return 'array'
if (innerSchema instanceof z.ZodObject) return 'object' if (innerSchema instanceof z.ZodObject) return 'object'
return 'text' // ZodString and others default to text return 'text' // ZodString and others default to text
}
// Get field type for LlamaCpp backend options
export function getLlamaCppFieldType(key: keyof LlamaCppBackendOptions): 'text' | 'number' | 'boolean' | 'array' {
const fieldSchema = LlamaCppBackendOptionsSchema.shape[key]
if (!fieldSchema) return 'text'
// Handle ZodOptional wrapper
const innerSchema = fieldSchema instanceof z.ZodOptional ? fieldSchema.unwrap() : fieldSchema
if (innerSchema instanceof z.ZodBoolean) return 'boolean'
if (innerSchema instanceof z.ZodNumber) return 'number'
if (innerSchema instanceof z.ZodArray) return 'array'
return 'text' // ZodString and others default to text
}
// Get field type for MLX backend options
export function getMlxFieldType(key: keyof MlxBackendOptions): 'text' | 'number' | 'boolean' | 'array' {
const fieldSchema = MlxBackendOptionsSchema.shape[key]
if (!fieldSchema) return 'text'
// Handle ZodOptional wrapper
const innerSchema = fieldSchema instanceof z.ZodOptional ? fieldSchema.unwrap() : fieldSchema
if (innerSchema instanceof z.ZodBoolean) return 'boolean'
if (innerSchema instanceof z.ZodNumber) return 'number'
if (innerSchema instanceof z.ZodArray) return 'array'
if (innerSchema instanceof z.ZodEnum) return 'text' // Enum treated as text/select
return 'text' // ZodString and others default to text
} }

View File

@@ -5,6 +5,7 @@ export { type CreateInstanceOptions } from '@/schemas/instanceOptions'
export const BackendType = { export const BackendType = {
LLAMA_CPP: 'llama_cpp', LLAMA_CPP: 'llama_cpp',
MLX_LM: 'mlx_lm', MLX_LM: 'mlx_lm',
VLLM: 'vllm',
// MLX_VLM: 'mlx_vlm', // Future expansion // MLX_VLM: 'mlx_vlm', // Future expansion
} as const } as const