Add vLLM backend support to webui

2025-12-23 01:24:24 +00:00 · 2025-09-21 20:58:43 +02:00
parent 7eb59aa7e0
commit b665194307
10 changed files with 545 additions and 258 deletions
--- a/webui/src/components/ParseCommandDialog.tsx
+++ b/webui/src/components/ParseCommandDialog.tsx
@@ -9,7 +9,7 @@ import {
  DialogHeader,
  DialogTitle,
 } from "@/components/ui/dialog";
-import { type CreateInstanceOptions } from "@/types/instance";
+import { BackendType, type BackendTypeValue, type CreateInstanceOptions } from "@/types/instance";
 import { backendsApi } from "@/lib/api";
 import { toast } from "sonner";
@@ -25,6 +25,7 @@ const ParseCommandDialog: React.FC<ParseCommandDialogProps> = ({
  onParsed,
 }) => {
  const [command, setCommand] = useState('');
  const [backendType, setBackendType] = useState<BackendTypeValue>(BackendType.LLAMA_CPP);
  const [loading, setLoading] = useState(false);
  const [error, setError] = useState<string | null>(null);
@@ -38,18 +39,31 @@ const ParseCommandDialog: React.FC<ParseCommandDialogProps> = ({
    setError(null);
    try {
-      const options = await backendsApi.llamaCpp.parseCommand(command);
+      let options: CreateInstanceOptions;
      // Parse based on selected backend type
      switch (backendType) {
        case BackendType.LLAMA_CPP:
          options = await backendsApi.llamaCpp.parseCommand(command);
          break;
        case BackendType.MLX_LM:
          options = await backendsApi.mlx.parseCommand(command);
          break;
        case BackendType.VLLM:
          options = await backendsApi.vllm.parseCommand(command);
          break;
        default:
          throw new Error(`Unsupported backend type: ${backendType}`);
      }
      onParsed(options);
      onOpenChange(false);
      // Reset form
      setCommand('');
      setError(null);
      // Show success toast
      toast.success('Command parsed successfully');
    } catch (err) {
      const errorMessage = err instanceof Error ? err.message : 'Failed to parse command';
      setError(errorMessage);
      // Show error toast
      toast.error('Failed to parse command', {
        description: errorMessage
      });
@@ -60,31 +74,58 @@ const ParseCommandDialog: React.FC<ParseCommandDialogProps> = ({
  const handleOpenChange = (open: boolean) => {
    if (!open) {
      // Reset form when closing
      setCommand('');
      setBackendType(BackendType.LLAMA_CPP);
      setError(null);
    }
    onOpenChange(open);
  };
  const getPlaceholderForBackend = (backendType: BackendTypeValue): string => {
    switch (backendType) {
      case BackendType.LLAMA_CPP:
        return "llama-server --model /path/to/model.gguf --gpu-layers 32 --ctx-size 4096";
      case BackendType.MLX_LM:
        return "mlx_lm.server --model mlx-community/Mistral-7B-Instruct-v0.3-4bit --host 0.0.0.0 --port 8080";
      case BackendType.VLLM:
        return "vllm serve --model microsoft/DialoGPT-medium --tensor-parallel-size 2 --gpu-memory-utilization 0.9";
      default:
        return "Enter your command here...";
    }
  };
  return (
    <Dialog open={open} onOpenChange={handleOpenChange}>
      <DialogContent className="sm:max-w-[600px]">
        <DialogHeader>
-          <DialogTitle>Parse Llama Server Command</DialogTitle>
+          <DialogTitle>Parse Backend Command</DialogTitle>
          <DialogDescription>
-            Paste your llama-server command to automatically populate the form fields
+            Select your backend type and paste the command to automatically populate the form fields
          </DialogDescription>
        </DialogHeader>
        <div className="space-y-4">
          <div>
            <Label htmlFor="backend-type">Backend Type</Label>
            <select
              id="backend-type"
              value={backendType}
              onChange={(e) => setBackendType(e.target.value as BackendTypeValue)}
              className="flex h-10 w-full rounded-md border border-input bg-background px-3 py-2 text-sm ring-offset-background file:border-0 file:bg-transparent file:text-sm file:font-medium placeholder:text-muted-foreground focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-ring focus-visible:ring-offset-2 disabled:cursor-not-allowed disabled:opacity-50"
            >
              <option value={BackendType.LLAMA_CPP}>Llama Server</option>
              <option value={BackendType.MLX_LM}>MLX LM</option>
              <option value={BackendType.VLLM}>vLLM</option>
            </select>
          </div>
          <div>
            <Label htmlFor="command">Command</Label>
            <textarea
              id="command"
              value={command}
              onChange={(e) => setCommand(e.target.value)}
-              placeholder="llama-server --model /path/to/model.gguf --gpu-layers 32 --ctx-size 4096"
+              placeholder={getPlaceholderForBackend(backendType)}
              className="w-full h-32 p-3 mt-2 border border-input rounded-md font-mono text-sm resize-vertical focus:outline-none focus:ring-2 focus:ring-ring focus:ring-offset-2"
            />
          </div>
--- a/webui/src/components/ZodFormField.tsx
+++ b/webui/src/components/ZodFormField.tsx
@@ -39,6 +39,7 @@ const ZodFormField: React.FC<ZodFormFieldProps> = ({ fieldKey, value, onChange }
          >
            <option value={BackendType.LLAMA_CPP}>Llama Server</option>
            <option value={BackendType.MLX_LM}>MLX LM</option>
            <option value={BackendType.VLLM}>vLLM</option>
          </select>
          {config.description && (
            <p className="text-sm text-muted-foreground">{config.description}</p>
--- a/webui/src/lib/api.ts
+++ b/webui/src/lib/api.ts
@@ -101,6 +101,14 @@ export const backendsApi = {
        body: JSON.stringify({ command }),
      }),
  },
  vllm: {
    // POST /backends/vllm/parse-command
    parseCommand: (command: string) =>
      apiCall<CreateInstanceOptions>('/backends/vllm/parse-command', {
        method: 'POST',
        body: JSON.stringify({ command }),
      }),
  },
 };
 // Instance API functions
--- a/webui/src/lib/zodFormUtils.ts
+++ b/webui/src/lib/zodFormUtils.ts
@@ -2,13 +2,17 @@ import {
  type CreateInstanceOptions,
  type LlamaCppBackendOptions,
  type MlxBackendOptions,
  type VllmBackendOptions,
  LlamaCppBackendOptionsSchema,
  MlxBackendOptionsSchema,
  VllmBackendOptionsSchema,
  getAllFieldKeys,
  getAllLlamaCppFieldKeys,
  getAllMlxFieldKeys,
  getAllVllmFieldKeys,
  getLlamaCppFieldType,
-  getMlxFieldType
+  getMlxFieldType,
  getVllmFieldType
 } from '@/schemas/instanceOptions'
 // Instance-level basic fields (not backend-specific)
@@ -117,6 +121,31 @@ const basicMlxFieldsConfig: Record<string, {
  }
 }
 // vLLM backend-specific basic fields
 const basicVllmFieldsConfig: Record<string, {
  label: string
  description?: string
  placeholder?: string
  required?: boolean
 }> = {
  model: {
    label: 'Model',
    placeholder: 'microsoft/DialoGPT-medium',
    description: 'The name or path of the Hugging Face model to use',
    required: true
  },
  tensor_parallel_size: {
    label: 'Tensor Parallel Size',
    placeholder: '1',
    description: 'Number of GPUs to use for distributed serving'
  },
  gpu_memory_utilization: {
    label: 'GPU Memory Utilization',
    placeholder: '0.9',
    description: 'The fraction of GPU memory to be used for the model executor'
  }
 }
 function isBasicField(key: keyof CreateInstanceOptions): boolean {
  return key in basicFieldsConfig
 }
@@ -134,6 +163,8 @@ export function getAdvancedFields(): (keyof CreateInstanceOptions)[] {
 export function getBasicBackendFields(backendType?: string): string[] {
  if (backendType === 'mlx_lm') {
    return Object.keys(basicMlxFieldsConfig)
  } else if (backendType === 'vllm') {
    return Object.keys(basicVllmFieldsConfig)
  } else if (backendType === 'llama_cpp') {
    return Object.keys(basicLlamaCppFieldsConfig)
  }
@@ -144,6 +175,8 @@ export function getBasicBackendFields(backendType?: string): string[] {
 export function getAdvancedBackendFields(backendType?: string): string[] {
  if (backendType === 'mlx_lm') {
    return getAllMlxFieldKeys().filter(key => !(key in basicMlxFieldsConfig))
  } else if (backendType === 'vllm') {
    return getAllVllmFieldKeys().filter(key => !(key in basicVllmFieldsConfig))
  } else if (backendType === 'llama_cpp') {
    return getAllLlamaCppFieldKeys().filter(key => !(key in basicLlamaCppFieldsConfig))
  }
@@ -159,7 +192,8 @@ export const basicBackendFieldsConfig: Record<string, {
  required?: boolean
 }> = {
  ...basicLlamaCppFieldsConfig,
-  ...basicMlxFieldsConfig
+  ...basicMlxFieldsConfig,
  ...basicVllmFieldsConfig
 }
 // Get field type for any backend option (union type)
@@ -182,6 +216,15 @@ export function getBackendFieldType(key: string): 'text' | 'number' | 'boolean'
    // Schema might not be available
  }
  // Try vLLM schema
  try {
    if (VllmBackendOptionsSchema.shape && key in VllmBackendOptionsSchema.shape) {
      return getVllmFieldType(key as keyof VllmBackendOptions)
    }
  } catch {
    // Schema might not be available
  }
  // Default fallback
  return 'text'
 }
--- a/webui/src/schemas/backends/index.ts
+++ b/webui/src/schemas/backends/index.ts
@@ -0,0 +1,4 @@
 // Re-export all backend schemas from one place
 export * from './llamacpp'
 export * from './mlx'
 export * from './vllm'
--- a/webui/src/schemas/backends/llamacpp.ts
+++ b/webui/src/schemas/backends/llamacpp.ts
@@ -0,0 +1,192 @@
 import { z } from 'zod'
 // Define the LlamaCpp backend options schema
 export const LlamaCppBackendOptionsSchema = z.object({
  // Common params
  verbose_prompt: z.boolean().optional(),
  threads: z.number().optional(),
  threads_batch: z.number().optional(),
  cpu_mask: z.string().optional(),
  cpu_range: z.string().optional(),
  cpu_strict: z.number().optional(),
  prio: z.number().optional(),
  poll: z.number().optional(),
  cpu_mask_batch: z.string().optional(),
  cpu_range_batch: z.string().optional(),
  cpu_strict_batch: z.number().optional(),
  prio_batch: z.number().optional(),
  poll_batch: z.number().optional(),
  ctx_size: z.number().optional(),
  predict: z.number().optional(),
  batch_size: z.number().optional(),
  ubatch_size: z.number().optional(),
  keep: z.number().optional(),
  flash_attn: z.boolean().optional(),
  no_perf: z.boolean().optional(),
  escape: z.boolean().optional(),
  no_escape: z.boolean().optional(),
  rope_scaling: z.string().optional(),
  rope_scale: z.number().optional(),
  rope_freq_base: z.number().optional(),
  rope_freq_scale: z.number().optional(),
  yarn_orig_ctx: z.number().optional(),
  yarn_ext_factor: z.number().optional(),
  yarn_attn_factor: z.number().optional(),
  yarn_beta_slow: z.number().optional(),
  yarn_beta_fast: z.number().optional(),
  dump_kv_cache: z.boolean().optional(),
  no_kv_offload: z.boolean().optional(),
  cache_type_k: z.string().optional(),
  cache_type_v: z.string().optional(),
  defrag_thold: z.number().optional(),
  parallel: z.number().optional(),
  mlock: z.boolean().optional(),
  no_mmap: z.boolean().optional(),
  numa: z.string().optional(),
  device: z.string().optional(),
  override_tensor: z.array(z.string()).optional(),
  gpu_layers: z.number().optional(),
  split_mode: z.string().optional(),
  tensor_split: z.string().optional(),
  main_gpu: z.number().optional(),
  check_tensors: z.boolean().optional(),
  override_kv: z.array(z.string()).optional(),
  lora: z.array(z.string()).optional(),
  lora_scaled: z.array(z.string()).optional(),
  control_vector: z.array(z.string()).optional(),
  control_vector_scaled: z.array(z.string()).optional(),
  control_vector_layer_range: z.string().optional(),
  model: z.string().optional(),
  model_url: z.string().optional(),
  hf_repo: z.string().optional(),
  hf_repo_draft: z.string().optional(),
  hf_file: z.string().optional(),
  hf_repo_v: z.string().optional(),
  hf_file_v: z.string().optional(),
  hf_token: z.string().optional(),
  log_disable: z.boolean().optional(),
  log_file: z.string().optional(),
  log_colors: z.boolean().optional(),
  verbose: z.boolean().optional(),
  verbosity: z.number().optional(),
  log_prefix: z.boolean().optional(),
  log_timestamps: z.boolean().optional(),
  // Sampling params
  samplers: z.string().optional(),
  seed: z.number().optional(),
  sampling_seq: z.string().optional(),
  ignore_eos: z.boolean().optional(),
  temp: z.number().optional(),
  top_k: z.number().optional(),
  top_p: z.number().optional(),
  min_p: z.number().optional(),
  xtc_probability: z.number().optional(),
  xtc_threshold: z.number().optional(),
  typical: z.number().optional(),
  repeat_last_n: z.number().optional(),
  repeat_penalty: z.number().optional(),
  presence_penalty: z.number().optional(),
  frequency_penalty: z.number().optional(),
  dry_multiplier: z.number().optional(),
  dry_base: z.number().optional(),
  dry_allowed_length: z.number().optional(),
  dry_penalty_last_n: z.number().optional(),
  dry_sequence_breaker: z.array(z.string()).optional(),
  dynatemp_range: z.number().optional(),
  dynatemp_exp: z.number().optional(),
  mirostat: z.number().optional(),
  mirostat_lr: z.number().optional(),
  mirostat_ent: z.number().optional(),
  logit_bias: z.array(z.string()).optional(),
  grammar: z.string().optional(),
  grammar_file: z.string().optional(),
  json_schema: z.string().optional(),
  json_schema_file: z.string().optional(),
  // Example-specific params
  no_context_shift: z.boolean().optional(),
  special: z.boolean().optional(),
  no_warmup: z.boolean().optional(),
  spm_infill: z.boolean().optional(),
  pooling: z.string().optional(),
  cont_batching: z.boolean().optional(),
  no_cont_batching: z.boolean().optional(),
  mmproj: z.string().optional(),
  mmproj_url: z.string().optional(),
  no_mmproj: z.boolean().optional(),
  no_mmproj_offload: z.boolean().optional(),
  alias: z.string().optional(),
  host: z.string().optional(),
  port: z.number().optional(),
  path: z.string().optional(),
  no_webui: z.boolean().optional(),
  embedding: z.boolean().optional(),
  reranking: z.boolean().optional(),
  api_key: z.string().optional(),
  api_key_file: z.string().optional(),
  ssl_key_file: z.string().optional(),
  ssl_cert_file: z.string().optional(),
  chat_template_kwargs: z.string().optional(),
  timeout: z.number().optional(),
  threads_http: z.number().optional(),
  cache_reuse: z.number().optional(),
  metrics: z.boolean().optional(),
  slots: z.boolean().optional(),
  props: z.boolean().optional(),
  no_slots: z.boolean().optional(),
  slot_save_path: z.string().optional(),
  jinja: z.boolean().optional(),
  reasoning_format: z.string().optional(),
  reasoning_budget: z.number().optional(),
  chat_template: z.string().optional(),
  chat_template_file: z.string().optional(),
  no_prefill_assistant: z.boolean().optional(),
  slot_prompt_similarity: z.number().optional(),
  lora_init_without_apply: z.boolean().optional(),
  draft_max: z.number().optional(),
  draft_min: z.number().optional(),
  draft_p_min: z.number().optional(),
  ctx_size_draft: z.number().optional(),
  device_draft: z.string().optional(),
  gpu_layers_draft: z.number().optional(),
  model_draft: z.string().optional(),
  cache_type_k_draft: z.string().optional(),
  cache_type_v_draft: z.string().optional(),
  // Audio/TTS params
  model_vocoder: z.string().optional(),
  tts_use_guide_tokens: z.boolean().optional(),
  // Default model params
  embd_bge_small_en_default: z.boolean().optional(),
  embd_e5_small_en_default: z.boolean().optional(),
  embd_gte_small_default: z.boolean().optional(),
  fim_qwen_1_5b_default: z.boolean().optional(),
  fim_qwen_3b_default: z.boolean().optional(),
  fim_qwen_7b_default: z.boolean().optional(),
  fim_qwen_7b_spec: z.boolean().optional(),
  fim_qwen_14b_spec: z.boolean().optional(),
 })
 // Infer the TypeScript type from the schema
 export type LlamaCppBackendOptions = z.infer<typeof LlamaCppBackendOptionsSchema>
 // Helper to get all LlamaCpp backend option field keys
 export function getAllLlamaCppFieldKeys(): (keyof LlamaCppBackendOptions)[] {
  return Object.keys(LlamaCppBackendOptionsSchema.shape) as (keyof LlamaCppBackendOptions)[]
 }
 // Get field type for LlamaCpp backend options
 export function getLlamaCppFieldType(key: keyof LlamaCppBackendOptions): 'text' | 'number' | 'boolean' | 'array' {
  const fieldSchema = LlamaCppBackendOptionsSchema.shape[key]
  if (!fieldSchema) return 'text'
  // Handle ZodOptional wrapper
  const innerSchema = fieldSchema instanceof z.ZodOptional ? fieldSchema.unwrap() : fieldSchema
  if (innerSchema instanceof z.ZodBoolean) return 'boolean'
  if (innerSchema instanceof z.ZodNumber) return 'number'
  if (innerSchema instanceof z.ZodArray) return 'array'
  return 'text' // ZodString and others default to text
 }
--- a/webui/src/schemas/backends/mlx.ts
+++ b/webui/src/schemas/backends/mlx.ts
@@ -0,0 +1,51 @@
 import { z } from 'zod'
 // Define the MLX backend options schema
 export const MlxBackendOptionsSchema = z.object({
  // Basic connection options
  model: z.string().optional(),
  host: z.string().optional(),
  port: z.number().optional(),
  // Model and adapter options
  adapter_path: z.string().optional(),
  draft_model: z.string().optional(),
  num_draft_tokens: z.number().optional(),
  trust_remote_code: z.boolean().optional(),
  // Logging and templates
  log_level: z.enum(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']).optional(),
  chat_template: z.string().optional(),
  use_default_chat_template: z.boolean().optional(),
  chat_template_args: z.string().optional(), // JSON string
  // Sampling defaults
  temp: z.number().optional(),     // Note: MLX uses "temp" not "temperature"
  top_p: z.number().optional(),
  top_k: z.number().optional(),
  min_p: z.number().optional(),
  max_tokens: z.number().optional(),
 })
 // Infer the TypeScript type from the schema
 export type MlxBackendOptions = z.infer<typeof MlxBackendOptionsSchema>
 // Helper to get all MLX backend option field keys
 export function getAllMlxFieldKeys(): (keyof MlxBackendOptions)[] {
  return Object.keys(MlxBackendOptionsSchema.shape) as (keyof MlxBackendOptions)[]
 }
 // Get field type for MLX backend options
 export function getMlxFieldType(key: keyof MlxBackendOptions): 'text' | 'number' | 'boolean' | 'array' {
  const fieldSchema = MlxBackendOptionsSchema.shape[key]
  if (!fieldSchema) return 'text'
  // Handle ZodOptional wrapper
  const innerSchema = fieldSchema instanceof z.ZodOptional ? fieldSchema.unwrap() : fieldSchema
  if (innerSchema instanceof z.ZodBoolean) return 'boolean'
  if (innerSchema instanceof z.ZodNumber) return 'number'
  if (innerSchema instanceof z.ZodArray) return 'array'
  if (innerSchema instanceof z.ZodEnum) return 'text' // Enum treated as text/select
  return 'text' // ZodString and others default to text
 }
--- a/webui/src/schemas/backends/vllm.ts
+++ b/webui/src/schemas/backends/vllm.ts
@@ -0,0 +1,150 @@
 import { z } from 'zod'
 // Define the vLLM backend options schema
 export const VllmBackendOptionsSchema = z.object({
  // Basic connection options (auto-assigned by llamactl)
  host: z.string().optional(),
  port: z.number().optional(),
  // Model and engine configuration
  model: z.string().optional(),
  tokenizer: z.string().optional(),
  skip_tokenizer_init: z.boolean().optional(),
  revision: z.string().optional(),
  code_revision: z.string().optional(),
  tokenizer_revision: z.string().optional(),
  tokenizer_mode: z.string().optional(),
  trust_remote_code: z.boolean().optional(),
  download_dir: z.string().optional(),
  load_format: z.string().optional(),
  config_format: z.string().optional(),
  dtype: z.string().optional(),
  kv_cache_dtype: z.string().optional(),
  quantization_param_path: z.string().optional(),
  seed: z.number().optional(),
  max_model_len: z.number().optional(),
  guided_decoding_backend: z.string().optional(),
  distributed_executor_backend: z.string().optional(),
  worker_use_ray: z.boolean().optional(),
  ray_workers_use_nsight: z.boolean().optional(),
  // Performance and serving configuration
  block_size: z.number().optional(),
  enable_prefix_caching: z.boolean().optional(),
  disable_sliding_window: z.boolean().optional(),
  use_v2_block_manager: z.boolean().optional(),
  num_lookahead_slots: z.number().optional(),
  swap_space: z.number().optional(),
  cpu_offload_gb: z.number().optional(),
  gpu_memory_utilization: z.number().optional(),
  num_gpu_blocks_override: z.number().optional(),
  max_num_batched_tokens: z.number().optional(),
  max_num_seqs: z.number().optional(),
  max_logprobs: z.number().optional(),
  disable_log_stats: z.boolean().optional(),
  quantization: z.string().optional(),
  rope_scaling: z.string().optional(),
  rope_theta: z.number().optional(),
  enforce_eager: z.boolean().optional(),
  max_context_len_to_capture: z.number().optional(),
  max_seq_len_to_capture: z.number().optional(),
  disable_custom_all_reduce: z.boolean().optional(),
  tokenizer_pool_size: z.number().optional(),
  tokenizer_pool_type: z.string().optional(),
  tokenizer_pool_extra_config: z.string().optional(),
  enable_lora_bias: z.boolean().optional(),
  lora_extra_vocab_size: z.number().optional(),
  lora_rank: z.number().optional(),
  prompt_lookback_distance: z.number().optional(),
  preemption_mode: z.string().optional(),
  // Distributed and parallel processing
  tensor_parallel_size: z.number().optional(),
  pipeline_parallel_size: z.number().optional(),
  max_parallel_loading_workers: z.number().optional(),
  disable_async_output_proc: z.boolean().optional(),
  worker_class: z.string().optional(),
  enabled_lora_modules: z.string().optional(),
  max_lora_rank: z.number().optional(),
  fully_sharded_loras: z.boolean().optional(),
  lora_modules: z.string().optional(),
  prompt_adapters: z.string().optional(),
  max_prompt_adapter_token: z.number().optional(),
  device: z.string().optional(),
  scheduler_delay: z.number().optional(),
  enable_chunked_prefill: z.boolean().optional(),
  speculative_model: z.string().optional(),
  speculative_model_quantization: z.string().optional(),
  speculative_revision: z.string().optional(),
  speculative_max_model_len: z.number().optional(),
  speculative_disable_by_batch_size: z.number().optional(),
  ngpt_speculative_length: z.number().optional(),
  speculative_disable_mqa: z.boolean().optional(),
  model_loader_extra_config: z.string().optional(),
  ignore_patterns: z.string().optional(),
  preloaded_lora_modules: z.string().optional(),
  // OpenAI server specific options
  uds: z.string().optional(),
  uvicorn_log_level: z.string().optional(),
  response_role: z.string().optional(),
  ssl_keyfile: z.string().optional(),
  ssl_certfile: z.string().optional(),
  ssl_ca_certs: z.string().optional(),
  ssl_cert_reqs: z.number().optional(),
  root_path: z.string().optional(),
  middleware: z.array(z.string()).optional(),
  return_tokens_as_token_ids: z.boolean().optional(),
  disable_frontend_multiprocessing: z.boolean().optional(),
  enable_auto_tool_choice: z.boolean().optional(),
  tool_call_parser: z.string().optional(),
  tool_server: z.string().optional(),
  chat_template: z.string().optional(),
  chat_template_content_format: z.string().optional(),
  allow_credentials: z.boolean().optional(),
  allowed_origins: z.array(z.string()).optional(),
  allowed_methods: z.array(z.string()).optional(),
  allowed_headers: z.array(z.string()).optional(),
  api_key: z.array(z.string()).optional(),
  enable_log_outputs: z.boolean().optional(),
  enable_token_usage: z.boolean().optional(),
  enable_async_engine_debug: z.boolean().optional(),
  engine_use_ray: z.boolean().optional(),
  disable_log_requests: z.boolean().optional(),
  max_log_len: z.number().optional(),
  // Additional engine configuration
  task: z.string().optional(),
  multi_modal_config: z.string().optional(),
  limit_mm_per_prompt: z.string().optional(),
  enable_sleep_mode: z.boolean().optional(),
  enable_chunking_request: z.boolean().optional(),
  compilation_config: z.string().optional(),
  disable_sliding_window_mask: z.boolean().optional(),
  enable_trtllm_engine_latency: z.boolean().optional(),
  override_pooling_config: z.string().optional(),
  override_neuron_config: z.string().optional(),
  override_kv_cache_align_size: z.number().optional(),
 })
 // Infer the TypeScript type from the schema
 export type VllmBackendOptions = z.infer<typeof VllmBackendOptionsSchema>
 // Helper to get all vLLM backend option field keys
 export function getAllVllmFieldKeys(): (keyof VllmBackendOptions)[] {
  return Object.keys(VllmBackendOptionsSchema.shape) as (keyof VllmBackendOptions)[]
 }
 // Get field type for vLLM backend options
 export function getVllmFieldType(key: keyof VllmBackendOptions): 'text' | 'number' | 'boolean' | 'array' {
  const fieldSchema = VllmBackendOptionsSchema.shape[key]
  if (!fieldSchema) return 'text'
  // Handle ZodOptional wrapper
  const innerSchema = fieldSchema instanceof z.ZodOptional ? fieldSchema.unwrap() : fieldSchema
  if (innerSchema instanceof z.ZodBoolean) return 'boolean'
  if (innerSchema instanceof z.ZodNumber) return 'number'
  if (innerSchema instanceof z.ZodArray) return 'array'
  return 'text' // ZodString and others default to text
 }
--- a/webui/src/schemas/instanceOptions.ts
+++ b/webui/src/schemas/instanceOptions.ts
@@ -1,206 +1,27 @@
 import { BackendType } from '@/types/instance'
 import { z } from 'zod'
-// Define the LlamaCpp backend options schema
+// Import backend schemas from separate files
-export const LlamaCppBackendOptionsSchema = z.object({
+import {
-  // Common params
+  LlamaCppBackendOptionsSchema,
-  verbose_prompt: z.boolean().optional(),
+  type LlamaCppBackendOptions,
-  threads: z.number().optional(),
+  getAllLlamaCppFieldKeys,
-  threads_batch: z.number().optional(),
+  getLlamaCppFieldType,
-  cpu_mask: z.string().optional(),
+  MlxBackendOptionsSchema,
-  cpu_range: z.string().optional(),
+  type MlxBackendOptions,
-  cpu_strict: z.number().optional(),
+  getAllMlxFieldKeys,
-  prio: z.number().optional(),
+  getMlxFieldType,
-  poll: z.number().optional(),
+  VllmBackendOptionsSchema,
-  cpu_mask_batch: z.string().optional(),
+  type VllmBackendOptions,
-  cpu_range_batch: z.string().optional(),
+  getAllVllmFieldKeys,
-  cpu_strict_batch: z.number().optional(),
+  getVllmFieldType
-  prio_batch: z.number().optional(),
+} from './backends'
  poll_batch: z.number().optional(),
  ctx_size: z.number().optional(),
  predict: z.number().optional(),
  batch_size: z.number().optional(),
  ubatch_size: z.number().optional(),
  keep: z.number().optional(),
  flash_attn: z.boolean().optional(),
  no_perf: z.boolean().optional(),
  escape: z.boolean().optional(),
  no_escape: z.boolean().optional(),
  rope_scaling: z.string().optional(),
  rope_scale: z.number().optional(),
  rope_freq_base: z.number().optional(),
  rope_freq_scale: z.number().optional(),
  yarn_orig_ctx: z.number().optional(),
  yarn_ext_factor: z.number().optional(),
  yarn_attn_factor: z.number().optional(),
  yarn_beta_slow: z.number().optional(),
  yarn_beta_fast: z.number().optional(),
  dump_kv_cache: z.boolean().optional(),
  no_kv_offload: z.boolean().optional(),
  cache_type_k: z.string().optional(),
  cache_type_v: z.string().optional(),
  defrag_thold: z.number().optional(),
  parallel: z.number().optional(),
  mlock: z.boolean().optional(),
  no_mmap: z.boolean().optional(),
  numa: z.string().optional(),
  device: z.string().optional(),
  override_tensor: z.array(z.string()).optional(),
  gpu_layers: z.number().optional(),
  split_mode: z.string().optional(),
  tensor_split: z.string().optional(),
  main_gpu: z.number().optional(),
  check_tensors: z.boolean().optional(),
  override_kv: z.array(z.string()).optional(),
  lora: z.array(z.string()).optional(),
  lora_scaled: z.array(z.string()).optional(),
  control_vector: z.array(z.string()).optional(),
  control_vector_scaled: z.array(z.string()).optional(),
  control_vector_layer_range: z.string().optional(),
  model: z.string().optional(),
  model_url: z.string().optional(),
  hf_repo: z.string().optional(),
  hf_repo_draft: z.string().optional(),
  hf_file: z.string().optional(),
  hf_repo_v: z.string().optional(),
  hf_file_v: z.string().optional(),
  hf_token: z.string().optional(),
  log_disable: z.boolean().optional(),
  log_file: z.string().optional(),
  log_colors: z.boolean().optional(),
  verbose: z.boolean().optional(),
  verbosity: z.number().optional(),
  log_prefix: z.boolean().optional(),
  log_timestamps: z.boolean().optional(),
  // Sampling params
  samplers: z.string().optional(),
  seed: z.number().optional(),
  sampling_seq: z.string().optional(),
  ignore_eos: z.boolean().optional(),
  temp: z.number().optional(),
  top_k: z.number().optional(),
  top_p: z.number().optional(),
  min_p: z.number().optional(),
  xtc_probability: z.number().optional(),
  xtc_threshold: z.number().optional(),
  typical: z.number().optional(),
  repeat_last_n: z.number().optional(),
  repeat_penalty: z.number().optional(),
  presence_penalty: z.number().optional(),
  frequency_penalty: z.number().optional(),
  dry_multiplier: z.number().optional(),
  dry_base: z.number().optional(),
  dry_allowed_length: z.number().optional(),
  dry_penalty_last_n: z.number().optional(),
  dry_sequence_breaker: z.array(z.string()).optional(),
  dynatemp_range: z.number().optional(),
  dynatemp_exp: z.number().optional(),
  mirostat: z.number().optional(),
  mirostat_lr: z.number().optional(),
  mirostat_ent: z.number().optional(),
  logit_bias: z.array(z.string()).optional(),
  grammar: z.string().optional(),
  grammar_file: z.string().optional(),
  json_schema: z.string().optional(),
  json_schema_file: z.string().optional(),
  // Example-specific params
  no_context_shift: z.boolean().optional(),
  special: z.boolean().optional(),
  no_warmup: z.boolean().optional(),
  spm_infill: z.boolean().optional(),
  pooling: z.string().optional(),
  cont_batching: z.boolean().optional(),
  no_cont_batching: z.boolean().optional(),
  mmproj: z.string().optional(),
  mmproj_url: z.string().optional(),
  no_mmproj: z.boolean().optional(),
  no_mmproj_offload: z.boolean().optional(),
  alias: z.string().optional(),
  host: z.string().optional(),
  port: z.number().optional(),
  path: z.string().optional(),
  no_webui: z.boolean().optional(),
  embedding: z.boolean().optional(),
  reranking: z.boolean().optional(),
  api_key: z.string().optional(),
  api_key_file: z.string().optional(),
  ssl_key_file: z.string().optional(),
  ssl_cert_file: z.string().optional(),
  chat_template_kwargs: z.string().optional(),
  timeout: z.number().optional(),
  threads_http: z.number().optional(),
  cache_reuse: z.number().optional(),
  metrics: z.boolean().optional(),
  slots: z.boolean().optional(),
  props: z.boolean().optional(),
  no_slots: z.boolean().optional(),
  slot_save_path: z.string().optional(),
  jinja: z.boolean().optional(),
  reasoning_format: z.string().optional(),
  reasoning_budget: z.number().optional(),
  chat_template: z.string().optional(),
  chat_template_file: z.string().optional(),
  no_prefill_assistant: z.boolean().optional(),
  slot_prompt_similarity: z.number().optional(),
  lora_init_without_apply: z.boolean().optional(),
  draft_max: z.number().optional(),
  draft_min: z.number().optional(),
  draft_p_min: z.number().optional(),
  ctx_size_draft: z.number().optional(),
  device_draft: z.string().optional(),
  gpu_layers_draft: z.number().optional(),
  model_draft: z.string().optional(),
  cache_type_k_draft: z.string().optional(),
  cache_type_v_draft: z.string().optional(),
  // Audio/TTS params
  model_vocoder: z.string().optional(),
  tts_use_guide_tokens: z.boolean().optional(),
  // Default model params
  embd_bge_small_en_default: z.boolean().optional(),
  embd_e5_small_en_default: z.boolean().optional(),
  embd_gte_small_default: z.boolean().optional(),
  fim_qwen_1_5b_default: z.boolean().optional(),
  fim_qwen_3b_default: z.boolean().optional(),
  fim_qwen_7b_default: z.boolean().optional(),
  fim_qwen_7b_spec: z.boolean().optional(),
  fim_qwen_14b_spec: z.boolean().optional(),
 })
 // Define the MLX backend options schema
 export const MlxBackendOptionsSchema = z.object({
  // Basic connection options
  model: z.string().optional(),
  host: z.string().optional(),
  port: z.number().optional(),
  // Model and adapter options
  adapter_path: z.string().optional(),
  draft_model: z.string().optional(),
  num_draft_tokens: z.number().optional(),
  trust_remote_code: z.boolean().optional(),
  // Logging and templates
  log_level: z.enum(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']).optional(),
  chat_template: z.string().optional(),
  use_default_chat_template: z.boolean().optional(),
  chat_template_args: z.string().optional(), // JSON string
  // Sampling defaults
  temp: z.number().optional(),     // Note: MLX uses "temp" not "temperature"
  top_p: z.number().optional(),
  top_k: z.number().optional(),
  min_p: z.number().optional(),
  max_tokens: z.number().optional(),
 })
 // Backend options union
 export const BackendOptionsSchema = z.union([
  LlamaCppBackendOptionsSchema,
  MlxBackendOptionsSchema,
  VllmBackendOptionsSchema,
 ])
 // Define the main create instance options schema
@@ -213,13 +34,27 @@ export const CreateInstanceOptionsSchema = z.object({
  on_demand_start: z.boolean().optional(),
  // Backend configuration
-  backend_type: z.enum([BackendType.LLAMA_CPP, BackendType.MLX_LM]).optional(),
+  backend_type: z.enum([BackendType.LLAMA_CPP, BackendType.MLX_LM, BackendType.VLLM]).optional(),
  backend_options: BackendOptionsSchema.optional(),
 })
 // Re-export types and schemas from backend files
 export {
  LlamaCppBackendOptionsSchema,
  MlxBackendOptionsSchema,
  VllmBackendOptionsSchema,
  type LlamaCppBackendOptions,
  type MlxBackendOptions,
  type VllmBackendOptions,
  getAllLlamaCppFieldKeys,
  getAllMlxFieldKeys,
  getAllVllmFieldKeys,
  getLlamaCppFieldType,
  getMlxFieldType,
  getVllmFieldType
 }
 // Infer the TypeScript types from the schemas
 export type LlamaCppBackendOptions = z.infer<typeof LlamaCppBackendOptionsSchema>
 export type MlxBackendOptions = z.infer<typeof MlxBackendOptionsSchema>
 export type BackendOptions = z.infer<typeof BackendOptionsSchema>
 export type CreateInstanceOptions = z.infer<typeof CreateInstanceOptionsSchema>
@@ -228,16 +63,6 @@ export function getAllFieldKeys(): (keyof CreateInstanceOptions)[] {
  return Object.keys(CreateInstanceOptionsSchema.shape) as (keyof CreateInstanceOptions)[]
 }
 // Helper to get all LlamaCpp backend option field keys
 export function getAllLlamaCppFieldKeys(): (keyof LlamaCppBackendOptions)[] {
  return Object.keys(LlamaCppBackendOptionsSchema.shape) as (keyof LlamaCppBackendOptions)[]
 }
 // Helper to get all MLX backend option field keys
 export function getAllMlxFieldKeys(): (keyof MlxBackendOptions)[] {
  return Object.keys(MlxBackendOptionsSchema.shape) as (keyof MlxBackendOptions)[]
 }
 // Get field type from Zod schema
 export function getFieldType(key: keyof CreateInstanceOptions): 'text' | 'number' | 'boolean' | 'array' | 'object' {
  const fieldSchema = CreateInstanceOptionsSchema.shape[key]
@@ -252,32 +77,3 @@ export function getFieldType(key: keyof CreateInstanceOptions): 'text' | 'number
  if (innerSchema instanceof z.ZodObject) return 'object'
  return 'text' // ZodString and others default to text
 }
 // Get field type for LlamaCpp backend options
 export function getLlamaCppFieldType(key: keyof LlamaCppBackendOptions): 'text' | 'number' | 'boolean' | 'array' {
  const fieldSchema = LlamaCppBackendOptionsSchema.shape[key]
  if (!fieldSchema) return 'text'
  // Handle ZodOptional wrapper
  const innerSchema = fieldSchema instanceof z.ZodOptional ? fieldSchema.unwrap() : fieldSchema
  if (innerSchema instanceof z.ZodBoolean) return 'boolean'
  if (innerSchema instanceof z.ZodNumber) return 'number'
  if (innerSchema instanceof z.ZodArray) return 'array'
  return 'text' // ZodString and others default to text
 }
 // Get field type for MLX backend options
 export function getMlxFieldType(key: keyof MlxBackendOptions): 'text' | 'number' | 'boolean' | 'array' {
  const fieldSchema = MlxBackendOptionsSchema.shape[key]
  if (!fieldSchema) return 'text'
  // Handle ZodOptional wrapper
  const innerSchema = fieldSchema instanceof z.ZodOptional ? fieldSchema.unwrap() : fieldSchema
  if (innerSchema instanceof z.ZodBoolean) return 'boolean'
  if (innerSchema instanceof z.ZodNumber) return 'number'
  if (innerSchema instanceof z.ZodArray) return 'array'
  if (innerSchema instanceof z.ZodEnum) return 'text' // Enum treated as text/select
  return 'text' // ZodString and others default to text
 }
--- a/webui/src/types/instance.ts
+++ b/webui/src/types/instance.ts
@@ -5,6 +5,7 @@ export { type CreateInstanceOptions } from '@/schemas/instanceOptions'
 export const BackendType = {
  LLAMA_CPP: 'llama_cpp',
  MLX_LM: 'mlx_lm',
  VLLM: 'vllm',
  // MLX_VLM: 'mlx_vlm',  // Future expansion
 } as const