Add vLLM backend support to webui

2025-12-25 02:24:22 +00:00 · 2025-09-21 20:58:43 +02:00
parent 7eb59aa7e0
commit b665194307
10 changed files with 545 additions and 258 deletions
--- a/webui/src/schemas/backends/index.ts
+++ b/webui/src/schemas/backends/index.ts
@@ -0,0 +1,4 @@
+// Re-export all backend schemas from one place
+export * from './llamacpp'
+export * from './mlx'
+export * from './vllm'
--- a/webui/src/schemas/backends/llamacpp.ts
+++ b/webui/src/schemas/backends/llamacpp.ts
@@ -0,0 +1,192 @@
+import { z } from 'zod'
+
+// Define the LlamaCpp backend options schema
+export const LlamaCppBackendOptionsSchema = z.object({
+  // Common params
+  verbose_prompt: z.boolean().optional(),
+  threads: z.number().optional(),
+  threads_batch: z.number().optional(),
+  cpu_mask: z.string().optional(),
+  cpu_range: z.string().optional(),
+  cpu_strict: z.number().optional(),
+  prio: z.number().optional(),
+  poll: z.number().optional(),
+  cpu_mask_batch: z.string().optional(),
+  cpu_range_batch: z.string().optional(),
+  cpu_strict_batch: z.number().optional(),
+  prio_batch: z.number().optional(),
+  poll_batch: z.number().optional(),
+  ctx_size: z.number().optional(),
+  predict: z.number().optional(),
+  batch_size: z.number().optional(),
+  ubatch_size: z.number().optional(),
+  keep: z.number().optional(),
+  flash_attn: z.boolean().optional(),
+  no_perf: z.boolean().optional(),
+  escape: z.boolean().optional(),
+  no_escape: z.boolean().optional(),
+  rope_scaling: z.string().optional(),
+  rope_scale: z.number().optional(),
+  rope_freq_base: z.number().optional(),
+  rope_freq_scale: z.number().optional(),
+  yarn_orig_ctx: z.number().optional(),
+  yarn_ext_factor: z.number().optional(),
+  yarn_attn_factor: z.number().optional(),
+  yarn_beta_slow: z.number().optional(),
+  yarn_beta_fast: z.number().optional(),
+  dump_kv_cache: z.boolean().optional(),
+  no_kv_offload: z.boolean().optional(),
+  cache_type_k: z.string().optional(),
+  cache_type_v: z.string().optional(),
+  defrag_thold: z.number().optional(),
+  parallel: z.number().optional(),
+  mlock: z.boolean().optional(),
+  no_mmap: z.boolean().optional(),
+  numa: z.string().optional(),
+  device: z.string().optional(),
+  override_tensor: z.array(z.string()).optional(),
+  gpu_layers: z.number().optional(),
+  split_mode: z.string().optional(),
+  tensor_split: z.string().optional(),
+  main_gpu: z.number().optional(),
+  check_tensors: z.boolean().optional(),
+  override_kv: z.array(z.string()).optional(),
+  lora: z.array(z.string()).optional(),
+  lora_scaled: z.array(z.string()).optional(),
+  control_vector: z.array(z.string()).optional(),
+  control_vector_scaled: z.array(z.string()).optional(),
+  control_vector_layer_range: z.string().optional(),
+  model: z.string().optional(),
+  model_url: z.string().optional(),
+  hf_repo: z.string().optional(),
+  hf_repo_draft: z.string().optional(),
+  hf_file: z.string().optional(),
+  hf_repo_v: z.string().optional(),
+  hf_file_v: z.string().optional(),
+  hf_token: z.string().optional(),
+  log_disable: z.boolean().optional(),
+  log_file: z.string().optional(),
+  log_colors: z.boolean().optional(),
+  verbose: z.boolean().optional(),
+  verbosity: z.number().optional(),
+  log_prefix: z.boolean().optional(),
+  log_timestamps: z.boolean().optional(),
+
+  // Sampling params
+  samplers: z.string().optional(),
+  seed: z.number().optional(),
+  sampling_seq: z.string().optional(),
+  ignore_eos: z.boolean().optional(),
+  temp: z.number().optional(),
+  top_k: z.number().optional(),
+  top_p: z.number().optional(),
+  min_p: z.number().optional(),
+  xtc_probability: z.number().optional(),
+  xtc_threshold: z.number().optional(),
+  typical: z.number().optional(),
+  repeat_last_n: z.number().optional(),
+  repeat_penalty: z.number().optional(),
+  presence_penalty: z.number().optional(),
+  frequency_penalty: z.number().optional(),
+  dry_multiplier: z.number().optional(),
+  dry_base: z.number().optional(),
+  dry_allowed_length: z.number().optional(),
+  dry_penalty_last_n: z.number().optional(),
+  dry_sequence_breaker: z.array(z.string()).optional(),
+  dynatemp_range: z.number().optional(),
+  dynatemp_exp: z.number().optional(),
+  mirostat: z.number().optional(),
+  mirostat_lr: z.number().optional(),
+  mirostat_ent: z.number().optional(),
+  logit_bias: z.array(z.string()).optional(),
+  grammar: z.string().optional(),
+  grammar_file: z.string().optional(),
+  json_schema: z.string().optional(),
+  json_schema_file: z.string().optional(),
+
+  // Example-specific params
+  no_context_shift: z.boolean().optional(),
+  special: z.boolean().optional(),
+  no_warmup: z.boolean().optional(),
+  spm_infill: z.boolean().optional(),
+  pooling: z.string().optional(),
+  cont_batching: z.boolean().optional(),
+  no_cont_batching: z.boolean().optional(),
+  mmproj: z.string().optional(),
+  mmproj_url: z.string().optional(),
+  no_mmproj: z.boolean().optional(),
+  no_mmproj_offload: z.boolean().optional(),
+  alias: z.string().optional(),
+  host: z.string().optional(),
+  port: z.number().optional(),
+  path: z.string().optional(),
+  no_webui: z.boolean().optional(),
+  embedding: z.boolean().optional(),
+  reranking: z.boolean().optional(),
+  api_key: z.string().optional(),
+  api_key_file: z.string().optional(),
+  ssl_key_file: z.string().optional(),
+  ssl_cert_file: z.string().optional(),
+  chat_template_kwargs: z.string().optional(),
+  timeout: z.number().optional(),
+  threads_http: z.number().optional(),
+  cache_reuse: z.number().optional(),
+  metrics: z.boolean().optional(),
+  slots: z.boolean().optional(),
+  props: z.boolean().optional(),
+  no_slots: z.boolean().optional(),
+  slot_save_path: z.string().optional(),
+  jinja: z.boolean().optional(),
+  reasoning_format: z.string().optional(),
+  reasoning_budget: z.number().optional(),
+  chat_template: z.string().optional(),
+  chat_template_file: z.string().optional(),
+  no_prefill_assistant: z.boolean().optional(),
+  slot_prompt_similarity: z.number().optional(),
+  lora_init_without_apply: z.boolean().optional(),
+  draft_max: z.number().optional(),
+  draft_min: z.number().optional(),
+  draft_p_min: z.number().optional(),
+  ctx_size_draft: z.number().optional(),
+  device_draft: z.string().optional(),
+  gpu_layers_draft: z.number().optional(),
+  model_draft: z.string().optional(),
+  cache_type_k_draft: z.string().optional(),
+  cache_type_v_draft: z.string().optional(),
+
+  // Audio/TTS params
+  model_vocoder: z.string().optional(),
+  tts_use_guide_tokens: z.boolean().optional(),
+
+  // Default model params
+  embd_bge_small_en_default: z.boolean().optional(),
+  embd_e5_small_en_default: z.boolean().optional(),
+  embd_gte_small_default: z.boolean().optional(),
+  fim_qwen_1_5b_default: z.boolean().optional(),
+  fim_qwen_3b_default: z.boolean().optional(),
+  fim_qwen_7b_default: z.boolean().optional(),
+  fim_qwen_7b_spec: z.boolean().optional(),
+  fim_qwen_14b_spec: z.boolean().optional(),
+})
+
+// Infer the TypeScript type from the schema
+export type LlamaCppBackendOptions = z.infer<typeof LlamaCppBackendOptionsSchema>
+
+// Helper to get all LlamaCpp backend option field keys
+export function getAllLlamaCppFieldKeys(): (keyof LlamaCppBackendOptions)[] {
+  return Object.keys(LlamaCppBackendOptionsSchema.shape) as (keyof LlamaCppBackendOptions)[]
+}
+
+// Get field type for LlamaCpp backend options
+export function getLlamaCppFieldType(key: keyof LlamaCppBackendOptions): 'text' | 'number' | 'boolean' | 'array' {
+  const fieldSchema = LlamaCppBackendOptionsSchema.shape[key]
+  if (!fieldSchema) return 'text'
+
+  // Handle ZodOptional wrapper
+  const innerSchema = fieldSchema instanceof z.ZodOptional ? fieldSchema.unwrap() : fieldSchema
+
+  if (innerSchema instanceof z.ZodBoolean) return 'boolean'
+  if (innerSchema instanceof z.ZodNumber) return 'number'
+  if (innerSchema instanceof z.ZodArray) return 'array'
+  return 'text' // ZodString and others default to text
+}
--- a/webui/src/schemas/backends/mlx.ts
+++ b/webui/src/schemas/backends/mlx.ts
@@ -0,0 +1,51 @@
+import { z } from 'zod'
+
+// Define the MLX backend options schema
+export const MlxBackendOptionsSchema = z.object({
+  // Basic connection options
+  model: z.string().optional(),
+  host: z.string().optional(),
+  port: z.number().optional(),
+
+  // Model and adapter options
+  adapter_path: z.string().optional(),
+  draft_model: z.string().optional(),
+  num_draft_tokens: z.number().optional(),
+  trust_remote_code: z.boolean().optional(),
+
+  // Logging and templates
+  log_level: z.enum(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']).optional(),
+  chat_template: z.string().optional(),
+  use_default_chat_template: z.boolean().optional(),
+  chat_template_args: z.string().optional(), // JSON string
+
+  // Sampling defaults
+  temp: z.number().optional(),     // Note: MLX uses "temp" not "temperature"
+  top_p: z.number().optional(),
+  top_k: z.number().optional(),
+  min_p: z.number().optional(),
+  max_tokens: z.number().optional(),
+})
+
+// Infer the TypeScript type from the schema
+export type MlxBackendOptions = z.infer<typeof MlxBackendOptionsSchema>
+
+// Helper to get all MLX backend option field keys
+export function getAllMlxFieldKeys(): (keyof MlxBackendOptions)[] {
+  return Object.keys(MlxBackendOptionsSchema.shape) as (keyof MlxBackendOptions)[]
+}
+
+// Get field type for MLX backend options
+export function getMlxFieldType(key: keyof MlxBackendOptions): 'text' | 'number' | 'boolean' | 'array' {
+  const fieldSchema = MlxBackendOptionsSchema.shape[key]
+  if (!fieldSchema) return 'text'
+
+  // Handle ZodOptional wrapper
+  const innerSchema = fieldSchema instanceof z.ZodOptional ? fieldSchema.unwrap() : fieldSchema
+
+  if (innerSchema instanceof z.ZodBoolean) return 'boolean'
+  if (innerSchema instanceof z.ZodNumber) return 'number'
+  if (innerSchema instanceof z.ZodArray) return 'array'
+  if (innerSchema instanceof z.ZodEnum) return 'text' // Enum treated as text/select
+  return 'text' // ZodString and others default to text
+}
--- a/webui/src/schemas/backends/vllm.ts
+++ b/webui/src/schemas/backends/vllm.ts
@@ -0,0 +1,150 @@
+import { z } from 'zod'
+
+// Define the vLLM backend options schema
+export const VllmBackendOptionsSchema = z.object({
+  // Basic connection options (auto-assigned by llamactl)
+  host: z.string().optional(),
+  port: z.number().optional(),
+
+  // Model and engine configuration
+  model: z.string().optional(),
+  tokenizer: z.string().optional(),
+  skip_tokenizer_init: z.boolean().optional(),
+  revision: z.string().optional(),
+  code_revision: z.string().optional(),
+  tokenizer_revision: z.string().optional(),
+  tokenizer_mode: z.string().optional(),
+  trust_remote_code: z.boolean().optional(),
+  download_dir: z.string().optional(),
+  load_format: z.string().optional(),
+  config_format: z.string().optional(),
+  dtype: z.string().optional(),
+  kv_cache_dtype: z.string().optional(),
+  quantization_param_path: z.string().optional(),
+  seed: z.number().optional(),
+  max_model_len: z.number().optional(),
+  guided_decoding_backend: z.string().optional(),
+  distributed_executor_backend: z.string().optional(),
+  worker_use_ray: z.boolean().optional(),
+  ray_workers_use_nsight: z.boolean().optional(),
+
+  // Performance and serving configuration
+  block_size: z.number().optional(),
+  enable_prefix_caching: z.boolean().optional(),
+  disable_sliding_window: z.boolean().optional(),
+  use_v2_block_manager: z.boolean().optional(),
+  num_lookahead_slots: z.number().optional(),
+  swap_space: z.number().optional(),
+  cpu_offload_gb: z.number().optional(),
+  gpu_memory_utilization: z.number().optional(),
+  num_gpu_blocks_override: z.number().optional(),
+  max_num_batched_tokens: z.number().optional(),
+  max_num_seqs: z.number().optional(),
+  max_logprobs: z.number().optional(),
+  disable_log_stats: z.boolean().optional(),
+  quantization: z.string().optional(),
+  rope_scaling: z.string().optional(),
+  rope_theta: z.number().optional(),
+  enforce_eager: z.boolean().optional(),
+  max_context_len_to_capture: z.number().optional(),
+  max_seq_len_to_capture: z.number().optional(),
+  disable_custom_all_reduce: z.boolean().optional(),
+  tokenizer_pool_size: z.number().optional(),
+  tokenizer_pool_type: z.string().optional(),
+  tokenizer_pool_extra_config: z.string().optional(),
+  enable_lora_bias: z.boolean().optional(),
+  lora_extra_vocab_size: z.number().optional(),
+  lora_rank: z.number().optional(),
+  prompt_lookback_distance: z.number().optional(),
+  preemption_mode: z.string().optional(),
+
+  // Distributed and parallel processing
+  tensor_parallel_size: z.number().optional(),
+  pipeline_parallel_size: z.number().optional(),
+  max_parallel_loading_workers: z.number().optional(),
+  disable_async_output_proc: z.boolean().optional(),
+  worker_class: z.string().optional(),
+  enabled_lora_modules: z.string().optional(),
+  max_lora_rank: z.number().optional(),
+  fully_sharded_loras: z.boolean().optional(),
+  lora_modules: z.string().optional(),
+  prompt_adapters: z.string().optional(),
+  max_prompt_adapter_token: z.number().optional(),
+  device: z.string().optional(),
+  scheduler_delay: z.number().optional(),
+  enable_chunked_prefill: z.boolean().optional(),
+  speculative_model: z.string().optional(),
+  speculative_model_quantization: z.string().optional(),
+  speculative_revision: z.string().optional(),
+  speculative_max_model_len: z.number().optional(),
+  speculative_disable_by_batch_size: z.number().optional(),
+  ngpt_speculative_length: z.number().optional(),
+  speculative_disable_mqa: z.boolean().optional(),
+  model_loader_extra_config: z.string().optional(),
+  ignore_patterns: z.string().optional(),
+  preloaded_lora_modules: z.string().optional(),
+
+  // OpenAI server specific options
+  uds: z.string().optional(),
+  uvicorn_log_level: z.string().optional(),
+  response_role: z.string().optional(),
+  ssl_keyfile: z.string().optional(),
+  ssl_certfile: z.string().optional(),
+  ssl_ca_certs: z.string().optional(),
+  ssl_cert_reqs: z.number().optional(),
+  root_path: z.string().optional(),
+  middleware: z.array(z.string()).optional(),
+  return_tokens_as_token_ids: z.boolean().optional(),
+  disable_frontend_multiprocessing: z.boolean().optional(),
+  enable_auto_tool_choice: z.boolean().optional(),
+  tool_call_parser: z.string().optional(),
+  tool_server: z.string().optional(),
+  chat_template: z.string().optional(),
+  chat_template_content_format: z.string().optional(),
+  allow_credentials: z.boolean().optional(),
+  allowed_origins: z.array(z.string()).optional(),
+  allowed_methods: z.array(z.string()).optional(),
+  allowed_headers: z.array(z.string()).optional(),
+  api_key: z.array(z.string()).optional(),
+  enable_log_outputs: z.boolean().optional(),
+  enable_token_usage: z.boolean().optional(),
+  enable_async_engine_debug: z.boolean().optional(),
+  engine_use_ray: z.boolean().optional(),
+  disable_log_requests: z.boolean().optional(),
+  max_log_len: z.number().optional(),
+
+  // Additional engine configuration
+  task: z.string().optional(),
+  multi_modal_config: z.string().optional(),
+  limit_mm_per_prompt: z.string().optional(),
+  enable_sleep_mode: z.boolean().optional(),
+  enable_chunking_request: z.boolean().optional(),
+  compilation_config: z.string().optional(),
+  disable_sliding_window_mask: z.boolean().optional(),
+  enable_trtllm_engine_latency: z.boolean().optional(),
+  override_pooling_config: z.string().optional(),
+  override_neuron_config: z.string().optional(),
+  override_kv_cache_align_size: z.number().optional(),
+})
+
+// Infer the TypeScript type from the schema
+export type VllmBackendOptions = z.infer<typeof VllmBackendOptionsSchema>
+
+// Helper to get all vLLM backend option field keys
+export function getAllVllmFieldKeys(): (keyof VllmBackendOptions)[] {
+  return Object.keys(VllmBackendOptionsSchema.shape) as (keyof VllmBackendOptions)[]
+}
+
+// Get field type for vLLM backend options
+export function getVllmFieldType(key: keyof VllmBackendOptions): 'text' | 'number' | 'boolean' | 'array' {
+  const fieldSchema = VllmBackendOptionsSchema.shape[key]
+  if (!fieldSchema) return 'text'
+
+  // Handle ZodOptional wrapper
+  const innerSchema = fieldSchema instanceof z.ZodOptional ? fieldSchema.unwrap() : fieldSchema
+
+  if (innerSchema instanceof z.ZodBoolean) return 'boolean'
+  if (innerSchema instanceof z.ZodNumber) return 'number'
+  if (innerSchema instanceof z.ZodArray) return 'array'
+  return 'text' // ZodString and others default to text
+}
--- a/webui/src/schemas/instanceOptions.ts
+++ b/webui/src/schemas/instanceOptions.ts
@@ -1,206 +1,27 @@
 import { BackendType } from '@/types/instance'
 import { z } from 'zod'

-// Define the LlamaCpp backend options schema
-export const LlamaCppBackendOptionsSchema = z.object({
-  // Common params
-  verbose_prompt: z.boolean().optional(),
-  threads: z.number().optional(),
-  threads_batch: z.number().optional(),
-  cpu_mask: z.string().optional(),
-  cpu_range: z.string().optional(),
-  cpu_strict: z.number().optional(),
-  prio: z.number().optional(),
-  poll: z.number().optional(),
-  cpu_mask_batch: z.string().optional(),
-  cpu_range_batch: z.string().optional(),
-  cpu_strict_batch: z.number().optional(),
-  prio_batch: z.number().optional(),
-  poll_batch: z.number().optional(),
-  ctx_size: z.number().optional(),
-  predict: z.number().optional(),
-  batch_size: z.number().optional(),
-  ubatch_size: z.number().optional(),
-  keep: z.number().optional(),
-  flash_attn: z.boolean().optional(),
-  no_perf: z.boolean().optional(),
-  escape: z.boolean().optional(),
-  no_escape: z.boolean().optional(),
-  rope_scaling: z.string().optional(),
-  rope_scale: z.number().optional(),
-  rope_freq_base: z.number().optional(),
-  rope_freq_scale: z.number().optional(),
-  yarn_orig_ctx: z.number().optional(),
-  yarn_ext_factor: z.number().optional(),
-  yarn_attn_factor: z.number().optional(),
-  yarn_beta_slow: z.number().optional(),
-  yarn_beta_fast: z.number().optional(),
-  dump_kv_cache: z.boolean().optional(),
-  no_kv_offload: z.boolean().optional(),
-  cache_type_k: z.string().optional(),
-  cache_type_v: z.string().optional(),
-  defrag_thold: z.number().optional(),
-  parallel: z.number().optional(),
-  mlock: z.boolean().optional(),
-  no_mmap: z.boolean().optional(),
-  numa: z.string().optional(),
-  device: z.string().optional(),
-  override_tensor: z.array(z.string()).optional(),
-  gpu_layers: z.number().optional(),
-  split_mode: z.string().optional(),
-  tensor_split: z.string().optional(),
-  main_gpu: z.number().optional(),
-  check_tensors: z.boolean().optional(),
-  override_kv: z.array(z.string()).optional(),
-  lora: z.array(z.string()).optional(),
-  lora_scaled: z.array(z.string()).optional(),
-  control_vector: z.array(z.string()).optional(),
-  control_vector_scaled: z.array(z.string()).optional(),
-  control_vector_layer_range: z.string().optional(),
-  model: z.string().optional(),
-  model_url: z.string().optional(),
-  hf_repo: z.string().optional(),
-  hf_repo_draft: z.string().optional(),
-  hf_file: z.string().optional(),
-  hf_repo_v: z.string().optional(),
-  hf_file_v: z.string().optional(),
-  hf_token: z.string().optional(),
-  log_disable: z.boolean().optional(),
-  log_file: z.string().optional(),
-  log_colors: z.boolean().optional(),
-  verbose: z.boolean().optional(),
-  verbosity: z.number().optional(),
-  log_prefix: z.boolean().optional(),
-  log_timestamps: z.boolean().optional(),
-
-  // Sampling params
-  samplers: z.string().optional(),
-  seed: z.number().optional(),
-  sampling_seq: z.string().optional(),
-  ignore_eos: z.boolean().optional(),
-  temp: z.number().optional(),
-  top_k: z.number().optional(),
-  top_p: z.number().optional(),
-  min_p: z.number().optional(),
-  xtc_probability: z.number().optional(),
-  xtc_threshold: z.number().optional(),
-  typical: z.number().optional(),
-  repeat_last_n: z.number().optional(),
-  repeat_penalty: z.number().optional(),
-  presence_penalty: z.number().optional(),
-  frequency_penalty: z.number().optional(),
-  dry_multiplier: z.number().optional(),
-  dry_base: z.number().optional(),
-  dry_allowed_length: z.number().optional(),
-  dry_penalty_last_n: z.number().optional(),
-  dry_sequence_breaker: z.array(z.string()).optional(),
-  dynatemp_range: z.number().optional(),
-  dynatemp_exp: z.number().optional(),
-  mirostat: z.number().optional(),
-  mirostat_lr: z.number().optional(),
-  mirostat_ent: z.number().optional(),
-  logit_bias: z.array(z.string()).optional(),
-  grammar: z.string().optional(),
-  grammar_file: z.string().optional(),
-  json_schema: z.string().optional(),
-  json_schema_file: z.string().optional(),
-
-  // Example-specific params
-  no_context_shift: z.boolean().optional(),
-  special: z.boolean().optional(),
-  no_warmup: z.boolean().optional(),
-  spm_infill: z.boolean().optional(),
-  pooling: z.string().optional(),
-  cont_batching: z.boolean().optional(),
-  no_cont_batching: z.boolean().optional(),
-  mmproj: z.string().optional(),
-  mmproj_url: z.string().optional(),
-  no_mmproj: z.boolean().optional(),
-  no_mmproj_offload: z.boolean().optional(),
-  alias: z.string().optional(),
-  host: z.string().optional(),
-  port: z.number().optional(),
-  path: z.string().optional(),
-  no_webui: z.boolean().optional(),
-  embedding: z.boolean().optional(),
-  reranking: z.boolean().optional(),
-  api_key: z.string().optional(),
-  api_key_file: z.string().optional(),
-  ssl_key_file: z.string().optional(),
-  ssl_cert_file: z.string().optional(),
-  chat_template_kwargs: z.string().optional(),
-  timeout: z.number().optional(),
-  threads_http: z.number().optional(),
-  cache_reuse: z.number().optional(),
-  metrics: z.boolean().optional(),
-  slots: z.boolean().optional(),
-  props: z.boolean().optional(),
-  no_slots: z.boolean().optional(),
-  slot_save_path: z.string().optional(),
-  jinja: z.boolean().optional(),
-  reasoning_format: z.string().optional(),
-  reasoning_budget: z.number().optional(),
-  chat_template: z.string().optional(),
-  chat_template_file: z.string().optional(),
-  no_prefill_assistant: z.boolean().optional(),
-  slot_prompt_similarity: z.number().optional(),
-  lora_init_without_apply: z.boolean().optional(),
-  draft_max: z.number().optional(),
-  draft_min: z.number().optional(),
-  draft_p_min: z.number().optional(),
-  ctx_size_draft: z.number().optional(),
-  device_draft: z.string().optional(),
-  gpu_layers_draft: z.number().optional(),
-  model_draft: z.string().optional(),
-  cache_type_k_draft: z.string().optional(),
-  cache_type_v_draft: z.string().optional(),
-
-  // Audio/TTS params
-  model_vocoder: z.string().optional(),
-  tts_use_guide_tokens: z.boolean().optional(),
-
-  // Default model params
-  embd_bge_small_en_default: z.boolean().optional(),
-  embd_e5_small_en_default: z.boolean().optional(),
-  embd_gte_small_default: z.boolean().optional(),
-  fim_qwen_1_5b_default: z.boolean().optional(),
-  fim_qwen_3b_default: z.boolean().optional(),
-  fim_qwen_7b_default: z.boolean().optional(),
-  fim_qwen_7b_spec: z.boolean().optional(),
-  fim_qwen_14b_spec: z.boolean().optional(),
-})
-
-// Define the MLX backend options schema
-export const MlxBackendOptionsSchema = z.object({
-  // Basic connection options
-  model: z.string().optional(),
-  host: z.string().optional(),
-  port: z.number().optional(),
-  
-  // Model and adapter options
-  adapter_path: z.string().optional(),
-  draft_model: z.string().optional(),
-  num_draft_tokens: z.number().optional(),
-  trust_remote_code: z.boolean().optional(),
-  
-  // Logging and templates
-  log_level: z.enum(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']).optional(),
-  chat_template: z.string().optional(),
-  use_default_chat_template: z.boolean().optional(),
-  chat_template_args: z.string().optional(), // JSON string
-  
-  // Sampling defaults
-  temp: z.number().optional(),     // Note: MLX uses "temp" not "temperature"
-  top_p: z.number().optional(),
-  top_k: z.number().optional(),
-  min_p: z.number().optional(),
-  max_tokens: z.number().optional(),
-})
+// Import backend schemas from separate files
+import {
+  LlamaCppBackendOptionsSchema,
+  type LlamaCppBackendOptions,
+  getAllLlamaCppFieldKeys,
+  getLlamaCppFieldType,
+  MlxBackendOptionsSchema,
+  type MlxBackendOptions,
+  getAllMlxFieldKeys,
+  getMlxFieldType,
+  VllmBackendOptionsSchema,
+  type VllmBackendOptions,
+  getAllVllmFieldKeys,
+  getVllmFieldType
+} from './backends'

 // Backend options union
 export const BackendOptionsSchema = z.union([
  LlamaCppBackendOptionsSchema,
  MlxBackendOptionsSchema,
+  VllmBackendOptionsSchema,
 ])

 // Define the main create instance options schema
@@ -213,13 +34,27 @@ export const CreateInstanceOptionsSchema = z.object({
  on_demand_start: z.boolean().optional(),

  // Backend configuration
-  backend_type: z.enum([BackendType.LLAMA_CPP, BackendType.MLX_LM]).optional(),
+  backend_type: z.enum([BackendType.LLAMA_CPP, BackendType.MLX_LM, BackendType.VLLM]).optional(),
  backend_options: BackendOptionsSchema.optional(),
 })

+// Re-export types and schemas from backend files
+export {
+  LlamaCppBackendOptionsSchema,
+  MlxBackendOptionsSchema,
+  VllmBackendOptionsSchema,
+  type LlamaCppBackendOptions,
+  type MlxBackendOptions,
+  type VllmBackendOptions,
+  getAllLlamaCppFieldKeys,
+  getAllMlxFieldKeys,
+  getAllVllmFieldKeys,
+  getLlamaCppFieldType,
+  getMlxFieldType,
+  getVllmFieldType
+}
+
 // Infer the TypeScript types from the schemas
-export type LlamaCppBackendOptions = z.infer<typeof LlamaCppBackendOptionsSchema>
-export type MlxBackendOptions = z.infer<typeof MlxBackendOptionsSchema>
 export type BackendOptions = z.infer<typeof BackendOptionsSchema>
 export type CreateInstanceOptions = z.infer<typeof CreateInstanceOptionsSchema>

@@ -228,56 +63,17 @@ export function getAllFieldKeys(): (keyof CreateInstanceOptions)[] {
  return Object.keys(CreateInstanceOptionsSchema.shape) as (keyof CreateInstanceOptions)[]
 }

-// Helper to get all LlamaCpp backend option field keys
-export function getAllLlamaCppFieldKeys(): (keyof LlamaCppBackendOptions)[] {
-  return Object.keys(LlamaCppBackendOptionsSchema.shape) as (keyof LlamaCppBackendOptions)[]
-}
-
-// Helper to get all MLX backend option field keys
-export function getAllMlxFieldKeys(): (keyof MlxBackendOptions)[] {
-  return Object.keys(MlxBackendOptionsSchema.shape) as (keyof MlxBackendOptions)[]
-}
-
 // Get field type from Zod schema
 export function getFieldType(key: keyof CreateInstanceOptions): 'text' | 'number' | 'boolean' | 'array' | 'object' {
  const fieldSchema = CreateInstanceOptionsSchema.shape[key]
  if (!fieldSchema) return 'text'
-  
+
  // Handle ZodOptional wrapper
  const innerSchema = fieldSchema instanceof z.ZodOptional ? fieldSchema.unwrap() : fieldSchema
-  
+
  if (innerSchema instanceof z.ZodBoolean) return 'boolean'
  if (innerSchema instanceof z.ZodNumber) return 'number'
  if (innerSchema instanceof z.ZodArray) return 'array'
  if (innerSchema instanceof z.ZodObject) return 'object'
  return 'text' // ZodString and others default to text
-}
-
-// Get field type for LlamaCpp backend options
-export function getLlamaCppFieldType(key: keyof LlamaCppBackendOptions): 'text' | 'number' | 'boolean' | 'array' {
-  const fieldSchema = LlamaCppBackendOptionsSchema.shape[key]
-  if (!fieldSchema) return 'text'
-  
-  // Handle ZodOptional wrapper
-  const innerSchema = fieldSchema instanceof z.ZodOptional ? fieldSchema.unwrap() : fieldSchema
-  
-  if (innerSchema instanceof z.ZodBoolean) return 'boolean'
-  if (innerSchema instanceof z.ZodNumber) return 'number'
-  if (innerSchema instanceof z.ZodArray) return 'array'
-  return 'text' // ZodString and others default to text
-}
-
-// Get field type for MLX backend options
-export function getMlxFieldType(key: keyof MlxBackendOptions): 'text' | 'number' | 'boolean' | 'array' {
-  const fieldSchema = MlxBackendOptionsSchema.shape[key]
-  if (!fieldSchema) return 'text'
-  
-  // Handle ZodOptional wrapper
-  const innerSchema = fieldSchema instanceof z.ZodOptional ? fieldSchema.unwrap() : fieldSchema
-  
-  if (innerSchema instanceof z.ZodBoolean) return 'boolean'
-  if (innerSchema instanceof z.ZodNumber) return 'number'
-  if (innerSchema instanceof z.ZodArray) return 'array'
-  if (innerSchema instanceof z.ZodEnum) return 'text' // Enum treated as text/select
-  return 'text' // ZodString and others default to text
 }