diff --git a/dev/getting-started/configuration/index.html b/dev/getting-started/configuration/index.html
index ec3a523..1eb3a74 100644
--- a/dev/getting-started/configuration/index.html
+++ b/dev/getting-started/configuration/index.html
@@ -853,28 +853,29 @@
 <a id="__codelineno-1-7" name="__codelineno-1-7" href="#__codelineno-1-7"></a><span class="nt">backends</span><span class="p">:</span>
 <a id="__codelineno-1-8" name="__codelineno-1-8" href="#__codelineno-1-8"></a><span class="w">  </span><span class="nt">llama_executable</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">llama-server</span><span class="w"> </span><span class="c1"># Path to llama-server executable</span>
 <a id="__codelineno-1-9" name="__codelineno-1-9" href="#__codelineno-1-9"></a><span class="w">  </span><span class="nt">mlx_lm_executable</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">mlx_lm.server</span><span class="w"> </span><span class="c1"># Path to mlx_lm.server executable</span>
-<a id="__codelineno-1-10" name="__codelineno-1-10" href="#__codelineno-1-10"></a>
-<a id="__codelineno-1-11" name="__codelineno-1-11" href="#__codelineno-1-11"></a><span class="nt">instances</span><span class="p">:</span>
-<a id="__codelineno-1-12" name="__codelineno-1-12" href="#__codelineno-1-12"></a><span class="w">  </span><span class="nt">port_range</span><span class="p">:</span><span class="w"> </span><span class="p p-Indicator">[</span><span class="nv">8000</span><span class="p p-Indicator">,</span><span class="w"> </span><span class="nv">9000</span><span class="p p-Indicator">]</span><span class="w">       </span><span class="c1"># Port range for instances</span>
-<a id="__codelineno-1-13" name="__codelineno-1-13" href="#__codelineno-1-13"></a><span class="w">  </span><span class="nt">data_dir</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">~/.local/share/llamactl</span><span class="w">         </span><span class="c1"># Data directory (platform-specific, see below)</span>
-<a id="__codelineno-1-14" name="__codelineno-1-14" href="#__codelineno-1-14"></a><span class="w">  </span><span class="nt">configs_dir</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">~/.local/share/llamactl/instances</span><span class="w">  </span><span class="c1"># Instance configs directory</span>
-<a id="__codelineno-1-15" name="__codelineno-1-15" href="#__codelineno-1-15"></a><span class="w">  </span><span class="nt">logs_dir</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">~/.local/share/llamactl/logs</span><span class="w">    </span><span class="c1"># Logs directory</span>
-<a id="__codelineno-1-16" name="__codelineno-1-16" href="#__codelineno-1-16"></a><span class="w">  </span><span class="nt">auto_create_dirs</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">true</span><span class="w">         </span><span class="c1"># Auto-create data/config/logs dirs if missing</span>
-<a id="__codelineno-1-17" name="__codelineno-1-17" href="#__codelineno-1-17"></a><span class="w">  </span><span class="nt">max_instances</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">-1</span><span class="w">              </span><span class="c1"># Max instances (-1 = unlimited)</span>
-<a id="__codelineno-1-18" name="__codelineno-1-18" href="#__codelineno-1-18"></a><span class="w">  </span><span class="nt">max_running_instances</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">-1</span><span class="w">      </span><span class="c1"># Max running instances (-1 = unlimited)</span>
-<a id="__codelineno-1-19" name="__codelineno-1-19" href="#__codelineno-1-19"></a><span class="w">  </span><span class="nt">enable_lru_eviction</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">true</span><span class="w">      </span><span class="c1"># Enable LRU eviction for idle instances</span>
-<a id="__codelineno-1-20" name="__codelineno-1-20" href="#__codelineno-1-20"></a><span class="w">  </span><span class="nt">default_auto_restart</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">true</span><span class="w">     </span><span class="c1"># Auto-restart new instances by default</span>
-<a id="__codelineno-1-21" name="__codelineno-1-21" href="#__codelineno-1-21"></a><span class="w">  </span><span class="nt">default_max_restarts</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">3</span><span class="w">        </span><span class="c1"># Max restarts for new instances</span>
-<a id="__codelineno-1-22" name="__codelineno-1-22" href="#__codelineno-1-22"></a><span class="w">  </span><span class="nt">default_restart_delay</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">5</span><span class="w">       </span><span class="c1"># Restart delay (seconds) for new instances</span>
-<a id="__codelineno-1-23" name="__codelineno-1-23" href="#__codelineno-1-23"></a><span class="w">  </span><span class="nt">default_on_demand_start</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">true</span><span class="w">  </span><span class="c1"># Default on-demand start setting</span>
-<a id="__codelineno-1-24" name="__codelineno-1-24" href="#__codelineno-1-24"></a><span class="w">  </span><span class="nt">on_demand_start_timeout</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">120</span><span class="w">   </span><span class="c1"># Default on-demand start timeout in seconds</span>
-<a id="__codelineno-1-25" name="__codelineno-1-25" href="#__codelineno-1-25"></a><span class="w">  </span><span class="nt">timeout_check_interval</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">5</span><span class="w">      </span><span class="c1"># Idle instance timeout check in minutes</span>
-<a id="__codelineno-1-26" name="__codelineno-1-26" href="#__codelineno-1-26"></a>
-<a id="__codelineno-1-27" name="__codelineno-1-27" href="#__codelineno-1-27"></a><span class="nt">auth</span><span class="p">:</span>
-<a id="__codelineno-1-28" name="__codelineno-1-28" href="#__codelineno-1-28"></a><span class="w">  </span><span class="nt">require_inference_auth</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">true</span><span class="w">   </span><span class="c1"># Require auth for inference endpoints</span>
-<a id="__codelineno-1-29" name="__codelineno-1-29" href="#__codelineno-1-29"></a><span class="w">  </span><span class="nt">inference_keys</span><span class="p">:</span><span class="w"> </span><span class="p p-Indicator">[]</span><span class="w">             </span><span class="c1"># Keys for inference endpoints</span>
-<a id="__codelineno-1-30" name="__codelineno-1-30" href="#__codelineno-1-30"></a><span class="w">  </span><span class="nt">require_management_auth</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">true</span><span class="w">  </span><span class="c1"># Require auth for management endpoints</span>
-<a id="__codelineno-1-31" name="__codelineno-1-31" href="#__codelineno-1-31"></a><span class="w">  </span><span class="nt">management_keys</span><span class="p">:</span><span class="w"> </span><span class="p p-Indicator">[]</span><span class="w">            </span><span class="c1"># Keys for management endpoints</span>
+<a id="__codelineno-1-10" name="__codelineno-1-10" href="#__codelineno-1-10"></a><span class="w">  </span><span class="nt">vllm_executable</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">vllm</span><span class="w"> </span><span class="c1"># Path to vllm executable</span>
+<a id="__codelineno-1-11" name="__codelineno-1-11" href="#__codelineno-1-11"></a>
+<a id="__codelineno-1-12" name="__codelineno-1-12" href="#__codelineno-1-12"></a><span class="nt">instances</span><span class="p">:</span>
+<a id="__codelineno-1-13" name="__codelineno-1-13" href="#__codelineno-1-13"></a><span class="w">  </span><span class="nt">port_range</span><span class="p">:</span><span class="w"> </span><span class="p p-Indicator">[</span><span class="nv">8000</span><span class="p p-Indicator">,</span><span class="w"> </span><span class="nv">9000</span><span class="p p-Indicator">]</span><span class="w">       </span><span class="c1"># Port range for instances</span>
+<a id="__codelineno-1-14" name="__codelineno-1-14" href="#__codelineno-1-14"></a><span class="w">  </span><span class="nt">data_dir</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">~/.local/share/llamactl</span><span class="w">         </span><span class="c1"># Data directory (platform-specific, see below)</span>
+<a id="__codelineno-1-15" name="__codelineno-1-15" href="#__codelineno-1-15"></a><span class="w">  </span><span class="nt">configs_dir</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">~/.local/share/llamactl/instances</span><span class="w">  </span><span class="c1"># Instance configs directory</span>
+<a id="__codelineno-1-16" name="__codelineno-1-16" href="#__codelineno-1-16"></a><span class="w">  </span><span class="nt">logs_dir</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">~/.local/share/llamactl/logs</span><span class="w">    </span><span class="c1"># Logs directory</span>
+<a id="__codelineno-1-17" name="__codelineno-1-17" href="#__codelineno-1-17"></a><span class="w">  </span><span class="nt">auto_create_dirs</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">true</span><span class="w">         </span><span class="c1"># Auto-create data/config/logs dirs if missing</span>
+<a id="__codelineno-1-18" name="__codelineno-1-18" href="#__codelineno-1-18"></a><span class="w">  </span><span class="nt">max_instances</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">-1</span><span class="w">              </span><span class="c1"># Max instances (-1 = unlimited)</span>
+<a id="__codelineno-1-19" name="__codelineno-1-19" href="#__codelineno-1-19"></a><span class="w">  </span><span class="nt">max_running_instances</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">-1</span><span class="w">      </span><span class="c1"># Max running instances (-1 = unlimited)</span>
+<a id="__codelineno-1-20" name="__codelineno-1-20" href="#__codelineno-1-20"></a><span class="w">  </span><span class="nt">enable_lru_eviction</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">true</span><span class="w">      </span><span class="c1"># Enable LRU eviction for idle instances</span>
+<a id="__codelineno-1-21" name="__codelineno-1-21" href="#__codelineno-1-21"></a><span class="w">  </span><span class="nt">default_auto_restart</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">true</span><span class="w">     </span><span class="c1"># Auto-restart new instances by default</span>
+<a id="__codelineno-1-22" name="__codelineno-1-22" href="#__codelineno-1-22"></a><span class="w">  </span><span class="nt">default_max_restarts</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">3</span><span class="w">        </span><span class="c1"># Max restarts for new instances</span>
+<a id="__codelineno-1-23" name="__codelineno-1-23" href="#__codelineno-1-23"></a><span class="w">  </span><span class="nt">default_restart_delay</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">5</span><span class="w">       </span><span class="c1"># Restart delay (seconds) for new instances</span>
+<a id="__codelineno-1-24" name="__codelineno-1-24" href="#__codelineno-1-24"></a><span class="w">  </span><span class="nt">default_on_demand_start</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">true</span><span class="w">  </span><span class="c1"># Default on-demand start setting</span>
+<a id="__codelineno-1-25" name="__codelineno-1-25" href="#__codelineno-1-25"></a><span class="w">  </span><span class="nt">on_demand_start_timeout</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">120</span><span class="w">   </span><span class="c1"># Default on-demand start timeout in seconds</span>
+<a id="__codelineno-1-26" name="__codelineno-1-26" href="#__codelineno-1-26"></a><span class="w">  </span><span class="nt">timeout_check_interval</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">5</span><span class="w">      </span><span class="c1"># Idle instance timeout check in minutes</span>
+<a id="__codelineno-1-27" name="__codelineno-1-27" href="#__codelineno-1-27"></a>
+<a id="__codelineno-1-28" name="__codelineno-1-28" href="#__codelineno-1-28"></a><span class="nt">auth</span><span class="p">:</span>
+<a id="__codelineno-1-29" name="__codelineno-1-29" href="#__codelineno-1-29"></a><span class="w">  </span><span class="nt">require_inference_auth</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">true</span><span class="w">   </span><span class="c1"># Require auth for inference endpoints</span>
+<a id="__codelineno-1-30" name="__codelineno-1-30" href="#__codelineno-1-30"></a><span class="w">  </span><span class="nt">inference_keys</span><span class="p">:</span><span class="w"> </span><span class="p p-Indicator">[]</span><span class="w">             </span><span class="c1"># Keys for inference endpoints</span>
+<a id="__codelineno-1-31" name="__codelineno-1-31" href="#__codelineno-1-31"></a><span class="w">  </span><span class="nt">require_management_auth</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">true</span><span class="w">  </span><span class="c1"># Require auth for management endpoints</span>
+<a id="__codelineno-1-32" name="__codelineno-1-32" href="#__codelineno-1-32"></a><span class="w">  </span><span class="nt">management_keys</span><span class="p">:</span><span class="w"> </span><span class="p p-Indicator">[]</span><span class="w">            </span><span class="c1"># Keys for management endpoints</span>
 </code></pre></div>
 <h2 id="configuration-files">Configuration Files<a class="headerlink" href="#configuration-files" title="Permanent link">&para;</a></h2>
 <h3 id="configuration-file-locations">Configuration File Locations<a class="headerlink" href="#configuration-file-locations" title="Permanent link">&para;</a></h3>
@@ -910,10 +911,12 @@
 <div class="highlight"><pre><span></span><code><a id="__codelineno-3-1" name="__codelineno-3-1" href="#__codelineno-3-1"></a><span class="nt">backends</span><span class="p">:</span>
 <a id="__codelineno-3-2" name="__codelineno-3-2" href="#__codelineno-3-2"></a><span class="w">  </span><span class="nt">llama_executable</span><span class="p">:</span><span class="w"> </span><span class="s">&quot;llama-server&quot;</span><span class="w">     </span><span class="c1"># Path to llama-server executable (default: &quot;llama-server&quot;)</span>
 <a id="__codelineno-3-3" name="__codelineno-3-3" href="#__codelineno-3-3"></a><span class="w">  </span><span class="nt">mlx_lm_executable</span><span class="p">:</span><span class="w"> </span><span class="s">&quot;mlx_lm.server&quot;</span><span class="w">   </span><span class="c1"># Path to mlx_lm.server executable (default: &quot;mlx_lm.server&quot;)</span>
+<a id="__codelineno-3-4" name="__codelineno-3-4" href="#__codelineno-3-4"></a><span class="w">  </span><span class="nt">vllm_executable</span><span class="p">:</span><span class="w"> </span><span class="s">&quot;vllm&quot;</span><span class="w">              </span><span class="c1"># Path to vllm executable (default: &quot;vllm&quot;)</span>
 </code></pre></div>
 <p><strong>Environment Variables:</strong>
 - <code>LLAMACTL_LLAMA_EXECUTABLE</code> - Path to llama-server executable
-- <code>LLAMACTL_MLX_LM_EXECUTABLE</code> - Path to mlx_lm.server executable</p>
+- <code>LLAMACTL_MLX_LM_EXECUTABLE</code> - Path to mlx_lm.server executable
+- <code>LLAMACTL_VLLM_EXECUTABLE</code> - Path to vllm executable</p>
 <h3 id="instance-configuration">Instance Configuration<a class="headerlink" href="#instance-configuration" title="Permanent link">&para;</a></h3>
 <div class="highlight"><pre><span></span><code><a id="__codelineno-4-1" name="__codelineno-4-1" href="#__codelineno-4-1"></a><span class="nt">instances</span><span class="p">:</span>
 <a id="__codelineno-4-2" name="__codelineno-4-2" href="#__codelineno-4-2"></a><span class="w">  </span><span class="nt">port_range</span><span class="p">:</span><span class="w"> </span><span class="p p-Indicator">[</span><span class="nv">8000</span><span class="p p-Indicator">,</span><span class="w"> </span><span class="nv">9000</span><span class="p p-Indicator">]</span><span class="w">                          </span><span class="c1"># Port range for instances (default: [8000, 9000])</span>
@@ -983,7 +986,7 @@
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-date">September 18, 2025</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-date">September 21, 2025</span>
   </span>
 
     
diff --git a/dev/getting-started/installation/index.html b/dev/getting-started/installation/index.html
index 0cced7b..4e5053e 100644
--- a/dev/getting-started/installation/index.html
+++ b/dev/getting-started/installation/index.html
@@ -825,18 +825,30 @@
 <a id="__codelineno-1-7" name="__codelineno-1-7" href="#__codelineno-1-7"></a>pip<span class="w"> </span>install<span class="w"> </span>mlx-lm
 </code></pre></div>
 <p>Note: MLX backend is only available on macOS with Apple Silicon (M1, M2, M3, etc.)</p>
+<p><strong>For vLLM backend:</strong></p>
+<p>vLLM provides high-throughput distributed serving for LLMs. Install vLLM:</p>
+<div class="highlight"><pre><span></span><code><a id="__codelineno-2-1" name="__codelineno-2-1" href="#__codelineno-2-1"></a><span class="c1"># Install via pip (requires Python 3.8+, GPU required)</span>
+<a id="__codelineno-2-2" name="__codelineno-2-2" href="#__codelineno-2-2"></a>pip<span class="w"> </span>install<span class="w"> </span>vllm
+<a id="__codelineno-2-3" name="__codelineno-2-3" href="#__codelineno-2-3"></a>
+<a id="__codelineno-2-4" name="__codelineno-2-4" href="#__codelineno-2-4"></a><span class="c1"># Or in a virtual environment (recommended)</span>
+<a id="__codelineno-2-5" name="__codelineno-2-5" href="#__codelineno-2-5"></a>python<span class="w"> </span>-m<span class="w"> </span>venv<span class="w"> </span>vllm-env
+<a id="__codelineno-2-6" name="__codelineno-2-6" href="#__codelineno-2-6"></a><span class="nb">source</span><span class="w"> </span>vllm-env/bin/activate
+<a id="__codelineno-2-7" name="__codelineno-2-7" href="#__codelineno-2-7"></a>pip<span class="w"> </span>install<span class="w"> </span>vllm
+<a id="__codelineno-2-8" name="__codelineno-2-8" href="#__codelineno-2-8"></a>
+<a id="__codelineno-2-9" name="__codelineno-2-9" href="#__codelineno-2-9"></a><span class="c1"># For production deployments, consider container-based installation</span>
+</code></pre></div>
 <h2 id="installation-methods">Installation Methods<a class="headerlink" href="#installation-methods" title="Permanent link">&para;</a></h2>
 <h3 id="option-1-download-binary-recommended">Option 1: Download Binary (Recommended)<a class="headerlink" href="#option-1-download-binary-recommended" title="Permanent link">&para;</a></h3>
 <p>Download the latest release from the <a href="https://github.com/lordmathis/llamactl/releases">GitHub releases page</a>:</p>
-<div class="highlight"><pre><span></span><code><a id="__codelineno-2-1" name="__codelineno-2-1" href="#__codelineno-2-1"></a><span class="c1"># Linux/macOS - Get latest version and download</span>
-<a id="__codelineno-2-2" name="__codelineno-2-2" href="#__codelineno-2-2"></a><span class="nv">LATEST_VERSION</span><span class="o">=</span><span class="k">$(</span>curl<span class="w"> </span>-s<span class="w"> </span>https://api.github.com/repos/lordmathis/llamactl/releases/latest<span class="w"> </span><span class="p">|</span><span class="w"> </span>grep<span class="w"> </span><span class="s1">&#39;&quot;tag_name&quot;:&#39;</span><span class="w"> </span><span class="p">|</span><span class="w"> </span>sed<span class="w"> </span>-E<span class="w"> </span><span class="s1">&#39;s/.*&quot;([^&quot;]+)&quot;.*/\1/&#39;</span><span class="k">)</span>
-<a id="__codelineno-2-3" name="__codelineno-2-3" href="#__codelineno-2-3"></a>curl<span class="w"> </span>-L<span class="w"> </span>https://github.com/lordmathis/llamactl/releases/download/<span class="si">${</span><span class="nv">LATEST_VERSION</span><span class="si">}</span>/llamactl-<span class="si">${</span><span class="nv">LATEST_VERSION</span><span class="si">}</span>-<span class="k">$(</span>uname<span class="w"> </span>-s<span class="w"> </span><span class="p">|</span><span class="w"> </span>tr<span class="w"> </span><span class="s1">&#39;[:upper:]&#39;</span><span class="w"> </span><span class="s1">&#39;[:lower:]&#39;</span><span class="k">)</span>-<span class="k">$(</span>uname<span class="w"> </span>-m<span class="k">)</span>.tar.gz<span class="w"> </span><span class="p">|</span><span class="w"> </span>tar<span class="w"> </span>-xz
-<a id="__codelineno-2-4" name="__codelineno-2-4" href="#__codelineno-2-4"></a>sudo<span class="w"> </span>mv<span class="w"> </span>llamactl<span class="w"> </span>/usr/local/bin/
-<a id="__codelineno-2-5" name="__codelineno-2-5" href="#__codelineno-2-5"></a>
-<a id="__codelineno-2-6" name="__codelineno-2-6" href="#__codelineno-2-6"></a><span class="c1"># Or download manually from:</span>
-<a id="__codelineno-2-7" name="__codelineno-2-7" href="#__codelineno-2-7"></a><span class="c1"># https://github.com/lordmathis/llamactl/releases/latest</span>
-<a id="__codelineno-2-8" name="__codelineno-2-8" href="#__codelineno-2-8"></a>
-<a id="__codelineno-2-9" name="__codelineno-2-9" href="#__codelineno-2-9"></a><span class="c1"># Windows - Download from releases page</span>
+<div class="highlight"><pre><span></span><code><a id="__codelineno-3-1" name="__codelineno-3-1" href="#__codelineno-3-1"></a><span class="c1"># Linux/macOS - Get latest version and download</span>
+<a id="__codelineno-3-2" name="__codelineno-3-2" href="#__codelineno-3-2"></a><span class="nv">LATEST_VERSION</span><span class="o">=</span><span class="k">$(</span>curl<span class="w"> </span>-s<span class="w"> </span>https://api.github.com/repos/lordmathis/llamactl/releases/latest<span class="w"> </span><span class="p">|</span><span class="w"> </span>grep<span class="w"> </span><span class="s1">&#39;&quot;tag_name&quot;:&#39;</span><span class="w"> </span><span class="p">|</span><span class="w"> </span>sed<span class="w"> </span>-E<span class="w"> </span><span class="s1">&#39;s/.*&quot;([^&quot;]+)&quot;.*/\1/&#39;</span><span class="k">)</span>
+<a id="__codelineno-3-3" name="__codelineno-3-3" href="#__codelineno-3-3"></a>curl<span class="w"> </span>-L<span class="w"> </span>https://github.com/lordmathis/llamactl/releases/download/<span class="si">${</span><span class="nv">LATEST_VERSION</span><span class="si">}</span>/llamactl-<span class="si">${</span><span class="nv">LATEST_VERSION</span><span class="si">}</span>-<span class="k">$(</span>uname<span class="w"> </span>-s<span class="w"> </span><span class="p">|</span><span class="w"> </span>tr<span class="w"> </span><span class="s1">&#39;[:upper:]&#39;</span><span class="w"> </span><span class="s1">&#39;[:lower:]&#39;</span><span class="k">)</span>-<span class="k">$(</span>uname<span class="w"> </span>-m<span class="k">)</span>.tar.gz<span class="w"> </span><span class="p">|</span><span class="w"> </span>tar<span class="w"> </span>-xz
+<a id="__codelineno-3-4" name="__codelineno-3-4" href="#__codelineno-3-4"></a>sudo<span class="w"> </span>mv<span class="w"> </span>llamactl<span class="w"> </span>/usr/local/bin/
+<a id="__codelineno-3-5" name="__codelineno-3-5" href="#__codelineno-3-5"></a>
+<a id="__codelineno-3-6" name="__codelineno-3-6" href="#__codelineno-3-6"></a><span class="c1"># Or download manually from:</span>
+<a id="__codelineno-3-7" name="__codelineno-3-7" href="#__codelineno-3-7"></a><span class="c1"># https://github.com/lordmathis/llamactl/releases/latest</span>
+<a id="__codelineno-3-8" name="__codelineno-3-8" href="#__codelineno-3-8"></a>
+<a id="__codelineno-3-9" name="__codelineno-3-9" href="#__codelineno-3-9"></a><span class="c1"># Windows - Download from releases page</span>
 </code></pre></div>
 <h3 id="option-2-build-from-source">Option 2: Build from Source<a class="headerlink" href="#option-2-build-from-source" title="Permanent link">&para;</a></h3>
 <p>Requirements:
@@ -844,19 +856,19 @@
 - Node.js 22 or later
 - Git</p>
 <p>If you prefer to build from source:</p>
-<div class="highlight"><pre><span></span><code><a id="__codelineno-3-1" name="__codelineno-3-1" href="#__codelineno-3-1"></a><span class="c1"># Clone the repository</span>
-<a id="__codelineno-3-2" name="__codelineno-3-2" href="#__codelineno-3-2"></a>git<span class="w"> </span>clone<span class="w"> </span>https://github.com/lordmathis/llamactl.git
-<a id="__codelineno-3-3" name="__codelineno-3-3" href="#__codelineno-3-3"></a><span class="nb">cd</span><span class="w"> </span>llamactl
-<a id="__codelineno-3-4" name="__codelineno-3-4" href="#__codelineno-3-4"></a>
-<a id="__codelineno-3-5" name="__codelineno-3-5" href="#__codelineno-3-5"></a><span class="c1"># Build the web UI</span>
-<a id="__codelineno-3-6" name="__codelineno-3-6" href="#__codelineno-3-6"></a><span class="nb">cd</span><span class="w"> </span>webui<span class="w"> </span><span class="o">&amp;&amp;</span><span class="w"> </span>npm<span class="w"> </span>ci<span class="w"> </span><span class="o">&amp;&amp;</span><span class="w"> </span>npm<span class="w"> </span>run<span class="w"> </span>build<span class="w"> </span><span class="o">&amp;&amp;</span><span class="w"> </span><span class="nb">cd</span><span class="w"> </span>..
-<a id="__codelineno-3-7" name="__codelineno-3-7" href="#__codelineno-3-7"></a>
-<a id="__codelineno-3-8" name="__codelineno-3-8" href="#__codelineno-3-8"></a><span class="c1"># Build the application</span>
-<a id="__codelineno-3-9" name="__codelineno-3-9" href="#__codelineno-3-9"></a>go<span class="w"> </span>build<span class="w"> </span>-o<span class="w"> </span>llamactl<span class="w"> </span>./cmd/server
+<div class="highlight"><pre><span></span><code><a id="__codelineno-4-1" name="__codelineno-4-1" href="#__codelineno-4-1"></a><span class="c1"># Clone the repository</span>
+<a id="__codelineno-4-2" name="__codelineno-4-2" href="#__codelineno-4-2"></a>git<span class="w"> </span>clone<span class="w"> </span>https://github.com/lordmathis/llamactl.git
+<a id="__codelineno-4-3" name="__codelineno-4-3" href="#__codelineno-4-3"></a><span class="nb">cd</span><span class="w"> </span>llamactl
+<a id="__codelineno-4-4" name="__codelineno-4-4" href="#__codelineno-4-4"></a>
+<a id="__codelineno-4-5" name="__codelineno-4-5" href="#__codelineno-4-5"></a><span class="c1"># Build the web UI</span>
+<a id="__codelineno-4-6" name="__codelineno-4-6" href="#__codelineno-4-6"></a><span class="nb">cd</span><span class="w"> </span>webui<span class="w"> </span><span class="o">&amp;&amp;</span><span class="w"> </span>npm<span class="w"> </span>ci<span class="w"> </span><span class="o">&amp;&amp;</span><span class="w"> </span>npm<span class="w"> </span>run<span class="w"> </span>build<span class="w"> </span><span class="o">&amp;&amp;</span><span class="w"> </span><span class="nb">cd</span><span class="w"> </span>..
+<a id="__codelineno-4-7" name="__codelineno-4-7" href="#__codelineno-4-7"></a>
+<a id="__codelineno-4-8" name="__codelineno-4-8" href="#__codelineno-4-8"></a><span class="c1"># Build the application</span>
+<a id="__codelineno-4-9" name="__codelineno-4-9" href="#__codelineno-4-9"></a>go<span class="w"> </span>build<span class="w"> </span>-o<span class="w"> </span>llamactl<span class="w"> </span>./cmd/server
 </code></pre></div>
 <h2 id="verification">Verification<a class="headerlink" href="#verification" title="Permanent link">&para;</a></h2>
 <p>Verify your installation by checking the version:</p>
-<div class="highlight"><pre><span></span><code><a id="__codelineno-4-1" name="__codelineno-4-1" href="#__codelineno-4-1"></a>llamactl<span class="w"> </span>--version
+<div class="highlight"><pre><span></span><code><a id="__codelineno-5-1" name="__codelineno-5-1" href="#__codelineno-5-1"></a>llamactl<span class="w"> </span>--version
 </code></pre></div>
 <h2 id="next-steps">Next Steps<a class="headerlink" href="#next-steps" title="Permanent link">&para;</a></h2>
 <p>Now that Llamactl is installed, continue to the <a href="../quick-start/">Quick Start</a> guide to get your first instance running!</p>
@@ -880,7 +892,7 @@
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-date">September 18, 2025</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-date">September 21, 2025</span>
   </span>
 
     
diff --git a/dev/getting-started/quick-start/index.html b/dev/getting-started/quick-start/index.html
index 204e8f1..9318927 100644
--- a/dev/getting-started/quick-start/index.html
+++ b/dev/getting-started/quick-start/index.html
@@ -495,9 +495,9 @@
 </li>
       
         <li class="md-nav__item">
-  <a href="#example-configuration" class="md-nav__link">
+  <a href="#example-configurations" class="md-nav__link">
     <span class="md-ellipsis">
-      Example Configuration
+      Example Configurations
     </span>
   </a>
   
@@ -775,9 +775,9 @@
 </li>
       
         <li class="md-nav__item">
-  <a href="#example-configuration" class="md-nav__link">
+  <a href="#example-configurations" class="md-nav__link">
     <span class="md-ellipsis">
-      Example Configuration
+      Example Configurations
     </span>
   </a>
   
@@ -879,9 +879,10 @@
 <li>Click the "Add Instance" button</li>
 <li>Fill in the instance configuration:</li>
 <li><strong>Name</strong>: Give your instance a descriptive name</li>
-<li><strong>Model Path</strong>: Path to your Llama.cpp model file</li>
+<li><strong>Backend Type</strong>: Choose from llama.cpp, MLX, or vLLM</li>
+<li><strong>Model</strong>: Model path or identifier for your chosen backend</li>
 <li>
-<p><strong>Additional Options</strong>: Any extra Llama.cpp parameters</p>
+<p><strong>Additional Options</strong>: Backend-specific parameters</p>
 </li>
 <li>
 <p>Click "Create Instance"</p>
@@ -895,76 +896,103 @@
 <li><strong>View logs</strong> by clicking the logs button</li>
 <li><strong>Stop</strong> the instance when needed</li>
 </ul>
-<h2 id="example-configuration">Example Configuration<a class="headerlink" href="#example-configuration" title="Permanent link">&para;</a></h2>
-<p>Here's a basic example configuration for a Llama 2 model:</p>
+<h2 id="example-configurations">Example Configurations<a class="headerlink" href="#example-configurations" title="Permanent link">&para;</a></h2>
+<p>Here are basic example configurations for each backend:</p>
+<p><strong>llama.cpp backend:</strong>
 <div class="highlight"><pre><span></span><code><a id="__codelineno-2-1" name="__codelineno-2-1" href="#__codelineno-2-1"></a><span class="p">{</span>
 <a id="__codelineno-2-2" name="__codelineno-2-2" href="#__codelineno-2-2"></a><span class="w">  </span><span class="nt">&quot;name&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;llama2-7b&quot;</span><span class="p">,</span>
-<a id="__codelineno-2-3" name="__codelineno-2-3" href="#__codelineno-2-3"></a><span class="w">  </span><span class="nt">&quot;model_path&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;/path/to/llama-2-7b-chat.gguf&quot;</span><span class="p">,</span>
-<a id="__codelineno-2-4" name="__codelineno-2-4" href="#__codelineno-2-4"></a><span class="w">  </span><span class="nt">&quot;options&quot;</span><span class="p">:</span><span class="w"> </span><span class="p">{</span>
-<a id="__codelineno-2-5" name="__codelineno-2-5" href="#__codelineno-2-5"></a><span class="w">    </span><span class="nt">&quot;threads&quot;</span><span class="p">:</span><span class="w"> </span><span class="mi">4</span><span class="p">,</span>
-<a id="__codelineno-2-6" name="__codelineno-2-6" href="#__codelineno-2-6"></a><span class="w">    </span><span class="nt">&quot;context_size&quot;</span><span class="p">:</span><span class="w"> </span><span class="mi">2048</span>
-<a id="__codelineno-2-7" name="__codelineno-2-7" href="#__codelineno-2-7"></a><span class="w">  </span><span class="p">}</span>
-<a id="__codelineno-2-8" name="__codelineno-2-8" href="#__codelineno-2-8"></a><span class="p">}</span>
-</code></pre></div>
+<a id="__codelineno-2-3" name="__codelineno-2-3" href="#__codelineno-2-3"></a><span class="w">  </span><span class="nt">&quot;backend_type&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;llama_cpp&quot;</span><span class="p">,</span>
+<a id="__codelineno-2-4" name="__codelineno-2-4" href="#__codelineno-2-4"></a><span class="w">  </span><span class="nt">&quot;backend_options&quot;</span><span class="p">:</span><span class="w"> </span><span class="p">{</span>
+<a id="__codelineno-2-5" name="__codelineno-2-5" href="#__codelineno-2-5"></a><span class="w">    </span><span class="nt">&quot;model&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;/path/to/llama-2-7b-chat.gguf&quot;</span><span class="p">,</span>
+<a id="__codelineno-2-6" name="__codelineno-2-6" href="#__codelineno-2-6"></a><span class="w">    </span><span class="nt">&quot;threads&quot;</span><span class="p">:</span><span class="w"> </span><span class="mi">4</span><span class="p">,</span>
+<a id="__codelineno-2-7" name="__codelineno-2-7" href="#__codelineno-2-7"></a><span class="w">    </span><span class="nt">&quot;ctx_size&quot;</span><span class="p">:</span><span class="w"> </span><span class="mi">2048</span><span class="p">,</span>
+<a id="__codelineno-2-8" name="__codelineno-2-8" href="#__codelineno-2-8"></a><span class="w">    </span><span class="nt">&quot;gpu_layers&quot;</span><span class="p">:</span><span class="w"> </span><span class="mi">32</span>
+<a id="__codelineno-2-9" name="__codelineno-2-9" href="#__codelineno-2-9"></a><span class="w">  </span><span class="p">}</span>
+<a id="__codelineno-2-10" name="__codelineno-2-10" href="#__codelineno-2-10"></a><span class="p">}</span>
+</code></pre></div></p>
+<p><strong>MLX backend (macOS only):</strong>
+<div class="highlight"><pre><span></span><code><a id="__codelineno-3-1" name="__codelineno-3-1" href="#__codelineno-3-1"></a><span class="p">{</span>
+<a id="__codelineno-3-2" name="__codelineno-3-2" href="#__codelineno-3-2"></a><span class="w">  </span><span class="nt">&quot;name&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;mistral-mlx&quot;</span><span class="p">,</span>
+<a id="__codelineno-3-3" name="__codelineno-3-3" href="#__codelineno-3-3"></a><span class="w">  </span><span class="nt">&quot;backend_type&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;mlx_lm&quot;</span><span class="p">,</span>
+<a id="__codelineno-3-4" name="__codelineno-3-4" href="#__codelineno-3-4"></a><span class="w">  </span><span class="nt">&quot;backend_options&quot;</span><span class="p">:</span><span class="w"> </span><span class="p">{</span>
+<a id="__codelineno-3-5" name="__codelineno-3-5" href="#__codelineno-3-5"></a><span class="w">    </span><span class="nt">&quot;model&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;mlx-community/Mistral-7B-Instruct-v0.3-4bit&quot;</span><span class="p">,</span>
+<a id="__codelineno-3-6" name="__codelineno-3-6" href="#__codelineno-3-6"></a><span class="w">    </span><span class="nt">&quot;temp&quot;</span><span class="p">:</span><span class="w"> </span><span class="mf">0.7</span><span class="p">,</span>
+<a id="__codelineno-3-7" name="__codelineno-3-7" href="#__codelineno-3-7"></a><span class="w">    </span><span class="nt">&quot;max_tokens&quot;</span><span class="p">:</span><span class="w"> </span><span class="mi">2048</span>
+<a id="__codelineno-3-8" name="__codelineno-3-8" href="#__codelineno-3-8"></a><span class="w">  </span><span class="p">}</span>
+<a id="__codelineno-3-9" name="__codelineno-3-9" href="#__codelineno-3-9"></a><span class="p">}</span>
+</code></pre></div></p>
+<p><strong>vLLM backend:</strong>
+<div class="highlight"><pre><span></span><code><a id="__codelineno-4-1" name="__codelineno-4-1" href="#__codelineno-4-1"></a><span class="p">{</span>
+<a id="__codelineno-4-2" name="__codelineno-4-2" href="#__codelineno-4-2"></a><span class="w">  </span><span class="nt">&quot;name&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;dialogpt-vllm&quot;</span><span class="p">,</span>
+<a id="__codelineno-4-3" name="__codelineno-4-3" href="#__codelineno-4-3"></a><span class="w">  </span><span class="nt">&quot;backend_type&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;vllm&quot;</span><span class="p">,</span>
+<a id="__codelineno-4-4" name="__codelineno-4-4" href="#__codelineno-4-4"></a><span class="w">  </span><span class="nt">&quot;backend_options&quot;</span><span class="p">:</span><span class="w"> </span><span class="p">{</span>
+<a id="__codelineno-4-5" name="__codelineno-4-5" href="#__codelineno-4-5"></a><span class="w">    </span><span class="nt">&quot;model&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;microsoft/DialoGPT-medium&quot;</span><span class="p">,</span>
+<a id="__codelineno-4-6" name="__codelineno-4-6" href="#__codelineno-4-6"></a><span class="w">    </span><span class="nt">&quot;tensor_parallel_size&quot;</span><span class="p">:</span><span class="w"> </span><span class="mi">2</span><span class="p">,</span>
+<a id="__codelineno-4-7" name="__codelineno-4-7" href="#__codelineno-4-7"></a><span class="w">    </span><span class="nt">&quot;gpu_memory_utilization&quot;</span><span class="p">:</span><span class="w"> </span><span class="mf">0.9</span>
+<a id="__codelineno-4-8" name="__codelineno-4-8" href="#__codelineno-4-8"></a><span class="w">  </span><span class="p">}</span>
+<a id="__codelineno-4-9" name="__codelineno-4-9" href="#__codelineno-4-9"></a><span class="p">}</span>
+</code></pre></div></p>
 <h2 id="using-the-api">Using the API<a class="headerlink" href="#using-the-api" title="Permanent link">&para;</a></h2>
 <p>You can also manage instances via the REST API:</p>
-<div class="highlight"><pre><span></span><code><a id="__codelineno-3-1" name="__codelineno-3-1" href="#__codelineno-3-1"></a><span class="c1"># List all instances</span>
-<a id="__codelineno-3-2" name="__codelineno-3-2" href="#__codelineno-3-2"></a>curl<span class="w"> </span>http://localhost:8080/api/instances
-<a id="__codelineno-3-3" name="__codelineno-3-3" href="#__codelineno-3-3"></a>
-<a id="__codelineno-3-4" name="__codelineno-3-4" href="#__codelineno-3-4"></a><span class="c1"># Create a new instance</span>
-<a id="__codelineno-3-5" name="__codelineno-3-5" href="#__codelineno-3-5"></a>curl<span class="w"> </span>-X<span class="w"> </span>POST<span class="w"> </span>http://localhost:8080/api/instances<span class="w"> </span><span class="se">\</span>
-<a id="__codelineno-3-6" name="__codelineno-3-6" href="#__codelineno-3-6"></a><span class="w">  </span>-H<span class="w"> </span><span class="s2">&quot;Content-Type: application/json&quot;</span><span class="w"> </span><span class="se">\</span>
-<a id="__codelineno-3-7" name="__codelineno-3-7" href="#__codelineno-3-7"></a><span class="w">  </span>-d<span class="w"> </span><span class="s1">&#39;{</span>
-<a id="__codelineno-3-8" name="__codelineno-3-8" href="#__codelineno-3-8"></a><span class="s1">    &quot;name&quot;: &quot;my-model&quot;,</span>
-<a id="__codelineno-3-9" name="__codelineno-3-9" href="#__codelineno-3-9"></a><span class="s1">    &quot;model_path&quot;: &quot;/path/to/model.gguf&quot;,</span>
-<a id="__codelineno-3-10" name="__codelineno-3-10" href="#__codelineno-3-10"></a><span class="s1">  }&#39;</span>
-<a id="__codelineno-3-11" name="__codelineno-3-11" href="#__codelineno-3-11"></a>
-<a id="__codelineno-3-12" name="__codelineno-3-12" href="#__codelineno-3-12"></a><span class="c1"># Start an instance</span>
-<a id="__codelineno-3-13" name="__codelineno-3-13" href="#__codelineno-3-13"></a>curl<span class="w"> </span>-X<span class="w"> </span>POST<span class="w"> </span>http://localhost:8080/api/instances/my-model/start
+<div class="highlight"><pre><span></span><code><a id="__codelineno-5-1" name="__codelineno-5-1" href="#__codelineno-5-1"></a><span class="c1"># List all instances</span>
+<a id="__codelineno-5-2" name="__codelineno-5-2" href="#__codelineno-5-2"></a>curl<span class="w"> </span>http://localhost:8080/api/instances
+<a id="__codelineno-5-3" name="__codelineno-5-3" href="#__codelineno-5-3"></a>
+<a id="__codelineno-5-4" name="__codelineno-5-4" href="#__codelineno-5-4"></a><span class="c1"># Create a new llama.cpp instance</span>
+<a id="__codelineno-5-5" name="__codelineno-5-5" href="#__codelineno-5-5"></a>curl<span class="w"> </span>-X<span class="w"> </span>POST<span class="w"> </span>http://localhost:8080/api/instances/my-model<span class="w"> </span><span class="se">\</span>
+<a id="__codelineno-5-6" name="__codelineno-5-6" href="#__codelineno-5-6"></a><span class="w">  </span>-H<span class="w"> </span><span class="s2">&quot;Content-Type: application/json&quot;</span><span class="w"> </span><span class="se">\</span>
+<a id="__codelineno-5-7" name="__codelineno-5-7" href="#__codelineno-5-7"></a><span class="w">  </span>-d<span class="w"> </span><span class="s1">&#39;{</span>
+<a id="__codelineno-5-8" name="__codelineno-5-8" href="#__codelineno-5-8"></a><span class="s1">    &quot;backend_type&quot;: &quot;llama_cpp&quot;,</span>
+<a id="__codelineno-5-9" name="__codelineno-5-9" href="#__codelineno-5-9"></a><span class="s1">    &quot;backend_options&quot;: {</span>
+<a id="__codelineno-5-10" name="__codelineno-5-10" href="#__codelineno-5-10"></a><span class="s1">      &quot;model&quot;: &quot;/path/to/model.gguf&quot;</span>
+<a id="__codelineno-5-11" name="__codelineno-5-11" href="#__codelineno-5-11"></a><span class="s1">    }</span>
+<a id="__codelineno-5-12" name="__codelineno-5-12" href="#__codelineno-5-12"></a><span class="s1">  }&#39;</span>
+<a id="__codelineno-5-13" name="__codelineno-5-13" href="#__codelineno-5-13"></a>
+<a id="__codelineno-5-14" name="__codelineno-5-14" href="#__codelineno-5-14"></a><span class="c1"># Start an instance</span>
+<a id="__codelineno-5-15" name="__codelineno-5-15" href="#__codelineno-5-15"></a>curl<span class="w"> </span>-X<span class="w"> </span>POST<span class="w"> </span>http://localhost:8080/api/instances/my-model/start
 </code></pre></div>
 <h2 id="openai-compatible-api">OpenAI Compatible API<a class="headerlink" href="#openai-compatible-api" title="Permanent link">&para;</a></h2>
 <p>Llamactl provides OpenAI-compatible endpoints, making it easy to integrate with existing OpenAI client libraries and tools.</p>
 <h3 id="chat-completions">Chat Completions<a class="headerlink" href="#chat-completions" title="Permanent link">&para;</a></h3>
 <p>Once you have an instance running, you can use it with the OpenAI-compatible chat completions endpoint:</p>
-<div class="highlight"><pre><span></span><code><a id="__codelineno-4-1" name="__codelineno-4-1" href="#__codelineno-4-1"></a>curl<span class="w"> </span>-X<span class="w"> </span>POST<span class="w"> </span>http://localhost:8080/v1/chat/completions<span class="w"> </span><span class="se">\</span>
-<a id="__codelineno-4-2" name="__codelineno-4-2" href="#__codelineno-4-2"></a><span class="w">  </span>-H<span class="w"> </span><span class="s2">&quot;Content-Type: application/json&quot;</span><span class="w"> </span><span class="se">\</span>
-<a id="__codelineno-4-3" name="__codelineno-4-3" href="#__codelineno-4-3"></a><span class="w">  </span>-d<span class="w"> </span><span class="s1">&#39;{</span>
-<a id="__codelineno-4-4" name="__codelineno-4-4" href="#__codelineno-4-4"></a><span class="s1">    &quot;model&quot;: &quot;my-model&quot;,</span>
-<a id="__codelineno-4-5" name="__codelineno-4-5" href="#__codelineno-4-5"></a><span class="s1">    &quot;messages&quot;: [</span>
-<a id="__codelineno-4-6" name="__codelineno-4-6" href="#__codelineno-4-6"></a><span class="s1">      {</span>
-<a id="__codelineno-4-7" name="__codelineno-4-7" href="#__codelineno-4-7"></a><span class="s1">        &quot;role&quot;: &quot;user&quot;,</span>
-<a id="__codelineno-4-8" name="__codelineno-4-8" href="#__codelineno-4-8"></a><span class="s1">        &quot;content&quot;: &quot;Hello! Can you help me write a Python function?&quot;</span>
-<a id="__codelineno-4-9" name="__codelineno-4-9" href="#__codelineno-4-9"></a><span class="s1">      }</span>
-<a id="__codelineno-4-10" name="__codelineno-4-10" href="#__codelineno-4-10"></a><span class="s1">    ],</span>
-<a id="__codelineno-4-11" name="__codelineno-4-11" href="#__codelineno-4-11"></a><span class="s1">    &quot;max_tokens&quot;: 150,</span>
-<a id="__codelineno-4-12" name="__codelineno-4-12" href="#__codelineno-4-12"></a><span class="s1">    &quot;temperature&quot;: 0.7</span>
-<a id="__codelineno-4-13" name="__codelineno-4-13" href="#__codelineno-4-13"></a><span class="s1">  }&#39;</span>
+<div class="highlight"><pre><span></span><code><a id="__codelineno-6-1" name="__codelineno-6-1" href="#__codelineno-6-1"></a>curl<span class="w"> </span>-X<span class="w"> </span>POST<span class="w"> </span>http://localhost:8080/v1/chat/completions<span class="w"> </span><span class="se">\</span>
+<a id="__codelineno-6-2" name="__codelineno-6-2" href="#__codelineno-6-2"></a><span class="w">  </span>-H<span class="w"> </span><span class="s2">&quot;Content-Type: application/json&quot;</span><span class="w"> </span><span class="se">\</span>
+<a id="__codelineno-6-3" name="__codelineno-6-3" href="#__codelineno-6-3"></a><span class="w">  </span>-d<span class="w"> </span><span class="s1">&#39;{</span>
+<a id="__codelineno-6-4" name="__codelineno-6-4" href="#__codelineno-6-4"></a><span class="s1">    &quot;model&quot;: &quot;my-model&quot;,</span>
+<a id="__codelineno-6-5" name="__codelineno-6-5" href="#__codelineno-6-5"></a><span class="s1">    &quot;messages&quot;: [</span>
+<a id="__codelineno-6-6" name="__codelineno-6-6" href="#__codelineno-6-6"></a><span class="s1">      {</span>
+<a id="__codelineno-6-7" name="__codelineno-6-7" href="#__codelineno-6-7"></a><span class="s1">        &quot;role&quot;: &quot;user&quot;,</span>
+<a id="__codelineno-6-8" name="__codelineno-6-8" href="#__codelineno-6-8"></a><span class="s1">        &quot;content&quot;: &quot;Hello! Can you help me write a Python function?&quot;</span>
+<a id="__codelineno-6-9" name="__codelineno-6-9" href="#__codelineno-6-9"></a><span class="s1">      }</span>
+<a id="__codelineno-6-10" name="__codelineno-6-10" href="#__codelineno-6-10"></a><span class="s1">    ],</span>
+<a id="__codelineno-6-11" name="__codelineno-6-11" href="#__codelineno-6-11"></a><span class="s1">    &quot;max_tokens&quot;: 150,</span>
+<a id="__codelineno-6-12" name="__codelineno-6-12" href="#__codelineno-6-12"></a><span class="s1">    &quot;temperature&quot;: 0.7</span>
+<a id="__codelineno-6-13" name="__codelineno-6-13" href="#__codelineno-6-13"></a><span class="s1">  }&#39;</span>
 </code></pre></div>
 <h3 id="using-with-python-openai-client">Using with Python OpenAI Client<a class="headerlink" href="#using-with-python-openai-client" title="Permanent link">&para;</a></h3>
 <p>You can also use the official OpenAI Python client:</p>
-<div class="highlight"><pre><span></span><code><a id="__codelineno-5-1" name="__codelineno-5-1" href="#__codelineno-5-1"></a><span class="kn">from</span><span class="w"> </span><span class="nn">openai</span><span class="w"> </span><span class="kn">import</span> <span class="n">OpenAI</span>
-<a id="__codelineno-5-2" name="__codelineno-5-2" href="#__codelineno-5-2"></a>
-<a id="__codelineno-5-3" name="__codelineno-5-3" href="#__codelineno-5-3"></a><span class="c1"># Point the client to your Llamactl server</span>
-<a id="__codelineno-5-4" name="__codelineno-5-4" href="#__codelineno-5-4"></a><span class="n">client</span> <span class="o">=</span> <span class="n">OpenAI</span><span class="p">(</span>
-<a id="__codelineno-5-5" name="__codelineno-5-5" href="#__codelineno-5-5"></a>    <span class="n">base_url</span><span class="o">=</span><span class="s2">&quot;http://localhost:8080/v1&quot;</span><span class="p">,</span>
-<a id="__codelineno-5-6" name="__codelineno-5-6" href="#__codelineno-5-6"></a>    <span class="n">api_key</span><span class="o">=</span><span class="s2">&quot;not-needed&quot;</span>  <span class="c1"># Llamactl doesn&#39;t require API keys by default</span>
-<a id="__codelineno-5-7" name="__codelineno-5-7" href="#__codelineno-5-7"></a><span class="p">)</span>
-<a id="__codelineno-5-8" name="__codelineno-5-8" href="#__codelineno-5-8"></a>
-<a id="__codelineno-5-9" name="__codelineno-5-9" href="#__codelineno-5-9"></a><span class="c1"># Create a chat completion</span>
-<a id="__codelineno-5-10" name="__codelineno-5-10" href="#__codelineno-5-10"></a><span class="n">response</span> <span class="o">=</span> <span class="n">client</span><span class="o">.</span><span class="n">chat</span><span class="o">.</span><span class="n">completions</span><span class="o">.</span><span class="n">create</span><span class="p">(</span>
-<a id="__codelineno-5-11" name="__codelineno-5-11" href="#__codelineno-5-11"></a>    <span class="n">model</span><span class="o">=</span><span class="s2">&quot;my-model&quot;</span><span class="p">,</span>  <span class="c1"># Use the name of your instance</span>
-<a id="__codelineno-5-12" name="__codelineno-5-12" href="#__codelineno-5-12"></a>    <span class="n">messages</span><span class="o">=</span><span class="p">[</span>
-<a id="__codelineno-5-13" name="__codelineno-5-13" href="#__codelineno-5-13"></a>        <span class="p">{</span><span class="s2">&quot;role&quot;</span><span class="p">:</span> <span class="s2">&quot;user&quot;</span><span class="p">,</span> <span class="s2">&quot;content&quot;</span><span class="p">:</span> <span class="s2">&quot;Explain quantum computing in simple terms&quot;</span><span class="p">}</span>
-<a id="__codelineno-5-14" name="__codelineno-5-14" href="#__codelineno-5-14"></a>    <span class="p">],</span>
-<a id="__codelineno-5-15" name="__codelineno-5-15" href="#__codelineno-5-15"></a>    <span class="n">max_tokens</span><span class="o">=</span><span class="mi">200</span><span class="p">,</span>
-<a id="__codelineno-5-16" name="__codelineno-5-16" href="#__codelineno-5-16"></a>    <span class="n">temperature</span><span class="o">=</span><span class="mf">0.7</span>
-<a id="__codelineno-5-17" name="__codelineno-5-17" href="#__codelineno-5-17"></a><span class="p">)</span>
-<a id="__codelineno-5-18" name="__codelineno-5-18" href="#__codelineno-5-18"></a>
-<a id="__codelineno-5-19" name="__codelineno-5-19" href="#__codelineno-5-19"></a><span class="nb">print</span><span class="p">(</span><span class="n">response</span><span class="o">.</span><span class="n">choices</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">message</span><span class="o">.</span><span class="n">content</span><span class="p">)</span>
+<div class="highlight"><pre><span></span><code><a id="__codelineno-7-1" name="__codelineno-7-1" href="#__codelineno-7-1"></a><span class="kn">from</span><span class="w"> </span><span class="nn">openai</span><span class="w"> </span><span class="kn">import</span> <span class="n">OpenAI</span>
+<a id="__codelineno-7-2" name="__codelineno-7-2" href="#__codelineno-7-2"></a>
+<a id="__codelineno-7-3" name="__codelineno-7-3" href="#__codelineno-7-3"></a><span class="c1"># Point the client to your Llamactl server</span>
+<a id="__codelineno-7-4" name="__codelineno-7-4" href="#__codelineno-7-4"></a><span class="n">client</span> <span class="o">=</span> <span class="n">OpenAI</span><span class="p">(</span>
+<a id="__codelineno-7-5" name="__codelineno-7-5" href="#__codelineno-7-5"></a>    <span class="n">base_url</span><span class="o">=</span><span class="s2">&quot;http://localhost:8080/v1&quot;</span><span class="p">,</span>
+<a id="__codelineno-7-6" name="__codelineno-7-6" href="#__codelineno-7-6"></a>    <span class="n">api_key</span><span class="o">=</span><span class="s2">&quot;not-needed&quot;</span>  <span class="c1"># Llamactl doesn&#39;t require API keys by default</span>
+<a id="__codelineno-7-7" name="__codelineno-7-7" href="#__codelineno-7-7"></a><span class="p">)</span>
+<a id="__codelineno-7-8" name="__codelineno-7-8" href="#__codelineno-7-8"></a>
+<a id="__codelineno-7-9" name="__codelineno-7-9" href="#__codelineno-7-9"></a><span class="c1"># Create a chat completion</span>
+<a id="__codelineno-7-10" name="__codelineno-7-10" href="#__codelineno-7-10"></a><span class="n">response</span> <span class="o">=</span> <span class="n">client</span><span class="o">.</span><span class="n">chat</span><span class="o">.</span><span class="n">completions</span><span class="o">.</span><span class="n">create</span><span class="p">(</span>
+<a id="__codelineno-7-11" name="__codelineno-7-11" href="#__codelineno-7-11"></a>    <span class="n">model</span><span class="o">=</span><span class="s2">&quot;my-model&quot;</span><span class="p">,</span>  <span class="c1"># Use the name of your instance</span>
+<a id="__codelineno-7-12" name="__codelineno-7-12" href="#__codelineno-7-12"></a>    <span class="n">messages</span><span class="o">=</span><span class="p">[</span>
+<a id="__codelineno-7-13" name="__codelineno-7-13" href="#__codelineno-7-13"></a>        <span class="p">{</span><span class="s2">&quot;role&quot;</span><span class="p">:</span> <span class="s2">&quot;user&quot;</span><span class="p">,</span> <span class="s2">&quot;content&quot;</span><span class="p">:</span> <span class="s2">&quot;Explain quantum computing in simple terms&quot;</span><span class="p">}</span>
+<a id="__codelineno-7-14" name="__codelineno-7-14" href="#__codelineno-7-14"></a>    <span class="p">],</span>
+<a id="__codelineno-7-15" name="__codelineno-7-15" href="#__codelineno-7-15"></a>    <span class="n">max_tokens</span><span class="o">=</span><span class="mi">200</span><span class="p">,</span>
+<a id="__codelineno-7-16" name="__codelineno-7-16" href="#__codelineno-7-16"></a>    <span class="n">temperature</span><span class="o">=</span><span class="mf">0.7</span>
+<a id="__codelineno-7-17" name="__codelineno-7-17" href="#__codelineno-7-17"></a><span class="p">)</span>
+<a id="__codelineno-7-18" name="__codelineno-7-18" href="#__codelineno-7-18"></a>
+<a id="__codelineno-7-19" name="__codelineno-7-19" href="#__codelineno-7-19"></a><span class="nb">print</span><span class="p">(</span><span class="n">response</span><span class="o">.</span><span class="n">choices</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">message</span><span class="o">.</span><span class="n">content</span><span class="p">)</span>
 </code></pre></div>
 <h3 id="list-available-models">List Available Models<a class="headerlink" href="#list-available-models" title="Permanent link">&para;</a></h3>
 <p>Get a list of running instances (models) in OpenAI-compatible format:</p>
-<div class="highlight"><pre><span></span><code><a id="__codelineno-6-1" name="__codelineno-6-1" href="#__codelineno-6-1"></a>curl<span class="w"> </span>http://localhost:8080/v1/models
+<div class="highlight"><pre><span></span><code><a id="__codelineno-8-1" name="__codelineno-8-1" href="#__codelineno-8-1"></a>curl<span class="w"> </span>http://localhost:8080/v1/models
 </code></pre></div>
 <h2 id="next-steps">Next Steps<a class="headerlink" href="#next-steps" title="Permanent link">&para;</a></h2>
 <ul>
@@ -992,7 +1020,7 @@
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-date">September 3, 2025</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-date">September 21, 2025</span>
   </span>
 
     
diff --git a/dev/search/search_index.json b/dev/search/search_index.json
index 4b42522..33d6a9b 100644
--- a/dev/search/search_index.json
+++ b/dev/search/search_index.json
@@ -1 +1 @@
-{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"","title":"Llamactl Documentation","text":"<p>Welcome to the Llamactl documentation! Management server and proxy for multiple llama.cpp and MLX instances with OpenAI-compatible API routing.</p> <p></p>"},{"location":"#what-is-llamactl","title":"What is Llamactl?","text":"<p>Llamactl is designed to simplify the deployment and management of llama-server and MLX instances. It provides a modern solution for running multiple large language models with centralized management and multi-backend support.</p>"},{"location":"#features","title":"Features","text":"<p>\ud83d\ude80 Multiple Model Serving: Run different models simultaneously (7B for speed, 70B for quality) \ud83d\udd17 OpenAI API Compatible: Drop-in replacement - route requests by model name \ud83c\udf4e Multi-Backend Support: Native support for both llama.cpp and MLX (Apple Silicon optimized) \ud83c\udf10 Web Dashboard: Modern React UI for visual management (unlike CLI-only tools) \ud83d\udd10 API Key Authentication: Separate keys for management vs inference access \ud83d\udcca Instance Monitoring: Health checks, auto-restart, log management \u26a1 Smart Resource Management: Idle timeout, LRU eviction, and configurable instance limits \ud83d\udca1 On-Demand Instance Start: Automatically launch instances upon receiving OpenAI-compatible API requests \ud83d\udcbe State Persistence: Ensure instances remain intact across server restarts  </p>"},{"location":"#quick-links","title":"Quick Links","text":"<ul> <li>Installation Guide - Get Llamactl up and running</li> <li>Configuration Guide - Detailed configuration options</li> <li>Quick Start - Your first steps with Llamactl</li> <li>Managing Instances - Instance lifecycle management</li> <li>API Reference - Complete API documentation</li> </ul>"},{"location":"#getting-help","title":"Getting Help","text":"<p>If you need help or have questions:</p> <ul> <li>Check the Troubleshooting guide</li> <li>Visit the GitHub repository</li> <li>Review the Configuration Guide for advanced settings</li> </ul>"},{"location":"#license","title":"License","text":"<p>MIT License - see the LICENSE file.</p>"},{"location":"getting-started/configuration/","title":"Configuration","text":"<p>llamactl can be configured via configuration files or environment variables. Configuration is loaded in the following order of precedence:</p> <pre><code>Defaults &lt; Configuration file &lt; Environment variables\n</code></pre> <p>llamactl works out of the box with sensible defaults, but you can customize the behavior to suit your needs.</p>"},{"location":"getting-started/configuration/#default-configuration","title":"Default Configuration","text":"<p>Here's the default configuration with all available options:</p> <pre><code>server:\n  host: \"0.0.0.0\"                # Server host to bind to\n  port: 8080                     # Server port to bind to\n  allowed_origins: [\"*\"]         # Allowed CORS origins (default: all)\n  enable_swagger: false          # Enable Swagger UI for API docs\n\nbackends:\n  llama_executable: llama-server # Path to llama-server executable\n  mlx_lm_executable: mlx_lm.server # Path to mlx_lm.server executable\n\ninstances:\n  port_range: [8000, 9000]       # Port range for instances\n  data_dir: ~/.local/share/llamactl         # Data directory (platform-specific, see below)\n  configs_dir: ~/.local/share/llamactl/instances  # Instance configs directory\n  logs_dir: ~/.local/share/llamactl/logs    # Logs directory\n  auto_create_dirs: true         # Auto-create data/config/logs dirs if missing\n  max_instances: -1              # Max instances (-1 = unlimited)\n  max_running_instances: -1      # Max running instances (-1 = unlimited)\n  enable_lru_eviction: true      # Enable LRU eviction for idle instances\n  default_auto_restart: true     # Auto-restart new instances by default\n  default_max_restarts: 3        # Max restarts for new instances\n  default_restart_delay: 5       # Restart delay (seconds) for new instances\n  default_on_demand_start: true  # Default on-demand start setting\n  on_demand_start_timeout: 120   # Default on-demand start timeout in seconds\n  timeout_check_interval: 5      # Idle instance timeout check in minutes\n\nauth:\n  require_inference_auth: true   # Require auth for inference endpoints\n  inference_keys: []             # Keys for inference endpoints\n  require_management_auth: true  # Require auth for management endpoints\n  management_keys: []            # Keys for management endpoints\n</code></pre>"},{"location":"getting-started/configuration/#configuration-files","title":"Configuration Files","text":""},{"location":"getting-started/configuration/#configuration-file-locations","title":"Configuration File Locations","text":"<p>Configuration files are searched in the following locations (in order of precedence):</p> <p>Linux: - <code>./llamactl.yaml</code> or <code>./config.yaml</code> (current directory) - <code>$HOME/.config/llamactl/config.yaml</code> - <code>/etc/llamactl/config.yaml</code> </p> <p>macOS: - <code>./llamactl.yaml</code> or <code>./config.yaml</code> (current directory) - <code>$HOME/Library/Application Support/llamactl/config.yaml</code> - <code>/Library/Application Support/llamactl/config.yaml</code> </p> <p>Windows: - <code>./llamactl.yaml</code> or <code>./config.yaml</code> (current directory) - <code>%APPDATA%\\llamactl\\config.yaml</code> - <code>%USERPROFILE%\\llamactl\\config.yaml</code> - <code>%PROGRAMDATA%\\llamactl\\config.yaml</code> </p> <p>You can specify the path to config file with <code>LLAMACTL_CONFIG_PATH</code> environment variable.</p>"},{"location":"getting-started/configuration/#configuration-options","title":"Configuration Options","text":""},{"location":"getting-started/configuration/#server-configuration","title":"Server Configuration","text":"<pre><code>server:\n  host: \"0.0.0.0\"         # Server host to bind to (default: \"0.0.0.0\")\n  port: 8080              # Server port to bind to (default: 8080)\n  allowed_origins: [\"*\"]  # CORS allowed origins (default: [\"*\"])\n  enable_swagger: false   # Enable Swagger UI (default: false)\n</code></pre> <p>Environment Variables: - <code>LLAMACTL_HOST</code> - Server host - <code>LLAMACTL_PORT</code> - Server port - <code>LLAMACTL_ALLOWED_ORIGINS</code> - Comma-separated CORS origins - <code>LLAMACTL_ENABLE_SWAGGER</code> - Enable Swagger UI (true/false)</p>"},{"location":"getting-started/configuration/#backend-configuration","title":"Backend Configuration","text":"<pre><code>backends:\n  llama_executable: \"llama-server\"     # Path to llama-server executable (default: \"llama-server\")\n  mlx_lm_executable: \"mlx_lm.server\"   # Path to mlx_lm.server executable (default: \"mlx_lm.server\")\n</code></pre> <p>Environment Variables: - <code>LLAMACTL_LLAMA_EXECUTABLE</code> - Path to llama-server executable - <code>LLAMACTL_MLX_LM_EXECUTABLE</code> - Path to mlx_lm.server executable</p>"},{"location":"getting-started/configuration/#instance-configuration","title":"Instance Configuration","text":"<pre><code>instances:\n  port_range: [8000, 9000]                          # Port range for instances (default: [8000, 9000])\n  data_dir: \"~/.local/share/llamactl\"               # Directory for all llamactl data (default varies by OS)\n  configs_dir: \"~/.local/share/llamactl/instances\"  # Directory for instance configs (default: data_dir/instances)\n  logs_dir: \"~/.local/share/llamactl/logs\"          # Directory for instance logs (default: data_dir/logs)\n  auto_create_dirs: true                            # Automatically create data/config/logs directories (default: true)\n  max_instances: -1                                 # Maximum instances (-1 = unlimited)\n  max_running_instances: -1                         # Maximum running instances (-1 = unlimited)\n  enable_lru_eviction: true                         # Enable LRU eviction for idle instances\n  default_auto_restart: true                        # Default auto-restart setting\n  default_max_restarts: 3                           # Default maximum restart attempts\n  default_restart_delay: 5                          # Default restart delay in seconds\n  default_on_demand_start: true                     # Default on-demand start setting\n  on_demand_start_timeout: 120                      # Default on-demand start timeout in seconds\n  timeout_check_interval: 5                         # Default instance timeout check interval in minutes\n</code></pre> <p>Environment Variables: - <code>LLAMACTL_INSTANCE_PORT_RANGE</code> - Port range (format: \"8000-9000\" or \"8000,9000\") - <code>LLAMACTL_DATA_DIRECTORY</code> - Data directory path - <code>LLAMACTL_INSTANCES_DIR</code> - Instance configs directory path - <code>LLAMACTL_LOGS_DIR</code> - Log directory path - <code>LLAMACTL_AUTO_CREATE_DATA_DIR</code> - Auto-create data/config/logs directories (true/false) - <code>LLAMACTL_MAX_INSTANCES</code> - Maximum number of instances - <code>LLAMACTL_MAX_RUNNING_INSTANCES</code> - Maximum number of running instances - <code>LLAMACTL_ENABLE_LRU_EVICTION</code> - Enable LRU eviction for idle instances - <code>LLAMACTL_DEFAULT_AUTO_RESTART</code> - Default auto-restart setting (true/false) - <code>LLAMACTL_DEFAULT_MAX_RESTARTS</code> - Default maximum restarts - <code>LLAMACTL_DEFAULT_RESTART_DELAY</code> - Default restart delay in seconds - <code>LLAMACTL_DEFAULT_ON_DEMAND_START</code> - Default on-demand start setting (true/false) - <code>LLAMACTL_ON_DEMAND_START_TIMEOUT</code> - Default on-demand start timeout in seconds - <code>LLAMACTL_TIMEOUT_CHECK_INTERVAL</code> - Default instance timeout check interval in minutes  </p>"},{"location":"getting-started/configuration/#authentication-configuration","title":"Authentication Configuration","text":"<pre><code>auth:\n  require_inference_auth: true           # Require API key for OpenAI endpoints (default: true)\n  inference_keys: []                     # List of valid inference API keys\n  require_management_auth: true          # Require API key for management endpoints (default: true)\n  management_keys: []                    # List of valid management API keys\n</code></pre> <p>Environment Variables: - <code>LLAMACTL_REQUIRE_INFERENCE_AUTH</code> - Require auth for OpenAI endpoints (true/false) - <code>LLAMACTL_INFERENCE_KEYS</code> - Comma-separated inference API keys - <code>LLAMACTL_REQUIRE_MANAGEMENT_AUTH</code> - Require auth for management endpoints (true/false) - <code>LLAMACTL_MANAGEMENT_KEYS</code> - Comma-separated management API keys  </p>"},{"location":"getting-started/configuration/#command-line-options","title":"Command Line Options","text":"<p>View all available command line options:</p> <pre><code>llamactl --help\n</code></pre> <p>You can also override configuration using command line flags when starting llamactl.</p>"},{"location":"getting-started/installation/","title":"Installation","text":"<p>This guide will walk you through installing Llamactl on your system.</p>"},{"location":"getting-started/installation/#prerequisites","title":"Prerequisites","text":""},{"location":"getting-started/installation/#backend-dependencies","title":"Backend Dependencies","text":"<p>llamactl supports multiple backends. Install at least one:</p> <p>For llama.cpp backend (all platforms):</p> <p>You need <code>llama-server</code> from llama.cpp installed:</p> <pre><code># Homebrew (macOS/Linux)\nbrew install llama.cpp\n# Winget (Windows)\nwinget install llama.cpp\n</code></pre> <p>Or build from source - see llama.cpp docs</p> <p>For MLX backend (macOS only):</p> <p>MLX provides optimized inference on Apple Silicon. Install MLX-LM:</p> <pre><code># Install via pip (requires Python 3.8+)\npip install mlx-lm\n\n# Or in a virtual environment (recommended)\npython -m venv mlx-env\nsource mlx-env/bin/activate\npip install mlx-lm\n</code></pre> <p>Note: MLX backend is only available on macOS with Apple Silicon (M1, M2, M3, etc.)</p>"},{"location":"getting-started/installation/#installation-methods","title":"Installation Methods","text":""},{"location":"getting-started/installation/#option-1-download-binary-recommended","title":"Option 1: Download Binary (Recommended)","text":"<p>Download the latest release from the GitHub releases page:</p> <pre><code># Linux/macOS - Get latest version and download\nLATEST_VERSION=$(curl -s https://api.github.com/repos/lordmathis/llamactl/releases/latest | grep '\"tag_name\":' | sed -E 's/.*\"([^\"]+)\".*/\\1/')\ncurl -L https://github.com/lordmathis/llamactl/releases/download/${LATEST_VERSION}/llamactl-${LATEST_VERSION}-$(uname -s | tr '[:upper:]' '[:lower:]')-$(uname -m).tar.gz | tar -xz\nsudo mv llamactl /usr/local/bin/\n\n# Or download manually from:\n# https://github.com/lordmathis/llamactl/releases/latest\n\n# Windows - Download from releases page\n</code></pre>"},{"location":"getting-started/installation/#option-2-build-from-source","title":"Option 2: Build from Source","text":"<p>Requirements: - Go 1.24 or later - Node.js 22 or later - Git</p> <p>If you prefer to build from source:</p> <pre><code># Clone the repository\ngit clone https://github.com/lordmathis/llamactl.git\ncd llamactl\n\n# Build the web UI\ncd webui &amp;&amp; npm ci &amp;&amp; npm run build &amp;&amp; cd ..\n\n# Build the application\ngo build -o llamactl ./cmd/server\n</code></pre>"},{"location":"getting-started/installation/#verification","title":"Verification","text":"<p>Verify your installation by checking the version:</p> <pre><code>llamactl --version\n</code></pre>"},{"location":"getting-started/installation/#next-steps","title":"Next Steps","text":"<p>Now that Llamactl is installed, continue to the Quick Start guide to get your first instance running!</p>"},{"location":"getting-started/quick-start/","title":"Quick Start","text":"<p>This guide will help you get Llamactl up and running in just a few minutes.</p>"},{"location":"getting-started/quick-start/#step-1-start-llamactl","title":"Step 1: Start Llamactl","text":"<p>Start the Llamactl server:</p> <pre><code>llamactl\n</code></pre> <p>By default, Llamactl will start on <code>http://localhost:8080</code>.</p>"},{"location":"getting-started/quick-start/#step-2-access-the-web-ui","title":"Step 2: Access the Web UI","text":"<p>Open your web browser and navigate to:</p> <pre><code>http://localhost:8080\n</code></pre> <p>Login with the management API key. By default it is generated during server startup. Copy it from the terminal output.</p> <p>You should see the Llamactl web interface.</p>"},{"location":"getting-started/quick-start/#step-3-create-your-first-instance","title":"Step 3: Create Your First Instance","text":"<ol> <li>Click the \"Add Instance\" button</li> <li>Fill in the instance configuration:</li> <li>Name: Give your instance a descriptive name</li> <li>Model Path: Path to your Llama.cpp model file</li> <li> <p>Additional Options: Any extra Llama.cpp parameters</p> </li> <li> <p>Click \"Create Instance\"</p> </li> </ol>"},{"location":"getting-started/quick-start/#step-4-start-your-instance","title":"Step 4: Start Your Instance","text":"<p>Once created, you can:</p> <ul> <li>Start the instance by clicking the start button</li> <li>Monitor its status in real-time</li> <li>View logs by clicking the logs button</li> <li>Stop the instance when needed</li> </ul>"},{"location":"getting-started/quick-start/#example-configuration","title":"Example Configuration","text":"<p>Here's a basic example configuration for a Llama 2 model:</p> <pre><code>{\n  \"name\": \"llama2-7b\",\n  \"model_path\": \"/path/to/llama-2-7b-chat.gguf\",\n  \"options\": {\n    \"threads\": 4,\n    \"context_size\": 2048\n  }\n}\n</code></pre>"},{"location":"getting-started/quick-start/#using-the-api","title":"Using the API","text":"<p>You can also manage instances via the REST API:</p> <pre><code># List all instances\ncurl http://localhost:8080/api/instances\n\n# Create a new instance\ncurl -X POST http://localhost:8080/api/instances \\\n  -H \"Content-Type: application/json\" \\\n  -d '{\n    \"name\": \"my-model\",\n    \"model_path\": \"/path/to/model.gguf\",\n  }'\n\n# Start an instance\ncurl -X POST http://localhost:8080/api/instances/my-model/start\n</code></pre>"},{"location":"getting-started/quick-start/#openai-compatible-api","title":"OpenAI Compatible API","text":"<p>Llamactl provides OpenAI-compatible endpoints, making it easy to integrate with existing OpenAI client libraries and tools.</p>"},{"location":"getting-started/quick-start/#chat-completions","title":"Chat Completions","text":"<p>Once you have an instance running, you can use it with the OpenAI-compatible chat completions endpoint:</p> <pre><code>curl -X POST http://localhost:8080/v1/chat/completions \\\n  -H \"Content-Type: application/json\" \\\n  -d '{\n    \"model\": \"my-model\",\n    \"messages\": [\n      {\n        \"role\": \"user\",\n        \"content\": \"Hello! Can you help me write a Python function?\"\n      }\n    ],\n    \"max_tokens\": 150,\n    \"temperature\": 0.7\n  }'\n</code></pre>"},{"location":"getting-started/quick-start/#using-with-python-openai-client","title":"Using with Python OpenAI Client","text":"<p>You can also use the official OpenAI Python client:</p> <pre><code>from openai import OpenAI\n\n# Point the client to your Llamactl server\nclient = OpenAI(\n    base_url=\"http://localhost:8080/v1\",\n    api_key=\"not-needed\"  # Llamactl doesn't require API keys by default\n)\n\n# Create a chat completion\nresponse = client.chat.completions.create(\n    model=\"my-model\",  # Use the name of your instance\n    messages=[\n        {\"role\": \"user\", \"content\": \"Explain quantum computing in simple terms\"}\n    ],\n    max_tokens=200,\n    temperature=0.7\n)\n\nprint(response.choices[0].message.content)\n</code></pre>"},{"location":"getting-started/quick-start/#list-available-models","title":"List Available Models","text":"<p>Get a list of running instances (models) in OpenAI-compatible format:</p> <pre><code>curl http://localhost:8080/v1/models\n</code></pre>"},{"location":"getting-started/quick-start/#next-steps","title":"Next Steps","text":"<ul> <li>Manage instances Managing Instances</li> <li>Explore the API Reference</li> <li>Configure advanced settings in the Configuration guide</li> </ul>"},{"location":"user-guide/api-reference/","title":"API Reference","text":"<p>Complete reference for the Llamactl REST API.</p>"},{"location":"user-guide/api-reference/#base-url","title":"Base URL","text":"<p>All API endpoints are relative to the base URL:</p> <pre><code>http://localhost:8080/api/v1\n</code></pre>"},{"location":"user-guide/api-reference/#authentication","title":"Authentication","text":"<p>Llamactl supports API key authentication. If authentication is enabled, include the API key in the Authorization header:</p> <pre><code>curl -H \"Authorization: Bearer &lt;your-api-key&gt;\" \\\n  http://localhost:8080/api/v1/instances\n</code></pre> <p>The server supports two types of API keys: - Management API Keys: Required for instance management operations (CRUD operations on instances) - Inference API Keys: Required for OpenAI-compatible inference endpoints</p>"},{"location":"user-guide/api-reference/#system-endpoints","title":"System Endpoints","text":""},{"location":"user-guide/api-reference/#get-llamactl-version","title":"Get Llamactl Version","text":"<p>Get the version information of the llamactl server.</p> <pre><code>GET /api/v1/version\n</code></pre> <p>Response: <pre><code>Version: 1.0.0\nCommit: abc123\nBuild Time: 2024-01-15T10:00:00Z\n</code></pre></p>"},{"location":"user-guide/api-reference/#get-llama-server-help","title":"Get Llama Server Help","text":"<p>Get help text for the llama-server command.</p> <pre><code>GET /api/v1/server/help\n</code></pre> <p>Response: Plain text help output from <code>llama-server --help</code></p>"},{"location":"user-guide/api-reference/#get-llama-server-version","title":"Get Llama Server Version","text":"<p>Get version information of the llama-server binary.</p> <pre><code>GET /api/v1/server/version\n</code></pre> <p>Response: Plain text version output from <code>llama-server --version</code></p>"},{"location":"user-guide/api-reference/#list-available-devices","title":"List Available Devices","text":"<p>List available devices for llama-server.</p> <pre><code>GET /api/v1/server/devices\n</code></pre> <p>Response: Plain text device list from <code>llama-server --list-devices</code></p>"},{"location":"user-guide/api-reference/#instances","title":"Instances","text":""},{"location":"user-guide/api-reference/#list-all-instances","title":"List All Instances","text":"<p>Get a list of all instances.</p> <pre><code>GET /api/v1/instances\n</code></pre> <p>Response: <pre><code>[\n  {\n    \"name\": \"llama2-7b\",\n    \"status\": \"running\",\n    \"created\": 1705312200\n  }\n]\n</code></pre></p>"},{"location":"user-guide/api-reference/#get-instance-details","title":"Get Instance Details","text":"<p>Get detailed information about a specific instance.</p> <pre><code>GET /api/v1/instances/{name}\n</code></pre> <p>Response: <pre><code>{\n  \"name\": \"llama2-7b\",\n  \"status\": \"running\",\n  \"created\": 1705312200\n}\n</code></pre></p>"},{"location":"user-guide/api-reference/#create-instance","title":"Create Instance","text":"<p>Create and start a new instance.</p> <pre><code>POST /api/v1/instances/{name}\n</code></pre> <p>Request Body: JSON object with instance configuration. See Managing Instances for available configuration options.</p> <p>Response: <pre><code>{\n  \"name\": \"llama2-7b\",\n  \"status\": \"running\",\n  \"created\": 1705312200\n}\n</code></pre></p>"},{"location":"user-guide/api-reference/#update-instance","title":"Update Instance","text":"<p>Update an existing instance configuration. See Managing Instances for available configuration options.</p> <pre><code>PUT /api/v1/instances/{name}\n</code></pre> <p>Request Body: JSON object with configuration fields to update.</p> <p>Response: <pre><code>{\n  \"name\": \"llama2-7b\",\n  \"status\": \"running\",\n  \"created\": 1705312200\n}\n</code></pre></p>"},{"location":"user-guide/api-reference/#delete-instance","title":"Delete Instance","text":"<p>Stop and remove an instance.</p> <pre><code>DELETE /api/v1/instances/{name}\n</code></pre> <p>Response: <code>204 No Content</code></p>"},{"location":"user-guide/api-reference/#instance-operations","title":"Instance Operations","text":""},{"location":"user-guide/api-reference/#start-instance","title":"Start Instance","text":"<p>Start a stopped instance.</p> <pre><code>POST /api/v1/instances/{name}/start\n</code></pre> <p>Response: <pre><code>{\n  \"name\": \"llama2-7b\",\n  \"status\": \"starting\",\n  \"created\": 1705312200\n}\n</code></pre></p> <p>Error Responses: - <code>409 Conflict</code>: Maximum number of running instances reached - <code>500 Internal Server Error</code>: Failed to start instance</p>"},{"location":"user-guide/api-reference/#stop-instance","title":"Stop Instance","text":"<p>Stop a running instance.</p> <pre><code>POST /api/v1/instances/{name}/stop\n</code></pre> <p>Response: <pre><code>{\n  \"name\": \"llama2-7b\",\n  \"status\": \"stopping\",\n  \"created\": 1705312200\n}\n</code></pre></p>"},{"location":"user-guide/api-reference/#restart-instance","title":"Restart Instance","text":"<p>Restart an instance (stop then start).</p> <pre><code>POST /api/v1/instances/{name}/restart\n</code></pre> <p>Response: <pre><code>{\n  \"name\": \"llama2-7b\",\n  \"status\": \"restarting\",\n  \"created\": 1705312200\n}\n</code></pre></p>"},{"location":"user-guide/api-reference/#get-instance-logs","title":"Get Instance Logs","text":"<p>Retrieve instance logs.</p> <pre><code>GET /api/v1/instances/{name}/logs\n</code></pre> <p>Query Parameters: - <code>lines</code>: Number of lines to return (default: all lines, use -1 for all)</p> <p>Response: Plain text log output</p> <p>Example: <pre><code>curl \"http://localhost:8080/api/v1/instances/my-instance/logs?lines=100\"\n</code></pre></p>"},{"location":"user-guide/api-reference/#proxy-to-instance","title":"Proxy to Instance","text":"<p>Proxy HTTP requests directly to the llama-server instance.</p> <pre><code>GET /api/v1/instances/{name}/proxy/*\nPOST /api/v1/instances/{name}/proxy/*\n</code></pre> <p>This endpoint forwards all requests to the underlying llama-server instance running on its configured port. The proxy strips the <code>/api/v1/instances/{name}/proxy</code> prefix and forwards the remaining path to the instance.</p> <p>Example - Check Instance Health: <pre><code>curl -H \"Authorization: Bearer your-api-key\" \\\n  http://localhost:8080/api/v1/instances/my-model/proxy/health\n</code></pre></p> <p>This forwards the request to <code>http://instance-host:instance-port/health</code> on the actual llama-server instance.</p> <p>Error Responses: - <code>503 Service Unavailable</code>: Instance is not running</p>"},{"location":"user-guide/api-reference/#openai-compatible-api","title":"OpenAI-Compatible API","text":"<p>Llamactl provides OpenAI-compatible endpoints for inference operations.</p>"},{"location":"user-guide/api-reference/#list-models","title":"List Models","text":"<p>List all instances in OpenAI-compatible format.</p> <pre><code>GET /v1/models\n</code></pre> <p>Response: <pre><code>{\n  \"object\": \"list\",\n  \"data\": [\n    {\n      \"id\": \"llama2-7b\",\n      \"object\": \"model\",\n      \"created\": 1705312200,\n      \"owned_by\": \"llamactl\"\n    }\n  ]\n}\n</code></pre></p>"},{"location":"user-guide/api-reference/#chat-completions-completions-embeddings","title":"Chat Completions, Completions, Embeddings","text":"<p>All OpenAI-compatible inference endpoints are available:</p> <pre><code>POST /v1/chat/completions\nPOST /v1/completions\nPOST /v1/embeddings\nPOST /v1/rerank\nPOST /v1/reranking\n</code></pre> <p>Request Body: Standard OpenAI format with <code>model</code> field specifying the instance name</p> <p>Example: <pre><code>{\n  \"model\": \"llama2-7b\",\n  \"messages\": [\n    {\n      \"role\": \"user\",\n      \"content\": \"Hello, how are you?\"\n    }\n  ]\n}\n</code></pre></p> <p>The server routes requests to the appropriate instance based on the <code>model</code> field in the request body. Instances with on-demand starting enabled will be automatically started if not running. For configuration details, see Managing Instances.</p> <p>Error Responses: - <code>400 Bad Request</code>: Invalid request body or missing model name - <code>503 Service Unavailable</code>: Instance is not running and on-demand start is disabled - <code>409 Conflict</code>: Cannot start instance due to maximum instances limit</p>"},{"location":"user-guide/api-reference/#instance-status-values","title":"Instance Status Values","text":"<p>Instances can have the following status values: - <code>stopped</code>: Instance is not running - <code>running</code>: Instance is running and ready to accept requests - <code>failed</code>: Instance failed to start or crashed  </p>"},{"location":"user-guide/api-reference/#error-responses","title":"Error Responses","text":"<p>All endpoints may return error responses in the following format:</p> <pre><code>{\n  \"error\": \"Error message description\"\n}\n</code></pre>"},{"location":"user-guide/api-reference/#common-http-status-codes","title":"Common HTTP Status Codes","text":"<ul> <li><code>200</code>: Success</li> <li><code>201</code>: Created</li> <li><code>204</code>: No Content (successful deletion)</li> <li><code>400</code>: Bad Request (invalid parameters or request body)</li> <li><code>401</code>: Unauthorized (missing or invalid API key)</li> <li><code>403</code>: Forbidden (insufficient permissions)</li> <li><code>404</code>: Not Found (instance not found)</li> <li><code>409</code>: Conflict (instance already exists, max instances reached)</li> <li><code>500</code>: Internal Server Error</li> <li><code>503</code>: Service Unavailable (instance not running)</li> </ul>"},{"location":"user-guide/api-reference/#examples","title":"Examples","text":""},{"location":"user-guide/api-reference/#complete-instance-lifecycle","title":"Complete Instance Lifecycle","text":"<pre><code># Create and start instance\ncurl -X POST http://localhost:8080/api/v1/instances/my-model \\\n  -H \"Content-Type: application/json\" \\\n  -H \"Authorization: Bearer your-api-key\" \\\n  -d '{\n    \"model\": \"/models/llama-2-7b.gguf\"\n  }'\n\n# Check instance status\ncurl -H \"Authorization: Bearer your-api-key\" \\\n  http://localhost:8080/api/v1/instances/my-model\n\n# Get instance logs\ncurl -H \"Authorization: Bearer your-api-key\" \\\n  \"http://localhost:8080/api/v1/instances/my-model/logs?lines=50\"\n\n# Use OpenAI-compatible chat completions\ncurl -X POST http://localhost:8080/v1/chat/completions \\\n  -H \"Content-Type: application/json\" \\\n  -H \"Authorization: Bearer your-inference-api-key\" \\\n  -d '{\n    \"model\": \"my-model\",\n    \"messages\": [\n      {\"role\": \"user\", \"content\": \"Hello!\"}\n    ],\n    \"max_tokens\": 100\n  }'\n\n# Stop instance\ncurl -X POST -H \"Authorization: Bearer your-api-key\" \\\n  http://localhost:8080/api/v1/instances/my-model/stop\n\n# Delete instance\ncurl -X DELETE -H \"Authorization: Bearer your-api-key\" \\\n  http://localhost:8080/api/v1/instances/my-model\n</code></pre>"},{"location":"user-guide/api-reference/#using-the-proxy-endpoint","title":"Using the Proxy Endpoint","text":"<p>You can also directly proxy requests to the llama-server instance:</p> <pre><code># Direct proxy to instance (bypasses OpenAI compatibility layer)\ncurl -X POST http://localhost:8080/api/v1/instances/my-model/proxy/completion \\\n  -H \"Content-Type: application/json\" \\\n  -H \"Authorization: Bearer your-api-key\" \\\n  -d '{\n    \"prompt\": \"Hello, world!\",\n    \"n_predict\": 50\n  }'\n</code></pre>"},{"location":"user-guide/api-reference/#swagger-documentation","title":"Swagger Documentation","text":"<p>If swagger documentation is enabled in the server configuration, you can access the interactive API documentation at:</p> <pre><code>http://localhost:8080/swagger/\n</code></pre> <p>This provides a complete interactive interface for testing all API endpoints.</p>"},{"location":"user-guide/managing-instances/","title":"Managing Instances","text":"<p>Learn how to effectively manage your llama.cpp and MLX instances with Llamactl through both the Web UI and API.</p>"},{"location":"user-guide/managing-instances/#overview","title":"Overview","text":"<p>Llamactl provides two ways to manage instances:</p> <ul> <li>Web UI: Accessible at <code>http://localhost:8080</code> with an intuitive dashboard</li> <li>REST API: Programmatic access for automation and integration</li> </ul> <p></p>"},{"location":"user-guide/managing-instances/#authentication","title":"Authentication","text":"<p>If authentication is enabled: 1. Navigate to the web UI 2. Enter your credentials 3. Bearer token is stored for the session</p>"},{"location":"user-guide/managing-instances/#theme-support","title":"Theme Support","text":"<ul> <li>Switch between light and dark themes</li> <li>Setting is remembered across sessions</li> </ul>"},{"location":"user-guide/managing-instances/#instance-cards","title":"Instance Cards","text":"<p>Each instance is displayed as a card showing:</p> <ul> <li>Instance name</li> <li>Health status badge (unknown, ready, error, failed)</li> <li>Action buttons (start, stop, edit, logs, delete)</li> </ul>"},{"location":"user-guide/managing-instances/#create-instance","title":"Create Instance","text":""},{"location":"user-guide/managing-instances/#via-web-ui","title":"Via Web UI","text":"<ol> <li>Click the \"Create Instance\" button on the dashboard</li> <li>Enter a unique Name for your instance (only required field)</li> <li>Choose Backend Type:<ul> <li>llama.cpp: For GGUF models using llama-server</li> <li>MLX: For MLX-optimized models (macOS only)</li> </ul> </li> <li>Configure model source:<ul> <li>For llama.cpp: GGUF model path or HuggingFace repo</li> <li>For MLX: MLX model path or identifier (e.g., <code>mlx-community/Mistral-7B-Instruct-v0.3-4bit</code>)</li> </ul> </li> <li>Configure optional instance management settings:<ul> <li>Auto Restart: Automatically restart instance on failure</li> <li>Max Restarts: Maximum number of restart attempts</li> <li>Restart Delay: Delay in seconds between restart attempts</li> <li>On Demand Start: Start instance when receiving a request to the OpenAI compatible endpoint</li> <li>Idle Timeout: Minutes before stopping idle instance (set to 0 to disable)</li> </ul> </li> <li>Configure backend-specific options:<ul> <li>llama.cpp: Threads, context size, GPU layers, port, etc.</li> <li>MLX: Temperature, top-p, adapter path, Python environment, etc.</li> </ul> </li> <li>Click \"Create\" to save the instance  </li> </ol>"},{"location":"user-guide/managing-instances/#via-api","title":"Via API","text":"<pre><code># Create llama.cpp instance with local model file\ncurl -X POST http://localhost:8080/api/instances/my-llama-instance \\\n  -H \"Content-Type: application/json\" \\\n  -d '{\n    \"backend_type\": \"llama_cpp\",\n    \"backend_options\": {\n      \"model\": \"/path/to/model.gguf\",\n      \"threads\": 8,\n      \"ctx_size\": 4096,\n      \"gpu_layers\": 32\n    }\n  }'\n\n# Create MLX instance (macOS only)\ncurl -X POST http://localhost:8080/api/instances/my-mlx-instance \\\n  -H \"Content-Type: application/json\" \\\n  -d '{\n    \"backend_type\": \"mlx_lm\",\n    \"backend_options\": {\n      \"model\": \"mlx-community/Mistral-7B-Instruct-v0.3-4bit\",\n      \"temp\": 0.7,\n      \"top_p\": 0.9,\n      \"max_tokens\": 2048\n    },\n    \"auto_restart\": true,\n    \"max_restarts\": 3\n  }'\n\n# Create llama.cpp instance with HuggingFace model\ncurl -X POST http://localhost:8080/api/instances/gemma-3-27b \\\n  -H \"Content-Type: application/json\" \\\n  -d '{\n    \"backend_type\": \"llama_cpp\",\n    \"backend_options\": {\n      \"hf_repo\": \"unsloth/gemma-3-27b-it-GGUF\",\n      \"hf_file\": \"gemma-3-27b-it-GGUF.gguf\",\n      \"gpu_layers\": 32\n    }\n  }'\n</code></pre>"},{"location":"user-guide/managing-instances/#start-instance","title":"Start Instance","text":""},{"location":"user-guide/managing-instances/#via-web-ui_1","title":"Via Web UI","text":"<ol> <li>Click the \"Start\" button on an instance card</li> <li>Watch the status change to \"Unknown\"</li> <li>Monitor progress in the logs</li> <li>Instance status changes to \"Ready\" when ready</li> </ol>"},{"location":"user-guide/managing-instances/#via-api_1","title":"Via API","text":"<pre><code>curl -X POST http://localhost:8080/api/instances/{name}/start\n</code></pre>"},{"location":"user-guide/managing-instances/#stop-instance","title":"Stop Instance","text":""},{"location":"user-guide/managing-instances/#via-web-ui_2","title":"Via Web UI","text":"<ol> <li>Click the \"Stop\" button on an instance card</li> <li>Instance gracefully shuts down</li> </ol>"},{"location":"user-guide/managing-instances/#via-api_2","title":"Via API","text":"<pre><code>curl -X POST http://localhost:8080/api/instances/{name}/stop\n</code></pre>"},{"location":"user-guide/managing-instances/#edit-instance","title":"Edit Instance","text":""},{"location":"user-guide/managing-instances/#via-web-ui_3","title":"Via Web UI","text":"<ol> <li>Click the \"Edit\" button on an instance card</li> <li>Modify settings in the configuration dialog</li> <li>Changes require instance restart to take effect</li> <li>Click \"Update &amp; Restart\" to apply changes</li> </ol>"},{"location":"user-guide/managing-instances/#via-api_3","title":"Via API","text":"<p>Modify instance settings:</p> <pre><code>curl -X PUT http://localhost:8080/api/instances/{name} \\\n  -H \"Content-Type: application/json\" \\\n  -d '{\n    \"backend_options\": {\n      \"threads\": 8,\n      \"context_size\": 4096\n    }\n  }'\n</code></pre> <p>Note</p> <p>Configuration changes require restarting the instance to take effect.</p>"},{"location":"user-guide/managing-instances/#view-logs","title":"View Logs","text":""},{"location":"user-guide/managing-instances/#via-web-ui_4","title":"Via Web UI","text":"<ol> <li>Click the \"Logs\" button on any instance card</li> <li>Real-time log viewer opens</li> </ol>"},{"location":"user-guide/managing-instances/#via-api_4","title":"Via API","text":"<p>Check instance status in real-time:</p> <pre><code># Get instance details\ncurl http://localhost:8080/api/instances/{name}/logs\n</code></pre>"},{"location":"user-guide/managing-instances/#delete-instance","title":"Delete Instance","text":""},{"location":"user-guide/managing-instances/#via-web-ui_5","title":"Via Web UI","text":"<ol> <li>Click the \"Delete\" button on an instance card</li> <li>Only stopped instances can be deleted</li> <li>Confirm deletion in the dialog</li> </ol>"},{"location":"user-guide/managing-instances/#via-api_5","title":"Via API","text":"<pre><code>curl -X DELETE http://localhost:8080/api/instances/{name}\n</code></pre>"},{"location":"user-guide/managing-instances/#instance-proxy","title":"Instance Proxy","text":"<p>Llamactl proxies all requests to the underlying backend instances (llama-server or MLX).</p> <pre><code># Get instance details\ncurl http://localhost:8080/api/instances/{name}/proxy/\n</code></pre> <p>Both backends provide OpenAI-compatible endpoints. Check the respective documentation: - llama-server docs - MLX-LM docs</p>"},{"location":"user-guide/managing-instances/#instance-health","title":"Instance Health","text":""},{"location":"user-guide/managing-instances/#via-web-ui_6","title":"Via Web UI","text":"<ol> <li>The health status badge is displayed on each instance card</li> </ol>"},{"location":"user-guide/managing-instances/#via-api_6","title":"Via API","text":"<p>Check the health status of your instances:</p> <pre><code>curl http://localhost:8080/api/instances/{name}/proxy/health\n</code></pre>"},{"location":"user-guide/troubleshooting/","title":"Troubleshooting","text":"<p>Issues specific to Llamactl deployment and operation.</p>"},{"location":"user-guide/troubleshooting/#configuration-issues","title":"Configuration Issues","text":""},{"location":"user-guide/troubleshooting/#invalid-configuration","title":"Invalid Configuration","text":"<p>Problem: Invalid configuration preventing startup</p> <p>Solutions: 1. Use minimal configuration:    <pre><code>server:\n  host: \"0.0.0.0\"\n  port: 8080\ninstances:\n  port_range: [8000, 9000]\n</code></pre></p> <ol> <li>Check data directory permissions:    <pre><code># Ensure data directory is writable (default: ~/.local/share/llamactl)\nmkdir -p ~/.local/share/llamactl/{instances,logs}\n</code></pre></li> </ol>"},{"location":"user-guide/troubleshooting/#instance-management-issues","title":"Instance Management Issues","text":""},{"location":"user-guide/troubleshooting/#model-loading-failures","title":"Model Loading Failures","text":"<p>Problem: Instance fails to start with model loading errors</p> <p>Common Solutions: - llama-server not found: Ensure <code>llama-server</code> binary is in PATH - Wrong model format: Ensure model is in GGUF format - Insufficient memory: Use smaller model or reduce context size - Path issues: Use absolute paths to model files  </p>"},{"location":"user-guide/troubleshooting/#memory-issues","title":"Memory Issues","text":"<p>Problem: Out of memory errors or system becomes unresponsive</p> <p>Solutions: 1. Reduce context size: <pre><code>{\n  \"n_ctx\": 1024\n}\n</code></pre></p> <ol> <li>Use quantized models: </li> <li>Try Q4_K_M instead of higher precision models  </li> <li>Use smaller model variants (7B instead of 13B)  </li> </ol>"},{"location":"user-guide/troubleshooting/#gpu-configuration","title":"GPU Configuration","text":"<p>Problem: GPU not being used effectively</p> <p>Solutions: 1. Configure GPU layers: <pre><code>{\n  \"n_gpu_layers\": 35\n}\n</code></pre></p>"},{"location":"user-guide/troubleshooting/#advanced-instance-issues","title":"Advanced Instance Issues","text":"<p>Problem: Complex model loading, performance, or compatibility issues</p> <p>Since llamactl uses <code>llama-server</code> under the hood, many instance-related issues are actually llama.cpp issues. For advanced troubleshooting:</p> <p>Resources: - llama.cpp Documentation: https://github.com/ggml/llama.cpp - llama.cpp Issues: https://github.com/ggml/llama.cpp/issues - llama.cpp Discussions: https://github.com/ggml/llama.cpp/discussions </p> <p>Testing directly with llama-server: <pre><code># Test your model and parameters directly with llama-server\nllama-server --model /path/to/model.gguf --port 8081 --n-gpu-layers 35\n</code></pre></p> <p>This helps determine if the issue is with llamactl or with the underlying llama.cpp/llama-server.</p>"},{"location":"user-guide/troubleshooting/#api-and-network-issues","title":"API and Network Issues","text":""},{"location":"user-guide/troubleshooting/#cors-errors","title":"CORS Errors","text":"<p>Problem: Web UI shows CORS errors in browser console</p> <p>Solutions: 1. Configure allowed origins: <pre><code>server:\n  allowed_origins:\n    - \"http://localhost:3000\"\n    - \"https://yourdomain.com\"\n</code></pre></p>"},{"location":"user-guide/troubleshooting/#authentication-issues","title":"Authentication Issues","text":"<p>Problem: API requests failing with authentication errors</p> <p>Solutions: 1. Disable authentication temporarily: <pre><code>auth:\n  require_management_auth: false\n  require_inference_auth: false\n</code></pre></p> <ol> <li> <p>Configure API keys: <pre><code>auth:\n  management_keys:\n    - \"your-management-key\"\n  inference_keys:\n    - \"your-inference-key\"\n</code></pre></p> </li> <li> <p>Use correct Authorization header: <pre><code>curl -H \"Authorization: Bearer your-api-key\" \\\n  http://localhost:8080/api/v1/instances\n</code></pre></p> </li> </ol>"},{"location":"user-guide/troubleshooting/#debugging-and-logs","title":"Debugging and Logs","text":""},{"location":"user-guide/troubleshooting/#viewing-instance-logs","title":"Viewing Instance Logs","text":"<pre><code># Get instance logs via API\ncurl http://localhost:8080/api/v1/instances/{name}/logs\n\n# Or check log files directly\ntail -f ~/.local/share/llamactl/logs/{instance-name}.log\n</code></pre>"},{"location":"user-guide/troubleshooting/#enable-debug-logging","title":"Enable Debug Logging","text":"<pre><code>export LLAMACTL_LOG_LEVEL=debug\nllamactl\n</code></pre>"},{"location":"user-guide/troubleshooting/#getting-help","title":"Getting Help","text":"<p>When reporting issues, include:</p> <ol> <li> <p>System information: <pre><code>llamactl --version\n</code></pre></p> </li> <li> <p>Configuration file (remove sensitive keys)</p> </li> <li> <p>Relevant log output</p> </li> <li> <p>Steps to reproduce the issue</p> </li> </ol>"}]}
\ No newline at end of file
+{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"","title":"Llamactl Documentation","text":"<p>Welcome to the Llamactl documentation! Management server and proxy for multiple llama.cpp and MLX instances with OpenAI-compatible API routing.</p> <p></p>"},{"location":"#what-is-llamactl","title":"What is Llamactl?","text":"<p>Llamactl is designed to simplify the deployment and management of llama-server and MLX instances. It provides a modern solution for running multiple large language models with centralized management and multi-backend support.</p>"},{"location":"#features","title":"Features","text":"<p>\ud83d\ude80 Multiple Model Serving: Run different models simultaneously (7B for speed, 70B for quality) \ud83d\udd17 OpenAI API Compatible: Drop-in replacement - route requests by model name \ud83c\udf4e Multi-Backend Support: Native support for both llama.cpp and MLX (Apple Silicon optimized) \ud83c\udf10 Web Dashboard: Modern React UI for visual management (unlike CLI-only tools) \ud83d\udd10 API Key Authentication: Separate keys for management vs inference access \ud83d\udcca Instance Monitoring: Health checks, auto-restart, log management \u26a1 Smart Resource Management: Idle timeout, LRU eviction, and configurable instance limits \ud83d\udca1 On-Demand Instance Start: Automatically launch instances upon receiving OpenAI-compatible API requests \ud83d\udcbe State Persistence: Ensure instances remain intact across server restarts  </p>"},{"location":"#quick-links","title":"Quick Links","text":"<ul> <li>Installation Guide - Get Llamactl up and running</li> <li>Configuration Guide - Detailed configuration options</li> <li>Quick Start - Your first steps with Llamactl</li> <li>Managing Instances - Instance lifecycle management</li> <li>API Reference - Complete API documentation</li> </ul>"},{"location":"#getting-help","title":"Getting Help","text":"<p>If you need help or have questions:</p> <ul> <li>Check the Troubleshooting guide</li> <li>Visit the GitHub repository</li> <li>Review the Configuration Guide for advanced settings</li> </ul>"},{"location":"#license","title":"License","text":"<p>MIT License - see the LICENSE file.</p>"},{"location":"getting-started/configuration/","title":"Configuration","text":"<p>llamactl can be configured via configuration files or environment variables. Configuration is loaded in the following order of precedence:</p> <pre><code>Defaults &lt; Configuration file &lt; Environment variables\n</code></pre> <p>llamactl works out of the box with sensible defaults, but you can customize the behavior to suit your needs.</p>"},{"location":"getting-started/configuration/#default-configuration","title":"Default Configuration","text":"<p>Here's the default configuration with all available options:</p> <pre><code>server:\n  host: \"0.0.0.0\"                # Server host to bind to\n  port: 8080                     # Server port to bind to\n  allowed_origins: [\"*\"]         # Allowed CORS origins (default: all)\n  enable_swagger: false          # Enable Swagger UI for API docs\n\nbackends:\n  llama_executable: llama-server # Path to llama-server executable\n  mlx_lm_executable: mlx_lm.server # Path to mlx_lm.server executable\n  vllm_executable: vllm # Path to vllm executable\n\ninstances:\n  port_range: [8000, 9000]       # Port range for instances\n  data_dir: ~/.local/share/llamactl         # Data directory (platform-specific, see below)\n  configs_dir: ~/.local/share/llamactl/instances  # Instance configs directory\n  logs_dir: ~/.local/share/llamactl/logs    # Logs directory\n  auto_create_dirs: true         # Auto-create data/config/logs dirs if missing\n  max_instances: -1              # Max instances (-1 = unlimited)\n  max_running_instances: -1      # Max running instances (-1 = unlimited)\n  enable_lru_eviction: true      # Enable LRU eviction for idle instances\n  default_auto_restart: true     # Auto-restart new instances by default\n  default_max_restarts: 3        # Max restarts for new instances\n  default_restart_delay: 5       # Restart delay (seconds) for new instances\n  default_on_demand_start: true  # Default on-demand start setting\n  on_demand_start_timeout: 120   # Default on-demand start timeout in seconds\n  timeout_check_interval: 5      # Idle instance timeout check in minutes\n\nauth:\n  require_inference_auth: true   # Require auth for inference endpoints\n  inference_keys: []             # Keys for inference endpoints\n  require_management_auth: true  # Require auth for management endpoints\n  management_keys: []            # Keys for management endpoints\n</code></pre>"},{"location":"getting-started/configuration/#configuration-files","title":"Configuration Files","text":""},{"location":"getting-started/configuration/#configuration-file-locations","title":"Configuration File Locations","text":"<p>Configuration files are searched in the following locations (in order of precedence):</p> <p>Linux: - <code>./llamactl.yaml</code> or <code>./config.yaml</code> (current directory) - <code>$HOME/.config/llamactl/config.yaml</code> - <code>/etc/llamactl/config.yaml</code> </p> <p>macOS: - <code>./llamactl.yaml</code> or <code>./config.yaml</code> (current directory) - <code>$HOME/Library/Application Support/llamactl/config.yaml</code> - <code>/Library/Application Support/llamactl/config.yaml</code> </p> <p>Windows: - <code>./llamactl.yaml</code> or <code>./config.yaml</code> (current directory) - <code>%APPDATA%\\llamactl\\config.yaml</code> - <code>%USERPROFILE%\\llamactl\\config.yaml</code> - <code>%PROGRAMDATA%\\llamactl\\config.yaml</code> </p> <p>You can specify the path to config file with <code>LLAMACTL_CONFIG_PATH</code> environment variable.</p>"},{"location":"getting-started/configuration/#configuration-options","title":"Configuration Options","text":""},{"location":"getting-started/configuration/#server-configuration","title":"Server Configuration","text":"<pre><code>server:\n  host: \"0.0.0.0\"         # Server host to bind to (default: \"0.0.0.0\")\n  port: 8080              # Server port to bind to (default: 8080)\n  allowed_origins: [\"*\"]  # CORS allowed origins (default: [\"*\"])\n  enable_swagger: false   # Enable Swagger UI (default: false)\n</code></pre> <p>Environment Variables: - <code>LLAMACTL_HOST</code> - Server host - <code>LLAMACTL_PORT</code> - Server port - <code>LLAMACTL_ALLOWED_ORIGINS</code> - Comma-separated CORS origins - <code>LLAMACTL_ENABLE_SWAGGER</code> - Enable Swagger UI (true/false)</p>"},{"location":"getting-started/configuration/#backend-configuration","title":"Backend Configuration","text":"<pre><code>backends:\n  llama_executable: \"llama-server\"     # Path to llama-server executable (default: \"llama-server\")\n  mlx_lm_executable: \"mlx_lm.server\"   # Path to mlx_lm.server executable (default: \"mlx_lm.server\")\n  vllm_executable: \"vllm\"              # Path to vllm executable (default: \"vllm\")\n</code></pre> <p>Environment Variables: - <code>LLAMACTL_LLAMA_EXECUTABLE</code> - Path to llama-server executable - <code>LLAMACTL_MLX_LM_EXECUTABLE</code> - Path to mlx_lm.server executable - <code>LLAMACTL_VLLM_EXECUTABLE</code> - Path to vllm executable</p>"},{"location":"getting-started/configuration/#instance-configuration","title":"Instance Configuration","text":"<pre><code>instances:\n  port_range: [8000, 9000]                          # Port range for instances (default: [8000, 9000])\n  data_dir: \"~/.local/share/llamactl\"               # Directory for all llamactl data (default varies by OS)\n  configs_dir: \"~/.local/share/llamactl/instances\"  # Directory for instance configs (default: data_dir/instances)\n  logs_dir: \"~/.local/share/llamactl/logs\"          # Directory for instance logs (default: data_dir/logs)\n  auto_create_dirs: true                            # Automatically create data/config/logs directories (default: true)\n  max_instances: -1                                 # Maximum instances (-1 = unlimited)\n  max_running_instances: -1                         # Maximum running instances (-1 = unlimited)\n  enable_lru_eviction: true                         # Enable LRU eviction for idle instances\n  default_auto_restart: true                        # Default auto-restart setting\n  default_max_restarts: 3                           # Default maximum restart attempts\n  default_restart_delay: 5                          # Default restart delay in seconds\n  default_on_demand_start: true                     # Default on-demand start setting\n  on_demand_start_timeout: 120                      # Default on-demand start timeout in seconds\n  timeout_check_interval: 5                         # Default instance timeout check interval in minutes\n</code></pre> <p>Environment Variables: - <code>LLAMACTL_INSTANCE_PORT_RANGE</code> - Port range (format: \"8000-9000\" or \"8000,9000\") - <code>LLAMACTL_DATA_DIRECTORY</code> - Data directory path - <code>LLAMACTL_INSTANCES_DIR</code> - Instance configs directory path - <code>LLAMACTL_LOGS_DIR</code> - Log directory path - <code>LLAMACTL_AUTO_CREATE_DATA_DIR</code> - Auto-create data/config/logs directories (true/false) - <code>LLAMACTL_MAX_INSTANCES</code> - Maximum number of instances - <code>LLAMACTL_MAX_RUNNING_INSTANCES</code> - Maximum number of running instances - <code>LLAMACTL_ENABLE_LRU_EVICTION</code> - Enable LRU eviction for idle instances - <code>LLAMACTL_DEFAULT_AUTO_RESTART</code> - Default auto-restart setting (true/false) - <code>LLAMACTL_DEFAULT_MAX_RESTARTS</code> - Default maximum restarts - <code>LLAMACTL_DEFAULT_RESTART_DELAY</code> - Default restart delay in seconds - <code>LLAMACTL_DEFAULT_ON_DEMAND_START</code> - Default on-demand start setting (true/false) - <code>LLAMACTL_ON_DEMAND_START_TIMEOUT</code> - Default on-demand start timeout in seconds - <code>LLAMACTL_TIMEOUT_CHECK_INTERVAL</code> - Default instance timeout check interval in minutes  </p>"},{"location":"getting-started/configuration/#authentication-configuration","title":"Authentication Configuration","text":"<pre><code>auth:\n  require_inference_auth: true           # Require API key for OpenAI endpoints (default: true)\n  inference_keys: []                     # List of valid inference API keys\n  require_management_auth: true          # Require API key for management endpoints (default: true)\n  management_keys: []                    # List of valid management API keys\n</code></pre> <p>Environment Variables: - <code>LLAMACTL_REQUIRE_INFERENCE_AUTH</code> - Require auth for OpenAI endpoints (true/false) - <code>LLAMACTL_INFERENCE_KEYS</code> - Comma-separated inference API keys - <code>LLAMACTL_REQUIRE_MANAGEMENT_AUTH</code> - Require auth for management endpoints (true/false) - <code>LLAMACTL_MANAGEMENT_KEYS</code> - Comma-separated management API keys  </p>"},{"location":"getting-started/configuration/#command-line-options","title":"Command Line Options","text":"<p>View all available command line options:</p> <pre><code>llamactl --help\n</code></pre> <p>You can also override configuration using command line flags when starting llamactl.</p>"},{"location":"getting-started/installation/","title":"Installation","text":"<p>This guide will walk you through installing Llamactl on your system.</p>"},{"location":"getting-started/installation/#prerequisites","title":"Prerequisites","text":""},{"location":"getting-started/installation/#backend-dependencies","title":"Backend Dependencies","text":"<p>llamactl supports multiple backends. Install at least one:</p> <p>For llama.cpp backend (all platforms):</p> <p>You need <code>llama-server</code> from llama.cpp installed:</p> <pre><code># Homebrew (macOS/Linux)\nbrew install llama.cpp\n# Winget (Windows)\nwinget install llama.cpp\n</code></pre> <p>Or build from source - see llama.cpp docs</p> <p>For MLX backend (macOS only):</p> <p>MLX provides optimized inference on Apple Silicon. Install MLX-LM:</p> <pre><code># Install via pip (requires Python 3.8+)\npip install mlx-lm\n\n# Or in a virtual environment (recommended)\npython -m venv mlx-env\nsource mlx-env/bin/activate\npip install mlx-lm\n</code></pre> <p>Note: MLX backend is only available on macOS with Apple Silicon (M1, M2, M3, etc.)</p> <p>For vLLM backend:</p> <p>vLLM provides high-throughput distributed serving for LLMs. Install vLLM:</p> <pre><code># Install via pip (requires Python 3.8+, GPU required)\npip install vllm\n\n# Or in a virtual environment (recommended)\npython -m venv vllm-env\nsource vllm-env/bin/activate\npip install vllm\n\n# For production deployments, consider container-based installation\n</code></pre>"},{"location":"getting-started/installation/#installation-methods","title":"Installation Methods","text":""},{"location":"getting-started/installation/#option-1-download-binary-recommended","title":"Option 1: Download Binary (Recommended)","text":"<p>Download the latest release from the GitHub releases page:</p> <pre><code># Linux/macOS - Get latest version and download\nLATEST_VERSION=$(curl -s https://api.github.com/repos/lordmathis/llamactl/releases/latest | grep '\"tag_name\":' | sed -E 's/.*\"([^\"]+)\".*/\\1/')\ncurl -L https://github.com/lordmathis/llamactl/releases/download/${LATEST_VERSION}/llamactl-${LATEST_VERSION}-$(uname -s | tr '[:upper:]' '[:lower:]')-$(uname -m).tar.gz | tar -xz\nsudo mv llamactl /usr/local/bin/\n\n# Or download manually from:\n# https://github.com/lordmathis/llamactl/releases/latest\n\n# Windows - Download from releases page\n</code></pre>"},{"location":"getting-started/installation/#option-2-build-from-source","title":"Option 2: Build from Source","text":"<p>Requirements: - Go 1.24 or later - Node.js 22 or later - Git</p> <p>If you prefer to build from source:</p> <pre><code># Clone the repository\ngit clone https://github.com/lordmathis/llamactl.git\ncd llamactl\n\n# Build the web UI\ncd webui &amp;&amp; npm ci &amp;&amp; npm run build &amp;&amp; cd ..\n\n# Build the application\ngo build -o llamactl ./cmd/server\n</code></pre>"},{"location":"getting-started/installation/#verification","title":"Verification","text":"<p>Verify your installation by checking the version:</p> <pre><code>llamactl --version\n</code></pre>"},{"location":"getting-started/installation/#next-steps","title":"Next Steps","text":"<p>Now that Llamactl is installed, continue to the Quick Start guide to get your first instance running!</p>"},{"location":"getting-started/quick-start/","title":"Quick Start","text":"<p>This guide will help you get Llamactl up and running in just a few minutes.</p>"},{"location":"getting-started/quick-start/#step-1-start-llamactl","title":"Step 1: Start Llamactl","text":"<p>Start the Llamactl server:</p> <pre><code>llamactl\n</code></pre> <p>By default, Llamactl will start on <code>http://localhost:8080</code>.</p>"},{"location":"getting-started/quick-start/#step-2-access-the-web-ui","title":"Step 2: Access the Web UI","text":"<p>Open your web browser and navigate to:</p> <pre><code>http://localhost:8080\n</code></pre> <p>Login with the management API key. By default it is generated during server startup. Copy it from the terminal output.</p> <p>You should see the Llamactl web interface.</p>"},{"location":"getting-started/quick-start/#step-3-create-your-first-instance","title":"Step 3: Create Your First Instance","text":"<ol> <li>Click the \"Add Instance\" button</li> <li>Fill in the instance configuration:</li> <li>Name: Give your instance a descriptive name</li> <li>Backend Type: Choose from llama.cpp, MLX, or vLLM</li> <li>Model: Model path or identifier for your chosen backend</li> <li> <p>Additional Options: Backend-specific parameters</p> </li> <li> <p>Click \"Create Instance\"</p> </li> </ol>"},{"location":"getting-started/quick-start/#step-4-start-your-instance","title":"Step 4: Start Your Instance","text":"<p>Once created, you can:</p> <ul> <li>Start the instance by clicking the start button</li> <li>Monitor its status in real-time</li> <li>View logs by clicking the logs button</li> <li>Stop the instance when needed</li> </ul>"},{"location":"getting-started/quick-start/#example-configurations","title":"Example Configurations","text":"<p>Here are basic example configurations for each backend:</p> <p>llama.cpp backend: <pre><code>{\n  \"name\": \"llama2-7b\",\n  \"backend_type\": \"llama_cpp\",\n  \"backend_options\": {\n    \"model\": \"/path/to/llama-2-7b-chat.gguf\",\n    \"threads\": 4,\n    \"ctx_size\": 2048,\n    \"gpu_layers\": 32\n  }\n}\n</code></pre></p> <p>MLX backend (macOS only): <pre><code>{\n  \"name\": \"mistral-mlx\",\n  \"backend_type\": \"mlx_lm\",\n  \"backend_options\": {\n    \"model\": \"mlx-community/Mistral-7B-Instruct-v0.3-4bit\",\n    \"temp\": 0.7,\n    \"max_tokens\": 2048\n  }\n}\n</code></pre></p> <p>vLLM backend: <pre><code>{\n  \"name\": \"dialogpt-vllm\",\n  \"backend_type\": \"vllm\",\n  \"backend_options\": {\n    \"model\": \"microsoft/DialoGPT-medium\",\n    \"tensor_parallel_size\": 2,\n    \"gpu_memory_utilization\": 0.9\n  }\n}\n</code></pre></p>"},{"location":"getting-started/quick-start/#using-the-api","title":"Using the API","text":"<p>You can also manage instances via the REST API:</p> <pre><code># List all instances\ncurl http://localhost:8080/api/instances\n\n# Create a new llama.cpp instance\ncurl -X POST http://localhost:8080/api/instances/my-model \\\n  -H \"Content-Type: application/json\" \\\n  -d '{\n    \"backend_type\": \"llama_cpp\",\n    \"backend_options\": {\n      \"model\": \"/path/to/model.gguf\"\n    }\n  }'\n\n# Start an instance\ncurl -X POST http://localhost:8080/api/instances/my-model/start\n</code></pre>"},{"location":"getting-started/quick-start/#openai-compatible-api","title":"OpenAI Compatible API","text":"<p>Llamactl provides OpenAI-compatible endpoints, making it easy to integrate with existing OpenAI client libraries and tools.</p>"},{"location":"getting-started/quick-start/#chat-completions","title":"Chat Completions","text":"<p>Once you have an instance running, you can use it with the OpenAI-compatible chat completions endpoint:</p> <pre><code>curl -X POST http://localhost:8080/v1/chat/completions \\\n  -H \"Content-Type: application/json\" \\\n  -d '{\n    \"model\": \"my-model\",\n    \"messages\": [\n      {\n        \"role\": \"user\",\n        \"content\": \"Hello! Can you help me write a Python function?\"\n      }\n    ],\n    \"max_tokens\": 150,\n    \"temperature\": 0.7\n  }'\n</code></pre>"},{"location":"getting-started/quick-start/#using-with-python-openai-client","title":"Using with Python OpenAI Client","text":"<p>You can also use the official OpenAI Python client:</p> <pre><code>from openai import OpenAI\n\n# Point the client to your Llamactl server\nclient = OpenAI(\n    base_url=\"http://localhost:8080/v1\",\n    api_key=\"not-needed\"  # Llamactl doesn't require API keys by default\n)\n\n# Create a chat completion\nresponse = client.chat.completions.create(\n    model=\"my-model\",  # Use the name of your instance\n    messages=[\n        {\"role\": \"user\", \"content\": \"Explain quantum computing in simple terms\"}\n    ],\n    max_tokens=200,\n    temperature=0.7\n)\n\nprint(response.choices[0].message.content)\n</code></pre>"},{"location":"getting-started/quick-start/#list-available-models","title":"List Available Models","text":"<p>Get a list of running instances (models) in OpenAI-compatible format:</p> <pre><code>curl http://localhost:8080/v1/models\n</code></pre>"},{"location":"getting-started/quick-start/#next-steps","title":"Next Steps","text":"<ul> <li>Manage instances Managing Instances</li> <li>Explore the API Reference</li> <li>Configure advanced settings in the Configuration guide</li> </ul>"},{"location":"user-guide/api-reference/","title":"API Reference","text":"<p>Complete reference for the Llamactl REST API.</p>"},{"location":"user-guide/api-reference/#base-url","title":"Base URL","text":"<p>All API endpoints are relative to the base URL:</p> <pre><code>http://localhost:8080/api/v1\n</code></pre>"},{"location":"user-guide/api-reference/#authentication","title":"Authentication","text":"<p>Llamactl supports API key authentication. If authentication is enabled, include the API key in the Authorization header:</p> <pre><code>curl -H \"Authorization: Bearer &lt;your-api-key&gt;\" \\\n  http://localhost:8080/api/v1/instances\n</code></pre> <p>The server supports two types of API keys: - Management API Keys: Required for instance management operations (CRUD operations on instances) - Inference API Keys: Required for OpenAI-compatible inference endpoints</p>"},{"location":"user-guide/api-reference/#system-endpoints","title":"System Endpoints","text":""},{"location":"user-guide/api-reference/#get-llamactl-version","title":"Get Llamactl Version","text":"<p>Get the version information of the llamactl server.</p> <pre><code>GET /api/v1/version\n</code></pre> <p>Response: <pre><code>Version: 1.0.0\nCommit: abc123\nBuild Time: 2024-01-15T10:00:00Z\n</code></pre></p>"},{"location":"user-guide/api-reference/#get-llama-server-help","title":"Get Llama Server Help","text":"<p>Get help text for the llama-server command.</p> <pre><code>GET /api/v1/server/help\n</code></pre> <p>Response: Plain text help output from <code>llama-server --help</code></p>"},{"location":"user-guide/api-reference/#get-llama-server-version","title":"Get Llama Server Version","text":"<p>Get version information of the llama-server binary.</p> <pre><code>GET /api/v1/server/version\n</code></pre> <p>Response: Plain text version output from <code>llama-server --version</code></p>"},{"location":"user-guide/api-reference/#list-available-devices","title":"List Available Devices","text":"<p>List available devices for llama-server.</p> <pre><code>GET /api/v1/server/devices\n</code></pre> <p>Response: Plain text device list from <code>llama-server --list-devices</code></p>"},{"location":"user-guide/api-reference/#instances","title":"Instances","text":""},{"location":"user-guide/api-reference/#list-all-instances","title":"List All Instances","text":"<p>Get a list of all instances.</p> <pre><code>GET /api/v1/instances\n</code></pre> <p>Response: <pre><code>[\n  {\n    \"name\": \"llama2-7b\",\n    \"status\": \"running\",\n    \"created\": 1705312200\n  }\n]\n</code></pre></p>"},{"location":"user-guide/api-reference/#get-instance-details","title":"Get Instance Details","text":"<p>Get detailed information about a specific instance.</p> <pre><code>GET /api/v1/instances/{name}\n</code></pre> <p>Response: <pre><code>{\n  \"name\": \"llama2-7b\",\n  \"status\": \"running\",\n  \"created\": 1705312200\n}\n</code></pre></p>"},{"location":"user-guide/api-reference/#create-instance","title":"Create Instance","text":"<p>Create and start a new instance.</p> <pre><code>POST /api/v1/instances/{name}\n</code></pre> <p>Request Body: JSON object with instance configuration. See Managing Instances for available configuration options.</p> <p>Response: <pre><code>{\n  \"name\": \"llama2-7b\",\n  \"status\": \"running\",\n  \"created\": 1705312200\n}\n</code></pre></p>"},{"location":"user-guide/api-reference/#update-instance","title":"Update Instance","text":"<p>Update an existing instance configuration. See Managing Instances for available configuration options.</p> <pre><code>PUT /api/v1/instances/{name}\n</code></pre> <p>Request Body: JSON object with configuration fields to update.</p> <p>Response: <pre><code>{\n  \"name\": \"llama2-7b\",\n  \"status\": \"running\",\n  \"created\": 1705312200\n}\n</code></pre></p>"},{"location":"user-guide/api-reference/#delete-instance","title":"Delete Instance","text":"<p>Stop and remove an instance.</p> <pre><code>DELETE /api/v1/instances/{name}\n</code></pre> <p>Response: <code>204 No Content</code></p>"},{"location":"user-guide/api-reference/#instance-operations","title":"Instance Operations","text":""},{"location":"user-guide/api-reference/#start-instance","title":"Start Instance","text":"<p>Start a stopped instance.</p> <pre><code>POST /api/v1/instances/{name}/start\n</code></pre> <p>Response: <pre><code>{\n  \"name\": \"llama2-7b\",\n  \"status\": \"running\",\n  \"created\": 1705312200\n}\n</code></pre></p> <p>Error Responses: - <code>409 Conflict</code>: Maximum number of running instances reached - <code>500 Internal Server Error</code>: Failed to start instance</p>"},{"location":"user-guide/api-reference/#stop-instance","title":"Stop Instance","text":"<p>Stop a running instance.</p> <pre><code>POST /api/v1/instances/{name}/stop\n</code></pre> <p>Response: <pre><code>{\n  \"name\": \"llama2-7b\",\n  \"status\": \"stopped\",\n  \"created\": 1705312200\n}\n</code></pre></p>"},{"location":"user-guide/api-reference/#restart-instance","title":"Restart Instance","text":"<p>Restart an instance (stop then start).</p> <pre><code>POST /api/v1/instances/{name}/restart\n</code></pre> <p>Response: <pre><code>{\n  \"name\": \"llama2-7b\",\n  \"status\": \"running\",\n  \"created\": 1705312200\n}\n</code></pre></p>"},{"location":"user-guide/api-reference/#get-instance-logs","title":"Get Instance Logs","text":"<p>Retrieve instance logs.</p> <pre><code>GET /api/v1/instances/{name}/logs\n</code></pre> <p>Query Parameters: - <code>lines</code>: Number of lines to return (default: all lines, use -1 for all)</p> <p>Response: Plain text log output</p> <p>Example: <pre><code>curl \"http://localhost:8080/api/v1/instances/my-instance/logs?lines=100\"\n</code></pre></p>"},{"location":"user-guide/api-reference/#proxy-to-instance","title":"Proxy to Instance","text":"<p>Proxy HTTP requests directly to the llama-server instance.</p> <pre><code>GET /api/v1/instances/{name}/proxy/*\nPOST /api/v1/instances/{name}/proxy/*\n</code></pre> <p>This endpoint forwards all requests to the underlying llama-server instance running on its configured port. The proxy strips the <code>/api/v1/instances/{name}/proxy</code> prefix and forwards the remaining path to the instance.</p> <p>Example - Check Instance Health: <pre><code>curl -H \"Authorization: Bearer your-api-key\" \\\n  http://localhost:8080/api/v1/instances/my-model/proxy/health\n</code></pre></p> <p>This forwards the request to <code>http://instance-host:instance-port/health</code> on the actual llama-server instance.</p> <p>Error Responses: - <code>503 Service Unavailable</code>: Instance is not running</p>"},{"location":"user-guide/api-reference/#openai-compatible-api","title":"OpenAI-Compatible API","text":"<p>Llamactl provides OpenAI-compatible endpoints for inference operations.</p>"},{"location":"user-guide/api-reference/#list-models","title":"List Models","text":"<p>List all instances in OpenAI-compatible format.</p> <pre><code>GET /v1/models\n</code></pre> <p>Response: <pre><code>{\n  \"object\": \"list\",\n  \"data\": [\n    {\n      \"id\": \"llama2-7b\",\n      \"object\": \"model\",\n      \"created\": 1705312200,\n      \"owned_by\": \"llamactl\"\n    }\n  ]\n}\n</code></pre></p>"},{"location":"user-guide/api-reference/#chat-completions-completions-embeddings","title":"Chat Completions, Completions, Embeddings","text":"<p>All OpenAI-compatible inference endpoints are available:</p> <pre><code>POST /v1/chat/completions\nPOST /v1/completions\nPOST /v1/embeddings\nPOST /v1/rerank\nPOST /v1/reranking\n</code></pre> <p>Request Body: Standard OpenAI format with <code>model</code> field specifying the instance name</p> <p>Example: <pre><code>{\n  \"model\": \"llama2-7b\",\n  \"messages\": [\n    {\n      \"role\": \"user\",\n      \"content\": \"Hello, how are you?\"\n    }\n  ]\n}\n</code></pre></p> <p>The server routes requests to the appropriate instance based on the <code>model</code> field in the request body. Instances with on-demand starting enabled will be automatically started if not running. For configuration details, see Managing Instances.</p> <p>Error Responses: - <code>400 Bad Request</code>: Invalid request body or missing model name - <code>503 Service Unavailable</code>: Instance is not running and on-demand start is disabled - <code>409 Conflict</code>: Cannot start instance due to maximum instances limit</p>"},{"location":"user-guide/api-reference/#instance-status-values","title":"Instance Status Values","text":"<p>Instances can have the following status values: - <code>stopped</code>: Instance is not running - <code>running</code>: Instance is running and ready to accept requests - <code>failed</code>: Instance failed to start or crashed  </p>"},{"location":"user-guide/api-reference/#error-responses","title":"Error Responses","text":"<p>All endpoints may return error responses in the following format:</p> <pre><code>{\n  \"error\": \"Error message description\"\n}\n</code></pre>"},{"location":"user-guide/api-reference/#common-http-status-codes","title":"Common HTTP Status Codes","text":"<ul> <li><code>200</code>: Success</li> <li><code>201</code>: Created</li> <li><code>204</code>: No Content (successful deletion)</li> <li><code>400</code>: Bad Request (invalid parameters or request body)</li> <li><code>401</code>: Unauthorized (missing or invalid API key)</li> <li><code>403</code>: Forbidden (insufficient permissions)</li> <li><code>404</code>: Not Found (instance not found)</li> <li><code>409</code>: Conflict (instance already exists, max instances reached)</li> <li><code>500</code>: Internal Server Error</li> <li><code>503</code>: Service Unavailable (instance not running)</li> </ul>"},{"location":"user-guide/api-reference/#examples","title":"Examples","text":""},{"location":"user-guide/api-reference/#complete-instance-lifecycle","title":"Complete Instance Lifecycle","text":"<pre><code># Create and start instance\ncurl -X POST http://localhost:8080/api/v1/instances/my-model \\\n  -H \"Content-Type: application/json\" \\\n  -H \"Authorization: Bearer your-api-key\" \\\n  -d '{\n    \"model\": \"/models/llama-2-7b.gguf\"\n  }'\n\n# Check instance status\ncurl -H \"Authorization: Bearer your-api-key\" \\\n  http://localhost:8080/api/v1/instances/my-model\n\n# Get instance logs\ncurl -H \"Authorization: Bearer your-api-key\" \\\n  \"http://localhost:8080/api/v1/instances/my-model/logs?lines=50\"\n\n# Use OpenAI-compatible chat completions\ncurl -X POST http://localhost:8080/v1/chat/completions \\\n  -H \"Content-Type: application/json\" \\\n  -H \"Authorization: Bearer your-inference-api-key\" \\\n  -d '{\n    \"model\": \"my-model\",\n    \"messages\": [\n      {\"role\": \"user\", \"content\": \"Hello!\"}\n    ],\n    \"max_tokens\": 100\n  }'\n\n# Stop instance\ncurl -X POST -H \"Authorization: Bearer your-api-key\" \\\n  http://localhost:8080/api/v1/instances/my-model/stop\n\n# Delete instance\ncurl -X DELETE -H \"Authorization: Bearer your-api-key\" \\\n  http://localhost:8080/api/v1/instances/my-model\n</code></pre>"},{"location":"user-guide/api-reference/#using-the-proxy-endpoint","title":"Using the Proxy Endpoint","text":"<p>You can also directly proxy requests to the llama-server instance:</p> <pre><code># Direct proxy to instance (bypasses OpenAI compatibility layer)\ncurl -X POST http://localhost:8080/api/v1/instances/my-model/proxy/completion \\\n  -H \"Content-Type: application/json\" \\\n  -H \"Authorization: Bearer your-api-key\" \\\n  -d '{\n    \"prompt\": \"Hello, world!\",\n    \"n_predict\": 50\n  }'\n</code></pre>"},{"location":"user-guide/api-reference/#backend-specific-endpoints","title":"Backend-Specific Endpoints","text":""},{"location":"user-guide/api-reference/#parse-commands","title":"Parse Commands","text":"<p>Llamactl provides endpoints to parse command strings from different backends into instance configuration options.</p>"},{"location":"user-guide/api-reference/#parse-llamacpp-command","title":"Parse Llama.cpp Command","text":"<p>Parse a llama-server command string into instance options.</p> <pre><code>POST /api/v1/backends/llama-cpp/parse-command\n</code></pre> <p>Request Body: <pre><code>{\n  \"command\": \"llama-server -m /path/to/model.gguf -c 2048 --port 8080\"\n}\n</code></pre></p> <p>Response: <pre><code>{\n  \"backend_type\": \"llama_cpp\",\n  \"llama_server_options\": {\n    \"model\": \"/path/to/model.gguf\",\n    \"ctx_size\": 2048,\n    \"port\": 8080\n  }\n}\n</code></pre></p>"},{"location":"user-guide/api-reference/#parse-mlx-lm-command","title":"Parse MLX-LM Command","text":"<p>Parse an MLX-LM server command string into instance options.</p> <pre><code>POST /api/v1/backends/mlx/parse-command\n</code></pre> <p>Request Body: <pre><code>{\n  \"command\": \"mlx_lm.server --model /path/to/model --port 8080\"\n}\n</code></pre></p> <p>Response: <pre><code>{\n  \"backend_type\": \"mlx_lm\",\n  \"mlx_server_options\": {\n    \"model\": \"/path/to/model\",\n    \"port\": 8080\n  }\n}\n</code></pre></p>"},{"location":"user-guide/api-reference/#parse-vllm-command","title":"Parse vLLM Command","text":"<p>Parse a vLLM serve command string into instance options.</p> <pre><code>POST /api/v1/backends/vllm/parse-command\n</code></pre> <p>Request Body: <pre><code>{\n  \"command\": \"vllm serve /path/to/model --port 8080\"\n}\n</code></pre></p> <p>Response: <pre><code>{\n  \"backend_type\": \"vllm\",\n  \"vllm_server_options\": {\n    \"model\": \"/path/to/model\",\n    \"port\": 8080\n  }\n}\n</code></pre></p> <p>Error Responses for Parse Commands: - <code>400 Bad Request</code>: Invalid request body, empty command, or parse error - <code>500 Internal Server Error</code>: Encoding error</p>"},{"location":"user-guide/api-reference/#auto-generated-documentation","title":"Auto-Generated Documentation","text":"<p>The API documentation is automatically generated from code annotations using Swagger/OpenAPI. To regenerate the documentation:</p> <ol> <li>Install the swag tool: <code>go install github.com/swaggo/swag/cmd/swag@latest</code></li> <li>Generate docs: <code>swag init -g cmd/server/main.go -o apidocs</code></li> </ol>"},{"location":"user-guide/api-reference/#swagger-documentation","title":"Swagger Documentation","text":"<p>If swagger documentation is enabled in the server configuration, you can access the interactive API documentation at:</p> <pre><code>http://localhost:8080/swagger/\n</code></pre> <p>This provides a complete interactive interface for testing all API endpoints.</p>"},{"location":"user-guide/managing-instances/","title":"Managing Instances","text":"<p>Learn how to effectively manage your llama.cpp, MLX, and vLLM instances with Llamactl through both the Web UI and API.</p>"},{"location":"user-guide/managing-instances/#overview","title":"Overview","text":"<p>Llamactl provides two ways to manage instances:</p> <ul> <li>Web UI: Accessible at <code>http://localhost:8080</code> with an intuitive dashboard</li> <li>REST API: Programmatic access for automation and integration</li> </ul> <p></p>"},{"location":"user-guide/managing-instances/#authentication","title":"Authentication","text":"<p>If authentication is enabled: 1. Navigate to the web UI 2. Enter your credentials 3. Bearer token is stored for the session</p>"},{"location":"user-guide/managing-instances/#theme-support","title":"Theme Support","text":"<ul> <li>Switch between light and dark themes</li> <li>Setting is remembered across sessions</li> </ul>"},{"location":"user-guide/managing-instances/#instance-cards","title":"Instance Cards","text":"<p>Each instance is displayed as a card showing:</p> <ul> <li>Instance name</li> <li>Health status badge (unknown, ready, error, failed)</li> <li>Action buttons (start, stop, edit, logs, delete)</li> </ul>"},{"location":"user-guide/managing-instances/#create-instance","title":"Create Instance","text":""},{"location":"user-guide/managing-instances/#via-web-ui","title":"Via Web UI","text":"<ol> <li>Click the \"Create Instance\" button on the dashboard</li> <li>Enter a unique Name for your instance (only required field)</li> <li>Choose Backend Type:<ul> <li>llama.cpp: For GGUF models using llama-server</li> <li>MLX: For MLX-optimized models (macOS only)</li> <li>vLLM: For distributed serving and high-throughput inference</li> </ul> </li> <li>Configure model source:<ul> <li>For llama.cpp: GGUF model path or HuggingFace repo</li> <li>For MLX: MLX model path or identifier (e.g., <code>mlx-community/Mistral-7B-Instruct-v0.3-4bit</code>)</li> <li>For vLLM: HuggingFace model identifier (e.g., <code>microsoft/DialoGPT-medium</code>)</li> </ul> </li> <li>Configure optional instance management settings:<ul> <li>Auto Restart: Automatically restart instance on failure</li> <li>Max Restarts: Maximum number of restart attempts</li> <li>Restart Delay: Delay in seconds between restart attempts</li> <li>On Demand Start: Start instance when receiving a request to the OpenAI compatible endpoint</li> <li>Idle Timeout: Minutes before stopping idle instance (set to 0 to disable)</li> </ul> </li> <li>Configure backend-specific options:<ul> <li>llama.cpp: Threads, context size, GPU layers, port, etc.</li> <li>MLX: Temperature, top-p, adapter path, Python environment, etc.</li> <li>vLLM: Tensor parallel size, GPU memory utilization, quantization, etc.</li> </ul> </li> <li>Click \"Create\" to save the instance  </li> </ol>"},{"location":"user-guide/managing-instances/#via-api","title":"Via API","text":"<pre><code># Create llama.cpp instance with local model file\ncurl -X POST http://localhost:8080/api/instances/my-llama-instance \\\n  -H \"Content-Type: application/json\" \\\n  -d '{\n    \"backend_type\": \"llama_cpp\",\n    \"backend_options\": {\n      \"model\": \"/path/to/model.gguf\",\n      \"threads\": 8,\n      \"ctx_size\": 4096,\n      \"gpu_layers\": 32\n    }\n  }'\n\n# Create MLX instance (macOS only)\ncurl -X POST http://localhost:8080/api/instances/my-mlx-instance \\\n  -H \"Content-Type: application/json\" \\\n  -d '{\n    \"backend_type\": \"mlx_lm\",\n    \"backend_options\": {\n      \"model\": \"mlx-community/Mistral-7B-Instruct-v0.3-4bit\",\n      \"temp\": 0.7,\n      \"top_p\": 0.9,\n      \"max_tokens\": 2048\n    },\n    \"auto_restart\": true,\n    \"max_restarts\": 3\n  }'\n\n# Create vLLM instance\ncurl -X POST http://localhost:8080/api/instances/my-vllm-instance \\\n  -H \"Content-Type: application/json\" \\\n  -d '{\n    \"backend_type\": \"vllm\",\n    \"backend_options\": {\n      \"model\": \"microsoft/DialoGPT-medium\",\n      \"tensor_parallel_size\": 2,\n      \"gpu_memory_utilization\": 0.9\n    },\n    \"auto_restart\": true,\n    \"on_demand_start\": true\n  }'\n\n# Create llama.cpp instance with HuggingFace model\ncurl -X POST http://localhost:8080/api/instances/gemma-3-27b \\\n  -H \"Content-Type: application/json\" \\\n  -d '{\n    \"backend_type\": \"llama_cpp\",\n    \"backend_options\": {\n      \"hf_repo\": \"unsloth/gemma-3-27b-it-GGUF\",\n      \"hf_file\": \"gemma-3-27b-it-GGUF.gguf\",\n      \"gpu_layers\": 32\n    }\n  }'\n</code></pre>"},{"location":"user-guide/managing-instances/#start-instance","title":"Start Instance","text":""},{"location":"user-guide/managing-instances/#via-web-ui_1","title":"Via Web UI","text":"<ol> <li>Click the \"Start\" button on an instance card</li> <li>Watch the status change to \"Unknown\"</li> <li>Monitor progress in the logs</li> <li>Instance status changes to \"Ready\" when ready</li> </ol>"},{"location":"user-guide/managing-instances/#via-api_1","title":"Via API","text":"<pre><code>curl -X POST http://localhost:8080/api/instances/{name}/start\n</code></pre>"},{"location":"user-guide/managing-instances/#stop-instance","title":"Stop Instance","text":""},{"location":"user-guide/managing-instances/#via-web-ui_2","title":"Via Web UI","text":"<ol> <li>Click the \"Stop\" button on an instance card</li> <li>Instance gracefully shuts down</li> </ol>"},{"location":"user-guide/managing-instances/#via-api_2","title":"Via API","text":"<pre><code>curl -X POST http://localhost:8080/api/instances/{name}/stop\n</code></pre>"},{"location":"user-guide/managing-instances/#edit-instance","title":"Edit Instance","text":""},{"location":"user-guide/managing-instances/#via-web-ui_3","title":"Via Web UI","text":"<ol> <li>Click the \"Edit\" button on an instance card</li> <li>Modify settings in the configuration dialog</li> <li>Changes require instance restart to take effect</li> <li>Click \"Update &amp; Restart\" to apply changes</li> </ol>"},{"location":"user-guide/managing-instances/#via-api_3","title":"Via API","text":"<p>Modify instance settings:</p> <pre><code>curl -X PUT http://localhost:8080/api/instances/{name} \\\n  -H \"Content-Type: application/json\" \\\n  -d '{\n    \"backend_options\": {\n      \"threads\": 8,\n      \"context_size\": 4096\n    }\n  }'\n</code></pre> <p>Note</p> <p>Configuration changes require restarting the instance to take effect.</p>"},{"location":"user-guide/managing-instances/#view-logs","title":"View Logs","text":""},{"location":"user-guide/managing-instances/#via-web-ui_4","title":"Via Web UI","text":"<ol> <li>Click the \"Logs\" button on any instance card</li> <li>Real-time log viewer opens</li> </ol>"},{"location":"user-guide/managing-instances/#via-api_4","title":"Via API","text":"<p>Check instance status in real-time:</p> <pre><code># Get instance details\ncurl http://localhost:8080/api/instances/{name}/logs\n</code></pre>"},{"location":"user-guide/managing-instances/#delete-instance","title":"Delete Instance","text":""},{"location":"user-guide/managing-instances/#via-web-ui_5","title":"Via Web UI","text":"<ol> <li>Click the \"Delete\" button on an instance card</li> <li>Only stopped instances can be deleted</li> <li>Confirm deletion in the dialog</li> </ol>"},{"location":"user-guide/managing-instances/#via-api_5","title":"Via API","text":"<pre><code>curl -X DELETE http://localhost:8080/api/instances/{name}\n</code></pre>"},{"location":"user-guide/managing-instances/#instance-proxy","title":"Instance Proxy","text":"<p>Llamactl proxies all requests to the underlying backend instances (llama-server, MLX, or vLLM).</p> <pre><code># Get instance details\ncurl http://localhost:8080/api/instances/{name}/proxy/\n</code></pre> <p>All backends provide OpenAI-compatible endpoints. Check the respective documentation: - llama-server docs - MLX-LM docs - vLLM docs</p>"},{"location":"user-guide/managing-instances/#instance-health","title":"Instance Health","text":""},{"location":"user-guide/managing-instances/#via-web-ui_6","title":"Via Web UI","text":"<ol> <li>The health status badge is displayed on each instance card</li> </ol>"},{"location":"user-guide/managing-instances/#via-api_6","title":"Via API","text":"<p>Check the health status of your instances:</p> <pre><code>curl http://localhost:8080/api/instances/{name}/proxy/health\n</code></pre>"},{"location":"user-guide/troubleshooting/","title":"Troubleshooting","text":"<p>Issues specific to Llamactl deployment and operation.</p>"},{"location":"user-guide/troubleshooting/#configuration-issues","title":"Configuration Issues","text":""},{"location":"user-guide/troubleshooting/#invalid-configuration","title":"Invalid Configuration","text":"<p>Problem: Invalid configuration preventing startup</p> <p>Solutions: 1. Use minimal configuration:    <pre><code>server:\n  host: \"0.0.0.0\"\n  port: 8080\ninstances:\n  port_range: [8000, 9000]\n</code></pre></p> <ol> <li>Check data directory permissions:    <pre><code># Ensure data directory is writable (default: ~/.local/share/llamactl)\nmkdir -p ~/.local/share/llamactl/{instances,logs}\n</code></pre></li> </ol>"},{"location":"user-guide/troubleshooting/#instance-management-issues","title":"Instance Management Issues","text":""},{"location":"user-guide/troubleshooting/#model-loading-failures","title":"Model Loading Failures","text":"<p>Problem: Instance fails to start with model loading errors</p> <p>Common Solutions: - llama-server not found: Ensure <code>llama-server</code> binary is in PATH - Wrong model format: Ensure model is in GGUF format - Insufficient memory: Use smaller model or reduce context size - Path issues: Use absolute paths to model files  </p>"},{"location":"user-guide/troubleshooting/#memory-issues","title":"Memory Issues","text":"<p>Problem: Out of memory errors or system becomes unresponsive</p> <p>Solutions: 1. Reduce context size: <pre><code>{\n  \"n_ctx\": 1024\n}\n</code></pre></p> <ol> <li>Use quantized models: </li> <li>Try Q4_K_M instead of higher precision models  </li> <li>Use smaller model variants (7B instead of 13B)  </li> </ol>"},{"location":"user-guide/troubleshooting/#gpu-configuration","title":"GPU Configuration","text":"<p>Problem: GPU not being used effectively</p> <p>Solutions: 1. Configure GPU layers: <pre><code>{\n  \"n_gpu_layers\": 35\n}\n</code></pre></p>"},{"location":"user-guide/troubleshooting/#advanced-instance-issues","title":"Advanced Instance Issues","text":"<p>Problem: Complex model loading, performance, or compatibility issues</p> <p>Since llamactl uses <code>llama-server</code> under the hood, many instance-related issues are actually llama.cpp issues. For advanced troubleshooting:</p> <p>Resources: - llama.cpp Documentation: https://github.com/ggml/llama.cpp - llama.cpp Issues: https://github.com/ggml/llama.cpp/issues - llama.cpp Discussions: https://github.com/ggml/llama.cpp/discussions </p> <p>Testing directly with llama-server: <pre><code># Test your model and parameters directly with llama-server\nllama-server --model /path/to/model.gguf --port 8081 --n-gpu-layers 35\n</code></pre></p> <p>This helps determine if the issue is with llamactl or with the underlying llama.cpp/llama-server.</p>"},{"location":"user-guide/troubleshooting/#api-and-network-issues","title":"API and Network Issues","text":""},{"location":"user-guide/troubleshooting/#cors-errors","title":"CORS Errors","text":"<p>Problem: Web UI shows CORS errors in browser console</p> <p>Solutions: 1. Configure allowed origins: <pre><code>server:\n  allowed_origins:\n    - \"http://localhost:3000\"\n    - \"https://yourdomain.com\"\n</code></pre></p>"},{"location":"user-guide/troubleshooting/#authentication-issues","title":"Authentication Issues","text":"<p>Problem: API requests failing with authentication errors</p> <p>Solutions: 1. Disable authentication temporarily: <pre><code>auth:\n  require_management_auth: false\n  require_inference_auth: false\n</code></pre></p> <ol> <li> <p>Configure API keys: <pre><code>auth:\n  management_keys:\n    - \"your-management-key\"\n  inference_keys:\n    - \"your-inference-key\"\n</code></pre></p> </li> <li> <p>Use correct Authorization header: <pre><code>curl -H \"Authorization: Bearer your-api-key\" \\\n  http://localhost:8080/api/v1/instances\n</code></pre></p> </li> </ol>"},{"location":"user-guide/troubleshooting/#debugging-and-logs","title":"Debugging and Logs","text":""},{"location":"user-guide/troubleshooting/#viewing-instance-logs","title":"Viewing Instance Logs","text":"<pre><code># Get instance logs via API\ncurl http://localhost:8080/api/v1/instances/{name}/logs\n\n# Or check log files directly\ntail -f ~/.local/share/llamactl/logs/{instance-name}.log\n</code></pre>"},{"location":"user-guide/troubleshooting/#enable-debug-logging","title":"Enable Debug Logging","text":"<pre><code>export LLAMACTL_LOG_LEVEL=debug\nllamactl\n</code></pre>"},{"location":"user-guide/troubleshooting/#getting-help","title":"Getting Help","text":"<p>When reporting issues, include:</p> <ol> <li> <p>System information: <pre><code>llamactl --version\n</code></pre></p> </li> <li> <p>Configuration file (remove sensitive keys)</p> </li> <li> <p>Relevant log output</p> </li> <li> <p>Steps to reproduce the issue</p> </li> </ol>"}]}
\ No newline at end of file
diff --git a/dev/sitemap.xml b/dev/sitemap.xml
index 612b6ca..0f9d74b 100644
--- a/dev/sitemap.xml
+++ b/dev/sitemap.xml
@@ -2,37 +2,37 @@
 <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
     <url>
          <loc>https://llamactl.org/dev/</loc>
-         <lastmod>2025-09-18</lastmod>
+         <lastmod>2025-09-22</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://llamactl.org/dev/getting-started/configuration/</loc>
-         <lastmod>2025-09-18</lastmod>
+         <lastmod>2025-09-22</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://llamactl.org/dev/getting-started/installation/</loc>
-         <lastmod>2025-09-18</lastmod>
+         <lastmod>2025-09-22</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://llamactl.org/dev/getting-started/quick-start/</loc>
-         <lastmod>2025-09-18</lastmod>
+         <lastmod>2025-09-22</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://llamactl.org/dev/user-guide/api-reference/</loc>
-         <lastmod>2025-09-18</lastmod>
+         <lastmod>2025-09-22</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://llamactl.org/dev/user-guide/managing-instances/</loc>
-         <lastmod>2025-09-18</lastmod>
+         <lastmod>2025-09-22</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://llamactl.org/dev/user-guide/troubleshooting/</loc>
-         <lastmod>2025-09-18</lastmod>
+         <lastmod>2025-09-22</lastmod>
          <changefreq>daily</changefreq>
     </url>
 </urlset>
\ No newline at end of file
diff --git a/dev/sitemap.xml.gz b/dev/sitemap.xml.gz
index 4d409ee..f7a9f94 100644
Binary files a/dev/sitemap.xml.gz and b/dev/sitemap.xml.gz differ
diff --git a/dev/user-guide/api-reference/index.html b/dev/user-guide/api-reference/index.html
index eb7b86b..7633cef 100644
--- a/dev/user-guide/api-reference/index.html
+++ b/dev/user-guide/api-reference/index.html
@@ -856,6 +856,72 @@
       </ul>
     </nav>
   
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#backend-specific-endpoints" class="md-nav__link">
+    <span class="md-ellipsis">
+      Backend-Specific Endpoints
+    </span>
+  </a>
+  
+    <nav class="md-nav" aria-label="Backend-Specific Endpoints">
+      <ul class="md-nav__list">
+        
+          <li class="md-nav__item">
+  <a href="#parse-commands" class="md-nav__link">
+    <span class="md-ellipsis">
+      Parse Commands
+    </span>
+  </a>
+  
+    <nav class="md-nav" aria-label="Parse Commands">
+      <ul class="md-nav__list">
+        
+          <li class="md-nav__item">
+  <a href="#parse-llamacpp-command" class="md-nav__link">
+    <span class="md-ellipsis">
+      Parse Llama.cpp Command
+    </span>
+  </a>
+  
+</li>
+        
+          <li class="md-nav__item">
+  <a href="#parse-mlx-lm-command" class="md-nav__link">
+    <span class="md-ellipsis">
+      Parse MLX-LM Command
+    </span>
+  </a>
+  
+</li>
+        
+          <li class="md-nav__item">
+  <a href="#parse-vllm-command" class="md-nav__link">
+    <span class="md-ellipsis">
+      Parse vLLM Command
+    </span>
+  </a>
+  
+</li>
+        
+      </ul>
+    </nav>
+  
+</li>
+        
+      </ul>
+    </nav>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#auto-generated-documentation" class="md-nav__link">
+    <span class="md-ellipsis">
+      Auto-Generated Documentation
+    </span>
+  </a>
+  
 </li>
       
         <li class="md-nav__item">
@@ -1216,6 +1282,72 @@
       </ul>
     </nav>
   
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#backend-specific-endpoints" class="md-nav__link">
+    <span class="md-ellipsis">
+      Backend-Specific Endpoints
+    </span>
+  </a>
+  
+    <nav class="md-nav" aria-label="Backend-Specific Endpoints">
+      <ul class="md-nav__list">
+        
+          <li class="md-nav__item">
+  <a href="#parse-commands" class="md-nav__link">
+    <span class="md-ellipsis">
+      Parse Commands
+    </span>
+  </a>
+  
+    <nav class="md-nav" aria-label="Parse Commands">
+      <ul class="md-nav__list">
+        
+          <li class="md-nav__item">
+  <a href="#parse-llamacpp-command" class="md-nav__link">
+    <span class="md-ellipsis">
+      Parse Llama.cpp Command
+    </span>
+  </a>
+  
+</li>
+        
+          <li class="md-nav__item">
+  <a href="#parse-mlx-lm-command" class="md-nav__link">
+    <span class="md-ellipsis">
+      Parse MLX-LM Command
+    </span>
+  </a>
+  
+</li>
+        
+          <li class="md-nav__item">
+  <a href="#parse-vllm-command" class="md-nav__link">
+    <span class="md-ellipsis">
+      Parse vLLM Command
+    </span>
+  </a>
+  
+</li>
+        
+      </ul>
+    </nav>
+  
+</li>
+        
+      </ul>
+    </nav>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#auto-generated-documentation" class="md-nav__link">
+    <span class="md-ellipsis">
+      Auto-Generated Documentation
+    </span>
+  </a>
+  
 </li>
       
         <li class="md-nav__item">
@@ -1346,7 +1478,7 @@
 <p><strong>Response:</strong>
 <div class="highlight"><pre><span></span><code><a id="__codelineno-17-1" name="__codelineno-17-1" href="#__codelineno-17-1"></a><span class="p">{</span>
 <a id="__codelineno-17-2" name="__codelineno-17-2" href="#__codelineno-17-2"></a><span class="w">  </span><span class="nt">&quot;name&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;llama2-7b&quot;</span><span class="p">,</span>
-<a id="__codelineno-17-3" name="__codelineno-17-3" href="#__codelineno-17-3"></a><span class="w">  </span><span class="nt">&quot;status&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;starting&quot;</span><span class="p">,</span>
+<a id="__codelineno-17-3" name="__codelineno-17-3" href="#__codelineno-17-3"></a><span class="w">  </span><span class="nt">&quot;status&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;running&quot;</span><span class="p">,</span>
 <a id="__codelineno-17-4" name="__codelineno-17-4" href="#__codelineno-17-4"></a><span class="w">  </span><span class="nt">&quot;created&quot;</span><span class="p">:</span><span class="w"> </span><span class="mi">1705312200</span>
 <a id="__codelineno-17-5" name="__codelineno-17-5" href="#__codelineno-17-5"></a><span class="p">}</span>
 </code></pre></div></p>
@@ -1360,7 +1492,7 @@
 <p><strong>Response:</strong>
 <div class="highlight"><pre><span></span><code><a id="__codelineno-19-1" name="__codelineno-19-1" href="#__codelineno-19-1"></a><span class="p">{</span>
 <a id="__codelineno-19-2" name="__codelineno-19-2" href="#__codelineno-19-2"></a><span class="w">  </span><span class="nt">&quot;name&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;llama2-7b&quot;</span><span class="p">,</span>
-<a id="__codelineno-19-3" name="__codelineno-19-3" href="#__codelineno-19-3"></a><span class="w">  </span><span class="nt">&quot;status&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;stopping&quot;</span><span class="p">,</span>
+<a id="__codelineno-19-3" name="__codelineno-19-3" href="#__codelineno-19-3"></a><span class="w">  </span><span class="nt">&quot;status&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;stopped&quot;</span><span class="p">,</span>
 <a id="__codelineno-19-4" name="__codelineno-19-4" href="#__codelineno-19-4"></a><span class="w">  </span><span class="nt">&quot;created&quot;</span><span class="p">:</span><span class="w"> </span><span class="mi">1705312200</span>
 <a id="__codelineno-19-5" name="__codelineno-19-5" href="#__codelineno-19-5"></a><span class="p">}</span>
 </code></pre></div></p>
@@ -1371,7 +1503,7 @@
 <p><strong>Response:</strong>
 <div class="highlight"><pre><span></span><code><a id="__codelineno-21-1" name="__codelineno-21-1" href="#__codelineno-21-1"></a><span class="p">{</span>
 <a id="__codelineno-21-2" name="__codelineno-21-2" href="#__codelineno-21-2"></a><span class="w">  </span><span class="nt">&quot;name&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;llama2-7b&quot;</span><span class="p">,</span>
-<a id="__codelineno-21-3" name="__codelineno-21-3" href="#__codelineno-21-3"></a><span class="w">  </span><span class="nt">&quot;status&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;restarting&quot;</span><span class="p">,</span>
+<a id="__codelineno-21-3" name="__codelineno-21-3" href="#__codelineno-21-3"></a><span class="w">  </span><span class="nt">&quot;status&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;running&quot;</span><span class="p">,</span>
 <a id="__codelineno-21-4" name="__codelineno-21-4" href="#__codelineno-21-4"></a><span class="w">  </span><span class="nt">&quot;created&quot;</span><span class="p">:</span><span class="w"> </span><span class="mi">1705312200</span>
 <a id="__codelineno-21-5" name="__codelineno-21-5" href="#__codelineno-21-5"></a><span class="p">}</span>
 </code></pre></div></p>
@@ -1443,9 +1575,9 @@
 - <code>503 Service Unavailable</code>: Instance is not running and on-demand start is disabled
 - <code>409 Conflict</code>: Cannot start instance due to maximum instances limit</p>
 <h2 id="instance-status-values">Instance Status Values<a class="headerlink" href="#instance-status-values" title="Permanent link">&para;</a></h2>
-<p>Instances can have the following status values:<br />
-- <code>stopped</code>: Instance is not running<br />
-- <code>running</code>: Instance is running and ready to accept requests<br />
+<p>Instances can have the following status values:
+- <code>stopped</code>: Instance is not running
+- <code>running</code>: Instance is running and ready to accept requests
 - <code>failed</code>: Instance failed to start or crashed  </p>
 <h2 id="error-responses">Error Responses<a class="headerlink" href="#error-responses" title="Permanent link">&para;</a></h2>
 <p>All endpoints may return error responses in the following format:</p>
@@ -1515,9 +1647,76 @@
 <a id="__codelineno-32-7" name="__codelineno-32-7" href="#__codelineno-32-7"></a><span class="s1">    &quot;n_predict&quot;: 50</span>
 <a id="__codelineno-32-8" name="__codelineno-32-8" href="#__codelineno-32-8"></a><span class="s1">  }&#39;</span>
 </code></pre></div>
+<h2 id="backend-specific-endpoints">Backend-Specific Endpoints<a class="headerlink" href="#backend-specific-endpoints" title="Permanent link">&para;</a></h2>
+<h3 id="parse-commands">Parse Commands<a class="headerlink" href="#parse-commands" title="Permanent link">&para;</a></h3>
+<p>Llamactl provides endpoints to parse command strings from different backends into instance configuration options.</p>
+<h4 id="parse-llamacpp-command">Parse Llama.cpp Command<a class="headerlink" href="#parse-llamacpp-command" title="Permanent link">&para;</a></h4>
+<p>Parse a llama-server command string into instance options.</p>
+<div class="highlight"><pre><span></span><code><a id="__codelineno-33-1" name="__codelineno-33-1" href="#__codelineno-33-1"></a><span class="err">POST /api/v1/backends/llama-cpp/parse-command</span>
+</code></pre></div>
+<p><strong>Request Body:</strong>
+<div class="highlight"><pre><span></span><code><a id="__codelineno-34-1" name="__codelineno-34-1" href="#__codelineno-34-1"></a><span class="p">{</span>
+<a id="__codelineno-34-2" name="__codelineno-34-2" href="#__codelineno-34-2"></a><span class="w">  </span><span class="nt">&quot;command&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;llama-server -m /path/to/model.gguf -c 2048 --port 8080&quot;</span>
+<a id="__codelineno-34-3" name="__codelineno-34-3" href="#__codelineno-34-3"></a><span class="p">}</span>
+</code></pre></div></p>
+<p><strong>Response:</strong>
+<div class="highlight"><pre><span></span><code><a id="__codelineno-35-1" name="__codelineno-35-1" href="#__codelineno-35-1"></a><span class="p">{</span>
+<a id="__codelineno-35-2" name="__codelineno-35-2" href="#__codelineno-35-2"></a><span class="w">  </span><span class="nt">&quot;backend_type&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;llama_cpp&quot;</span><span class="p">,</span>
+<a id="__codelineno-35-3" name="__codelineno-35-3" href="#__codelineno-35-3"></a><span class="w">  </span><span class="nt">&quot;llama_server_options&quot;</span><span class="p">:</span><span class="w"> </span><span class="p">{</span>
+<a id="__codelineno-35-4" name="__codelineno-35-4" href="#__codelineno-35-4"></a><span class="w">    </span><span class="nt">&quot;model&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;/path/to/model.gguf&quot;</span><span class="p">,</span>
+<a id="__codelineno-35-5" name="__codelineno-35-5" href="#__codelineno-35-5"></a><span class="w">    </span><span class="nt">&quot;ctx_size&quot;</span><span class="p">:</span><span class="w"> </span><span class="mi">2048</span><span class="p">,</span>
+<a id="__codelineno-35-6" name="__codelineno-35-6" href="#__codelineno-35-6"></a><span class="w">    </span><span class="nt">&quot;port&quot;</span><span class="p">:</span><span class="w"> </span><span class="mi">8080</span>
+<a id="__codelineno-35-7" name="__codelineno-35-7" href="#__codelineno-35-7"></a><span class="w">  </span><span class="p">}</span>
+<a id="__codelineno-35-8" name="__codelineno-35-8" href="#__codelineno-35-8"></a><span class="p">}</span>
+</code></pre></div></p>
+<h4 id="parse-mlx-lm-command">Parse MLX-LM Command<a class="headerlink" href="#parse-mlx-lm-command" title="Permanent link">&para;</a></h4>
+<p>Parse an MLX-LM server command string into instance options.</p>
+<div class="highlight"><pre><span></span><code><a id="__codelineno-36-1" name="__codelineno-36-1" href="#__codelineno-36-1"></a><span class="err">POST /api/v1/backends/mlx/parse-command</span>
+</code></pre></div>
+<p><strong>Request Body:</strong>
+<div class="highlight"><pre><span></span><code><a id="__codelineno-37-1" name="__codelineno-37-1" href="#__codelineno-37-1"></a><span class="p">{</span>
+<a id="__codelineno-37-2" name="__codelineno-37-2" href="#__codelineno-37-2"></a><span class="w">  </span><span class="nt">&quot;command&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;mlx_lm.server --model /path/to/model --port 8080&quot;</span>
+<a id="__codelineno-37-3" name="__codelineno-37-3" href="#__codelineno-37-3"></a><span class="p">}</span>
+</code></pre></div></p>
+<p><strong>Response:</strong>
+<div class="highlight"><pre><span></span><code><a id="__codelineno-38-1" name="__codelineno-38-1" href="#__codelineno-38-1"></a><span class="p">{</span>
+<a id="__codelineno-38-2" name="__codelineno-38-2" href="#__codelineno-38-2"></a><span class="w">  </span><span class="nt">&quot;backend_type&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;mlx_lm&quot;</span><span class="p">,</span>
+<a id="__codelineno-38-3" name="__codelineno-38-3" href="#__codelineno-38-3"></a><span class="w">  </span><span class="nt">&quot;mlx_server_options&quot;</span><span class="p">:</span><span class="w"> </span><span class="p">{</span>
+<a id="__codelineno-38-4" name="__codelineno-38-4" href="#__codelineno-38-4"></a><span class="w">    </span><span class="nt">&quot;model&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;/path/to/model&quot;</span><span class="p">,</span>
+<a id="__codelineno-38-5" name="__codelineno-38-5" href="#__codelineno-38-5"></a><span class="w">    </span><span class="nt">&quot;port&quot;</span><span class="p">:</span><span class="w"> </span><span class="mi">8080</span>
+<a id="__codelineno-38-6" name="__codelineno-38-6" href="#__codelineno-38-6"></a><span class="w">  </span><span class="p">}</span>
+<a id="__codelineno-38-7" name="__codelineno-38-7" href="#__codelineno-38-7"></a><span class="p">}</span>
+</code></pre></div></p>
+<h4 id="parse-vllm-command">Parse vLLM Command<a class="headerlink" href="#parse-vllm-command" title="Permanent link">&para;</a></h4>
+<p>Parse a vLLM serve command string into instance options.</p>
+<div class="highlight"><pre><span></span><code><a id="__codelineno-39-1" name="__codelineno-39-1" href="#__codelineno-39-1"></a><span class="err">POST /api/v1/backends/vllm/parse-command</span>
+</code></pre></div>
+<p><strong>Request Body:</strong>
+<div class="highlight"><pre><span></span><code><a id="__codelineno-40-1" name="__codelineno-40-1" href="#__codelineno-40-1"></a><span class="p">{</span>
+<a id="__codelineno-40-2" name="__codelineno-40-2" href="#__codelineno-40-2"></a><span class="w">  </span><span class="nt">&quot;command&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;vllm serve /path/to/model --port 8080&quot;</span>
+<a id="__codelineno-40-3" name="__codelineno-40-3" href="#__codelineno-40-3"></a><span class="p">}</span>
+</code></pre></div></p>
+<p><strong>Response:</strong>
+<div class="highlight"><pre><span></span><code><a id="__codelineno-41-1" name="__codelineno-41-1" href="#__codelineno-41-1"></a><span class="p">{</span>
+<a id="__codelineno-41-2" name="__codelineno-41-2" href="#__codelineno-41-2"></a><span class="w">  </span><span class="nt">&quot;backend_type&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;vllm&quot;</span><span class="p">,</span>
+<a id="__codelineno-41-3" name="__codelineno-41-3" href="#__codelineno-41-3"></a><span class="w">  </span><span class="nt">&quot;vllm_server_options&quot;</span><span class="p">:</span><span class="w"> </span><span class="p">{</span>
+<a id="__codelineno-41-4" name="__codelineno-41-4" href="#__codelineno-41-4"></a><span class="w">    </span><span class="nt">&quot;model&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;/path/to/model&quot;</span><span class="p">,</span>
+<a id="__codelineno-41-5" name="__codelineno-41-5" href="#__codelineno-41-5"></a><span class="w">    </span><span class="nt">&quot;port&quot;</span><span class="p">:</span><span class="w"> </span><span class="mi">8080</span>
+<a id="__codelineno-41-6" name="__codelineno-41-6" href="#__codelineno-41-6"></a><span class="w">  </span><span class="p">}</span>
+<a id="__codelineno-41-7" name="__codelineno-41-7" href="#__codelineno-41-7"></a><span class="p">}</span>
+</code></pre></div></p>
+<p><strong>Error Responses for Parse Commands:</strong>
+- <code>400 Bad Request</code>: Invalid request body, empty command, or parse error
+- <code>500 Internal Server Error</code>: Encoding error</p>
+<h2 id="auto-generated-documentation">Auto-Generated Documentation<a class="headerlink" href="#auto-generated-documentation" title="Permanent link">&para;</a></h2>
+<p>The API documentation is automatically generated from code annotations using Swagger/OpenAPI. To regenerate the documentation:</p>
+<ol>
+<li>Install the swag tool: <code>go install github.com/swaggo/swag/cmd/swag@latest</code></li>
+<li>Generate docs: <code>swag init -g cmd/server/main.go -o apidocs</code></li>
+</ol>
 <h2 id="swagger-documentation">Swagger Documentation<a class="headerlink" href="#swagger-documentation" title="Permanent link">&para;</a></h2>
 <p>If swagger documentation is enabled in the server configuration, you can access the interactive API documentation at:</p>
-<div class="highlight"><pre><span></span><code><a id="__codelineno-33-1" name="__codelineno-33-1" href="#__codelineno-33-1"></a>http://localhost:8080/swagger/
+<div class="highlight"><pre><span></span><code><a id="__codelineno-42-1" name="__codelineno-42-1" href="#__codelineno-42-1"></a>http://localhost:8080/swagger/
 </code></pre></div>
 <p>This provides a complete interactive interface for testing all API endpoints.</p>
 
@@ -1540,7 +1739,7 @@
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-date">September 3, 2025</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-date">September 21, 2025</span>
   </span>
 
     
diff --git a/dev/user-guide/managing-instances/index.html b/dev/user-guide/managing-instances/index.html
index fdccc29..b005e1c 100644
--- a/dev/user-guide/managing-instances/index.html
+++ b/dev/user-guide/managing-instances/index.html
@@ -1228,7 +1228,7 @@
 
 
 <h1 id="managing-instances">Managing Instances<a class="headerlink" href="#managing-instances" title="Permanent link">&para;</a></h1>
-<p>Learn how to effectively manage your llama.cpp and MLX instances with Llamactl through both the Web UI and API.</p>
+<p>Learn how to effectively manage your llama.cpp, MLX, and vLLM instances with Llamactl through both the Web UI and API.</p>
 <h2 id="overview">Overview<a class="headerlink" href="#overview" title="Permanent link">&para;</a></h2>
 <p>Llamactl provides two ways to manage instances:</p>
 <ul>
@@ -1262,11 +1262,13 @@
 <li><strong>Choose Backend Type</strong>:<ul>
 <li><strong>llama.cpp</strong>: For GGUF models using llama-server</li>
 <li><strong>MLX</strong>: For MLX-optimized models (macOS only)</li>
+<li><strong>vLLM</strong>: For distributed serving and high-throughput inference</li>
 </ul>
 </li>
 <li>Configure model source:<ul>
 <li><strong>For llama.cpp</strong>: GGUF model path or HuggingFace repo</li>
 <li><strong>For MLX</strong>: MLX model path or identifier (e.g., <code>mlx-community/Mistral-7B-Instruct-v0.3-4bit</code>)</li>
+<li><strong>For vLLM</strong>: HuggingFace model identifier (e.g., <code>microsoft/DialoGPT-medium</code>)</li>
 </ul>
 </li>
 <li>Configure optional instance management settings:<ul>
@@ -1280,6 +1282,7 @@
 <li>Configure backend-specific options:<ul>
 <li><strong>llama.cpp</strong>: Threads, context size, GPU layers, port, etc.</li>
 <li><strong>MLX</strong>: Temperature, top-p, adapter path, Python environment, etc.</li>
+<li><strong>vLLM</strong>: Tensor parallel size, GPU memory utilization, quantization, etc.</li>
 </ul>
 </li>
 <li>Click <strong>"Create"</strong> to save the instance  </li>
@@ -1313,17 +1316,31 @@
 <a id="__codelineno-0-26" name="__codelineno-0-26" href="#__codelineno-0-26"></a><span class="s1">    &quot;max_restarts&quot;: 3</span>
 <a id="__codelineno-0-27" name="__codelineno-0-27" href="#__codelineno-0-27"></a><span class="s1">  }&#39;</span>
 <a id="__codelineno-0-28" name="__codelineno-0-28" href="#__codelineno-0-28"></a>
-<a id="__codelineno-0-29" name="__codelineno-0-29" href="#__codelineno-0-29"></a><span class="c1"># Create llama.cpp instance with HuggingFace model</span>
-<a id="__codelineno-0-30" name="__codelineno-0-30" href="#__codelineno-0-30"></a>curl<span class="w"> </span>-X<span class="w"> </span>POST<span class="w"> </span>http://localhost:8080/api/instances/gemma-3-27b<span class="w"> </span><span class="se">\</span>
+<a id="__codelineno-0-29" name="__codelineno-0-29" href="#__codelineno-0-29"></a><span class="c1"># Create vLLM instance</span>
+<a id="__codelineno-0-30" name="__codelineno-0-30" href="#__codelineno-0-30"></a>curl<span class="w"> </span>-X<span class="w"> </span>POST<span class="w"> </span>http://localhost:8080/api/instances/my-vllm-instance<span class="w"> </span><span class="se">\</span>
 <a id="__codelineno-0-31" name="__codelineno-0-31" href="#__codelineno-0-31"></a><span class="w">  </span>-H<span class="w"> </span><span class="s2">&quot;Content-Type: application/json&quot;</span><span class="w"> </span><span class="se">\</span>
 <a id="__codelineno-0-32" name="__codelineno-0-32" href="#__codelineno-0-32"></a><span class="w">  </span>-d<span class="w"> </span><span class="s1">&#39;{</span>
-<a id="__codelineno-0-33" name="__codelineno-0-33" href="#__codelineno-0-33"></a><span class="s1">    &quot;backend_type&quot;: &quot;llama_cpp&quot;,</span>
+<a id="__codelineno-0-33" name="__codelineno-0-33" href="#__codelineno-0-33"></a><span class="s1">    &quot;backend_type&quot;: &quot;vllm&quot;,</span>
 <a id="__codelineno-0-34" name="__codelineno-0-34" href="#__codelineno-0-34"></a><span class="s1">    &quot;backend_options&quot;: {</span>
-<a id="__codelineno-0-35" name="__codelineno-0-35" href="#__codelineno-0-35"></a><span class="s1">      &quot;hf_repo&quot;: &quot;unsloth/gemma-3-27b-it-GGUF&quot;,</span>
-<a id="__codelineno-0-36" name="__codelineno-0-36" href="#__codelineno-0-36"></a><span class="s1">      &quot;hf_file&quot;: &quot;gemma-3-27b-it-GGUF.gguf&quot;,</span>
-<a id="__codelineno-0-37" name="__codelineno-0-37" href="#__codelineno-0-37"></a><span class="s1">      &quot;gpu_layers&quot;: 32</span>
-<a id="__codelineno-0-38" name="__codelineno-0-38" href="#__codelineno-0-38"></a><span class="s1">    }</span>
-<a id="__codelineno-0-39" name="__codelineno-0-39" href="#__codelineno-0-39"></a><span class="s1">  }&#39;</span>
+<a id="__codelineno-0-35" name="__codelineno-0-35" href="#__codelineno-0-35"></a><span class="s1">      &quot;model&quot;: &quot;microsoft/DialoGPT-medium&quot;,</span>
+<a id="__codelineno-0-36" name="__codelineno-0-36" href="#__codelineno-0-36"></a><span class="s1">      &quot;tensor_parallel_size&quot;: 2,</span>
+<a id="__codelineno-0-37" name="__codelineno-0-37" href="#__codelineno-0-37"></a><span class="s1">      &quot;gpu_memory_utilization&quot;: 0.9</span>
+<a id="__codelineno-0-38" name="__codelineno-0-38" href="#__codelineno-0-38"></a><span class="s1">    },</span>
+<a id="__codelineno-0-39" name="__codelineno-0-39" href="#__codelineno-0-39"></a><span class="s1">    &quot;auto_restart&quot;: true,</span>
+<a id="__codelineno-0-40" name="__codelineno-0-40" href="#__codelineno-0-40"></a><span class="s1">    &quot;on_demand_start&quot;: true</span>
+<a id="__codelineno-0-41" name="__codelineno-0-41" href="#__codelineno-0-41"></a><span class="s1">  }&#39;</span>
+<a id="__codelineno-0-42" name="__codelineno-0-42" href="#__codelineno-0-42"></a>
+<a id="__codelineno-0-43" name="__codelineno-0-43" href="#__codelineno-0-43"></a><span class="c1"># Create llama.cpp instance with HuggingFace model</span>
+<a id="__codelineno-0-44" name="__codelineno-0-44" href="#__codelineno-0-44"></a>curl<span class="w"> </span>-X<span class="w"> </span>POST<span class="w"> </span>http://localhost:8080/api/instances/gemma-3-27b<span class="w"> </span><span class="se">\</span>
+<a id="__codelineno-0-45" name="__codelineno-0-45" href="#__codelineno-0-45"></a><span class="w">  </span>-H<span class="w"> </span><span class="s2">&quot;Content-Type: application/json&quot;</span><span class="w"> </span><span class="se">\</span>
+<a id="__codelineno-0-46" name="__codelineno-0-46" href="#__codelineno-0-46"></a><span class="w">  </span>-d<span class="w"> </span><span class="s1">&#39;{</span>
+<a id="__codelineno-0-47" name="__codelineno-0-47" href="#__codelineno-0-47"></a><span class="s1">    &quot;backend_type&quot;: &quot;llama_cpp&quot;,</span>
+<a id="__codelineno-0-48" name="__codelineno-0-48" href="#__codelineno-0-48"></a><span class="s1">    &quot;backend_options&quot;: {</span>
+<a id="__codelineno-0-49" name="__codelineno-0-49" href="#__codelineno-0-49"></a><span class="s1">      &quot;hf_repo&quot;: &quot;unsloth/gemma-3-27b-it-GGUF&quot;,</span>
+<a id="__codelineno-0-50" name="__codelineno-0-50" href="#__codelineno-0-50"></a><span class="s1">      &quot;hf_file&quot;: &quot;gemma-3-27b-it-GGUF.gguf&quot;,</span>
+<a id="__codelineno-0-51" name="__codelineno-0-51" href="#__codelineno-0-51"></a><span class="s1">      &quot;gpu_layers&quot;: 32</span>
+<a id="__codelineno-0-52" name="__codelineno-0-52" href="#__codelineno-0-52"></a><span class="s1">    }</span>
+<a id="__codelineno-0-53" name="__codelineno-0-53" href="#__codelineno-0-53"></a><span class="s1">  }&#39;</span>
 </code></pre></div>
 <h2 id="start-instance">Start Instance<a class="headerlink" href="#start-instance" title="Permanent link">&para;</a></h2>
 <h3 id="via-web-ui_1">Via Web UI<a class="headerlink" href="#via-web-ui_1" title="Permanent link">&para;</a></h3>
@@ -1390,13 +1407,14 @@
 <div class="highlight"><pre><span></span><code><a id="__codelineno-5-1" name="__codelineno-5-1" href="#__codelineno-5-1"></a>curl<span class="w"> </span>-X<span class="w"> </span>DELETE<span class="w"> </span>http://localhost:8080/api/instances/<span class="o">{</span>name<span class="o">}</span>
 </code></pre></div>
 <h2 id="instance-proxy">Instance Proxy<a class="headerlink" href="#instance-proxy" title="Permanent link">&para;</a></h2>
-<p>Llamactl proxies all requests to the underlying backend instances (llama-server or MLX).</p>
+<p>Llamactl proxies all requests to the underlying backend instances (llama-server, MLX, or vLLM).</p>
 <div class="highlight"><pre><span></span><code><a id="__codelineno-6-1" name="__codelineno-6-1" href="#__codelineno-6-1"></a><span class="c1"># Get instance details</span>
 <a id="__codelineno-6-2" name="__codelineno-6-2" href="#__codelineno-6-2"></a>curl<span class="w"> </span>http://localhost:8080/api/instances/<span class="o">{</span>name<span class="o">}</span>/proxy/
 </code></pre></div>
-<p>Both backends provide OpenAI-compatible endpoints. Check the respective documentation:
+<p>All backends provide OpenAI-compatible endpoints. Check the respective documentation:
 - <a href="https://github.com/ggml-org/llama.cpp/blob/master/tools/server/README.md">llama-server docs</a>
-- <a href="https://github.com/ml-explore/mlx-lm/blob/main/mlx_lm/SERVER.md">MLX-LM docs</a></p>
+- <a href="https://github.com/ml-explore/mlx-lm/blob/main/mlx_lm/SERVER.md">MLX-LM docs</a>
+- <a href="https://docs.vllm.ai/en/latest/">vLLM docs</a></p>
 <h3 id="instance-health">Instance Health<a class="headerlink" href="#instance-health" title="Permanent link">&para;</a></h3>
 <h4 id="via-web-ui_6">Via Web UI<a class="headerlink" href="#via-web-ui_6" title="Permanent link">&para;</a></h4>
 <ol>
@@ -1426,7 +1444,7 @@
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-date">September 18, 2025</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-date">September 21, 2025</span>
   </span>