From 43ceed2d714dcc0425778593464486e7bdc165c8 Mon Sep 17 00:00:00 2001
From: lordmathis <lordmathis@users.noreply.github.com>
Date: Thu, 9 Oct 2025 21:28:27 +0000
Subject: [PATCH] Deployed cf20f30 to dev with MkDocs 1.5.3 and mike 2.0.0

---
 .../fix_line_endings.cpython-311.pyc          | Bin 0 -> 2109 bytes
 dev/__pycache__/readme_sync.cpython-311.pyc   | Bin 3201 -> 3201 bytes
 dev/fix_line_endings.py                       |  60 ++++++
 dev/getting-started/configuration/index.html  | 118 +++++------
 dev/getting-started/installation/index.html   |  68 +++---
 dev/getting-started/quick-start/index.html    |  62 +++---
 dev/index.html                                |  24 +--
 dev/search/search_index.json                  |   2 +-
 dev/sitemap.xml.gz                            | Bin 291 -> 291 bytes
 dev/user-guide/api-reference/index.html       | 200 +++++++++---------
 dev/user-guide/managing-instances/index.html  | 126 +++++------
 dev/user-guide/troubleshooting/index.html     |  64 +++---
 12 files changed, 392 insertions(+), 332 deletions(-)
 create mode 100644 dev/__pycache__/fix_line_endings.cpython-311.pyc
 create mode 100644 dev/fix_line_endings.py
diff --git a/dev/__pycache__/fix_line_endings.cpython-311.pyc b/dev/__pycache__/fix_line_endings.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bfd59fbada0b13e9720ec79ceea6e8dc291f9137
GIT binary patch
literal 2109
zcmbVN%}*Og6rb_h>s@<LP=N$g61IE^OTiq9R01SbR1)=KI8>D?L}fOf1#fNdy0c42
z&|31L2Nxkht4jFjR8^uXm;NajS;E~DQdOzFZNzO)ePi=w8&^%AJ#S}!^X512y_tFY
zrl%)@K;BLLm3|dQ=nwvKA%XMG*&uWd5k;$rVj9TeRZNBN&}x7NzCf|yPBD_mESV`w
zNt<Ryu}x(|f1zY`LsK+^>c*y}Y?w^RGc&I-g#o59K;x0eTeg`~Z9S!Cvo95uQcJP7
zO~uNqDa}$;Td~s(3dP)L(7|vTP(#ybll~dg)QlC6{LLL*P6_A!@<#|-6x!J=pohpo
z6?o|sieF=d-a>0iA-IVOAsQ%%_BA#q*lM+e4)E8jYmlEpxF9)F<swrTG>zwL&~Go1
zg6znZYn+z@hzhOjY{(G_Du_;~61=jy>N!%YJw|(<J0iRN9`@^wY`u^Di>|)6Wybop
zt7WLwYio^)j{F>1UqSx(ZgmI^x6SyhD|cH)taw*Txs7uOEN@=D-QS=i)5vqg?)>*W
zAK<+KM`m$HwC;DGRS`Qv2cY%z(C`k&TFxqlQ@Hn}JMRtFYZO=DmUoKN7A@TC=+16L
z0eRYfnRP?SsVqayZ9_?!hOHU)^$UJp3$ERB&hq&G-)=tL$C0PLu)e<T<1rsU^0BiM
z092Tsp7!x#O!9Hu#~=ClQy<U!c!BXPWiluq-;YUd$jWDR+m$tAE2lBl)&Q}Yo_8ab
ztuou%*6p+_s`)$=CszV>j<BXx@EcP%Vi8xmh}#Y3)lJP6A^Z(}(-kw@D%-T&5HEU`
zdkab(R4_}U$tKXF8_ASOwdAv`naa3WcY}Pul^XPUjas}2^EjL0FV3XRoHoO@3`1ix
z+a}A*WV32cP1)J24uz6C1EsdHNb<bY#`7=TUeic&|B@H(vT>l7pjkCIK}F<6P|v{r
z^6qkZwn~&5QHoFMw+D+W_5Pt^;?KU}@|V@Vv0C4lhhm}FJFdPDmvCve-goD4{>S+r
z7JphSNuG#E|4H=Tar9n!s~VlCMJGHY$OCm{`e=B%l&F&rPsr#o8Lhw~Q#CSmM5gMb
zpHCa|Pyh_O8uSkStHB50{RtU4CL<N0O2%ts+(R<w>u*B^Ku^i7gSpa5X{Anj_Y=E`
z@_3bu)X0b@pq}B=%PFBMnXHjXF9ayK*s(yE<5B~^dra<@Q&lorBcq<oah}ej!KEX@
z|2Kv8do%8So}gGc(Rf_5d$aKK!mEeBJp668I`&DG%+<);5t)O|=?y7@WS&NE?LXLk
zaFA@K{U|tb2Ahjwf*VdI;aDV-Zh&cxUH=ua2^eBypnPdDXVR^#_K3ycm4EcCk3o3?
c#@Gu8I0%k`k`Rs*W&ZQTSv={XPSqg&2lU<NkN^Mx

literal 0
HcmV?d00001

diff --git a/dev/__pycache__/readme_sync.cpython-311.pyc b/dev/__pycache__/readme_sync.cpython-311.pyc
index 9f8937b5bb9c73d95e346f3645b08ef2f586e7f6..3c84a605e02cc8ea64cc321c73f1a8b642101333 100644
GIT binary patch
delta 19
ZcmZpaY?S0$&dbZi00bX3Hgc8o001jK1YrOG

delta 19
ZcmZpaY?S0$&dbZi00f(*H*%Hp001e81Qq}Q

diff --git a/dev/fix_line_endings.py b/dev/fix_line_endings.py
new file mode 100644
index 0000000..9555e6b
--- /dev/null
+++ b/dev/fix_line_endings.py
@@ -0,0 +1,60 @@
+"""
+MkDocs hook to fix line endings for proper rendering.
+Automatically adds two spaces at the end of lines that need line breaks.
+"""
+import re
+
+
+def on_page_markdown(markdown, page, config, **kwargs):
+    """
+    Fix line endings in markdown content for proper MkDocs rendering.
+    Adds two spaces at the end of lines that need line breaks.
+    """
+    lines = markdown.split('\n')
+    processed_lines = []
+    in_code_block = False
+    
+    for i, line in enumerate(lines):
+        stripped = line.strip()
+        
+        # Track code blocks
+        if stripped.startswith('```'):
+            in_code_block = not in_code_block
+            processed_lines.append(line)
+            continue
+            
+        # Skip processing inside code blocks
+        if in_code_block:
+            processed_lines.append(line)
+            continue
+            
+        # Skip empty lines
+        if not stripped:
+            processed_lines.append(line)
+            continue
+            
+        # Skip lines that shouldn't have line breaks:
+        # - Headers (# ## ###)
+        # - Blockquotes (>)
+        # - Table rows (|)
+        # - Lines already ending with two spaces
+        # - YAML front matter and HTML tags
+        # - Standalone punctuation lines
+        if (stripped.startswith('#') or 
+            stripped.startswith('>') or
+            '|' in stripped or
+            line.endswith('  ') or
+            stripped.startswith('---') or
+            stripped.startswith('<') or
+            stripped.endswith('>') or
+            stripped in ('.', '!', '?', ':', ';', '```', '---', ',')):
+            processed_lines.append(line)
+            continue
+            
+        # Add two spaces to lines that end with regular text or most punctuation
+        if stripped and not in_code_block:
+            processed_lines.append(line.rstrip() + '  ')
+        else:
+            processed_lines.append(line)
+            
+    return '\n'.join(processed_lines)
\ No newline at end of file
diff --git a/dev/getting-started/configuration/index.html b/dev/getting-started/configuration/index.html
index 94adbb7..2e50cce 100644
--- a/dev/getting-started/configuration/index.html
+++ b/dev/getting-started/configuration/index.html
@@ -838,12 +838,12 @@
 
 
 <h1 id="configuration">Configuration<a class="headerlink" href="#configuration" title="Permanent link">&para;</a></h1>
-<p>llamactl can be configured via configuration files or environment variables. Configuration is loaded in the following order of precedence:</p>
+<p>llamactl can be configured via configuration files or environment variables. Configuration is loaded in the following order of precedence:  </p>
 <div class="highlight"><pre><span></span><code><a id="__codelineno-0-1" name="__codelineno-0-1" href="#__codelineno-0-1"></a>Defaults &lt; Configuration file &lt; Environment variables
 </code></pre></div>
-<p>llamactl works out of the box with sensible defaults, but you can customize the behavior to suit your needs.</p>
+<p>llamactl works out of the box with sensible defaults, but you can customize the behavior to suit your needs.  </p>
 <h2 id="default-configuration">Default Configuration<a class="headerlink" href="#default-configuration" title="Permanent link">&para;</a></h2>
-<p>Here's the default configuration with all available options:</p>
+<p>Here's the default configuration with all available options:  </p>
 <div class="highlight"><pre><span></span><code><a id="__codelineno-1-1" name="__codelineno-1-1" href="#__codelineno-1-1"></a><span class="nt">server</span><span class="p">:</span>
 <a id="__codelineno-1-2" name="__codelineno-1-2" href="#__codelineno-1-2"></a><span class="w">  </span><span class="nt">host</span><span class="p">:</span><span class="w"> </span><span class="s">&quot;0.0.0.0&quot;</span><span class="w">                </span><span class="c1"># Server host to bind to</span>
 <a id="__codelineno-1-3" name="__codelineno-1-3" href="#__codelineno-1-3"></a><span class="w">  </span><span class="nt">port</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">8080</span><span class="w">                     </span><span class="c1"># Server port to bind to</span>
@@ -908,7 +908,7 @@
 </code></pre></div>
 <h2 id="configuration-files">Configuration Files<a class="headerlink" href="#configuration-files" title="Permanent link">&para;</a></h2>
 <h3 id="configuration-file-locations">Configuration File Locations<a class="headerlink" href="#configuration-file-locations" title="Permanent link">&para;</a></h3>
-<p>Configuration files are searched in the following locations (in order of precedence):</p>
+<p>Configuration files are searched in the following locations (in order of precedence):  </p>
 <p><strong>Linux:</strong><br />
 - <code>./llamactl.yaml</code> or <code>./config.yaml</code> (current directory)<br />
 - <code>$HOME/.config/llamactl/config.yaml</code><br />
@@ -922,7 +922,7 @@
 - <code>%APPDATA%\llamactl\config.yaml</code><br />
 - <code>%USERPROFILE%\llamactl\config.yaml</code><br />
 - <code>%PROGRAMDATA%\llamactl\config.yaml</code>  </p>
-<p>You can specify the path to config file with <code>LLAMACTL_CONFIG_PATH</code> environment variable.</p>
+<p>You can specify the path to config file with <code>LLAMACTL_CONFIG_PATH</code> environment variable.  </p>
 <h2 id="configuration-options">Configuration Options<a class="headerlink" href="#configuration-options" title="Permanent link">&para;</a></h2>
 <h3 id="server-configuration">Server Configuration<a class="headerlink" href="#server-configuration" title="Permanent link">&para;</a></h3>
 <div class="highlight"><pre><span></span><code><a id="__codelineno-2-1" name="__codelineno-2-1" href="#__codelineno-2-1"></a><span class="nt">server</span><span class="p">:</span>
@@ -932,11 +932,11 @@
 <a id="__codelineno-2-5" name="__codelineno-2-5" href="#__codelineno-2-5"></a><span class="w">  </span><span class="nt">allowed_headers</span><span class="p">:</span><span class="w"> </span><span class="p p-Indicator">[</span><span class="s">&quot;*&quot;</span><span class="p p-Indicator">]</span><span class="w">  </span><span class="c1"># CORS allowed headers (default: [&quot;*&quot;])</span>
 <a id="__codelineno-2-6" name="__codelineno-2-6" href="#__codelineno-2-6"></a><span class="w">  </span><span class="nt">enable_swagger</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">false</span><span class="w">   </span><span class="c1"># Enable Swagger UI (default: false)</span>
 </code></pre></div>
-<p><strong>Environment Variables:</strong>
-- <code>LLAMACTL_HOST</code> - Server host
-- <code>LLAMACTL_PORT</code> - Server port
-- <code>LLAMACTL_ALLOWED_ORIGINS</code> - Comma-separated CORS origins
-- <code>LLAMACTL_ENABLE_SWAGGER</code> - Enable Swagger UI (true/false)</p>
+<p><strong>Environment Variables:</strong><br />
+- <code>LLAMACTL_HOST</code> - Server host<br />
+- <code>LLAMACTL_PORT</code> - Server port<br />
+- <code>LLAMACTL_ALLOWED_ORIGINS</code> - Comma-separated CORS origins<br />
+- <code>LLAMACTL_ENABLE_SWAGGER</code> - Enable Swagger UI (true/false)  </p>
 <h3 id="backend-configuration">Backend Configuration<a class="headerlink" href="#backend-configuration" title="Permanent link">&para;</a></h3>
 <div class="highlight"><pre><span></span><code><a id="__codelineno-3-1" name="__codelineno-3-1" href="#__codelineno-3-1"></a><span class="nt">backends</span><span class="p">:</span>
 <a id="__codelineno-3-2" name="__codelineno-3-2" href="#__codelineno-3-2"></a><span class="w">  </span><span class="nt">llama-cpp</span><span class="p">:</span>
@@ -968,43 +968,43 @@
 <a id="__codelineno-3-28" name="__codelineno-3-28" href="#__codelineno-3-28"></a><span class="w">    </span><span class="c1"># MLX does not support Docker</span>
 <a id="__codelineno-3-29" name="__codelineno-3-29" href="#__codelineno-3-29"></a><span class="w">    </span><span class="nt">response_headers</span><span class="p">:</span><span class="w"> </span><span class="p p-Indicator">{}</span><span class="w">         </span><span class="c1"># Additional response headers to send with responses</span>
 </code></pre></div>
-<p><strong>Backend Configuration Fields:</strong>
-- <code>command</code>: Executable name/path for the backend
-- <code>args</code>: Default arguments prepended to all instances
-- <code>environment</code>: Environment variables for the backend process (optional)
-- <code>response_headers</code>: Additional response headers to send with responses (optional)
-- <code>docker</code>: Docker-specific configuration (optional)
-  - <code>enabled</code>: Boolean flag to enable Docker runtime
-  - <code>image</code>: Docker image to use
-  - <code>args</code>: Additional arguments passed to <code>docker run</code>
-  - <code>environment</code>: Environment variables for the container (optional)</p>
+<p><strong>Backend Configuration Fields:</strong><br />
+- <code>command</code>: Executable name/path for the backend<br />
+- <code>args</code>: Default arguments prepended to all instances<br />
+- <code>environment</code>: Environment variables for the backend process (optional)<br />
+- <code>response_headers</code>: Additional response headers to send with responses (optional)<br />
+- <code>docker</code>: Docker-specific configuration (optional)<br />
+  - <code>enabled</code>: Boolean flag to enable Docker runtime<br />
+  - <code>image</code>: Docker image to use<br />
+  - <code>args</code>: Additional arguments passed to <code>docker run</code><br />
+  - <code>environment</code>: Environment variables for the container (optional)  </p>
 <blockquote>
 <p>If llamactl is behind an NGINX proxy, <code>X-Accel-Buffering: no</code> response header may be required for NGINX to properly stream the responses without buffering.</p>
 </blockquote>
-<p><strong>Environment Variables:</strong></p>
-<p><strong>LlamaCpp Backend:</strong>
-- <code>LLAMACTL_LLAMACPP_COMMAND</code> - LlamaCpp executable command
-- <code>LLAMACTL_LLAMACPP_ARGS</code> - Space-separated default arguments
-- <code>LLAMACTL_LLAMACPP_ENV</code> - Environment variables in format "KEY1=value1,KEY2=value2"
-- <code>LLAMACTL_LLAMACPP_DOCKER_ENABLED</code> - Enable Docker runtime (true/false)
-- <code>LLAMACTL_LLAMACPP_DOCKER_IMAGE</code> - Docker image to use
-- <code>LLAMACTL_LLAMACPP_DOCKER_ARGS</code> - Space-separated Docker arguments
-- <code>LLAMACTL_LLAMACPP_DOCKER_ENV</code> - Docker environment variables in format "KEY1=value1,KEY2=value2"
-- <code>LLAMACTL_LLAMACPP_RESPONSE_HEADERS</code> - Response headers in format "KEY1=value1;KEY2=value2"</p>
-<p><strong>VLLM Backend:</strong>
-- <code>LLAMACTL_VLLM_COMMAND</code> - VLLM executable command
-- <code>LLAMACTL_VLLM_ARGS</code> - Space-separated default arguments
-- <code>LLAMACTL_VLLM_ENV</code> - Environment variables in format "KEY1=value1,KEY2=value2"
-- <code>LLAMACTL_VLLM_DOCKER_ENABLED</code> - Enable Docker runtime (true/false)
-- <code>LLAMACTL_VLLM_DOCKER_IMAGE</code> - Docker image to use
-- <code>LLAMACTL_VLLM_DOCKER_ARGS</code> - Space-separated Docker arguments
-- <code>LLAMACTL_VLLM_DOCKER_ENV</code> - Docker environment variables in format "KEY1=value1,KEY2=value2"
-- <code>LLAMACTL_VLLM_RESPONSE_HEADERS</code> - Response headers in format "KEY1=value1;KEY2=value2"</p>
-<p><strong>MLX Backend:</strong>
-- <code>LLAMACTL_MLX_COMMAND</code> - MLX executable command
-- <code>LLAMACTL_MLX_ARGS</code> - Space-separated default arguments
-- <code>LLAMACTL_MLX_ENV</code> - Environment variables in format "KEY1=value1,KEY2=value2"
-- <code>LLAMACTL_MLX_RESPONSE_HEADERS</code> - Response headers in format "KEY1=value1;KEY2=value2"</p>
+<p><strong>Environment Variables:</strong>  </p>
+<p><strong>LlamaCpp Backend:</strong><br />
+- <code>LLAMACTL_LLAMACPP_COMMAND</code> - LlamaCpp executable command<br />
+- <code>LLAMACTL_LLAMACPP_ARGS</code> - Space-separated default arguments<br />
+- <code>LLAMACTL_LLAMACPP_ENV</code> - Environment variables in format "KEY1=value1,KEY2=value2"<br />
+- <code>LLAMACTL_LLAMACPP_DOCKER_ENABLED</code> - Enable Docker runtime (true/false)<br />
+- <code>LLAMACTL_LLAMACPP_DOCKER_IMAGE</code> - Docker image to use<br />
+- <code>LLAMACTL_LLAMACPP_DOCKER_ARGS</code> - Space-separated Docker arguments<br />
+- <code>LLAMACTL_LLAMACPP_DOCKER_ENV</code> - Docker environment variables in format "KEY1=value1,KEY2=value2"<br />
+- <code>LLAMACTL_LLAMACPP_RESPONSE_HEADERS</code> - Response headers in format "KEY1=value1;KEY2=value2"  </p>
+<p><strong>VLLM Backend:</strong><br />
+- <code>LLAMACTL_VLLM_COMMAND</code> - VLLM executable command<br />
+- <code>LLAMACTL_VLLM_ARGS</code> - Space-separated default arguments<br />
+- <code>LLAMACTL_VLLM_ENV</code> - Environment variables in format "KEY1=value1,KEY2=value2"<br />
+- <code>LLAMACTL_VLLM_DOCKER_ENABLED</code> - Enable Docker runtime (true/false)<br />
+- <code>LLAMACTL_VLLM_DOCKER_IMAGE</code> - Docker image to use<br />
+- <code>LLAMACTL_VLLM_DOCKER_ARGS</code> - Space-separated Docker arguments<br />
+- <code>LLAMACTL_VLLM_DOCKER_ENV</code> - Docker environment variables in format "KEY1=value1,KEY2=value2"<br />
+- <code>LLAMACTL_VLLM_RESPONSE_HEADERS</code> - Response headers in format "KEY1=value1;KEY2=value2"  </p>
+<p><strong>MLX Backend:</strong><br />
+- <code>LLAMACTL_MLX_COMMAND</code> - MLX executable command<br />
+- <code>LLAMACTL_MLX_ARGS</code> - Space-separated default arguments<br />
+- <code>LLAMACTL_MLX_ENV</code> - Environment variables in format "KEY1=value1,KEY2=value2"<br />
+- <code>LLAMACTL_MLX_RESPONSE_HEADERS</code> - Response headers in format "KEY1=value1;KEY2=value2"  </p>
 <h3 id="instance-configuration">Instance Configuration<a class="headerlink" href="#instance-configuration" title="Permanent link">&para;</a></h3>
 <div class="highlight"><pre><span></span><code><a id="__codelineno-4-1" name="__codelineno-4-1" href="#__codelineno-4-1"></a><span class="nt">instances</span><span class="p">:</span>
 <a id="__codelineno-4-2" name="__codelineno-4-2" href="#__codelineno-4-2"></a><span class="w">  </span><span class="nt">port_range</span><span class="p">:</span><span class="w"> </span><span class="p p-Indicator">[</span><span class="nv">8000</span><span class="p p-Indicator">,</span><span class="w"> </span><span class="nv">9000</span><span class="p p-Indicator">]</span><span class="w">                          </span><span class="c1"># Port range for instances (default: [8000, 9000])</span>
@@ -1029,8 +1029,8 @@
 - <code>LLAMACTL_LOGS_DIR</code> - Log directory path<br />
 - <code>LLAMACTL_AUTO_CREATE_DATA_DIR</code> - Auto-create data/config/logs directories (true/false)<br />
 - <code>LLAMACTL_MAX_INSTANCES</code> - Maximum number of instances<br />
-- <code>LLAMACTL_MAX_RUNNING_INSTANCES</code> - Maximum number of running instances
-- <code>LLAMACTL_ENABLE_LRU_EVICTION</code> - Enable LRU eviction for idle instances
+- <code>LLAMACTL_MAX_RUNNING_INSTANCES</code> - Maximum number of running instances<br />
+- <code>LLAMACTL_ENABLE_LRU_EVICTION</code> - Enable LRU eviction for idle instances<br />
 - <code>LLAMACTL_DEFAULT_AUTO_RESTART</code> - Default auto-restart setting (true/false)<br />
 - <code>LLAMACTL_DEFAULT_MAX_RESTARTS</code> - Default maximum restarts<br />
 - <code>LLAMACTL_DEFAULT_RESTART_DELAY</code> - Default restart delay in seconds<br />
@@ -1044,13 +1044,13 @@
 <a id="__codelineno-5-4" name="__codelineno-5-4" href="#__codelineno-5-4"></a><span class="w">  </span><span class="nt">require_management_auth</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">true</span><span class="w">          </span><span class="c1"># Require API key for management endpoints (default: true)</span>
 <a id="__codelineno-5-5" name="__codelineno-5-5" href="#__codelineno-5-5"></a><span class="w">  </span><span class="nt">management_keys</span><span class="p">:</span><span class="w"> </span><span class="p p-Indicator">[]</span><span class="w">                    </span><span class="c1"># List of valid management API keys</span>
 </code></pre></div>
-<p><strong>Environment Variables:</strong>
-- <code>LLAMACTL_REQUIRE_INFERENCE_AUTH</code> - Require auth for OpenAI endpoints (true/false)
-- <code>LLAMACTL_INFERENCE_KEYS</code> - Comma-separated inference API keys
-- <code>LLAMACTL_REQUIRE_MANAGEMENT_AUTH</code> - Require auth for management endpoints (true/false)
-- <code>LLAMACTL_MANAGEMENT_KEYS</code> - Comma-separated management API keys</p>
+<p><strong>Environment Variables:</strong><br />
+- <code>LLAMACTL_REQUIRE_INFERENCE_AUTH</code> - Require auth for OpenAI endpoints (true/false)<br />
+- <code>LLAMACTL_INFERENCE_KEYS</code> - Comma-separated inference API keys<br />
+- <code>LLAMACTL_REQUIRE_MANAGEMENT_AUTH</code> - Require auth for management endpoints (true/false)<br />
+- <code>LLAMACTL_MANAGEMENT_KEYS</code> - Comma-separated management API keys  </p>
 <h3 id="remote-node-configuration">Remote Node Configuration<a class="headerlink" href="#remote-node-configuration" title="Permanent link">&para;</a></h3>
-<p>llamactl supports remote node deployments. Configure remote nodes to deploy instances on remote hosts and manage them centrally.</p>
+<p>llamactl supports remote node deployments. Configure remote nodes to deploy instances on remote hosts and manage them centrally.  </p>
 <div class="highlight"><pre><span></span><code><a id="__codelineno-6-1" name="__codelineno-6-1" href="#__codelineno-6-1"></a><span class="nt">local_node</span><span class="p">:</span><span class="w"> </span><span class="s">&quot;main&quot;</span><span class="w">               </span><span class="c1"># Name of the local node (default: &quot;main&quot;)</span>
 <a id="__codelineno-6-2" name="__codelineno-6-2" href="#__codelineno-6-2"></a><span class="nt">nodes</span><span class="p">:</span><span class="w">                           </span><span class="c1"># Node configuration map</span>
 <a id="__codelineno-6-3" name="__codelineno-6-3" href="#__codelineno-6-3"></a><span class="w">  </span><span class="nt">main</span><span class="p">:</span><span class="w">                          </span><span class="c1"># Local node (empty address means local)</span>
@@ -1060,13 +1060,13 @@
 <a id="__codelineno-6-7" name="__codelineno-6-7" href="#__codelineno-6-7"></a><span class="w">    </span><span class="nt">address</span><span class="p">:</span><span class="w"> </span><span class="s">&quot;http://192.168.1.10:8080&quot;</span>
 <a id="__codelineno-6-8" name="__codelineno-6-8" href="#__codelineno-6-8"></a><span class="w">    </span><span class="nt">api_key</span><span class="p">:</span><span class="w"> </span><span class="s">&quot;worker1-api-key&quot;</span><span class="w">   </span><span class="c1"># Management API key for authentication</span>
 </code></pre></div>
-<p><strong>Node Configuration Fields:</strong>
-- <code>local_node</code>: Specifies which node in the <code>nodes</code> map represents the local node
-- <code>nodes</code>: Map of node configurations
-  - <code>address</code>: HTTP/HTTPS URL of the remote node (empty for local node)
-  - <code>api_key</code>: Management API key for authenticating with the remote node</p>
-<p><strong>Environment Variables:</strong>
-- <code>LLAMACTL_LOCAL_NODE</code> - Name of the local node</p>
+<p><strong>Node Configuration Fields:</strong><br />
+- <code>local_node</code>: Specifies which node in the <code>nodes</code> map represents the local node<br />
+- <code>nodes</code>: Map of node configurations<br />
+  - <code>address</code>: HTTP/HTTPS URL of the remote node (empty for local node)<br />
+  - <code>api_key</code>: Management API key for authenticating with the remote node  </p>
+<p><strong>Environment Variables:</strong><br />
+- <code>LLAMACTL_LOCAL_NODE</code> - Name of the local node  </p>
 
 
 
diff --git a/dev/getting-started/installation/index.html b/dev/getting-started/installation/index.html
index 4c0c80e..4c03c7c 100644
--- a/dev/getting-started/installation/index.html
+++ b/dev/getting-started/installation/index.html
@@ -886,20 +886,20 @@
 
 
 <h1 id="installation">Installation<a class="headerlink" href="#installation" title="Permanent link">&para;</a></h1>
-<p>This guide will walk you through installing Llamactl on your system.</p>
+<p>This guide will walk you through installing Llamactl on your system.  </p>
 <h2 id="prerequisites">Prerequisites<a class="headerlink" href="#prerequisites" title="Permanent link">&para;</a></h2>
 <h3 id="backend-dependencies">Backend Dependencies<a class="headerlink" href="#backend-dependencies" title="Permanent link">&para;</a></h3>
-<p>llamactl supports multiple backends. Install at least one:</p>
-<p><strong>For llama.cpp backend (all platforms):</strong></p>
-<p>You need <code>llama-server</code> from <a href="https://github.com/ggml-org/llama.cpp">llama.cpp</a> installed:</p>
+<p>llamactl supports multiple backends. Install at least one:  </p>
+<p><strong>For llama.cpp backend (all platforms):</strong>  </p>
+<p>You need <code>llama-server</code> from <a href="https://github.com/ggml-org/llama.cpp">llama.cpp</a> installed:  </p>
 <div class="highlight"><pre><span></span><code><a id="__codelineno-0-1" name="__codelineno-0-1" href="#__codelineno-0-1"></a><span class="c1"># Homebrew (macOS/Linux)</span>
 <a id="__codelineno-0-2" name="__codelineno-0-2" href="#__codelineno-0-2"></a>brew<span class="w"> </span>install<span class="w"> </span>llama.cpp
 <a id="__codelineno-0-3" name="__codelineno-0-3" href="#__codelineno-0-3"></a><span class="c1"># Winget (Windows)</span>
 <a id="__codelineno-0-4" name="__codelineno-0-4" href="#__codelineno-0-4"></a>winget<span class="w"> </span>install<span class="w"> </span>llama.cpp
 </code></pre></div>
-<p>Or build from source - see llama.cpp docs</p>
-<p><strong>For MLX backend (macOS only):</strong></p>
-<p>MLX provides optimized inference on Apple Silicon. Install MLX-LM:</p>
+<p>Or build from source - see llama.cpp docs  </p>
+<p><strong>For MLX backend (macOS only):</strong>  </p>
+<p>MLX provides optimized inference on Apple Silicon. Install MLX-LM:  </p>
 <div class="highlight"><pre><span></span><code><a id="__codelineno-1-1" name="__codelineno-1-1" href="#__codelineno-1-1"></a><span class="c1"># Install via pip (requires Python 3.8+)</span>
 <a id="__codelineno-1-2" name="__codelineno-1-2" href="#__codelineno-1-2"></a>pip<span class="w"> </span>install<span class="w"> </span>mlx-lm
 <a id="__codelineno-1-3" name="__codelineno-1-3" href="#__codelineno-1-3"></a>
@@ -908,9 +908,9 @@
 <a id="__codelineno-1-6" name="__codelineno-1-6" href="#__codelineno-1-6"></a><span class="nb">source</span><span class="w"> </span>mlx-env/bin/activate
 <a id="__codelineno-1-7" name="__codelineno-1-7" href="#__codelineno-1-7"></a>pip<span class="w"> </span>install<span class="w"> </span>mlx-lm
 </code></pre></div>
-<p>Note: MLX backend is only available on macOS with Apple Silicon (M1, M2, M3, etc.)</p>
-<p><strong>For vLLM backend:</strong></p>
-<p>vLLM provides high-throughput distributed serving for LLMs. Install vLLM:</p>
+<p>Note: MLX backend is only available on macOS with Apple Silicon (M1, M2, M3, etc.)  </p>
+<p><strong>For vLLM backend:</strong>  </p>
+<p>vLLM provides high-throughput distributed serving for LLMs. Install vLLM:  </p>
 <div class="highlight"><pre><span></span><code><a id="__codelineno-2-1" name="__codelineno-2-1" href="#__codelineno-2-1"></a><span class="c1"># Install via pip (requires Python 3.8+, GPU required)</span>
 <a id="__codelineno-2-2" name="__codelineno-2-2" href="#__codelineno-2-2"></a>pip<span class="w"> </span>install<span class="w"> </span>vllm
 <a id="__codelineno-2-3" name="__codelineno-2-3" href="#__codelineno-2-3"></a>
@@ -923,7 +923,7 @@
 </code></pre></div>
 <h2 id="installation-methods">Installation Methods<a class="headerlink" href="#installation-methods" title="Permanent link">&para;</a></h2>
 <h3 id="option-1-download-binary-recommended">Option 1: Download Binary (Recommended)<a class="headerlink" href="#option-1-download-binary-recommended" title="Permanent link">&para;</a></h3>
-<p>Download the latest release from the <a href="https://github.com/lordmathis/llamactl/releases">GitHub releases page</a>:</p>
+<p>Download the latest release from the <a href="https://github.com/lordmathis/llamactl/releases">GitHub releases page</a>:  </p>
 <div class="highlight"><pre><span></span><code><a id="__codelineno-3-1" name="__codelineno-3-1" href="#__codelineno-3-1"></a><span class="c1"># Linux/macOS - Get latest version and download</span>
 <a id="__codelineno-3-2" name="__codelineno-3-2" href="#__codelineno-3-2"></a><span class="nv">LATEST_VERSION</span><span class="o">=</span><span class="k">$(</span>curl<span class="w"> </span>-s<span class="w"> </span>https://api.github.com/repos/lordmathis/llamactl/releases/latest<span class="w"> </span><span class="p">|</span><span class="w"> </span>grep<span class="w"> </span><span class="s1">&#39;&quot;tag_name&quot;:&#39;</span><span class="w"> </span><span class="p">|</span><span class="w"> </span>sed<span class="w"> </span>-E<span class="w"> </span><span class="s1">&#39;s/.*&quot;([^&quot;]+)&quot;.*/\1/&#39;</span><span class="k">)</span>
 <a id="__codelineno-3-3" name="__codelineno-3-3" href="#__codelineno-3-3"></a>curl<span class="w"> </span>-L<span class="w"> </span>https://github.com/lordmathis/llamactl/releases/download/<span class="si">${</span><span class="nv">LATEST_VERSION</span><span class="si">}</span>/llamactl-<span class="si">${</span><span class="nv">LATEST_VERSION</span><span class="si">}</span>-<span class="k">$(</span>uname<span class="w"> </span>-s<span class="w"> </span><span class="p">|</span><span class="w"> </span>tr<span class="w"> </span><span class="s1">&#39;[:upper:]&#39;</span><span class="w"> </span><span class="s1">&#39;[:lower:]&#39;</span><span class="k">)</span>-<span class="k">$(</span>uname<span class="w"> </span>-m<span class="k">)</span>.tar.gz<span class="w"> </span><span class="p">|</span><span class="w"> </span>tar<span class="w"> </span>-xz
@@ -935,12 +935,12 @@
 <a id="__codelineno-3-9" name="__codelineno-3-9" href="#__codelineno-3-9"></a><span class="c1"># Windows - Download from releases page</span>
 </code></pre></div>
 <h3 id="option-2-docker">Option 2: Docker<a class="headerlink" href="#option-2-docker" title="Permanent link">&para;</a></h3>
-<p>llamactl provides Dockerfiles for creating Docker images with backends pre-installed. The resulting images include the latest llamactl release with the respective backend.</p>
-<p><strong>Available Dockerfiles (CUDA):</strong>
-- <strong>llamactl with llama.cpp CUDA</strong>: <code>docker/Dockerfile.llamacpp</code> (based on <code>ghcr.io/ggml-org/llama.cpp:server-cuda</code>)
-- <strong>llamactl with vLLM CUDA</strong>: <code>docker/Dockerfile.vllm</code> (based on <code>vllm/vllm-openai:latest</code>)
-- <strong>llamactl built from source</strong>: <code>docker/Dockerfile.source</code> (multi-stage build with webui)</p>
-<p><strong>Note:</strong> These Dockerfiles are configured for CUDA. For other platforms (CPU, ROCm, Vulkan, etc.), adapt the base image. For llama.cpp, see available tags at <a href="https://github.com/ggml-org/llama.cpp/blob/master/docs/docker.md">llama.cpp Docker docs</a>. For vLLM, check <a href="https://docs.vllm.ai/en/v0.6.5/serving/deploying_with_docker.html">vLLM docs</a>.</p>
+<p>llamactl provides Dockerfiles for creating Docker images with backends pre-installed. The resulting images include the latest llamactl release with the respective backend.  </p>
+<p><strong>Available Dockerfiles (CUDA):</strong><br />
+- <strong>llamactl with llama.cpp CUDA</strong>: <code>docker/Dockerfile.llamacpp</code> (based on <code>ghcr.io/ggml-org/llama.cpp:server-cuda</code>)<br />
+- <strong>llamactl with vLLM CUDA</strong>: <code>docker/Dockerfile.vllm</code> (based on <code>vllm/vllm-openai:latest</code>)<br />
+- <strong>llamactl built from source</strong>: <code>docker/Dockerfile.source</code> (multi-stage build with webui)  </p>
+<p><strong>Note:</strong> These Dockerfiles are configured for CUDA. For other platforms (CPU, ROCm, Vulkan, etc.), adapt the base image. For llama.cpp, see available tags at <a href="https://github.com/ggml-org/llama.cpp/blob/master/docs/docker.md">llama.cpp Docker docs</a>. For vLLM, check <a href="https://docs.vllm.ai/en/v0.6.5/serving/deploying_with_docker.html">vLLM docs</a>.  </p>
 <h4 id="using-docker-compose">Using Docker Compose<a class="headerlink" href="#using-docker-compose" title="Permanent link">&para;</a></h4>
 <div class="highlight"><pre><span></span><code><a id="__codelineno-4-1" name="__codelineno-4-1" href="#__codelineno-4-1"></a><span class="c1"># Clone the repository</span>
 <a id="__codelineno-4-2" name="__codelineno-4-2" href="#__codelineno-4-2"></a>git<span class="w"> </span>clone<span class="w"> </span>https://github.com/lordmathis/llamactl.git
@@ -955,11 +955,11 @@
 <a id="__codelineno-4-11" name="__codelineno-4-11" href="#__codelineno-4-11"></a><span class="c1"># Or start llamactl with vLLM backend</span>
 <a id="__codelineno-4-12" name="__codelineno-4-12" href="#__codelineno-4-12"></a>docker-compose<span class="w"> </span>-f<span class="w"> </span>docker/docker-compose.yml<span class="w"> </span>up<span class="w"> </span>llamactl-vllm<span class="w"> </span>-d
 </code></pre></div>
-<p>Access the dashboard at:
-- llamactl with llama.cpp: http://localhost:8080
-- llamactl with vLLM: http://localhost:8081</p>
+<p>Access the dashboard at:<br />
+- llamactl with llama.cpp: http://localhost:8080<br />
+- llamactl with vLLM: http://localhost:8081  </p>
 <h4 id="using-docker-build-and-run">Using Docker Build and Run<a class="headerlink" href="#using-docker-build-and-run" title="Permanent link">&para;</a></h4>
-<p><strong>llamactl with llama.cpp CUDA:</strong>
+<p><strong>llamactl with llama.cpp CUDA:</strong><br />
 <div class="highlight"><pre><span></span><code><a id="__codelineno-5-1" name="__codelineno-5-1" href="#__codelineno-5-1"></a>docker<span class="w"> </span>build<span class="w"> </span>-f<span class="w"> </span>docker/Dockerfile.llamacpp<span class="w"> </span>-t<span class="w"> </span>llamactl:llamacpp-cuda<span class="w"> </span>.
 <a id="__codelineno-5-2" name="__codelineno-5-2" href="#__codelineno-5-2"></a>docker<span class="w"> </span>run<span class="w"> </span>-d<span class="w"> </span><span class="se">\</span>
 <a id="__codelineno-5-3" name="__codelineno-5-3" href="#__codelineno-5-3"></a><span class="w">  </span>--name<span class="w"> </span>llamactl-llamacpp<span class="w"> </span><span class="se">\</span>
@@ -968,7 +968,7 @@
 <a id="__codelineno-5-6" name="__codelineno-5-6" href="#__codelineno-5-6"></a><span class="w">  </span>-v<span class="w"> </span>~/.cache/llama.cpp:/root/.cache/llama.cpp<span class="w"> </span><span class="se">\</span>
 <a id="__codelineno-5-7" name="__codelineno-5-7" href="#__codelineno-5-7"></a><span class="w">  </span>llamactl:llamacpp-cuda
 </code></pre></div></p>
-<p><strong>llamactl with vLLM CUDA:</strong>
+<p><strong>llamactl with vLLM CUDA:</strong><br />
 <div class="highlight"><pre><span></span><code><a id="__codelineno-6-1" name="__codelineno-6-1" href="#__codelineno-6-1"></a>docker<span class="w"> </span>build<span class="w"> </span>-f<span class="w"> </span>docker/Dockerfile.vllm<span class="w"> </span>-t<span class="w"> </span>llamactl:vllm-cuda<span class="w"> </span>.
 <a id="__codelineno-6-2" name="__codelineno-6-2" href="#__codelineno-6-2"></a>docker<span class="w"> </span>run<span class="w"> </span>-d<span class="w"> </span><span class="se">\</span>
 <a id="__codelineno-6-3" name="__codelineno-6-3" href="#__codelineno-6-3"></a><span class="w">  </span>--name<span class="w"> </span>llamactl-vllm<span class="w"> </span><span class="se">\</span>
@@ -977,7 +977,7 @@
 <a id="__codelineno-6-6" name="__codelineno-6-6" href="#__codelineno-6-6"></a><span class="w">  </span>-v<span class="w"> </span>~/.cache/huggingface:/root/.cache/huggingface<span class="w"> </span><span class="se">\</span>
 <a id="__codelineno-6-7" name="__codelineno-6-7" href="#__codelineno-6-7"></a><span class="w">  </span>llamactl:vllm-cuda
 </code></pre></div></p>
-<p><strong>llamactl built from source:</strong>
+<p><strong>llamactl built from source:</strong><br />
 <div class="highlight"><pre><span></span><code><a id="__codelineno-7-1" name="__codelineno-7-1" href="#__codelineno-7-1"></a>docker<span class="w"> </span>build<span class="w"> </span>-f<span class="w"> </span>docker/Dockerfile.source<span class="w"> </span>-t<span class="w"> </span>llamactl:source<span class="w"> </span>.
 <a id="__codelineno-7-2" name="__codelineno-7-2" href="#__codelineno-7-2"></a>docker<span class="w"> </span>run<span class="w"> </span>-d<span class="w"> </span><span class="se">\</span>
 <a id="__codelineno-7-3" name="__codelineno-7-3" href="#__codelineno-7-3"></a><span class="w">  </span>--name<span class="w"> </span>llamactl<span class="w"> </span><span class="se">\</span>
@@ -985,11 +985,11 @@
 <a id="__codelineno-7-5" name="__codelineno-7-5" href="#__codelineno-7-5"></a><span class="w">  </span>llamactl:source
 </code></pre></div></p>
 <h3 id="option-3-build-from-source">Option 3: Build from Source<a class="headerlink" href="#option-3-build-from-source" title="Permanent link">&para;</a></h3>
-<p>Requirements:
-- Go 1.24 or later
-- Node.js 22 or later
-- Git</p>
-<p>If you prefer to build from source:</p>
+<p>Requirements:<br />
+- Go 1.24 or later<br />
+- Node.js 22 or later<br />
+- Git  </p>
+<p>If you prefer to build from source:  </p>
 <div class="highlight"><pre><span></span><code><a id="__codelineno-8-1" name="__codelineno-8-1" href="#__codelineno-8-1"></a><span class="c1"># Clone the repository</span>
 <a id="__codelineno-8-2" name="__codelineno-8-2" href="#__codelineno-8-2"></a>git<span class="w"> </span>clone<span class="w"> </span>https://github.com/lordmathis/llamactl.git
 <a id="__codelineno-8-3" name="__codelineno-8-3" href="#__codelineno-8-3"></a><span class="nb">cd</span><span class="w"> </span>llamactl
@@ -1001,16 +1001,16 @@
 <a id="__codelineno-8-9" name="__codelineno-8-9" href="#__codelineno-8-9"></a>go<span class="w"> </span>build<span class="w"> </span>-o<span class="w"> </span>llamactl<span class="w"> </span>./cmd/server
 </code></pre></div>
 <h2 id="remote-node-installation">Remote Node Installation<a class="headerlink" href="#remote-node-installation" title="Permanent link">&para;</a></h2>
-<p>For deployments with remote nodes:
-- Install llamactl on each node using any of the methods above
-- Configure API keys for authentication between nodes</p>
+<p>For deployments with remote nodes:<br />
+- Install llamactl on each node using any of the methods above<br />
+- Configure API keys for authentication between nodes  </p>
 <h2 id="verification">Verification<a class="headerlink" href="#verification" title="Permanent link">&para;</a></h2>
-<p>Verify your installation by checking the version:</p>
+<p>Verify your installation by checking the version:  </p>
 <div class="highlight"><pre><span></span><code><a id="__codelineno-9-1" name="__codelineno-9-1" href="#__codelineno-9-1"></a>llamactl<span class="w"> </span>--version
 </code></pre></div>
 <h2 id="next-steps">Next Steps<a class="headerlink" href="#next-steps" title="Permanent link">&para;</a></h2>
-<p>Now that Llamactl is installed, continue to the <a href="../quick-start/">Quick Start</a> guide to get your first instance running!</p>
-<p>For remote node deployments, see the <a href="../configuration/">Configuration Guide</a> for node setup instructions.</p>
+<p>Now that Llamactl is installed, continue to the <a href="../quick-start/">Quick Start</a> guide to get your first instance running!  </p>
+<p>For remote node deployments, see the <a href="../configuration/">Configuration Guide</a> for node setup instructions.  </p>
 
 
 
diff --git a/dev/getting-started/quick-start/index.html b/dev/getting-started/quick-start/index.html
index 8c4f988..5b90891 100644
--- a/dev/getting-started/quick-start/index.html
+++ b/dev/getting-started/quick-start/index.html
@@ -880,43 +880,43 @@
 
 
 <h1 id="quick-start">Quick Start<a class="headerlink" href="#quick-start" title="Permanent link">&para;</a></h1>
-<p>This guide will help you get Llamactl up and running in just a few minutes.</p>
+<p>This guide will help you get Llamactl up and running in just a few minutes.  </p>
 <h2 id="step-1-start-llamactl">Step 1: Start Llamactl<a class="headerlink" href="#step-1-start-llamactl" title="Permanent link">&para;</a></h2>
-<p>Start the Llamactl server:</p>
+<p>Start the Llamactl server:  </p>
 <div class="highlight"><pre><span></span><code><a id="__codelineno-0-1" name="__codelineno-0-1" href="#__codelineno-0-1"></a>llamactl
 </code></pre></div>
-<p>By default, Llamactl will start on <code>http://localhost:8080</code>.</p>
+<p>By default, Llamactl will start on <code>http://localhost:8080</code>.  </p>
 <h2 id="step-2-access-the-web-ui">Step 2: Access the Web UI<a class="headerlink" href="#step-2-access-the-web-ui" title="Permanent link">&para;</a></h2>
-<p>Open your web browser and navigate to:</p>
+<p>Open your web browser and navigate to:  </p>
 <div class="highlight"><pre><span></span><code><a id="__codelineno-1-1" name="__codelineno-1-1" href="#__codelineno-1-1"></a>http://localhost:8080
 </code></pre></div>
-<p>Login with the management API key. By default it is generated during server startup. Copy it from the terminal output.</p>
-<p>You should see the Llamactl web interface.</p>
+<p>Login with the management API key. By default it is generated during server startup. Copy it from the terminal output.  </p>
+<p>You should see the Llamactl web interface.  </p>
 <h2 id="step-3-create-your-first-instance">Step 3: Create Your First Instance<a class="headerlink" href="#step-3-create-your-first-instance" title="Permanent link">&para;</a></h2>
 <ol>
-<li>Click the "Add Instance" button</li>
-<li>Fill in the instance configuration:</li>
-<li><strong>Name</strong>: Give your instance a descriptive name</li>
-<li><strong>Backend Type</strong>: Choose from llama.cpp, MLX, or vLLM</li>
-<li><strong>Model</strong>: Model path or identifier for your chosen backend</li>
+<li>Click the "Add Instance" button  </li>
+<li>Fill in the instance configuration:  </li>
+<li><strong>Name</strong>: Give your instance a descriptive name  </li>
+<li><strong>Backend Type</strong>: Choose from llama.cpp, MLX, or vLLM  </li>
+<li><strong>Model</strong>: Model path or identifier for your chosen backend  </li>
 <li>
-<p><strong>Additional Options</strong>: Backend-specific parameters</p>
+<p><strong>Additional Options</strong>: Backend-specific parameters  </p>
 </li>
 <li>
-<p>Click "Create Instance"</p>
+<p>Click "Create Instance"  </p>
 </li>
 </ol>
 <h2 id="step-4-start-your-instance">Step 4: Start Your Instance<a class="headerlink" href="#step-4-start-your-instance" title="Permanent link">&para;</a></h2>
-<p>Once created, you can:</p>
+<p>Once created, you can:  </p>
 <ul>
-<li><strong>Start</strong> the instance by clicking the start button</li>
-<li><strong>Monitor</strong> its status in real-time</li>
-<li><strong>View logs</strong> by clicking the logs button</li>
-<li><strong>Stop</strong> the instance when needed</li>
+<li><strong>Start</strong> the instance by clicking the start button  </li>
+<li><strong>Monitor</strong> its status in real-time  </li>
+<li><strong>View logs</strong> by clicking the logs button  </li>
+<li><strong>Stop</strong> the instance when needed  </li>
 </ul>
 <h2 id="example-configurations">Example Configurations<a class="headerlink" href="#example-configurations" title="Permanent link">&para;</a></h2>
-<p>Here are basic example configurations for each backend:</p>
-<p><strong>llama.cpp backend:</strong>
+<p>Here are basic example configurations for each backend:  </p>
+<p><strong>llama.cpp backend:</strong><br />
 <div class="highlight"><pre><span></span><code><a id="__codelineno-2-1" name="__codelineno-2-1" href="#__codelineno-2-1"></a><span class="p">{</span>
 <a id="__codelineno-2-2" name="__codelineno-2-2" href="#__codelineno-2-2"></a><span class="w">  </span><span class="nt">&quot;name&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;llama2-7b&quot;</span><span class="p">,</span>
 <a id="__codelineno-2-3" name="__codelineno-2-3" href="#__codelineno-2-3"></a><span class="w">  </span><span class="nt">&quot;backend_type&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;llama_cpp&quot;</span><span class="p">,</span>
@@ -928,7 +928,7 @@
 <a id="__codelineno-2-9" name="__codelineno-2-9" href="#__codelineno-2-9"></a><span class="w">  </span><span class="p">}</span>
 <a id="__codelineno-2-10" name="__codelineno-2-10" href="#__codelineno-2-10"></a><span class="p">}</span>
 </code></pre></div></p>
-<p><strong>MLX backend (macOS only):</strong>
+<p><strong>MLX backend (macOS only):</strong><br />
 <div class="highlight"><pre><span></span><code><a id="__codelineno-3-1" name="__codelineno-3-1" href="#__codelineno-3-1"></a><span class="p">{</span>
 <a id="__codelineno-3-2" name="__codelineno-3-2" href="#__codelineno-3-2"></a><span class="w">  </span><span class="nt">&quot;name&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;mistral-mlx&quot;</span><span class="p">,</span>
 <a id="__codelineno-3-3" name="__codelineno-3-3" href="#__codelineno-3-3"></a><span class="w">  </span><span class="nt">&quot;backend_type&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;mlx_lm&quot;</span><span class="p">,</span>
@@ -939,7 +939,7 @@
 <a id="__codelineno-3-8" name="__codelineno-3-8" href="#__codelineno-3-8"></a><span class="w">  </span><span class="p">}</span>
 <a id="__codelineno-3-9" name="__codelineno-3-9" href="#__codelineno-3-9"></a><span class="p">}</span>
 </code></pre></div></p>
-<p><strong>vLLM backend:</strong>
+<p><strong>vLLM backend:</strong><br />
 <div class="highlight"><pre><span></span><code><a id="__codelineno-4-1" name="__codelineno-4-1" href="#__codelineno-4-1"></a><span class="p">{</span>
 <a id="__codelineno-4-2" name="__codelineno-4-2" href="#__codelineno-4-2"></a><span class="w">  </span><span class="nt">&quot;name&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;dialogpt-vllm&quot;</span><span class="p">,</span>
 <a id="__codelineno-4-3" name="__codelineno-4-3" href="#__codelineno-4-3"></a><span class="w">  </span><span class="nt">&quot;backend_type&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;vllm&quot;</span><span class="p">,</span>
@@ -951,7 +951,7 @@
 <a id="__codelineno-4-9" name="__codelineno-4-9" href="#__codelineno-4-9"></a><span class="p">}</span>
 </code></pre></div></p>
 <h2 id="docker-support">Docker Support<a class="headerlink" href="#docker-support" title="Permanent link">&para;</a></h2>
-<p>Llamactl can run backends in Docker containers. To enable Docker for a backend, add a <code>docker</code> section to that backend in your YAML configuration file (e.g. <code>config.yaml</code>) as shown below:</p>
+<p>Llamactl can run backends in Docker containers. To enable Docker for a backend, add a <code>docker</code> section to that backend in your YAML configuration file (e.g. <code>config.yaml</code>) as shown below:  </p>
 <div class="highlight"><pre><span></span><code><a id="__codelineno-5-1" name="__codelineno-5-1" href="#__codelineno-5-1"></a><span class="nt">backends</span><span class="p">:</span>
 <a id="__codelineno-5-2" name="__codelineno-5-2" href="#__codelineno-5-2"></a><span class="w">  </span><span class="nt">vllm</span><span class="p">:</span>
 <a id="__codelineno-5-3" name="__codelineno-5-3" href="#__codelineno-5-3"></a><span class="w">    </span><span class="nt">command</span><span class="p">:</span><span class="w"> </span><span class="s">&quot;vllm&quot;</span>
@@ -962,7 +962,7 @@
 <a id="__codelineno-5-8" name="__codelineno-5-8" href="#__codelineno-5-8"></a><span class="w">      </span><span class="nt">args</span><span class="p">:</span><span class="w"> </span><span class="p p-Indicator">[</span><span class="s">&quot;run&quot;</span><span class="p p-Indicator">,</span><span class="w"> </span><span class="s">&quot;--rm&quot;</span><span class="p p-Indicator">,</span><span class="w"> </span><span class="s">&quot;--network&quot;</span><span class="p p-Indicator">,</span><span class="w"> </span><span class="s">&quot;host&quot;</span><span class="p p-Indicator">,</span><span class="w"> </span><span class="s">&quot;--gpus&quot;</span><span class="p p-Indicator">,</span><span class="w"> </span><span class="s">&quot;all&quot;</span><span class="p p-Indicator">,</span><span class="w"> </span><span class="s">&quot;--shm-size&quot;</span><span class="p p-Indicator">,</span><span class="w"> </span><span class="s">&quot;1g&quot;</span><span class="p p-Indicator">]</span>
 </code></pre></div>
 <h2 id="using-the-api">Using the API<a class="headerlink" href="#using-the-api" title="Permanent link">&para;</a></h2>
-<p>You can also manage instances via the REST API:</p>
+<p>You can also manage instances via the REST API:  </p>
 <div class="highlight"><pre><span></span><code><a id="__codelineno-6-1" name="__codelineno-6-1" href="#__codelineno-6-1"></a><span class="c1"># List all instances</span>
 <a id="__codelineno-6-2" name="__codelineno-6-2" href="#__codelineno-6-2"></a>curl<span class="w"> </span>http://localhost:8080/api/instances
 <a id="__codelineno-6-3" name="__codelineno-6-3" href="#__codelineno-6-3"></a>
@@ -980,9 +980,9 @@
 <a id="__codelineno-6-15" name="__codelineno-6-15" href="#__codelineno-6-15"></a>curl<span class="w"> </span>-X<span class="w"> </span>POST<span class="w"> </span>http://localhost:8080/api/instances/my-model/start
 </code></pre></div>
 <h2 id="openai-compatible-api">OpenAI Compatible API<a class="headerlink" href="#openai-compatible-api" title="Permanent link">&para;</a></h2>
-<p>Llamactl provides OpenAI-compatible endpoints, making it easy to integrate with existing OpenAI client libraries and tools.</p>
+<p>Llamactl provides OpenAI-compatible endpoints, making it easy to integrate with existing OpenAI client libraries and tools.  </p>
 <h3 id="chat-completions">Chat Completions<a class="headerlink" href="#chat-completions" title="Permanent link">&para;</a></h3>
-<p>Once you have an instance running, you can use it with the OpenAI-compatible chat completions endpoint:</p>
+<p>Once you have an instance running, you can use it with the OpenAI-compatible chat completions endpoint:  </p>
 <div class="highlight"><pre><span></span><code><a id="__codelineno-7-1" name="__codelineno-7-1" href="#__codelineno-7-1"></a>curl<span class="w"> </span>-X<span class="w"> </span>POST<span class="w"> </span>http://localhost:8080/v1/chat/completions<span class="w"> </span><span class="se">\</span>
 <a id="__codelineno-7-2" name="__codelineno-7-2" href="#__codelineno-7-2"></a><span class="w">  </span>-H<span class="w"> </span><span class="s2">&quot;Content-Type: application/json&quot;</span><span class="w"> </span><span class="se">\</span>
 <a id="__codelineno-7-3" name="__codelineno-7-3" href="#__codelineno-7-3"></a><span class="w">  </span>-d<span class="w"> </span><span class="s1">&#39;{</span>
@@ -998,7 +998,7 @@
 <a id="__codelineno-7-13" name="__codelineno-7-13" href="#__codelineno-7-13"></a><span class="s1">  }&#39;</span>
 </code></pre></div>
 <h3 id="using-with-python-openai-client">Using with Python OpenAI Client<a class="headerlink" href="#using-with-python-openai-client" title="Permanent link">&para;</a></h3>
-<p>You can also use the official OpenAI Python client:</p>
+<p>You can also use the official OpenAI Python client:  </p>
 <div class="highlight"><pre><span></span><code><a id="__codelineno-8-1" name="__codelineno-8-1" href="#__codelineno-8-1"></a><span class="kn">from</span><span class="w"> </span><span class="nn">openai</span><span class="w"> </span><span class="kn">import</span> <span class="n">OpenAI</span>
 <a id="__codelineno-8-2" name="__codelineno-8-2" href="#__codelineno-8-2"></a>
 <a id="__codelineno-8-3" name="__codelineno-8-3" href="#__codelineno-8-3"></a><span class="c1"># Point the client to your Llamactl server</span>
@@ -1020,14 +1020,14 @@
 <a id="__codelineno-8-19" name="__codelineno-8-19" href="#__codelineno-8-19"></a><span class="nb">print</span><span class="p">(</span><span class="n">response</span><span class="o">.</span><span class="n">choices</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">message</span><span class="o">.</span><span class="n">content</span><span class="p">)</span>
 </code></pre></div>
 <h3 id="list-available-models">List Available Models<a class="headerlink" href="#list-available-models" title="Permanent link">&para;</a></h3>
-<p>Get a list of running instances (models) in OpenAI-compatible format:</p>
+<p>Get a list of running instances (models) in OpenAI-compatible format:  </p>
 <div class="highlight"><pre><span></span><code><a id="__codelineno-9-1" name="__codelineno-9-1" href="#__codelineno-9-1"></a>curl<span class="w"> </span>http://localhost:8080/v1/models
 </code></pre></div>
 <h2 id="next-steps">Next Steps<a class="headerlink" href="#next-steps" title="Permanent link">&para;</a></h2>
 <ul>
-<li>Manage instances <a href="../../user-guide/managing-instances/">Managing Instances</a></li>
-<li>Explore the <a href="../../user-guide/api-reference/">API Reference</a></li>
-<li>Configure advanced settings in the <a href="../configuration/">Configuration</a> guide</li>
+<li>Manage instances <a href="../../user-guide/managing-instances/">Managing Instances</a>  </li>
+<li>Explore the <a href="../../user-guide/api-reference/">API Reference</a>  </li>
+<li>Configure advanced settings in the <a href="../configuration/">Configuration</a> guide  </li>
 </ul>
 
 
diff --git a/dev/index.html b/dev/index.html
index 6c1c1a6..8a0380b 100644
--- a/dev/index.html
+++ b/dev/index.html
@@ -843,9 +843,9 @@
 
 <h1 id="llamactl-documentation">Llamactl Documentation<a class="headerlink" href="#llamactl-documentation" title="Permanent link">&para;</a></h1>
 <p>Welcome to the Llamactl documentation!  </p>
-<p><img alt="Dashboard Screenshot" src="images/dashboard.png" /></p>
+<p><img alt="Dashboard Screenshot" src="images/dashboard.png" />  </p>
 <h2 id="what-is-llamactl">What is Llamactl?<a class="headerlink" href="#what-is-llamactl" title="Permanent link">&para;</a></h2>
-<p><strong>Unified management and routing for llama.cpp, MLX and vLLM models with web dashboard.</strong></p>
+<p><strong>Unified management and routing for llama.cpp, MLX and vLLM models with web dashboard.</strong>  </p>
 <h2 id="features">Features<a class="headerlink" href="#features" title="Permanent link">&para;</a></h2>
 <h3 id="easy-model-management">🚀 Easy Model Management<a class="headerlink" href="#easy-model-management" title="Permanent link">&para;</a></h3>
 <ul>
@@ -879,21 +879,21 @@
 <p><img alt="Dashboard Screenshot" src="images/dashboard.png" />    </p>
 <h2 id="quick-links">Quick Links<a class="headerlink" href="#quick-links" title="Permanent link">&para;</a></h2>
 <ul>
-<li><a href="getting-started/installation/">Installation Guide</a> - Get Llamactl up and running</li>
-<li><a href="getting-started/configuration/">Configuration Guide</a> - Detailed configuration options</li>
-<li><a href="getting-started/quick-start/">Quick Start</a> - Your first steps with Llamactl</li>
-<li><a href="user-guide/managing-instances/">Managing Instances</a> - Instance lifecycle management</li>
-<li><a href="user-guide/api-reference/">API Reference</a> - Complete API documentation</li>
+<li><a href="getting-started/installation/">Installation Guide</a> - Get Llamactl up and running  </li>
+<li><a href="getting-started/configuration/">Configuration Guide</a> - Detailed configuration options  </li>
+<li><a href="getting-started/quick-start/">Quick Start</a> - Your first steps with Llamactl  </li>
+<li><a href="user-guide/managing-instances/">Managing Instances</a> - Instance lifecycle management  </li>
+<li><a href="user-guide/api-reference/">API Reference</a> - Complete API documentation  </li>
 </ul>
 <h2 id="getting-help">Getting Help<a class="headerlink" href="#getting-help" title="Permanent link">&para;</a></h2>
-<p>If you need help or have questions:</p>
+<p>If you need help or have questions:  </p>
 <ul>
-<li>Check the <a href="user-guide/troubleshooting/">Troubleshooting</a> guide</li>
-<li>Visit the <a href="https://github.com/lordmathis/llamactl">GitHub repository</a></li>
-<li>Review the <a href="getting-started/configuration/">Configuration Guide</a> for advanced settings</li>
+<li>Check the <a href="user-guide/troubleshooting/">Troubleshooting</a> guide  </li>
+<li>Visit the <a href="https://github.com/lordmathis/llamactl">GitHub repository</a>  </li>
+<li>Review the <a href="getting-started/configuration/">Configuration Guide</a> for advanced settings  </li>
 </ul>
 <h2 id="license">License<a class="headerlink" href="#license" title="Permanent link">&para;</a></h2>
-<p>MIT License - see the <a href="https://github.com/lordmathis/llamactl/blob/main/LICENSE">LICENSE</a> file.</p>
+<p>MIT License - see the <a href="https://github.com/lordmathis/llamactl/blob/main/LICENSE">LICENSE</a> file.  </p>
 
 
 
diff --git a/dev/search/search_index.json b/dev/search/search_index.json
index a65774a..4c2d3ce 100644
--- a/dev/search/search_index.json
+++ b/dev/search/search_index.json
@@ -1 +1 @@
-{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"","title":"Llamactl Documentation","text":"<p>Welcome to the Llamactl documentation!  </p> <p></p>"},{"location":"#what-is-llamactl","title":"What is Llamactl?","text":"<p>Unified management and routing for llama.cpp, MLX and vLLM models with web dashboard.</p>"},{"location":"#features","title":"Features","text":""},{"location":"#easy-model-management","title":"\ud83d\ude80 Easy Model Management","text":"<ul> <li>Multiple Model Serving: Run different models simultaneously (7B for speed, 70B for quality)  </li> <li>On-Demand Instance Start: Automatically launch instances upon receiving API requests  </li> <li>State Persistence: Ensure instances remain intact across server restarts  </li> </ul>"},{"location":"#universal-compatibility","title":"\ud83d\udd17 Universal Compatibility","text":"<ul> <li>OpenAI API Compatible: Drop-in replacement - route requests by instance name  </li> <li>Multi-Backend Support: Native support for llama.cpp, MLX (Apple Silicon optimized), and vLLM  </li> <li>Docker Support: Run backends in containers  </li> </ul>"},{"location":"#user-friendly-interface","title":"\ud83c\udf10 User-Friendly Interface","text":"<ul> <li>Web Dashboard: Modern React UI for visual management (unlike CLI-only tools)  </li> <li>API Key Authentication: Separate keys for management vs inference access  </li> </ul>"},{"location":"#smart-operations","title":"\u26a1 Smart Operations","text":"<ul> <li>Instance Monitoring: Health checks, auto-restart, log management  </li> <li>Smart Resource Management: Idle timeout, LRU eviction, and configurable instance limits  </li> <li>Environment Variables: Set custom environment variables per instance for advanced configuration  </li> </ul>"},{"location":"#remote-instance-deployment","title":"\ud83d\udd17 Remote Instance Deployment","text":"<ul> <li>Remote Node Support: Deploy instances on remote hosts  </li> <li>Central Management: Manage remote instances from a single dashboard  </li> <li>Seamless Routing: Automatic request routing to remote instances  </li> </ul>"},{"location":"#quick-links","title":"Quick Links","text":"<ul> <li>Installation Guide - Get Llamactl up and running</li> <li>Configuration Guide - Detailed configuration options</li> <li>Quick Start - Your first steps with Llamactl</li> <li>Managing Instances - Instance lifecycle management</li> <li>API Reference - Complete API documentation</li> </ul>"},{"location":"#getting-help","title":"Getting Help","text":"<p>If you need help or have questions:</p> <ul> <li>Check the Troubleshooting guide</li> <li>Visit the GitHub repository</li> <li>Review the Configuration Guide for advanced settings</li> </ul>"},{"location":"#license","title":"License","text":"<p>MIT License - see the LICENSE file.</p>"},{"location":"getting-started/configuration/","title":"Configuration","text":"<p>llamactl can be configured via configuration files or environment variables. Configuration is loaded in the following order of precedence:</p> <pre><code>Defaults &lt; Configuration file &lt; Environment variables\n</code></pre> <p>llamactl works out of the box with sensible defaults, but you can customize the behavior to suit your needs.</p>"},{"location":"getting-started/configuration/#default-configuration","title":"Default Configuration","text":"<p>Here's the default configuration with all available options:</p> <pre><code>server:\n  host: \"0.0.0.0\"                # Server host to bind to\n  port: 8080                     # Server port to bind to\n  allowed_origins: [\"*\"]         # Allowed CORS origins (default: all)\n  allowed_headers: [\"*\"]         # Allowed CORS headers (default: all)\n  enable_swagger: false          # Enable Swagger UI for API docs\n\nbackends:\n  llama-cpp:\n    command: \"llama-server\"\n    args: []\n    environment: {}              # Environment variables for the backend process\n    docker:\n      enabled: false\n      image: \"ghcr.io/ggml-org/llama.cpp:server\"\n      args: [\"run\", \"--rm\", \"--network\", \"host\", \"--gpus\", \"all\"]\n      environment: {}\n    response_headers: {}         # Additional response headers to send with responses\n\n  vllm:\n    command: \"vllm\"\n    args: [\"serve\"]\n    environment: {}              # Environment variables for the backend process\n    docker:\n      enabled: false\n      image: \"vllm/vllm-openai:latest\"\n      args: [\"run\", \"--rm\", \"--network\", \"host\", \"--gpus\", \"all\", \"--shm-size\", \"1g\"]\n      environment: {}\n    response_headers: {}         # Additional response headers to send with responses\n\n  mlx:\n    command: \"mlx_lm.server\"\n    args: []\n    environment: {}              # Environment variables for the backend process\n    response_headers: {}         # Additional response headers to send with responses\n\ninstances:\n  port_range: [8000, 9000]       # Port range for instances\n  data_dir: ~/.local/share/llamactl         # Data directory (platform-specific, see below)\n  configs_dir: ~/.local/share/llamactl/instances  # Instance configs directory\n  logs_dir: ~/.local/share/llamactl/logs    # Logs directory\n  auto_create_dirs: true         # Auto-create data/config/logs dirs if missing\n  max_instances: -1              # Max instances (-1 = unlimited)\n  max_running_instances: -1      # Max running instances (-1 = unlimited)\n  enable_lru_eviction: true      # Enable LRU eviction for idle instances\n  default_auto_restart: true     # Auto-restart new instances by default\n  default_max_restarts: 3        # Max restarts for new instances\n  default_restart_delay: 5       # Restart delay (seconds) for new instances\n  default_on_demand_start: true  # Default on-demand start setting\n  on_demand_start_timeout: 120   # Default on-demand start timeout in seconds\n  timeout_check_interval: 5      # Idle instance timeout check in minutes\n\nauth:\n  require_inference_auth: true   # Require auth for inference endpoints\n  inference_keys: []             # Keys for inference endpoints\n  require_management_auth: true  # Require auth for management endpoints\n  management_keys: []            # Keys for management endpoints\n\nlocal_node: \"main\"               # Name of the local node (default: \"main\")\nnodes:                           # Node configuration for multi-node deployment\n  main:                          # Default local node (empty config)\n</code></pre>"},{"location":"getting-started/configuration/#configuration-files","title":"Configuration Files","text":""},{"location":"getting-started/configuration/#configuration-file-locations","title":"Configuration File Locations","text":"<p>Configuration files are searched in the following locations (in order of precedence):</p> <p>Linux: - <code>./llamactl.yaml</code> or <code>./config.yaml</code> (current directory) - <code>$HOME/.config/llamactl/config.yaml</code> - <code>/etc/llamactl/config.yaml</code> </p> <p>macOS: - <code>./llamactl.yaml</code> or <code>./config.yaml</code> (current directory) - <code>$HOME/Library/Application Support/llamactl/config.yaml</code> - <code>/Library/Application Support/llamactl/config.yaml</code> </p> <p>Windows: - <code>./llamactl.yaml</code> or <code>./config.yaml</code> (current directory) - <code>%APPDATA%\\llamactl\\config.yaml</code> - <code>%USERPROFILE%\\llamactl\\config.yaml</code> - <code>%PROGRAMDATA%\\llamactl\\config.yaml</code> </p> <p>You can specify the path to config file with <code>LLAMACTL_CONFIG_PATH</code> environment variable.</p>"},{"location":"getting-started/configuration/#configuration-options","title":"Configuration Options","text":""},{"location":"getting-started/configuration/#server-configuration","title":"Server Configuration","text":"<pre><code>server:\n  host: \"0.0.0.0\"         # Server host to bind to (default: \"0.0.0.0\")\n  port: 8080              # Server port to bind to (default: 8080)\n  allowed_origins: [\"*\"]  # CORS allowed origins (default: [\"*\"])\n  allowed_headers: [\"*\"]  # CORS allowed headers (default: [\"*\"])\n  enable_swagger: false   # Enable Swagger UI (default: false)\n</code></pre> <p>Environment Variables: - <code>LLAMACTL_HOST</code> - Server host - <code>LLAMACTL_PORT</code> - Server port - <code>LLAMACTL_ALLOWED_ORIGINS</code> - Comma-separated CORS origins - <code>LLAMACTL_ENABLE_SWAGGER</code> - Enable Swagger UI (true/false)</p>"},{"location":"getting-started/configuration/#backend-configuration","title":"Backend Configuration","text":"<pre><code>backends:\n  llama-cpp:\n    command: \"llama-server\"\n    args: []\n    environment: {}              # Environment variables for the backend process\n    docker:\n      enabled: false             # Enable Docker runtime (default: false)\n      image: \"ghcr.io/ggml-org/llama.cpp:server\"\n      args: [\"run\", \"--rm\", \"--network\", \"host\", \"--gpus\", \"all\"]\n      environment: {}\n    response_headers: {}         # Additional response headers to send with responses\n\n  vllm:\n    command: \"vllm\"\n    args: [\"serve\"]\n    environment: {}              # Environment variables for the backend process\n    docker:\n      enabled: false             # Enable Docker runtime (default: false)\n      image: \"vllm/vllm-openai:latest\"\n      args: [\"run\", \"--rm\", \"--network\", \"host\", \"--gpus\", \"all\", \"--shm-size\", \"1g\"]\n      environment: {}\n    response_headers: {}         # Additional response headers to send with responses\n\n  mlx:\n    command: \"mlx_lm.server\"\n    args: []\n    environment: {}              # Environment variables for the backend process\n    # MLX does not support Docker\n    response_headers: {}         # Additional response headers to send with responses\n</code></pre> <p>Backend Configuration Fields: - <code>command</code>: Executable name/path for the backend - <code>args</code>: Default arguments prepended to all instances - <code>environment</code>: Environment variables for the backend process (optional) - <code>response_headers</code>: Additional response headers to send with responses (optional) - <code>docker</code>: Docker-specific configuration (optional)   - <code>enabled</code>: Boolean flag to enable Docker runtime   - <code>image</code>: Docker image to use   - <code>args</code>: Additional arguments passed to <code>docker run</code>   - <code>environment</code>: Environment variables for the container (optional)</p> <p>If llamactl is behind an NGINX proxy, <code>X-Accel-Buffering: no</code> response header may be required for NGINX to properly stream the responses without buffering.</p> <p>Environment Variables:</p> <p>LlamaCpp Backend: - <code>LLAMACTL_LLAMACPP_COMMAND</code> - LlamaCpp executable command - <code>LLAMACTL_LLAMACPP_ARGS</code> - Space-separated default arguments - <code>LLAMACTL_LLAMACPP_ENV</code> - Environment variables in format \"KEY1=value1,KEY2=value2\" - <code>LLAMACTL_LLAMACPP_DOCKER_ENABLED</code> - Enable Docker runtime (true/false) - <code>LLAMACTL_LLAMACPP_DOCKER_IMAGE</code> - Docker image to use - <code>LLAMACTL_LLAMACPP_DOCKER_ARGS</code> - Space-separated Docker arguments - <code>LLAMACTL_LLAMACPP_DOCKER_ENV</code> - Docker environment variables in format \"KEY1=value1,KEY2=value2\" - <code>LLAMACTL_LLAMACPP_RESPONSE_HEADERS</code> - Response headers in format \"KEY1=value1;KEY2=value2\"</p> <p>VLLM Backend: - <code>LLAMACTL_VLLM_COMMAND</code> - VLLM executable command - <code>LLAMACTL_VLLM_ARGS</code> - Space-separated default arguments - <code>LLAMACTL_VLLM_ENV</code> - Environment variables in format \"KEY1=value1,KEY2=value2\" - <code>LLAMACTL_VLLM_DOCKER_ENABLED</code> - Enable Docker runtime (true/false) - <code>LLAMACTL_VLLM_DOCKER_IMAGE</code> - Docker image to use - <code>LLAMACTL_VLLM_DOCKER_ARGS</code> - Space-separated Docker arguments - <code>LLAMACTL_VLLM_DOCKER_ENV</code> - Docker environment variables in format \"KEY1=value1,KEY2=value2\" - <code>LLAMACTL_VLLM_RESPONSE_HEADERS</code> - Response headers in format \"KEY1=value1;KEY2=value2\"</p> <p>MLX Backend: - <code>LLAMACTL_MLX_COMMAND</code> - MLX executable command - <code>LLAMACTL_MLX_ARGS</code> - Space-separated default arguments - <code>LLAMACTL_MLX_ENV</code> - Environment variables in format \"KEY1=value1,KEY2=value2\" - <code>LLAMACTL_MLX_RESPONSE_HEADERS</code> - Response headers in format \"KEY1=value1;KEY2=value2\"</p>"},{"location":"getting-started/configuration/#instance-configuration","title":"Instance Configuration","text":"<pre><code>instances:\n  port_range: [8000, 9000]                          # Port range for instances (default: [8000, 9000])\n  data_dir: \"~/.local/share/llamactl\"               # Directory for all llamactl data (default varies by OS)\n  configs_dir: \"~/.local/share/llamactl/instances\"  # Directory for instance configs (default: data_dir/instances)\n  logs_dir: \"~/.local/share/llamactl/logs\"          # Directory for instance logs (default: data_dir/logs)\n  auto_create_dirs: true                            # Automatically create data/config/logs directories (default: true)\n  max_instances: -1                                 # Maximum instances (-1 = unlimited)\n  max_running_instances: -1                         # Maximum running instances (-1 = unlimited)\n  enable_lru_eviction: true                         # Enable LRU eviction for idle instances\n  default_auto_restart: true                        # Default auto-restart setting\n  default_max_restarts: 3                           # Default maximum restart attempts\n  default_restart_delay: 5                          # Default restart delay in seconds\n  default_on_demand_start: true                     # Default on-demand start setting\n  on_demand_start_timeout: 120                      # Default on-demand start timeout in seconds\n  timeout_check_interval: 5                         # Default instance timeout check interval in minutes\n</code></pre> <p>Environment Variables: - <code>LLAMACTL_INSTANCE_PORT_RANGE</code> - Port range (format: \"8000-9000\" or \"8000,9000\") - <code>LLAMACTL_DATA_DIRECTORY</code> - Data directory path - <code>LLAMACTL_INSTANCES_DIR</code> - Instance configs directory path - <code>LLAMACTL_LOGS_DIR</code> - Log directory path - <code>LLAMACTL_AUTO_CREATE_DATA_DIR</code> - Auto-create data/config/logs directories (true/false) - <code>LLAMACTL_MAX_INSTANCES</code> - Maximum number of instances - <code>LLAMACTL_MAX_RUNNING_INSTANCES</code> - Maximum number of running instances - <code>LLAMACTL_ENABLE_LRU_EVICTION</code> - Enable LRU eviction for idle instances - <code>LLAMACTL_DEFAULT_AUTO_RESTART</code> - Default auto-restart setting (true/false) - <code>LLAMACTL_DEFAULT_MAX_RESTARTS</code> - Default maximum restarts - <code>LLAMACTL_DEFAULT_RESTART_DELAY</code> - Default restart delay in seconds - <code>LLAMACTL_DEFAULT_ON_DEMAND_START</code> - Default on-demand start setting (true/false) - <code>LLAMACTL_ON_DEMAND_START_TIMEOUT</code> - Default on-demand start timeout in seconds - <code>LLAMACTL_TIMEOUT_CHECK_INTERVAL</code> - Default instance timeout check interval in minutes  </p>"},{"location":"getting-started/configuration/#authentication-configuration","title":"Authentication Configuration","text":"<pre><code>auth:\n  require_inference_auth: true           # Require API key for OpenAI endpoints (default: true)\n  inference_keys: []                     # List of valid inference API keys\n  require_management_auth: true          # Require API key for management endpoints (default: true)\n  management_keys: []                    # List of valid management API keys\n</code></pre> <p>Environment Variables: - <code>LLAMACTL_REQUIRE_INFERENCE_AUTH</code> - Require auth for OpenAI endpoints (true/false) - <code>LLAMACTL_INFERENCE_KEYS</code> - Comma-separated inference API keys - <code>LLAMACTL_REQUIRE_MANAGEMENT_AUTH</code> - Require auth for management endpoints (true/false) - <code>LLAMACTL_MANAGEMENT_KEYS</code> - Comma-separated management API keys</p>"},{"location":"getting-started/configuration/#remote-node-configuration","title":"Remote Node Configuration","text":"<p>llamactl supports remote node deployments. Configure remote nodes to deploy instances on remote hosts and manage them centrally.</p> <pre><code>local_node: \"main\"               # Name of the local node (default: \"main\")\nnodes:                           # Node configuration map\n  main:                          # Local node (empty address means local)\n    address: \"\"                  # Not used for local node\n    api_key: \"\"                  # Not used for local node\n  worker1:                       # Remote worker node\n    address: \"http://192.168.1.10:8080\"\n    api_key: \"worker1-api-key\"   # Management API key for authentication\n</code></pre> <p>Node Configuration Fields: - <code>local_node</code>: Specifies which node in the <code>nodes</code> map represents the local node - <code>nodes</code>: Map of node configurations   - <code>address</code>: HTTP/HTTPS URL of the remote node (empty for local node)   - <code>api_key</code>: Management API key for authenticating with the remote node</p> <p>Environment Variables: - <code>LLAMACTL_LOCAL_NODE</code> - Name of the local node</p>"},{"location":"getting-started/installation/","title":"Installation","text":"<p>This guide will walk you through installing Llamactl on your system.</p>"},{"location":"getting-started/installation/#prerequisites","title":"Prerequisites","text":""},{"location":"getting-started/installation/#backend-dependencies","title":"Backend Dependencies","text":"<p>llamactl supports multiple backends. Install at least one:</p> <p>For llama.cpp backend (all platforms):</p> <p>You need <code>llama-server</code> from llama.cpp installed:</p> <pre><code># Homebrew (macOS/Linux)\nbrew install llama.cpp\n# Winget (Windows)\nwinget install llama.cpp\n</code></pre> <p>Or build from source - see llama.cpp docs</p> <p>For MLX backend (macOS only):</p> <p>MLX provides optimized inference on Apple Silicon. Install MLX-LM:</p> <pre><code># Install via pip (requires Python 3.8+)\npip install mlx-lm\n\n# Or in a virtual environment (recommended)\npython -m venv mlx-env\nsource mlx-env/bin/activate\npip install mlx-lm\n</code></pre> <p>Note: MLX backend is only available on macOS with Apple Silicon (M1, M2, M3, etc.)</p> <p>For vLLM backend:</p> <p>vLLM provides high-throughput distributed serving for LLMs. Install vLLM:</p> <pre><code># Install via pip (requires Python 3.8+, GPU required)\npip install vllm\n\n# Or in a virtual environment (recommended)\npython -m venv vllm-env\nsource vllm-env/bin/activate\npip install vllm\n\n# For production deployments, consider container-based installation\n</code></pre>"},{"location":"getting-started/installation/#installation-methods","title":"Installation Methods","text":""},{"location":"getting-started/installation/#option-1-download-binary-recommended","title":"Option 1: Download Binary (Recommended)","text":"<p>Download the latest release from the GitHub releases page:</p> <pre><code># Linux/macOS - Get latest version and download\nLATEST_VERSION=$(curl -s https://api.github.com/repos/lordmathis/llamactl/releases/latest | grep '\"tag_name\":' | sed -E 's/.*\"([^\"]+)\".*/\\1/')\ncurl -L https://github.com/lordmathis/llamactl/releases/download/${LATEST_VERSION}/llamactl-${LATEST_VERSION}-$(uname -s | tr '[:upper:]' '[:lower:]')-$(uname -m).tar.gz | tar -xz\nsudo mv llamactl /usr/local/bin/\n\n# Or download manually from:\n# https://github.com/lordmathis/llamactl/releases/latest\n\n# Windows - Download from releases page\n</code></pre>"},{"location":"getting-started/installation/#option-2-docker","title":"Option 2: Docker","text":"<p>llamactl provides Dockerfiles for creating Docker images with backends pre-installed. The resulting images include the latest llamactl release with the respective backend.</p> <p>Available Dockerfiles (CUDA): - llamactl with llama.cpp CUDA: <code>docker/Dockerfile.llamacpp</code> (based on <code>ghcr.io/ggml-org/llama.cpp:server-cuda</code>) - llamactl with vLLM CUDA: <code>docker/Dockerfile.vllm</code> (based on <code>vllm/vllm-openai:latest</code>) - llamactl built from source: <code>docker/Dockerfile.source</code> (multi-stage build with webui)</p> <p>Note: These Dockerfiles are configured for CUDA. For other platforms (CPU, ROCm, Vulkan, etc.), adapt the base image. For llama.cpp, see available tags at llama.cpp Docker docs. For vLLM, check vLLM docs.</p>"},{"location":"getting-started/installation/#using-docker-compose","title":"Using Docker Compose","text":"<pre><code># Clone the repository\ngit clone https://github.com/lordmathis/llamactl.git\ncd llamactl\n\n# Create directories for data and models\nmkdir -p data/llamacpp data/vllm models\n\n# Start llamactl with llama.cpp backend\ndocker-compose -f docker/docker-compose.yml up llamactl-llamacpp -d\n\n# Or start llamactl with vLLM backend\ndocker-compose -f docker/docker-compose.yml up llamactl-vllm -d\n</code></pre> <p>Access the dashboard at: - llamactl with llama.cpp: http://localhost:8080 - llamactl with vLLM: http://localhost:8081</p>"},{"location":"getting-started/installation/#using-docker-build-and-run","title":"Using Docker Build and Run","text":"<p>llamactl with llama.cpp CUDA: <pre><code>docker build -f docker/Dockerfile.llamacpp -t llamactl:llamacpp-cuda .\ndocker run -d \\\n  --name llamactl-llamacpp \\\n  --gpus all \\\n  -p 8080:8080 \\\n  -v ~/.cache/llama.cpp:/root/.cache/llama.cpp \\\n  llamactl:llamacpp-cuda\n</code></pre></p> <p>llamactl with vLLM CUDA: <pre><code>docker build -f docker/Dockerfile.vllm -t llamactl:vllm-cuda .\ndocker run -d \\\n  --name llamactl-vllm \\\n  --gpus all \\\n  -p 8080:8080 \\\n  -v ~/.cache/huggingface:/root/.cache/huggingface \\\n  llamactl:vllm-cuda\n</code></pre></p> <p>llamactl built from source: <pre><code>docker build -f docker/Dockerfile.source -t llamactl:source .\ndocker run -d \\\n  --name llamactl \\\n  -p 8080:8080 \\\n  llamactl:source\n</code></pre></p>"},{"location":"getting-started/installation/#option-3-build-from-source","title":"Option 3: Build from Source","text":"<p>Requirements: - Go 1.24 or later - Node.js 22 or later - Git</p> <p>If you prefer to build from source:</p> <pre><code># Clone the repository\ngit clone https://github.com/lordmathis/llamactl.git\ncd llamactl\n\n# Build the web UI\ncd webui &amp;&amp; npm ci &amp;&amp; npm run build &amp;&amp; cd ..\n\n# Build the application\ngo build -o llamactl ./cmd/server\n</code></pre>"},{"location":"getting-started/installation/#remote-node-installation","title":"Remote Node Installation","text":"<p>For deployments with remote nodes: - Install llamactl on each node using any of the methods above - Configure API keys for authentication between nodes</p>"},{"location":"getting-started/installation/#verification","title":"Verification","text":"<p>Verify your installation by checking the version:</p> <pre><code>llamactl --version\n</code></pre>"},{"location":"getting-started/installation/#next-steps","title":"Next Steps","text":"<p>Now that Llamactl is installed, continue to the Quick Start guide to get your first instance running!</p> <p>For remote node deployments, see the Configuration Guide for node setup instructions.</p>"},{"location":"getting-started/quick-start/","title":"Quick Start","text":"<p>This guide will help you get Llamactl up and running in just a few minutes.</p>"},{"location":"getting-started/quick-start/#step-1-start-llamactl","title":"Step 1: Start Llamactl","text":"<p>Start the Llamactl server:</p> <pre><code>llamactl\n</code></pre> <p>By default, Llamactl will start on <code>http://localhost:8080</code>.</p>"},{"location":"getting-started/quick-start/#step-2-access-the-web-ui","title":"Step 2: Access the Web UI","text":"<p>Open your web browser and navigate to:</p> <pre><code>http://localhost:8080\n</code></pre> <p>Login with the management API key. By default it is generated during server startup. Copy it from the terminal output.</p> <p>You should see the Llamactl web interface.</p>"},{"location":"getting-started/quick-start/#step-3-create-your-first-instance","title":"Step 3: Create Your First Instance","text":"<ol> <li>Click the \"Add Instance\" button</li> <li>Fill in the instance configuration:</li> <li>Name: Give your instance a descriptive name</li> <li>Backend Type: Choose from llama.cpp, MLX, or vLLM</li> <li>Model: Model path or identifier for your chosen backend</li> <li> <p>Additional Options: Backend-specific parameters</p> </li> <li> <p>Click \"Create Instance\"</p> </li> </ol>"},{"location":"getting-started/quick-start/#step-4-start-your-instance","title":"Step 4: Start Your Instance","text":"<p>Once created, you can:</p> <ul> <li>Start the instance by clicking the start button</li> <li>Monitor its status in real-time</li> <li>View logs by clicking the logs button</li> <li>Stop the instance when needed</li> </ul>"},{"location":"getting-started/quick-start/#example-configurations","title":"Example Configurations","text":"<p>Here are basic example configurations for each backend:</p> <p>llama.cpp backend: <pre><code>{\n  \"name\": \"llama2-7b\",\n  \"backend_type\": \"llama_cpp\",\n  \"backend_options\": {\n    \"model\": \"/path/to/llama-2-7b-chat.gguf\",\n    \"threads\": 4,\n    \"ctx_size\": 2048,\n    \"gpu_layers\": 32\n  }\n}\n</code></pre></p> <p>MLX backend (macOS only): <pre><code>{\n  \"name\": \"mistral-mlx\",\n  \"backend_type\": \"mlx_lm\",\n  \"backend_options\": {\n    \"model\": \"mlx-community/Mistral-7B-Instruct-v0.3-4bit\",\n    \"temp\": 0.7,\n    \"max_tokens\": 2048\n  }\n}\n</code></pre></p> <p>vLLM backend: <pre><code>{\n  \"name\": \"dialogpt-vllm\",\n  \"backend_type\": \"vllm\",\n  \"backend_options\": {\n    \"model\": \"microsoft/DialoGPT-medium\",\n    \"tensor_parallel_size\": 2,\n    \"gpu_memory_utilization\": 0.9\n  }\n}\n</code></pre></p>"},{"location":"getting-started/quick-start/#docker-support","title":"Docker Support","text":"<p>Llamactl can run backends in Docker containers. To enable Docker for a backend, add a <code>docker</code> section to that backend in your YAML configuration file (e.g. <code>config.yaml</code>) as shown below:</p> <pre><code>backends:\n  vllm:\n    command: \"vllm\"\n    args: [\"serve\"]\n    docker:\n      enabled: true\n      image: \"vllm/vllm-openai:latest\"\n      args: [\"run\", \"--rm\", \"--network\", \"host\", \"--gpus\", \"all\", \"--shm-size\", \"1g\"]\n</code></pre>"},{"location":"getting-started/quick-start/#using-the-api","title":"Using the API","text":"<p>You can also manage instances via the REST API:</p> <pre><code># List all instances\ncurl http://localhost:8080/api/instances\n\n# Create a new llama.cpp instance\ncurl -X POST http://localhost:8080/api/instances/my-model \\\n  -H \"Content-Type: application/json\" \\\n  -d '{\n    \"backend_type\": \"llama_cpp\",\n    \"backend_options\": {\n      \"model\": \"/path/to/model.gguf\"\n    }\n  }'\n\n# Start an instance\ncurl -X POST http://localhost:8080/api/instances/my-model/start\n</code></pre>"},{"location":"getting-started/quick-start/#openai-compatible-api","title":"OpenAI Compatible API","text":"<p>Llamactl provides OpenAI-compatible endpoints, making it easy to integrate with existing OpenAI client libraries and tools.</p>"},{"location":"getting-started/quick-start/#chat-completions","title":"Chat Completions","text":"<p>Once you have an instance running, you can use it with the OpenAI-compatible chat completions endpoint:</p> <pre><code>curl -X POST http://localhost:8080/v1/chat/completions \\\n  -H \"Content-Type: application/json\" \\\n  -d '{\n    \"model\": \"my-model\",\n    \"messages\": [\n      {\n        \"role\": \"user\",\n        \"content\": \"Hello! Can you help me write a Python function?\"\n      }\n    ],\n    \"max_tokens\": 150,\n    \"temperature\": 0.7\n  }'\n</code></pre>"},{"location":"getting-started/quick-start/#using-with-python-openai-client","title":"Using with Python OpenAI Client","text":"<p>You can also use the official OpenAI Python client:</p> <pre><code>from openai import OpenAI\n\n# Point the client to your Llamactl server\nclient = OpenAI(\n    base_url=\"http://localhost:8080/v1\",\n    api_key=\"not-needed\"  # Llamactl doesn't require API keys by default\n)\n\n# Create a chat completion\nresponse = client.chat.completions.create(\n    model=\"my-model\",  # Use the name of your instance\n    messages=[\n        {\"role\": \"user\", \"content\": \"Explain quantum computing in simple terms\"}\n    ],\n    max_tokens=200,\n    temperature=0.7\n)\n\nprint(response.choices[0].message.content)\n</code></pre>"},{"location":"getting-started/quick-start/#list-available-models","title":"List Available Models","text":"<p>Get a list of running instances (models) in OpenAI-compatible format:</p> <pre><code>curl http://localhost:8080/v1/models\n</code></pre>"},{"location":"getting-started/quick-start/#next-steps","title":"Next Steps","text":"<ul> <li>Manage instances Managing Instances</li> <li>Explore the API Reference</li> <li>Configure advanced settings in the Configuration guide</li> </ul>"},{"location":"user-guide/api-reference/","title":"API Reference","text":"<p>Complete reference for the Llamactl REST API.</p>"},{"location":"user-guide/api-reference/#base-url","title":"Base URL","text":"<p>All API endpoints are relative to the base URL:</p> <pre><code>http://localhost:8080/api/v1\n</code></pre>"},{"location":"user-guide/api-reference/#authentication","title":"Authentication","text":"<p>Llamactl supports API key authentication. If authentication is enabled, include the API key in the Authorization header:</p> <pre><code>curl -H \"Authorization: Bearer &lt;your-api-key&gt;\" \\\n  http://localhost:8080/api/v1/instances\n</code></pre> <p>The server supports two types of API keys: - Management API Keys: Required for instance management operations (CRUD operations on instances) - Inference API Keys: Required for OpenAI-compatible inference endpoints</p>"},{"location":"user-guide/api-reference/#system-endpoints","title":"System Endpoints","text":""},{"location":"user-guide/api-reference/#get-llamactl-version","title":"Get Llamactl Version","text":"<p>Get the version information of the llamactl server.</p> <pre><code>GET /api/v1/version\n</code></pre> <p>Response: <pre><code>Version: 1.0.0\nCommit: abc123\nBuild Time: 2024-01-15T10:00:00Z\n</code></pre></p>"},{"location":"user-guide/api-reference/#get-llama-server-help","title":"Get Llama Server Help","text":"<p>Get help text for the llama-server command.</p> <pre><code>GET /api/v1/server/help\n</code></pre> <p>Response: Plain text help output from <code>llama-server --help</code></p>"},{"location":"user-guide/api-reference/#get-llama-server-version","title":"Get Llama Server Version","text":"<p>Get version information of the llama-server binary.</p> <pre><code>GET /api/v1/server/version\n</code></pre> <p>Response: Plain text version output from <code>llama-server --version</code></p>"},{"location":"user-guide/api-reference/#list-available-devices","title":"List Available Devices","text":"<p>List available devices for llama-server.</p> <pre><code>GET /api/v1/server/devices\n</code></pre> <p>Response: Plain text device list from <code>llama-server --list-devices</code></p>"},{"location":"user-guide/api-reference/#instances","title":"Instances","text":""},{"location":"user-guide/api-reference/#list-all-instances","title":"List All Instances","text":"<p>Get a list of all instances.</p> <pre><code>GET /api/v1/instances\n</code></pre> <p>Response: <pre><code>[\n  {\n    \"name\": \"llama2-7b\",\n    \"status\": \"running\",\n    \"created\": 1705312200\n  }\n]\n</code></pre></p>"},{"location":"user-guide/api-reference/#get-instance-details","title":"Get Instance Details","text":"<p>Get detailed information about a specific instance.</p> <pre><code>GET /api/v1/instances/{name}\n</code></pre> <p>Response: <pre><code>{\n  \"name\": \"llama2-7b\",\n  \"status\": \"running\",\n  \"created\": 1705312200\n}\n</code></pre></p>"},{"location":"user-guide/api-reference/#create-instance","title":"Create Instance","text":"<p>Create and start a new instance.</p> <pre><code>POST /api/v1/instances/{name}\n</code></pre> <p>Request Body: JSON object with instance configuration. Common fields include:</p> <ul> <li><code>backend_type</code>: Backend type (<code>llama_cpp</code>, <code>mlx_lm</code>, or <code>vllm</code>)</li> <li><code>backend_options</code>: Backend-specific configuration</li> <li><code>auto_restart</code>: Enable automatic restart on failure</li> <li><code>max_restarts</code>: Maximum restart attempts</li> <li><code>restart_delay</code>: Delay between restarts in seconds</li> <li><code>on_demand_start</code>: Start instance when receiving requests</li> <li><code>idle_timeout</code>: Idle timeout in minutes</li> <li><code>environment</code>: Environment variables as key-value pairs</li> <li><code>nodes</code>: Array with single node name to deploy the instance to (for remote deployments)</li> </ul> <p>See Managing Instances for complete configuration options.</p> <p>Response: <pre><code>{\n  \"name\": \"llama2-7b\",\n  \"status\": \"running\",\n  \"created\": 1705312200\n}\n</code></pre></p>"},{"location":"user-guide/api-reference/#update-instance","title":"Update Instance","text":"<p>Update an existing instance configuration. See Managing Instances for available configuration options.</p> <pre><code>PUT /api/v1/instances/{name}\n</code></pre> <p>Request Body: JSON object with configuration fields to update.</p> <p>Response: <pre><code>{\n  \"name\": \"llama2-7b\",\n  \"status\": \"running\",\n  \"created\": 1705312200\n}\n</code></pre></p>"},{"location":"user-guide/api-reference/#delete-instance","title":"Delete Instance","text":"<p>Stop and remove an instance.</p> <pre><code>DELETE /api/v1/instances/{name}\n</code></pre> <p>Response: <code>204 No Content</code></p>"},{"location":"user-guide/api-reference/#instance-operations","title":"Instance Operations","text":""},{"location":"user-guide/api-reference/#start-instance","title":"Start Instance","text":"<p>Start a stopped instance.</p> <pre><code>POST /api/v1/instances/{name}/start\n</code></pre> <p>Response: <pre><code>{\n  \"name\": \"llama2-7b\",\n  \"status\": \"running\",\n  \"created\": 1705312200\n}\n</code></pre></p> <p>Error Responses: - <code>409 Conflict</code>: Maximum number of running instances reached - <code>500 Internal Server Error</code>: Failed to start instance</p>"},{"location":"user-guide/api-reference/#stop-instance","title":"Stop Instance","text":"<p>Stop a running instance.</p> <pre><code>POST /api/v1/instances/{name}/stop\n</code></pre> <p>Response: <pre><code>{\n  \"name\": \"llama2-7b\",\n  \"status\": \"stopped\",\n  \"created\": 1705312200\n}\n</code></pre></p>"},{"location":"user-guide/api-reference/#restart-instance","title":"Restart Instance","text":"<p>Restart an instance (stop then start).</p> <pre><code>POST /api/v1/instances/{name}/restart\n</code></pre> <p>Response: <pre><code>{\n  \"name\": \"llama2-7b\",\n  \"status\": \"running\",\n  \"created\": 1705312200\n}\n</code></pre></p>"},{"location":"user-guide/api-reference/#get-instance-logs","title":"Get Instance Logs","text":"<p>Retrieve instance logs.</p> <pre><code>GET /api/v1/instances/{name}/logs\n</code></pre> <p>Query Parameters: - <code>lines</code>: Number of lines to return (default: all lines, use -1 for all)</p> <p>Response: Plain text log output</p> <p>Example: <pre><code>curl \"http://localhost:8080/api/v1/instances/my-instance/logs?lines=100\"\n</code></pre></p>"},{"location":"user-guide/api-reference/#proxy-to-instance","title":"Proxy to Instance","text":"<p>Proxy HTTP requests directly to the llama-server instance.</p> <pre><code>GET /api/v1/instances/{name}/proxy/*\nPOST /api/v1/instances/{name}/proxy/*\n</code></pre> <p>This endpoint forwards all requests to the underlying llama-server instance running on its configured port. The proxy strips the <code>/api/v1/instances/{name}/proxy</code> prefix and forwards the remaining path to the instance.</p> <p>Example - Check Instance Health: <pre><code>curl -H \"Authorization: Bearer your-api-key\" \\\n  http://localhost:8080/api/v1/instances/my-model/proxy/health\n</code></pre></p> <p>This forwards the request to <code>http://instance-host:instance-port/health</code> on the actual llama-server instance.</p> <p>Error Responses: - <code>503 Service Unavailable</code>: Instance is not running</p>"},{"location":"user-guide/api-reference/#openai-compatible-api","title":"OpenAI-Compatible API","text":"<p>Llamactl provides OpenAI-compatible endpoints for inference operations.</p>"},{"location":"user-guide/api-reference/#list-models","title":"List Models","text":"<p>List all instances in OpenAI-compatible format.</p> <pre><code>GET /v1/models\n</code></pre> <p>Response: <pre><code>{\n  \"object\": \"list\",\n  \"data\": [\n    {\n      \"id\": \"llama2-7b\",\n      \"object\": \"model\",\n      \"created\": 1705312200,\n      \"owned_by\": \"llamactl\"\n    }\n  ]\n}\n</code></pre></p>"},{"location":"user-guide/api-reference/#chat-completions-completions-embeddings","title":"Chat Completions, Completions, Embeddings","text":"<p>All OpenAI-compatible inference endpoints are available:</p> <pre><code>POST /v1/chat/completions\nPOST /v1/completions\nPOST /v1/embeddings\nPOST /v1/rerank\nPOST /v1/reranking\n</code></pre> <p>Request Body: Standard OpenAI format with <code>model</code> field specifying the instance name</p> <p>Example: <pre><code>{\n  \"model\": \"llama2-7b\",\n  \"messages\": [\n    {\n      \"role\": \"user\",\n      \"content\": \"Hello, how are you?\"\n    }\n  ]\n}\n</code></pre></p> <p>The server routes requests to the appropriate instance based on the <code>model</code> field in the request body. Instances with on-demand starting enabled will be automatically started if not running. For configuration details, see Managing Instances.</p> <p>Error Responses: - <code>400 Bad Request</code>: Invalid request body or missing instance name - <code>503 Service Unavailable</code>: Instance is not running and on-demand start is disabled - <code>409 Conflict</code>: Cannot start instance due to maximum instances limit</p>"},{"location":"user-guide/api-reference/#instance-status-values","title":"Instance Status Values","text":"<p>Instances can have the following status values: - <code>stopped</code>: Instance is not running - <code>running</code>: Instance is running and ready to accept requests - <code>failed</code>: Instance failed to start or crashed  </p>"},{"location":"user-guide/api-reference/#error-responses","title":"Error Responses","text":"<p>All endpoints may return error responses in the following format:</p> <pre><code>{\n  \"error\": \"Error message description\"\n}\n</code></pre>"},{"location":"user-guide/api-reference/#common-http-status-codes","title":"Common HTTP Status Codes","text":"<ul> <li><code>200</code>: Success</li> <li><code>201</code>: Created</li> <li><code>204</code>: No Content (successful deletion)</li> <li><code>400</code>: Bad Request (invalid parameters or request body)</li> <li><code>401</code>: Unauthorized (missing or invalid API key)</li> <li><code>403</code>: Forbidden (insufficient permissions)</li> <li><code>404</code>: Not Found (instance not found)</li> <li><code>409</code>: Conflict (instance already exists, max instances reached)</li> <li><code>500</code>: Internal Server Error</li> <li><code>503</code>: Service Unavailable (instance not running)</li> </ul>"},{"location":"user-guide/api-reference/#examples","title":"Examples","text":""},{"location":"user-guide/api-reference/#complete-instance-lifecycle","title":"Complete Instance Lifecycle","text":"<pre><code># Create and start instance\ncurl -X POST http://localhost:8080/api/v1/instances/my-model \\\n  -H \"Content-Type: application/json\" \\\n  -H \"Authorization: Bearer your-api-key\" \\\n  -d '{\n    \"backend_type\": \"llama_cpp\",\n    \"backend_options\": {\n      \"model\": \"/models/llama-2-7b.gguf\",\n      \"gpu_layers\": 32\n    },\n    \"environment\": {\n      \"CUDA_VISIBLE_DEVICES\": \"0\",\n      \"OMP_NUM_THREADS\": \"8\"\n    }\n  }'\n\n# Check instance status\ncurl -H \"Authorization: Bearer your-api-key\" \\\n  http://localhost:8080/api/v1/instances/my-model\n\n# Get instance logs\ncurl -H \"Authorization: Bearer your-api-key\" \\\n  \"http://localhost:8080/api/v1/instances/my-model/logs?lines=50\"\n\n# Use OpenAI-compatible chat completions\ncurl -X POST http://localhost:8080/v1/chat/completions \\\n  -H \"Content-Type: application/json\" \\\n  -H \"Authorization: Bearer your-inference-api-key\" \\\n  -d '{\n    \"model\": \"my-model\",\n    \"messages\": [\n      {\"role\": \"user\", \"content\": \"Hello!\"}\n    ],\n    \"max_tokens\": 100\n  }'\n\n# Stop instance\ncurl -X POST -H \"Authorization: Bearer your-api-key\" \\\n  http://localhost:8080/api/v1/instances/my-model/stop\n\n# Delete instance\ncurl -X DELETE -H \"Authorization: Bearer your-api-key\" \\\n  http://localhost:8080/api/v1/instances/my-model\n</code></pre>"},{"location":"user-guide/api-reference/#remote-node-instance-example","title":"Remote Node Instance Example","text":"<pre><code># Create instance on specific remote node\ncurl -X POST http://localhost:8080/api/v1/instances/remote-model \\\n  -H \"Content-Type: application/json\" \\\n  -H \"Authorization: Bearer your-api-key\" \\\n  -d '{\n    \"backend_type\": \"llama_cpp\",\n    \"backend_options\": {\n      \"model\": \"/models/llama-2-7b.gguf\",\n      \"gpu_layers\": 32\n    },\n    \"nodes\": [\"worker1\"]\n  }'\n\n# Check status of remote instance\ncurl -H \"Authorization: Bearer your-api-key\" \\\n  http://localhost:8080/api/v1/instances/remote-model\n\n# Use remote instance with OpenAI-compatible API\ncurl -X POST http://localhost:8080/v1/chat/completions \\\n  -H \"Content-Type: application/json\" \\\n  -H \"Authorization: Bearer your-inference-api-key\" \\\n  -d '{\n    \"model\": \"remote-model\",\n    \"messages\": [\n      {\"role\": \"user\", \"content\": \"Hello from remote node!\"}\n    ]\n  }'\n</code></pre>"},{"location":"user-guide/api-reference/#using-the-proxy-endpoint","title":"Using the Proxy Endpoint","text":"<p>You can also directly proxy requests to the llama-server instance:</p> <pre><code># Direct proxy to instance (bypasses OpenAI compatibility layer)\ncurl -X POST http://localhost:8080/api/v1/instances/my-model/proxy/completion \\\n  -H \"Content-Type: application/json\" \\\n  -H \"Authorization: Bearer your-api-key\" \\\n  -d '{\n    \"prompt\": \"Hello, world!\",\n    \"n_predict\": 50\n  }'\n</code></pre>"},{"location":"user-guide/api-reference/#backend-specific-endpoints","title":"Backend-Specific Endpoints","text":""},{"location":"user-guide/api-reference/#parse-commands","title":"Parse Commands","text":"<p>Llamactl provides endpoints to parse command strings from different backends into instance configuration options.</p>"},{"location":"user-guide/api-reference/#parse-llamacpp-command","title":"Parse Llama.cpp Command","text":"<p>Parse a llama-server command string into instance options.</p> <pre><code>POST /api/v1/backends/llama-cpp/parse-command\n</code></pre> <p>Request Body: <pre><code>{\n  \"command\": \"llama-server -m /path/to/model.gguf -c 2048 --port 8080\"\n}\n</code></pre></p> <p>Response: <pre><code>{\n  \"backend_type\": \"llama_cpp\",\n  \"llama_server_options\": {\n    \"model\": \"/path/to/model.gguf\",\n    \"ctx_size\": 2048,\n    \"port\": 8080\n  }\n}\n</code></pre></p>"},{"location":"user-guide/api-reference/#parse-mlx-lm-command","title":"Parse MLX-LM Command","text":"<p>Parse an MLX-LM server command string into instance options.</p> <pre><code>POST /api/v1/backends/mlx/parse-command\n</code></pre> <p>Request Body: <pre><code>{\n  \"command\": \"mlx_lm.server --model /path/to/model --port 8080\"\n}\n</code></pre></p> <p>Response: <pre><code>{\n  \"backend_type\": \"mlx_lm\",\n  \"mlx_server_options\": {\n    \"model\": \"/path/to/model\",\n    \"port\": 8080\n  }\n}\n</code></pre></p>"},{"location":"user-guide/api-reference/#parse-vllm-command","title":"Parse vLLM Command","text":"<p>Parse a vLLM serve command string into instance options.</p> <pre><code>POST /api/v1/backends/vllm/parse-command\n</code></pre> <p>Request Body: <pre><code>{\n  \"command\": \"vllm serve /path/to/model --port 8080\"\n}\n</code></pre></p> <p>Response: <pre><code>{\n  \"backend_type\": \"vllm\",\n  \"vllm_server_options\": {\n    \"model\": \"/path/to/model\",\n    \"port\": 8080\n  }\n}\n</code></pre></p> <p>Error Responses for Parse Commands: - <code>400 Bad Request</code>: Invalid request body, empty command, or parse error - <code>500 Internal Server Error</code>: Encoding error</p>"},{"location":"user-guide/api-reference/#auto-generated-documentation","title":"Auto-Generated Documentation","text":"<p>The API documentation is automatically generated from code annotations using Swagger/OpenAPI. To regenerate the documentation:</p> <ol> <li>Install the swag tool: <code>go install github.com/swaggo/swag/cmd/swag@latest</code></li> <li>Generate docs: <code>swag init -g cmd/server/main.go -o apidocs</code></li> </ol>"},{"location":"user-guide/api-reference/#swagger-documentation","title":"Swagger Documentation","text":"<p>If swagger documentation is enabled in the server configuration, you can access the interactive API documentation at:</p> <pre><code>http://localhost:8080/swagger/\n</code></pre> <p>This provides a complete interactive interface for testing all API endpoints.</p>"},{"location":"user-guide/managing-instances/","title":"Managing Instances","text":"<p>Learn how to effectively manage your llama.cpp, MLX, and vLLM instances with Llamactl through both the Web UI and API.</p>"},{"location":"user-guide/managing-instances/#overview","title":"Overview","text":"<p>Llamactl provides two ways to manage instances:</p> <ul> <li>Web UI: Accessible at <code>http://localhost:8080</code> with an intuitive dashboard</li> <li>REST API: Programmatic access for automation and integration</li> </ul> <p></p>"},{"location":"user-guide/managing-instances/#authentication","title":"Authentication","text":"<p>If authentication is enabled: 1. Navigate to the web UI 2. Enter your credentials 3. Bearer token is stored for the session</p>"},{"location":"user-guide/managing-instances/#theme-support","title":"Theme Support","text":"<ul> <li>Switch between light and dark themes</li> <li>Setting is remembered across sessions</li> </ul>"},{"location":"user-guide/managing-instances/#instance-cards","title":"Instance Cards","text":"<p>Each instance is displayed as a card showing:</p> <ul> <li>Instance name</li> <li>Health status badge (unknown, ready, error, failed)</li> <li>Action buttons (start, stop, edit, logs, delete)</li> </ul>"},{"location":"user-guide/managing-instances/#create-instance","title":"Create Instance","text":""},{"location":"user-guide/managing-instances/#via-web-ui","title":"Via Web UI","text":"<ol> <li>Click the \"Create Instance\" button on the dashboard</li> <li>Enter a unique Name for your instance (only required field)</li> <li>Select Target Node: Choose which node to deploy the instance to from the dropdown</li> <li>Choose Backend Type:<ul> <li>llama.cpp: For GGUF models using llama-server</li> <li>MLX: For MLX-optimized models (macOS only)</li> <li>vLLM: For distributed serving and high-throughput inference</li> </ul> </li> <li>Configure model source:<ul> <li>For llama.cpp: GGUF model path or HuggingFace repo</li> <li>For MLX: MLX model path or identifier (e.g., <code>mlx-community/Mistral-7B-Instruct-v0.3-4bit</code>)</li> <li>For vLLM: HuggingFace model identifier (e.g., <code>microsoft/DialoGPT-medium</code>)</li> </ul> </li> <li>Configure optional instance management settings:<ul> <li>Auto Restart: Automatically restart instance on failure</li> <li>Max Restarts: Maximum number of restart attempts</li> <li>Restart Delay: Delay in seconds between restart attempts</li> <li>On Demand Start: Start instance when receiving a request to the OpenAI compatible endpoint</li> <li>Idle Timeout: Minutes before stopping idle instance (set to 0 to disable)</li> <li>Environment Variables: Set custom environment variables for the instance process</li> </ul> </li> <li>Configure backend-specific options:<ul> <li>llama.cpp: Threads, context size, GPU layers, port, etc.</li> <li>MLX: Temperature, top-p, adapter path, Python environment, etc.</li> <li>vLLM: Tensor parallel size, GPU memory utilization, quantization, etc.</li> </ul> </li> <li>Click \"Create\" to save the instance  </li> </ol>"},{"location":"user-guide/managing-instances/#via-api","title":"Via API","text":"<pre><code># Create llama.cpp instance with local model file\ncurl -X POST http://localhost:8080/api/instances/my-llama-instance \\\n  -H \"Content-Type: application/json\" \\\n  -d '{\n    \"backend_type\": \"llama_cpp\",\n    \"backend_options\": {\n      \"model\": \"/path/to/model.gguf\",\n      \"threads\": 8,\n      \"ctx_size\": 4096,\n      \"gpu_layers\": 32\n    }\n  }'\n\n# Create MLX instance (macOS only)\ncurl -X POST http://localhost:8080/api/instances/my-mlx-instance \\\n  -H \"Content-Type: application/json\" \\\n  -d '{\n    \"backend_type\": \"mlx_lm\",\n    \"backend_options\": {\n      \"model\": \"mlx-community/Mistral-7B-Instruct-v0.3-4bit\",\n      \"temp\": 0.7,\n      \"top_p\": 0.9,\n      \"max_tokens\": 2048\n    },\n    \"auto_restart\": true,\n    \"max_restarts\": 3\n  }'\n\n# Create vLLM instance\ncurl -X POST http://localhost:8080/api/instances/my-vllm-instance \\\n  -H \"Content-Type: application/json\" \\\n  -d '{\n    \"backend_type\": \"vllm\",\n    \"backend_options\": {\n      \"model\": \"microsoft/DialoGPT-medium\",\n      \"tensor_parallel_size\": 2,\n      \"gpu_memory_utilization\": 0.9\n    },\n    \"auto_restart\": true,\n    \"on_demand_start\": true,\n    \"environment\": {\n      \"CUDA_VISIBLE_DEVICES\": \"0,1\",\n      \"NCCL_DEBUG\": \"INFO\",\n      \"PYTHONPATH\": \"/custom/path\"\n    }\n  }'\n\n# Create llama.cpp instance with HuggingFace model\ncurl -X POST http://localhost:8080/api/instances/gemma-3-27b \\\n  -H \"Content-Type: application/json\" \\\n  -d '{\n    \"backend_type\": \"llama_cpp\",\n    \"backend_options\": {\n      \"hf_repo\": \"unsloth/gemma-3-27b-it-GGUF\",\n      \"hf_file\": \"gemma-3-27b-it-GGUF.gguf\",\n      \"gpu_layers\": 32\n    }\n  }'\n\n# Create instance on specific remote node\ncurl -X POST http://localhost:8080/api/instances/remote-llama \\\n  -H \"Content-Type: application/json\" \\\n  -d '{\n    \"backend_type\": \"llama_cpp\",\n    \"backend_options\": {\n      \"model\": \"/models/llama-7b.gguf\",\n      \"gpu_layers\": 32\n    },\n    \"nodes\": [\"worker1\"]\n  }'\n</code></pre>"},{"location":"user-guide/managing-instances/#start-instance","title":"Start Instance","text":""},{"location":"user-guide/managing-instances/#via-web-ui_1","title":"Via Web UI","text":"<ol> <li>Click the \"Start\" button on an instance card</li> <li>Watch the status change to \"Unknown\"</li> <li>Monitor progress in the logs</li> <li>Instance status changes to \"Ready\" when ready</li> </ol>"},{"location":"user-guide/managing-instances/#via-api_1","title":"Via API","text":"<pre><code>curl -X POST http://localhost:8080/api/instances/{name}/start\n</code></pre>"},{"location":"user-guide/managing-instances/#stop-instance","title":"Stop Instance","text":""},{"location":"user-guide/managing-instances/#via-web-ui_2","title":"Via Web UI","text":"<ol> <li>Click the \"Stop\" button on an instance card</li> <li>Instance gracefully shuts down</li> </ol>"},{"location":"user-guide/managing-instances/#via-api_2","title":"Via API","text":"<pre><code>curl -X POST http://localhost:8080/api/instances/{name}/stop\n</code></pre>"},{"location":"user-guide/managing-instances/#edit-instance","title":"Edit Instance","text":""},{"location":"user-guide/managing-instances/#via-web-ui_3","title":"Via Web UI","text":"<ol> <li>Click the \"Edit\" button on an instance card</li> <li>Modify settings in the configuration dialog</li> <li>Changes require instance restart to take effect</li> <li>Click \"Update &amp; Restart\" to apply changes</li> </ol>"},{"location":"user-guide/managing-instances/#via-api_3","title":"Via API","text":"<p>Modify instance settings:</p> <pre><code>curl -X PUT http://localhost:8080/api/instances/{name} \\\n  -H \"Content-Type: application/json\" \\\n  -d '{\n    \"backend_options\": {\n      \"threads\": 8,\n      \"context_size\": 4096\n    }\n  }'\n</code></pre> <p>Note</p> <p>Configuration changes require restarting the instance to take effect.</p>"},{"location":"user-guide/managing-instances/#view-logs","title":"View Logs","text":""},{"location":"user-guide/managing-instances/#via-web-ui_4","title":"Via Web UI","text":"<ol> <li>Click the \"Logs\" button on any instance card</li> <li>Real-time log viewer opens</li> </ol>"},{"location":"user-guide/managing-instances/#via-api_4","title":"Via API","text":"<p>Check instance status in real-time:</p> <pre><code># Get instance details\ncurl http://localhost:8080/api/instances/{name}/logs\n</code></pre>"},{"location":"user-guide/managing-instances/#delete-instance","title":"Delete Instance","text":""},{"location":"user-guide/managing-instances/#via-web-ui_5","title":"Via Web UI","text":"<ol> <li>Click the \"Delete\" button on an instance card</li> <li>Only stopped instances can be deleted</li> <li>Confirm deletion in the dialog</li> </ol>"},{"location":"user-guide/managing-instances/#via-api_5","title":"Via API","text":"<pre><code>curl -X DELETE http://localhost:8080/api/instances/{name}\n</code></pre>"},{"location":"user-guide/managing-instances/#instance-proxy","title":"Instance Proxy","text":"<p>Llamactl proxies all requests to the underlying backend instances (llama-server, MLX, or vLLM).</p> <pre><code># Get instance details\ncurl http://localhost:8080/api/instances/{name}/proxy/\n</code></pre> <p>All backends provide OpenAI-compatible endpoints. Check the respective documentation: - llama-server docs - MLX-LM docs - vLLM docs</p>"},{"location":"user-guide/managing-instances/#instance-health","title":"Instance Health","text":""},{"location":"user-guide/managing-instances/#via-web-ui_6","title":"Via Web UI","text":"<ol> <li>The health status badge is displayed on each instance card</li> </ol>"},{"location":"user-guide/managing-instances/#via-api_6","title":"Via API","text":"<p>Check the health status of your instances:</p> <pre><code>curl http://localhost:8080/api/instances/{name}/proxy/health\n</code></pre>"},{"location":"user-guide/troubleshooting/","title":"Troubleshooting","text":"<p>Issues specific to Llamactl deployment and operation.</p>"},{"location":"user-guide/troubleshooting/#configuration-issues","title":"Configuration Issues","text":""},{"location":"user-guide/troubleshooting/#invalid-configuration","title":"Invalid Configuration","text":"<p>Problem: Invalid configuration preventing startup</p> <p>Solutions: 1. Use minimal configuration:    <pre><code>server:\n  host: \"0.0.0.0\"\n  port: 8080\ninstances:\n  port_range: [8000, 9000]\n</code></pre></p> <ol> <li>Check data directory permissions:    <pre><code># Ensure data directory is writable (default: ~/.local/share/llamactl)\nmkdir -p ~/.local/share/llamactl/{instances,logs}\n</code></pre></li> </ol>"},{"location":"user-guide/troubleshooting/#instance-management-issues","title":"Instance Management Issues","text":""},{"location":"user-guide/troubleshooting/#model-loading-failures","title":"Model Loading Failures","text":"<p>Problem: Instance fails to start with model loading errors</p> <p>Common Solutions: - llama-server not found: Ensure <code>llama-server</code> binary is in PATH - Wrong model format: Ensure model is in GGUF format - Insufficient memory: Use smaller model or reduce context size - Path issues: Use absolute paths to model files  </p>"},{"location":"user-guide/troubleshooting/#memory-issues","title":"Memory Issues","text":"<p>Problem: Out of memory errors or system becomes unresponsive</p> <p>Solutions: 1. Reduce context size: <pre><code>{\n  \"n_ctx\": 1024\n}\n</code></pre></p> <ol> <li>Use quantized models: </li> <li>Try Q4_K_M instead of higher precision models  </li> <li>Use smaller model variants (7B instead of 13B)  </li> </ol>"},{"location":"user-guide/troubleshooting/#gpu-configuration","title":"GPU Configuration","text":"<p>Problem: GPU not being used effectively</p> <p>Solutions: 1. Configure GPU layers: <pre><code>{\n  \"n_gpu_layers\": 35\n}\n</code></pre></p>"},{"location":"user-guide/troubleshooting/#advanced-instance-issues","title":"Advanced Instance Issues","text":"<p>Problem: Complex model loading, performance, or compatibility issues</p> <p>Since llamactl uses <code>llama-server</code> under the hood, many instance-related issues are actually llama.cpp issues. For advanced troubleshooting:</p> <p>Resources: - llama.cpp Documentation: https://github.com/ggml/llama.cpp - llama.cpp Issues: https://github.com/ggml/llama.cpp/issues - llama.cpp Discussions: https://github.com/ggml/llama.cpp/discussions </p> <p>Testing directly with llama-server: <pre><code># Test your model and parameters directly with llama-server\nllama-server --model /path/to/model.gguf --port 8081 --n-gpu-layers 35\n</code></pre></p> <p>This helps determine if the issue is with llamactl or with the underlying llama.cpp/llama-server.</p>"},{"location":"user-guide/troubleshooting/#api-and-network-issues","title":"API and Network Issues","text":""},{"location":"user-guide/troubleshooting/#cors-errors","title":"CORS Errors","text":"<p>Problem: Web UI shows CORS errors in browser console</p> <p>Solutions: 1. Configure allowed origins: <pre><code>server:\n  allowed_origins:\n    - \"http://localhost:3000\"\n    - \"https://yourdomain.com\"\n</code></pre></p>"},{"location":"user-guide/troubleshooting/#authentication-issues","title":"Authentication Issues","text":"<p>Problem: API requests failing with authentication errors</p> <p>Solutions: 1. Disable authentication temporarily: <pre><code>auth:\n  require_management_auth: false\n  require_inference_auth: false\n</code></pre></p> <ol> <li> <p>Configure API keys: <pre><code>auth:\n  management_keys:\n    - \"your-management-key\"\n  inference_keys:\n    - \"your-inference-key\"\n</code></pre></p> </li> <li> <p>Use correct Authorization header: <pre><code>curl -H \"Authorization: Bearer your-api-key\" \\\n  http://localhost:8080/api/v1/instances\n</code></pre></p> </li> </ol>"},{"location":"user-guide/troubleshooting/#remote-node-issues","title":"Remote Node Issues","text":""},{"location":"user-guide/troubleshooting/#node-configuration","title":"Node Configuration","text":"<p>Problem: Remote instances not appearing or cannot be managed</p> <p>Solutions: 1. Verify node configuration: <pre><code>local_node: \"main\"  # Must match a key in nodes map\nnodes:\n  main:\n    address: \"\"     # Empty for local node\n  worker1:\n    address: \"http://worker1.internal:8080\"\n    api_key: \"secure-key\"  # Must match worker1's management key\n</code></pre></p> <ol> <li>Test remote node connectivity: <pre><code>curl -H \"Authorization: Bearer remote-node-key\" \\\n  http://remote-node:8080/api/v1/instances\n</code></pre></li> </ol>"},{"location":"user-guide/troubleshooting/#debugging-and-logs","title":"Debugging and Logs","text":""},{"location":"user-guide/troubleshooting/#viewing-instance-logs","title":"Viewing Instance Logs","text":"<pre><code># Get instance logs via API\ncurl http://localhost:8080/api/v1/instances/{name}/logs\n\n# Or check log files directly\ntail -f ~/.local/share/llamactl/logs/{instance-name}.log\n</code></pre>"},{"location":"user-guide/troubleshooting/#enable-debug-logging","title":"Enable Debug Logging","text":"<pre><code>export LLAMACTL_LOG_LEVEL=debug\nllamactl\n</code></pre>"},{"location":"user-guide/troubleshooting/#getting-help","title":"Getting Help","text":"<p>When reporting issues, include:</p> <ol> <li> <p>System information: <pre><code>llamactl --version\n</code></pre></p> </li> <li> <p>Configuration file (remove sensitive keys)</p> </li> <li> <p>Relevant log output</p> </li> <li> <p>Steps to reproduce the issue</p> </li> </ol>"}]}
\ No newline at end of file
+{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"","title":"Llamactl Documentation","text":"<p>Welcome to the Llamactl documentation!  </p> <p> </p>"},{"location":"#what-is-llamactl","title":"What is Llamactl?","text":"<p>Unified management and routing for llama.cpp, MLX and vLLM models with web dashboard. </p>"},{"location":"#features","title":"Features","text":""},{"location":"#easy-model-management","title":"\ud83d\ude80 Easy Model Management","text":"<ul> <li>Multiple Model Serving: Run different models simultaneously (7B for speed, 70B for quality)  </li> <li>On-Demand Instance Start: Automatically launch instances upon receiving API requests  </li> <li>State Persistence: Ensure instances remain intact across server restarts  </li> </ul>"},{"location":"#universal-compatibility","title":"\ud83d\udd17 Universal Compatibility","text":"<ul> <li>OpenAI API Compatible: Drop-in replacement - route requests by instance name  </li> <li>Multi-Backend Support: Native support for llama.cpp, MLX (Apple Silicon optimized), and vLLM  </li> <li>Docker Support: Run backends in containers  </li> </ul>"},{"location":"#user-friendly-interface","title":"\ud83c\udf10 User-Friendly Interface","text":"<ul> <li>Web Dashboard: Modern React UI for visual management (unlike CLI-only tools)  </li> <li>API Key Authentication: Separate keys for management vs inference access  </li> </ul>"},{"location":"#smart-operations","title":"\u26a1 Smart Operations","text":"<ul> <li>Instance Monitoring: Health checks, auto-restart, log management  </li> <li>Smart Resource Management: Idle timeout, LRU eviction, and configurable instance limits  </li> <li>Environment Variables: Set custom environment variables per instance for advanced configuration  </li> </ul>"},{"location":"#remote-instance-deployment","title":"\ud83d\udd17 Remote Instance Deployment","text":"<ul> <li>Remote Node Support: Deploy instances on remote hosts  </li> <li>Central Management: Manage remote instances from a single dashboard  </li> <li>Seamless Routing: Automatic request routing to remote instances  </li> </ul>"},{"location":"#quick-links","title":"Quick Links","text":"<ul> <li>Installation Guide - Get Llamactl up and running  </li> <li>Configuration Guide - Detailed configuration options  </li> <li>Quick Start - Your first steps with Llamactl  </li> <li>Managing Instances - Instance lifecycle management  </li> <li>API Reference - Complete API documentation  </li> </ul>"},{"location":"#getting-help","title":"Getting Help","text":"<p>If you need help or have questions:  </p> <ul> <li>Check the Troubleshooting guide  </li> <li>Visit the GitHub repository </li> <li>Review the Configuration Guide for advanced settings  </li> </ul>"},{"location":"#license","title":"License","text":"<p>MIT License - see the LICENSE file.  </p>"},{"location":"getting-started/configuration/","title":"Configuration","text":"<p>llamactl can be configured via configuration files or environment variables. Configuration is loaded in the following order of precedence:  </p> <pre><code>Defaults &lt; Configuration file &lt; Environment variables\n</code></pre> <p>llamactl works out of the box with sensible defaults, but you can customize the behavior to suit your needs.  </p>"},{"location":"getting-started/configuration/#default-configuration","title":"Default Configuration","text":"<p>Here's the default configuration with all available options:  </p> <pre><code>server:\n  host: \"0.0.0.0\"                # Server host to bind to\n  port: 8080                     # Server port to bind to\n  allowed_origins: [\"*\"]         # Allowed CORS origins (default: all)\n  allowed_headers: [\"*\"]         # Allowed CORS headers (default: all)\n  enable_swagger: false          # Enable Swagger UI for API docs\n\nbackends:\n  llama-cpp:\n    command: \"llama-server\"\n    args: []\n    environment: {}              # Environment variables for the backend process\n    docker:\n      enabled: false\n      image: \"ghcr.io/ggml-org/llama.cpp:server\"\n      args: [\"run\", \"--rm\", \"--network\", \"host\", \"--gpus\", \"all\"]\n      environment: {}\n    response_headers: {}         # Additional response headers to send with responses\n\n  vllm:\n    command: \"vllm\"\n    args: [\"serve\"]\n    environment: {}              # Environment variables for the backend process\n    docker:\n      enabled: false\n      image: \"vllm/vllm-openai:latest\"\n      args: [\"run\", \"--rm\", \"--network\", \"host\", \"--gpus\", \"all\", \"--shm-size\", \"1g\"]\n      environment: {}\n    response_headers: {}         # Additional response headers to send with responses\n\n  mlx:\n    command: \"mlx_lm.server\"\n    args: []\n    environment: {}              # Environment variables for the backend process\n    response_headers: {}         # Additional response headers to send with responses\n\ninstances:\n  port_range: [8000, 9000]       # Port range for instances\n  data_dir: ~/.local/share/llamactl         # Data directory (platform-specific, see below)\n  configs_dir: ~/.local/share/llamactl/instances  # Instance configs directory\n  logs_dir: ~/.local/share/llamactl/logs    # Logs directory\n  auto_create_dirs: true         # Auto-create data/config/logs dirs if missing\n  max_instances: -1              # Max instances (-1 = unlimited)\n  max_running_instances: -1      # Max running instances (-1 = unlimited)\n  enable_lru_eviction: true      # Enable LRU eviction for idle instances\n  default_auto_restart: true     # Auto-restart new instances by default\n  default_max_restarts: 3        # Max restarts for new instances\n  default_restart_delay: 5       # Restart delay (seconds) for new instances\n  default_on_demand_start: true  # Default on-demand start setting\n  on_demand_start_timeout: 120   # Default on-demand start timeout in seconds\n  timeout_check_interval: 5      # Idle instance timeout check in minutes\n\nauth:\n  require_inference_auth: true   # Require auth for inference endpoints\n  inference_keys: []             # Keys for inference endpoints\n  require_management_auth: true  # Require auth for management endpoints\n  management_keys: []            # Keys for management endpoints\n\nlocal_node: \"main\"               # Name of the local node (default: \"main\")\nnodes:                           # Node configuration for multi-node deployment\n  main:                          # Default local node (empty config)\n</code></pre>"},{"location":"getting-started/configuration/#configuration-files","title":"Configuration Files","text":""},{"location":"getting-started/configuration/#configuration-file-locations","title":"Configuration File Locations","text":"<p>Configuration files are searched in the following locations (in order of precedence):  </p> <p>Linux: - <code>./llamactl.yaml</code> or <code>./config.yaml</code> (current directory) - <code>$HOME/.config/llamactl/config.yaml</code> - <code>/etc/llamactl/config.yaml</code> </p> <p>macOS: - <code>./llamactl.yaml</code> or <code>./config.yaml</code> (current directory) - <code>$HOME/Library/Application Support/llamactl/config.yaml</code> - <code>/Library/Application Support/llamactl/config.yaml</code> </p> <p>Windows: - <code>./llamactl.yaml</code> or <code>./config.yaml</code> (current directory) - <code>%APPDATA%\\llamactl\\config.yaml</code> - <code>%USERPROFILE%\\llamactl\\config.yaml</code> - <code>%PROGRAMDATA%\\llamactl\\config.yaml</code> </p> <p>You can specify the path to config file with <code>LLAMACTL_CONFIG_PATH</code> environment variable.  </p>"},{"location":"getting-started/configuration/#configuration-options","title":"Configuration Options","text":""},{"location":"getting-started/configuration/#server-configuration","title":"Server Configuration","text":"<pre><code>server:\n  host: \"0.0.0.0\"         # Server host to bind to (default: \"0.0.0.0\")\n  port: 8080              # Server port to bind to (default: 8080)\n  allowed_origins: [\"*\"]  # CORS allowed origins (default: [\"*\"])\n  allowed_headers: [\"*\"]  # CORS allowed headers (default: [\"*\"])\n  enable_swagger: false   # Enable Swagger UI (default: false)\n</code></pre> <p>Environment Variables: - <code>LLAMACTL_HOST</code> - Server host - <code>LLAMACTL_PORT</code> - Server port - <code>LLAMACTL_ALLOWED_ORIGINS</code> - Comma-separated CORS origins - <code>LLAMACTL_ENABLE_SWAGGER</code> - Enable Swagger UI (true/false)  </p>"},{"location":"getting-started/configuration/#backend-configuration","title":"Backend Configuration","text":"<pre><code>backends:\n  llama-cpp:\n    command: \"llama-server\"\n    args: []\n    environment: {}              # Environment variables for the backend process\n    docker:\n      enabled: false             # Enable Docker runtime (default: false)\n      image: \"ghcr.io/ggml-org/llama.cpp:server\"\n      args: [\"run\", \"--rm\", \"--network\", \"host\", \"--gpus\", \"all\"]\n      environment: {}\n    response_headers: {}         # Additional response headers to send with responses\n\n  vllm:\n    command: \"vllm\"\n    args: [\"serve\"]\n    environment: {}              # Environment variables for the backend process\n    docker:\n      enabled: false             # Enable Docker runtime (default: false)\n      image: \"vllm/vllm-openai:latest\"\n      args: [\"run\", \"--rm\", \"--network\", \"host\", \"--gpus\", \"all\", \"--shm-size\", \"1g\"]\n      environment: {}\n    response_headers: {}         # Additional response headers to send with responses\n\n  mlx:\n    command: \"mlx_lm.server\"\n    args: []\n    environment: {}              # Environment variables for the backend process\n    # MLX does not support Docker\n    response_headers: {}         # Additional response headers to send with responses\n</code></pre> <p>Backend Configuration Fields: - <code>command</code>: Executable name/path for the backend - <code>args</code>: Default arguments prepended to all instances - <code>environment</code>: Environment variables for the backend process (optional) - <code>response_headers</code>: Additional response headers to send with responses (optional) - <code>docker</code>: Docker-specific configuration (optional)   - <code>enabled</code>: Boolean flag to enable Docker runtime   - <code>image</code>: Docker image to use   - <code>args</code>: Additional arguments passed to <code>docker run</code>   - <code>environment</code>: Environment variables for the container (optional)  </p> <p>If llamactl is behind an NGINX proxy, <code>X-Accel-Buffering: no</code> response header may be required for NGINX to properly stream the responses without buffering.</p> <p>Environment Variables: </p> <p>LlamaCpp Backend: - <code>LLAMACTL_LLAMACPP_COMMAND</code> - LlamaCpp executable command - <code>LLAMACTL_LLAMACPP_ARGS</code> - Space-separated default arguments - <code>LLAMACTL_LLAMACPP_ENV</code> - Environment variables in format \"KEY1=value1,KEY2=value2\" - <code>LLAMACTL_LLAMACPP_DOCKER_ENABLED</code> - Enable Docker runtime (true/false) - <code>LLAMACTL_LLAMACPP_DOCKER_IMAGE</code> - Docker image to use - <code>LLAMACTL_LLAMACPP_DOCKER_ARGS</code> - Space-separated Docker arguments - <code>LLAMACTL_LLAMACPP_DOCKER_ENV</code> - Docker environment variables in format \"KEY1=value1,KEY2=value2\" - <code>LLAMACTL_LLAMACPP_RESPONSE_HEADERS</code> - Response headers in format \"KEY1=value1;KEY2=value2\"  </p> <p>VLLM Backend: - <code>LLAMACTL_VLLM_COMMAND</code> - VLLM executable command - <code>LLAMACTL_VLLM_ARGS</code> - Space-separated default arguments - <code>LLAMACTL_VLLM_ENV</code> - Environment variables in format \"KEY1=value1,KEY2=value2\" - <code>LLAMACTL_VLLM_DOCKER_ENABLED</code> - Enable Docker runtime (true/false) - <code>LLAMACTL_VLLM_DOCKER_IMAGE</code> - Docker image to use - <code>LLAMACTL_VLLM_DOCKER_ARGS</code> - Space-separated Docker arguments - <code>LLAMACTL_VLLM_DOCKER_ENV</code> - Docker environment variables in format \"KEY1=value1,KEY2=value2\" - <code>LLAMACTL_VLLM_RESPONSE_HEADERS</code> - Response headers in format \"KEY1=value1;KEY2=value2\"  </p> <p>MLX Backend: - <code>LLAMACTL_MLX_COMMAND</code> - MLX executable command - <code>LLAMACTL_MLX_ARGS</code> - Space-separated default arguments - <code>LLAMACTL_MLX_ENV</code> - Environment variables in format \"KEY1=value1,KEY2=value2\" - <code>LLAMACTL_MLX_RESPONSE_HEADERS</code> - Response headers in format \"KEY1=value1;KEY2=value2\"  </p>"},{"location":"getting-started/configuration/#instance-configuration","title":"Instance Configuration","text":"<pre><code>instances:\n  port_range: [8000, 9000]                          # Port range for instances (default: [8000, 9000])\n  data_dir: \"~/.local/share/llamactl\"               # Directory for all llamactl data (default varies by OS)\n  configs_dir: \"~/.local/share/llamactl/instances\"  # Directory for instance configs (default: data_dir/instances)\n  logs_dir: \"~/.local/share/llamactl/logs\"          # Directory for instance logs (default: data_dir/logs)\n  auto_create_dirs: true                            # Automatically create data/config/logs directories (default: true)\n  max_instances: -1                                 # Maximum instances (-1 = unlimited)\n  max_running_instances: -1                         # Maximum running instances (-1 = unlimited)\n  enable_lru_eviction: true                         # Enable LRU eviction for idle instances\n  default_auto_restart: true                        # Default auto-restart setting\n  default_max_restarts: 3                           # Default maximum restart attempts\n  default_restart_delay: 5                          # Default restart delay in seconds\n  default_on_demand_start: true                     # Default on-demand start setting\n  on_demand_start_timeout: 120                      # Default on-demand start timeout in seconds\n  timeout_check_interval: 5                         # Default instance timeout check interval in minutes\n</code></pre> <p>Environment Variables: - <code>LLAMACTL_INSTANCE_PORT_RANGE</code> - Port range (format: \"8000-9000\" or \"8000,9000\") - <code>LLAMACTL_DATA_DIRECTORY</code> - Data directory path - <code>LLAMACTL_INSTANCES_DIR</code> - Instance configs directory path - <code>LLAMACTL_LOGS_DIR</code> - Log directory path - <code>LLAMACTL_AUTO_CREATE_DATA_DIR</code> - Auto-create data/config/logs directories (true/false) - <code>LLAMACTL_MAX_INSTANCES</code> - Maximum number of instances - <code>LLAMACTL_MAX_RUNNING_INSTANCES</code> - Maximum number of running instances - <code>LLAMACTL_ENABLE_LRU_EVICTION</code> - Enable LRU eviction for idle instances - <code>LLAMACTL_DEFAULT_AUTO_RESTART</code> - Default auto-restart setting (true/false) - <code>LLAMACTL_DEFAULT_MAX_RESTARTS</code> - Default maximum restarts - <code>LLAMACTL_DEFAULT_RESTART_DELAY</code> - Default restart delay in seconds - <code>LLAMACTL_DEFAULT_ON_DEMAND_START</code> - Default on-demand start setting (true/false) - <code>LLAMACTL_ON_DEMAND_START_TIMEOUT</code> - Default on-demand start timeout in seconds - <code>LLAMACTL_TIMEOUT_CHECK_INTERVAL</code> - Default instance timeout check interval in minutes  </p>"},{"location":"getting-started/configuration/#authentication-configuration","title":"Authentication Configuration","text":"<pre><code>auth:\n  require_inference_auth: true           # Require API key for OpenAI endpoints (default: true)\n  inference_keys: []                     # List of valid inference API keys\n  require_management_auth: true          # Require API key for management endpoints (default: true)\n  management_keys: []                    # List of valid management API keys\n</code></pre> <p>Environment Variables: - <code>LLAMACTL_REQUIRE_INFERENCE_AUTH</code> - Require auth for OpenAI endpoints (true/false) - <code>LLAMACTL_INFERENCE_KEYS</code> - Comma-separated inference API keys - <code>LLAMACTL_REQUIRE_MANAGEMENT_AUTH</code> - Require auth for management endpoints (true/false) - <code>LLAMACTL_MANAGEMENT_KEYS</code> - Comma-separated management API keys  </p>"},{"location":"getting-started/configuration/#remote-node-configuration","title":"Remote Node Configuration","text":"<p>llamactl supports remote node deployments. Configure remote nodes to deploy instances on remote hosts and manage them centrally.  </p> <pre><code>local_node: \"main\"               # Name of the local node (default: \"main\")\nnodes:                           # Node configuration map\n  main:                          # Local node (empty address means local)\n    address: \"\"                  # Not used for local node\n    api_key: \"\"                  # Not used for local node\n  worker1:                       # Remote worker node\n    address: \"http://192.168.1.10:8080\"\n    api_key: \"worker1-api-key\"   # Management API key for authentication\n</code></pre> <p>Node Configuration Fields: - <code>local_node</code>: Specifies which node in the <code>nodes</code> map represents the local node - <code>nodes</code>: Map of node configurations   - <code>address</code>: HTTP/HTTPS URL of the remote node (empty for local node)   - <code>api_key</code>: Management API key for authenticating with the remote node  </p> <p>Environment Variables: - <code>LLAMACTL_LOCAL_NODE</code> - Name of the local node  </p>"},{"location":"getting-started/installation/","title":"Installation","text":"<p>This guide will walk you through installing Llamactl on your system.  </p>"},{"location":"getting-started/installation/#prerequisites","title":"Prerequisites","text":""},{"location":"getting-started/installation/#backend-dependencies","title":"Backend Dependencies","text":"<p>llamactl supports multiple backends. Install at least one:  </p> <p>For llama.cpp backend (all platforms): </p> <p>You need <code>llama-server</code> from llama.cpp installed:  </p> <pre><code># Homebrew (macOS/Linux)\nbrew install llama.cpp\n# Winget (Windows)\nwinget install llama.cpp\n</code></pre> <p>Or build from source - see llama.cpp docs  </p> <p>For MLX backend (macOS only): </p> <p>MLX provides optimized inference on Apple Silicon. Install MLX-LM:  </p> <pre><code># Install via pip (requires Python 3.8+)\npip install mlx-lm\n\n# Or in a virtual environment (recommended)\npython -m venv mlx-env\nsource mlx-env/bin/activate\npip install mlx-lm\n</code></pre> <p>Note: MLX backend is only available on macOS with Apple Silicon (M1, M2, M3, etc.)  </p> <p>For vLLM backend: </p> <p>vLLM provides high-throughput distributed serving for LLMs. Install vLLM:  </p> <pre><code># Install via pip (requires Python 3.8+, GPU required)\npip install vllm\n\n# Or in a virtual environment (recommended)\npython -m venv vllm-env\nsource vllm-env/bin/activate\npip install vllm\n\n# For production deployments, consider container-based installation\n</code></pre>"},{"location":"getting-started/installation/#installation-methods","title":"Installation Methods","text":""},{"location":"getting-started/installation/#option-1-download-binary-recommended","title":"Option 1: Download Binary (Recommended)","text":"<p>Download the latest release from the GitHub releases page:  </p> <pre><code># Linux/macOS - Get latest version and download\nLATEST_VERSION=$(curl -s https://api.github.com/repos/lordmathis/llamactl/releases/latest | grep '\"tag_name\":' | sed -E 's/.*\"([^\"]+)\".*/\\1/')\ncurl -L https://github.com/lordmathis/llamactl/releases/download/${LATEST_VERSION}/llamactl-${LATEST_VERSION}-$(uname -s | tr '[:upper:]' '[:lower:]')-$(uname -m).tar.gz | tar -xz\nsudo mv llamactl /usr/local/bin/\n\n# Or download manually from:\n# https://github.com/lordmathis/llamactl/releases/latest\n\n# Windows - Download from releases page\n</code></pre>"},{"location":"getting-started/installation/#option-2-docker","title":"Option 2: Docker","text":"<p>llamactl provides Dockerfiles for creating Docker images with backends pre-installed. The resulting images include the latest llamactl release with the respective backend.  </p> <p>Available Dockerfiles (CUDA): - llamactl with llama.cpp CUDA: <code>docker/Dockerfile.llamacpp</code> (based on <code>ghcr.io/ggml-org/llama.cpp:server-cuda</code>) - llamactl with vLLM CUDA: <code>docker/Dockerfile.vllm</code> (based on <code>vllm/vllm-openai:latest</code>) - llamactl built from source: <code>docker/Dockerfile.source</code> (multi-stage build with webui)  </p> <p>Note: These Dockerfiles are configured for CUDA. For other platforms (CPU, ROCm, Vulkan, etc.), adapt the base image. For llama.cpp, see available tags at llama.cpp Docker docs. For vLLM, check vLLM docs.  </p>"},{"location":"getting-started/installation/#using-docker-compose","title":"Using Docker Compose","text":"<pre><code># Clone the repository\ngit clone https://github.com/lordmathis/llamactl.git\ncd llamactl\n\n# Create directories for data and models\nmkdir -p data/llamacpp data/vllm models\n\n# Start llamactl with llama.cpp backend\ndocker-compose -f docker/docker-compose.yml up llamactl-llamacpp -d\n\n# Or start llamactl with vLLM backend\ndocker-compose -f docker/docker-compose.yml up llamactl-vllm -d\n</code></pre> <p>Access the dashboard at: - llamactl with llama.cpp: http://localhost:8080 - llamactl with vLLM: http://localhost:8081  </p>"},{"location":"getting-started/installation/#using-docker-build-and-run","title":"Using Docker Build and Run","text":"<p>llamactl with llama.cpp CUDA: <pre><code>docker build -f docker/Dockerfile.llamacpp -t llamactl:llamacpp-cuda .\ndocker run -d \\\n  --name llamactl-llamacpp \\\n  --gpus all \\\n  -p 8080:8080 \\\n  -v ~/.cache/llama.cpp:/root/.cache/llama.cpp \\\n  llamactl:llamacpp-cuda\n</code></pre></p> <p>llamactl with vLLM CUDA: <pre><code>docker build -f docker/Dockerfile.vllm -t llamactl:vllm-cuda .\ndocker run -d \\\n  --name llamactl-vllm \\\n  --gpus all \\\n  -p 8080:8080 \\\n  -v ~/.cache/huggingface:/root/.cache/huggingface \\\n  llamactl:vllm-cuda\n</code></pre></p> <p>llamactl built from source: <pre><code>docker build -f docker/Dockerfile.source -t llamactl:source .\ndocker run -d \\\n  --name llamactl \\\n  -p 8080:8080 \\\n  llamactl:source\n</code></pre></p>"},{"location":"getting-started/installation/#option-3-build-from-source","title":"Option 3: Build from Source","text":"<p>Requirements: - Go 1.24 or later - Node.js 22 or later - Git  </p> <p>If you prefer to build from source:  </p> <pre><code># Clone the repository\ngit clone https://github.com/lordmathis/llamactl.git\ncd llamactl\n\n# Build the web UI\ncd webui &amp;&amp; npm ci &amp;&amp; npm run build &amp;&amp; cd ..\n\n# Build the application\ngo build -o llamactl ./cmd/server\n</code></pre>"},{"location":"getting-started/installation/#remote-node-installation","title":"Remote Node Installation","text":"<p>For deployments with remote nodes: - Install llamactl on each node using any of the methods above - Configure API keys for authentication between nodes  </p>"},{"location":"getting-started/installation/#verification","title":"Verification","text":"<p>Verify your installation by checking the version:  </p> <pre><code>llamactl --version\n</code></pre>"},{"location":"getting-started/installation/#next-steps","title":"Next Steps","text":"<p>Now that Llamactl is installed, continue to the Quick Start guide to get your first instance running!  </p> <p>For remote node deployments, see the Configuration Guide for node setup instructions.  </p>"},{"location":"getting-started/quick-start/","title":"Quick Start","text":"<p>This guide will help you get Llamactl up and running in just a few minutes.  </p>"},{"location":"getting-started/quick-start/#step-1-start-llamactl","title":"Step 1: Start Llamactl","text":"<p>Start the Llamactl server:  </p> <pre><code>llamactl\n</code></pre> <p>By default, Llamactl will start on <code>http://localhost:8080</code>.  </p>"},{"location":"getting-started/quick-start/#step-2-access-the-web-ui","title":"Step 2: Access the Web UI","text":"<p>Open your web browser and navigate to:  </p> <pre><code>http://localhost:8080\n</code></pre> <p>Login with the management API key. By default it is generated during server startup. Copy it from the terminal output.  </p> <p>You should see the Llamactl web interface.  </p>"},{"location":"getting-started/quick-start/#step-3-create-your-first-instance","title":"Step 3: Create Your First Instance","text":"<ol> <li>Click the \"Add Instance\" button  </li> <li>Fill in the instance configuration:  </li> <li>Name: Give your instance a descriptive name  </li> <li>Backend Type: Choose from llama.cpp, MLX, or vLLM  </li> <li>Model: Model path or identifier for your chosen backend  </li> <li> <p>Additional Options: Backend-specific parameters  </p> </li> <li> <p>Click \"Create Instance\"  </p> </li> </ol>"},{"location":"getting-started/quick-start/#step-4-start-your-instance","title":"Step 4: Start Your Instance","text":"<p>Once created, you can:  </p> <ul> <li>Start the instance by clicking the start button  </li> <li>Monitor its status in real-time  </li> <li>View logs by clicking the logs button  </li> <li>Stop the instance when needed  </li> </ul>"},{"location":"getting-started/quick-start/#example-configurations","title":"Example Configurations","text":"<p>Here are basic example configurations for each backend:  </p> <p>llama.cpp backend: <pre><code>{\n  \"name\": \"llama2-7b\",\n  \"backend_type\": \"llama_cpp\",\n  \"backend_options\": {\n    \"model\": \"/path/to/llama-2-7b-chat.gguf\",\n    \"threads\": 4,\n    \"ctx_size\": 2048,\n    \"gpu_layers\": 32\n  }\n}\n</code></pre></p> <p>MLX backend (macOS only): <pre><code>{\n  \"name\": \"mistral-mlx\",\n  \"backend_type\": \"mlx_lm\",\n  \"backend_options\": {\n    \"model\": \"mlx-community/Mistral-7B-Instruct-v0.3-4bit\",\n    \"temp\": 0.7,\n    \"max_tokens\": 2048\n  }\n}\n</code></pre></p> <p>vLLM backend: <pre><code>{\n  \"name\": \"dialogpt-vllm\",\n  \"backend_type\": \"vllm\",\n  \"backend_options\": {\n    \"model\": \"microsoft/DialoGPT-medium\",\n    \"tensor_parallel_size\": 2,\n    \"gpu_memory_utilization\": 0.9\n  }\n}\n</code></pre></p>"},{"location":"getting-started/quick-start/#docker-support","title":"Docker Support","text":"<p>Llamactl can run backends in Docker containers. To enable Docker for a backend, add a <code>docker</code> section to that backend in your YAML configuration file (e.g. <code>config.yaml</code>) as shown below:  </p> <pre><code>backends:\n  vllm:\n    command: \"vllm\"\n    args: [\"serve\"]\n    docker:\n      enabled: true\n      image: \"vllm/vllm-openai:latest\"\n      args: [\"run\", \"--rm\", \"--network\", \"host\", \"--gpus\", \"all\", \"--shm-size\", \"1g\"]\n</code></pre>"},{"location":"getting-started/quick-start/#using-the-api","title":"Using the API","text":"<p>You can also manage instances via the REST API:  </p> <pre><code># List all instances\ncurl http://localhost:8080/api/instances\n\n# Create a new llama.cpp instance\ncurl -X POST http://localhost:8080/api/instances/my-model \\\n  -H \"Content-Type: application/json\" \\\n  -d '{\n    \"backend_type\": \"llama_cpp\",\n    \"backend_options\": {\n      \"model\": \"/path/to/model.gguf\"\n    }\n  }'\n\n# Start an instance\ncurl -X POST http://localhost:8080/api/instances/my-model/start\n</code></pre>"},{"location":"getting-started/quick-start/#openai-compatible-api","title":"OpenAI Compatible API","text":"<p>Llamactl provides OpenAI-compatible endpoints, making it easy to integrate with existing OpenAI client libraries and tools.  </p>"},{"location":"getting-started/quick-start/#chat-completions","title":"Chat Completions","text":"<p>Once you have an instance running, you can use it with the OpenAI-compatible chat completions endpoint:  </p> <pre><code>curl -X POST http://localhost:8080/v1/chat/completions \\\n  -H \"Content-Type: application/json\" \\\n  -d '{\n    \"model\": \"my-model\",\n    \"messages\": [\n      {\n        \"role\": \"user\",\n        \"content\": \"Hello! Can you help me write a Python function?\"\n      }\n    ],\n    \"max_tokens\": 150,\n    \"temperature\": 0.7\n  }'\n</code></pre>"},{"location":"getting-started/quick-start/#using-with-python-openai-client","title":"Using with Python OpenAI Client","text":"<p>You can also use the official OpenAI Python client:  </p> <pre><code>from openai import OpenAI\n\n# Point the client to your Llamactl server\nclient = OpenAI(\n    base_url=\"http://localhost:8080/v1\",\n    api_key=\"not-needed\"  # Llamactl doesn't require API keys by default\n)\n\n# Create a chat completion\nresponse = client.chat.completions.create(\n    model=\"my-model\",  # Use the name of your instance\n    messages=[\n        {\"role\": \"user\", \"content\": \"Explain quantum computing in simple terms\"}\n    ],\n    max_tokens=200,\n    temperature=0.7\n)\n\nprint(response.choices[0].message.content)\n</code></pre>"},{"location":"getting-started/quick-start/#list-available-models","title":"List Available Models","text":"<p>Get a list of running instances (models) in OpenAI-compatible format:  </p> <pre><code>curl http://localhost:8080/v1/models\n</code></pre>"},{"location":"getting-started/quick-start/#next-steps","title":"Next Steps","text":"<ul> <li>Manage instances Managing Instances </li> <li>Explore the API Reference </li> <li>Configure advanced settings in the Configuration guide  </li> </ul>"},{"location":"user-guide/api-reference/","title":"API Reference","text":"<p>Complete reference for the Llamactl REST API.  </p>"},{"location":"user-guide/api-reference/#base-url","title":"Base URL","text":"<p>All API endpoints are relative to the base URL:  </p> <pre><code>http://localhost:8080/api/v1\n</code></pre>"},{"location":"user-guide/api-reference/#authentication","title":"Authentication","text":"<p>Llamactl supports API key authentication. If authentication is enabled, include the API key in the Authorization header:  </p> <pre><code>curl -H \"Authorization: Bearer &lt;your-api-key&gt;\" \\\n  http://localhost:8080/api/v1/instances\n</code></pre> <p>The server supports two types of API keys: - Management API Keys: Required for instance management operations (CRUD operations on instances) - Inference API Keys: Required for OpenAI-compatible inference endpoints  </p>"},{"location":"user-guide/api-reference/#system-endpoints","title":"System Endpoints","text":""},{"location":"user-guide/api-reference/#get-llamactl-version","title":"Get Llamactl Version","text":"<p>Get the version information of the llamactl server.  </p> <pre><code>GET /api/v1/version\n</code></pre> <p>Response: <pre><code>Version: 1.0.0\nCommit: abc123\nBuild Time: 2024-01-15T10:00:00Z\n</code></pre></p>"},{"location":"user-guide/api-reference/#get-llama-server-help","title":"Get Llama Server Help","text":"<p>Get help text for the llama-server command.  </p> <pre><code>GET /api/v1/server/help\n</code></pre> <p>Response: Plain text help output from <code>llama-server --help</code> </p>"},{"location":"user-guide/api-reference/#get-llama-server-version","title":"Get Llama Server Version","text":"<p>Get version information of the llama-server binary.  </p> <pre><code>GET /api/v1/server/version\n</code></pre> <p>Response: Plain text version output from <code>llama-server --version</code> </p>"},{"location":"user-guide/api-reference/#list-available-devices","title":"List Available Devices","text":"<p>List available devices for llama-server.  </p> <pre><code>GET /api/v1/server/devices\n</code></pre> <p>Response: Plain text device list from <code>llama-server --list-devices</code> </p>"},{"location":"user-guide/api-reference/#instances","title":"Instances","text":""},{"location":"user-guide/api-reference/#list-all-instances","title":"List All Instances","text":"<p>Get a list of all instances.  </p> <pre><code>GET /api/v1/instances\n</code></pre> <p>Response: <pre><code>[\n  {\n    \"name\": \"llama2-7b\",\n    \"status\": \"running\",\n    \"created\": 1705312200\n  }\n]\n</code></pre></p>"},{"location":"user-guide/api-reference/#get-instance-details","title":"Get Instance Details","text":"<p>Get detailed information about a specific instance.  </p> <pre><code>GET /api/v1/instances/{name}\n</code></pre> <p>Response: <pre><code>{\n  \"name\": \"llama2-7b\",\n  \"status\": \"running\",\n  \"created\": 1705312200\n}\n</code></pre></p>"},{"location":"user-guide/api-reference/#create-instance","title":"Create Instance","text":"<p>Create and start a new instance.  </p> <pre><code>POST /api/v1/instances/{name}\n</code></pre> <p>Request Body: JSON object with instance configuration. Common fields include:  </p> <ul> <li><code>backend_type</code>: Backend type (<code>llama_cpp</code>, <code>mlx_lm</code>, or <code>vllm</code>)  </li> <li><code>backend_options</code>: Backend-specific configuration  </li> <li><code>auto_restart</code>: Enable automatic restart on failure  </li> <li><code>max_restarts</code>: Maximum restart attempts  </li> <li><code>restart_delay</code>: Delay between restarts in seconds  </li> <li><code>on_demand_start</code>: Start instance when receiving requests  </li> <li><code>idle_timeout</code>: Idle timeout in minutes  </li> <li><code>environment</code>: Environment variables as key-value pairs  </li> <li><code>nodes</code>: Array with single node name to deploy the instance to (for remote deployments)  </li> </ul> <p>See Managing Instances for complete configuration options.  </p> <p>Response: <pre><code>{\n  \"name\": \"llama2-7b\",\n  \"status\": \"running\",\n  \"created\": 1705312200\n}\n</code></pre></p>"},{"location":"user-guide/api-reference/#update-instance","title":"Update Instance","text":"<p>Update an existing instance configuration. See Managing Instances for available configuration options.  </p> <pre><code>PUT /api/v1/instances/{name}\n</code></pre> <p>Request Body: JSON object with configuration fields to update.  </p> <p>Response: <pre><code>{\n  \"name\": \"llama2-7b\",\n  \"status\": \"running\",\n  \"created\": 1705312200\n}\n</code></pre></p>"},{"location":"user-guide/api-reference/#delete-instance","title":"Delete Instance","text":"<p>Stop and remove an instance.  </p> <pre><code>DELETE /api/v1/instances/{name}\n</code></pre> <p>Response: <code>204 No Content</code> </p>"},{"location":"user-guide/api-reference/#instance-operations","title":"Instance Operations","text":""},{"location":"user-guide/api-reference/#start-instance","title":"Start Instance","text":"<p>Start a stopped instance.  </p> <pre><code>POST /api/v1/instances/{name}/start\n</code></pre> <p>Response: <pre><code>{\n  \"name\": \"llama2-7b\",\n  \"status\": \"running\",\n  \"created\": 1705312200\n}\n</code></pre></p> <p>Error Responses: - <code>409 Conflict</code>: Maximum number of running instances reached - <code>500 Internal Server Error</code>: Failed to start instance  </p>"},{"location":"user-guide/api-reference/#stop-instance","title":"Stop Instance","text":"<p>Stop a running instance.  </p> <pre><code>POST /api/v1/instances/{name}/stop\n</code></pre> <p>Response: <pre><code>{\n  \"name\": \"llama2-7b\",\n  \"status\": \"stopped\",\n  \"created\": 1705312200\n}\n</code></pre></p>"},{"location":"user-guide/api-reference/#restart-instance","title":"Restart Instance","text":"<p>Restart an instance (stop then start).  </p> <pre><code>POST /api/v1/instances/{name}/restart\n</code></pre> <p>Response: <pre><code>{\n  \"name\": \"llama2-7b\",\n  \"status\": \"running\",\n  \"created\": 1705312200\n}\n</code></pre></p>"},{"location":"user-guide/api-reference/#get-instance-logs","title":"Get Instance Logs","text":"<p>Retrieve instance logs.  </p> <pre><code>GET /api/v1/instances/{name}/logs\n</code></pre> <p>Query Parameters: - <code>lines</code>: Number of lines to return (default: all lines, use -1 for all)  </p> <p>Response: Plain text log output  </p> <p>Example: <pre><code>curl \"http://localhost:8080/api/v1/instances/my-instance/logs?lines=100\"\n</code></pre></p>"},{"location":"user-guide/api-reference/#proxy-to-instance","title":"Proxy to Instance","text":"<p>Proxy HTTP requests directly to the llama-server instance.  </p> <pre><code>GET /api/v1/instances/{name}/proxy/*\nPOST /api/v1/instances/{name}/proxy/*\n</code></pre> <p>This endpoint forwards all requests to the underlying llama-server instance running on its configured port. The proxy strips the <code>/api/v1/instances/{name}/proxy</code> prefix and forwards the remaining path to the instance.  </p> <p>Example - Check Instance Health: <pre><code>curl -H \"Authorization: Bearer your-api-key\" \\\n  http://localhost:8080/api/v1/instances/my-model/proxy/health\n</code></pre></p> <p>This forwards the request to <code>http://instance-host:instance-port/health</code> on the actual llama-server instance.  </p> <p>Error Responses: - <code>503 Service Unavailable</code>: Instance is not running  </p>"},{"location":"user-guide/api-reference/#openai-compatible-api","title":"OpenAI-Compatible API","text":"<p>Llamactl provides OpenAI-compatible endpoints for inference operations.  </p>"},{"location":"user-guide/api-reference/#list-models","title":"List Models","text":"<p>List all instances in OpenAI-compatible format.  </p> <pre><code>GET /v1/models\n</code></pre> <p>Response: <pre><code>{\n  \"object\": \"list\",\n  \"data\": [\n    {\n      \"id\": \"llama2-7b\",\n      \"object\": \"model\",\n      \"created\": 1705312200,\n      \"owned_by\": \"llamactl\"\n    }\n  ]\n}\n</code></pre></p>"},{"location":"user-guide/api-reference/#chat-completions-completions-embeddings","title":"Chat Completions, Completions, Embeddings","text":"<p>All OpenAI-compatible inference endpoints are available:  </p> <pre><code>POST /v1/chat/completions\nPOST /v1/completions\nPOST /v1/embeddings\nPOST /v1/rerank\nPOST /v1/reranking\n</code></pre> <p>Request Body: Standard OpenAI format with <code>model</code> field specifying the instance name  </p> <p>Example: <pre><code>{\n  \"model\": \"llama2-7b\",\n  \"messages\": [\n    {\n      \"role\": \"user\",\n      \"content\": \"Hello, how are you?\"\n    }\n  ]\n}\n</code></pre></p> <p>The server routes requests to the appropriate instance based on the <code>model</code> field in the request body. Instances with on-demand starting enabled will be automatically started if not running. For configuration details, see Managing Instances.  </p> <p>Error Responses: - <code>400 Bad Request</code>: Invalid request body or missing instance name - <code>503 Service Unavailable</code>: Instance is not running and on-demand start is disabled - <code>409 Conflict</code>: Cannot start instance due to maximum instances limit  </p>"},{"location":"user-guide/api-reference/#instance-status-values","title":"Instance Status Values","text":"<p>Instances can have the following status values: - <code>stopped</code>: Instance is not running - <code>running</code>: Instance is running and ready to accept requests - <code>failed</code>: Instance failed to start or crashed  </p>"},{"location":"user-guide/api-reference/#error-responses","title":"Error Responses","text":"<p>All endpoints may return error responses in the following format:  </p> <pre><code>{\n  \"error\": \"Error message description\"\n}\n</code></pre>"},{"location":"user-guide/api-reference/#common-http-status-codes","title":"Common HTTP Status Codes","text":"<ul> <li><code>200</code>: Success  </li> <li><code>201</code>: Created  </li> <li><code>204</code>: No Content (successful deletion)  </li> <li><code>400</code>: Bad Request (invalid parameters or request body)  </li> <li><code>401</code>: Unauthorized (missing or invalid API key)  </li> <li><code>403</code>: Forbidden (insufficient permissions)  </li> <li><code>404</code>: Not Found (instance not found)  </li> <li><code>409</code>: Conflict (instance already exists, max instances reached)  </li> <li><code>500</code>: Internal Server Error  </li> <li><code>503</code>: Service Unavailable (instance not running)  </li> </ul>"},{"location":"user-guide/api-reference/#examples","title":"Examples","text":""},{"location":"user-guide/api-reference/#complete-instance-lifecycle","title":"Complete Instance Lifecycle","text":"<pre><code># Create and start instance\ncurl -X POST http://localhost:8080/api/v1/instances/my-model \\\n  -H \"Content-Type: application/json\" \\\n  -H \"Authorization: Bearer your-api-key\" \\\n  -d '{\n    \"backend_type\": \"llama_cpp\",\n    \"backend_options\": {\n      \"model\": \"/models/llama-2-7b.gguf\",\n      \"gpu_layers\": 32\n    },\n    \"environment\": {\n      \"CUDA_VISIBLE_DEVICES\": \"0\",\n      \"OMP_NUM_THREADS\": \"8\"\n    }\n  }'\n\n# Check instance status\ncurl -H \"Authorization: Bearer your-api-key\" \\\n  http://localhost:8080/api/v1/instances/my-model\n\n# Get instance logs\ncurl -H \"Authorization: Bearer your-api-key\" \\\n  \"http://localhost:8080/api/v1/instances/my-model/logs?lines=50\"\n\n# Use OpenAI-compatible chat completions\ncurl -X POST http://localhost:8080/v1/chat/completions \\\n  -H \"Content-Type: application/json\" \\\n  -H \"Authorization: Bearer your-inference-api-key\" \\\n  -d '{\n    \"model\": \"my-model\",\n    \"messages\": [\n      {\"role\": \"user\", \"content\": \"Hello!\"}\n    ],\n    \"max_tokens\": 100\n  }'\n\n# Stop instance\ncurl -X POST -H \"Authorization: Bearer your-api-key\" \\\n  http://localhost:8080/api/v1/instances/my-model/stop\n\n# Delete instance\ncurl -X DELETE -H \"Authorization: Bearer your-api-key\" \\\n  http://localhost:8080/api/v1/instances/my-model\n</code></pre>"},{"location":"user-guide/api-reference/#remote-node-instance-example","title":"Remote Node Instance Example","text":"<pre><code># Create instance on specific remote node\ncurl -X POST http://localhost:8080/api/v1/instances/remote-model \\\n  -H \"Content-Type: application/json\" \\\n  -H \"Authorization: Bearer your-api-key\" \\\n  -d '{\n    \"backend_type\": \"llama_cpp\",\n    \"backend_options\": {\n      \"model\": \"/models/llama-2-7b.gguf\",\n      \"gpu_layers\": 32\n    },\n    \"nodes\": [\"worker1\"]\n  }'\n\n# Check status of remote instance\ncurl -H \"Authorization: Bearer your-api-key\" \\\n  http://localhost:8080/api/v1/instances/remote-model\n\n# Use remote instance with OpenAI-compatible API\ncurl -X POST http://localhost:8080/v1/chat/completions \\\n  -H \"Content-Type: application/json\" \\\n  -H \"Authorization: Bearer your-inference-api-key\" \\\n  -d '{\n    \"model\": \"remote-model\",\n    \"messages\": [\n      {\"role\": \"user\", \"content\": \"Hello from remote node!\"}\n    ]\n  }'\n</code></pre>"},{"location":"user-guide/api-reference/#using-the-proxy-endpoint","title":"Using the Proxy Endpoint","text":"<p>You can also directly proxy requests to the llama-server instance:  </p> <pre><code># Direct proxy to instance (bypasses OpenAI compatibility layer)\ncurl -X POST http://localhost:8080/api/v1/instances/my-model/proxy/completion \\\n  -H \"Content-Type: application/json\" \\\n  -H \"Authorization: Bearer your-api-key\" \\\n  -d '{\n    \"prompt\": \"Hello, world!\",\n    \"n_predict\": 50\n  }'\n</code></pre>"},{"location":"user-guide/api-reference/#backend-specific-endpoints","title":"Backend-Specific Endpoints","text":""},{"location":"user-guide/api-reference/#parse-commands","title":"Parse Commands","text":"<p>Llamactl provides endpoints to parse command strings from different backends into instance configuration options.  </p>"},{"location":"user-guide/api-reference/#parse-llamacpp-command","title":"Parse Llama.cpp Command","text":"<p>Parse a llama-server command string into instance options.  </p> <pre><code>POST /api/v1/backends/llama-cpp/parse-command\n</code></pre> <p>Request Body: <pre><code>{\n  \"command\": \"llama-server -m /path/to/model.gguf -c 2048 --port 8080\"\n}\n</code></pre></p> <p>Response: <pre><code>{\n  \"backend_type\": \"llama_cpp\",\n  \"llama_server_options\": {\n    \"model\": \"/path/to/model.gguf\",\n    \"ctx_size\": 2048,\n    \"port\": 8080\n  }\n}\n</code></pre></p>"},{"location":"user-guide/api-reference/#parse-mlx-lm-command","title":"Parse MLX-LM Command","text":"<p>Parse an MLX-LM server command string into instance options.  </p> <pre><code>POST /api/v1/backends/mlx/parse-command\n</code></pre> <p>Request Body: <pre><code>{\n  \"command\": \"mlx_lm.server --model /path/to/model --port 8080\"\n}\n</code></pre></p> <p>Response: <pre><code>{\n  \"backend_type\": \"mlx_lm\",\n  \"mlx_server_options\": {\n    \"model\": \"/path/to/model\",\n    \"port\": 8080\n  }\n}\n</code></pre></p>"},{"location":"user-guide/api-reference/#parse-vllm-command","title":"Parse vLLM Command","text":"<p>Parse a vLLM serve command string into instance options.  </p> <pre><code>POST /api/v1/backends/vllm/parse-command\n</code></pre> <p>Request Body: <pre><code>{\n  \"command\": \"vllm serve /path/to/model --port 8080\"\n}\n</code></pre></p> <p>Response: <pre><code>{\n  \"backend_type\": \"vllm\",\n  \"vllm_server_options\": {\n    \"model\": \"/path/to/model\",\n    \"port\": 8080\n  }\n}\n</code></pre></p> <p>Error Responses for Parse Commands: - <code>400 Bad Request</code>: Invalid request body, empty command, or parse error - <code>500 Internal Server Error</code>: Encoding error  </p>"},{"location":"user-guide/api-reference/#auto-generated-documentation","title":"Auto-Generated Documentation","text":"<p>The API documentation is automatically generated from code annotations using Swagger/OpenAPI. To regenerate the documentation:  </p> <ol> <li>Install the swag tool: <code>go install github.com/swaggo/swag/cmd/swag@latest</code> </li> <li>Generate docs: <code>swag init -g cmd/server/main.go -o apidocs</code> </li> </ol>"},{"location":"user-guide/api-reference/#swagger-documentation","title":"Swagger Documentation","text":"<p>If swagger documentation is enabled in the server configuration, you can access the interactive API documentation at:  </p> <pre><code>http://localhost:8080/swagger/\n</code></pre> <p>This provides a complete interactive interface for testing all API endpoints.  </p>"},{"location":"user-guide/managing-instances/","title":"Managing Instances","text":"<p>Learn how to effectively manage your llama.cpp, MLX, and vLLM instances with Llamactl through both the Web UI and API.  </p>"},{"location":"user-guide/managing-instances/#overview","title":"Overview","text":"<p>Llamactl provides two ways to manage instances:  </p> <ul> <li>Web UI: Accessible at <code>http://localhost:8080</code> with an intuitive dashboard  </li> <li>REST API: Programmatic access for automation and integration  </li> </ul> <p> </p>"},{"location":"user-guide/managing-instances/#authentication","title":"Authentication","text":"<p>If authentication is enabled: 1. Navigate to the web UI 2. Enter your credentials 3. Bearer token is stored for the session  </p>"},{"location":"user-guide/managing-instances/#theme-support","title":"Theme Support","text":"<ul> <li>Switch between light and dark themes  </li> <li>Setting is remembered across sessions  </li> </ul>"},{"location":"user-guide/managing-instances/#instance-cards","title":"Instance Cards","text":"<p>Each instance is displayed as a card showing:  </p> <ul> <li>Instance name </li> <li>Health status badge (unknown, ready, error, failed)  </li> <li>Action buttons (start, stop, edit, logs, delete)  </li> </ul>"},{"location":"user-guide/managing-instances/#create-instance","title":"Create Instance","text":""},{"location":"user-guide/managing-instances/#via-web-ui","title":"Via Web UI","text":"<ol> <li>Click the \"Create Instance\" button on the dashboard  </li> <li>Enter a unique Name for your instance (only required field)  </li> <li>Select Target Node: Choose which node to deploy the instance to from the dropdown  </li> <li>Choose Backend Type:  <ul> <li>llama.cpp: For GGUF models using llama-server  </li> <li>MLX: For MLX-optimized models (macOS only)  </li> <li>vLLM: For distributed serving and high-throughput inference  </li> </ul> </li> <li>Configure model source:  <ul> <li>For llama.cpp: GGUF model path or HuggingFace repo  </li> <li>For MLX: MLX model path or identifier (e.g., <code>mlx-community/Mistral-7B-Instruct-v0.3-4bit</code>)  </li> <li>For vLLM: HuggingFace model identifier (e.g., <code>microsoft/DialoGPT-medium</code>)  </li> </ul> </li> <li>Configure optional instance management settings:  <ul> <li>Auto Restart: Automatically restart instance on failure  </li> <li>Max Restarts: Maximum number of restart attempts  </li> <li>Restart Delay: Delay in seconds between restart attempts  </li> <li>On Demand Start: Start instance when receiving a request to the OpenAI compatible endpoint  </li> <li>Idle Timeout: Minutes before stopping idle instance (set to 0 to disable)  </li> <li>Environment Variables: Set custom environment variables for the instance process  </li> </ul> </li> <li>Configure backend-specific options:  <ul> <li>llama.cpp: Threads, context size, GPU layers, port, etc.  </li> <li>MLX: Temperature, top-p, adapter path, Python environment, etc.  </li> <li>vLLM: Tensor parallel size, GPU memory utilization, quantization, etc.  </li> </ul> </li> <li>Click \"Create\" to save the instance  </li> </ol>"},{"location":"user-guide/managing-instances/#via-api","title":"Via API","text":"<pre><code># Create llama.cpp instance with local model file\ncurl -X POST http://localhost:8080/api/instances/my-llama-instance \\\n  -H \"Content-Type: application/json\" \\\n  -d '{\n    \"backend_type\": \"llama_cpp\",\n    \"backend_options\": {\n      \"model\": \"/path/to/model.gguf\",\n      \"threads\": 8,\n      \"ctx_size\": 4096,\n      \"gpu_layers\": 32\n    }\n  }'\n\n# Create MLX instance (macOS only)\ncurl -X POST http://localhost:8080/api/instances/my-mlx-instance \\\n  -H \"Content-Type: application/json\" \\\n  -d '{\n    \"backend_type\": \"mlx_lm\",\n    \"backend_options\": {\n      \"model\": \"mlx-community/Mistral-7B-Instruct-v0.3-4bit\",\n      \"temp\": 0.7,\n      \"top_p\": 0.9,\n      \"max_tokens\": 2048\n    },\n    \"auto_restart\": true,\n    \"max_restarts\": 3\n  }'\n\n# Create vLLM instance\ncurl -X POST http://localhost:8080/api/instances/my-vllm-instance \\\n  -H \"Content-Type: application/json\" \\\n  -d '{\n    \"backend_type\": \"vllm\",\n    \"backend_options\": {\n      \"model\": \"microsoft/DialoGPT-medium\",\n      \"tensor_parallel_size\": 2,\n      \"gpu_memory_utilization\": 0.9\n    },\n    \"auto_restart\": true,\n    \"on_demand_start\": true,\n    \"environment\": {\n      \"CUDA_VISIBLE_DEVICES\": \"0,1\",\n      \"NCCL_DEBUG\": \"INFO\",\n      \"PYTHONPATH\": \"/custom/path\"\n    }\n  }'\n\n# Create llama.cpp instance with HuggingFace model\ncurl -X POST http://localhost:8080/api/instances/gemma-3-27b \\\n  -H \"Content-Type: application/json\" \\\n  -d '{\n    \"backend_type\": \"llama_cpp\",\n    \"backend_options\": {\n      \"hf_repo\": \"unsloth/gemma-3-27b-it-GGUF\",\n      \"hf_file\": \"gemma-3-27b-it-GGUF.gguf\",\n      \"gpu_layers\": 32\n    }\n  }'\n\n# Create instance on specific remote node\ncurl -X POST http://localhost:8080/api/instances/remote-llama \\\n  -H \"Content-Type: application/json\" \\\n  -d '{\n    \"backend_type\": \"llama_cpp\",\n    \"backend_options\": {\n      \"model\": \"/models/llama-7b.gguf\",\n      \"gpu_layers\": 32\n    },\n    \"nodes\": [\"worker1\"]\n  }'\n</code></pre>"},{"location":"user-guide/managing-instances/#start-instance","title":"Start Instance","text":""},{"location":"user-guide/managing-instances/#via-web-ui_1","title":"Via Web UI","text":"<ol> <li>Click the \"Start\" button on an instance card  </li> <li>Watch the status change to \"Unknown\"  </li> <li>Monitor progress in the logs  </li> <li>Instance status changes to \"Ready\" when ready  </li> </ol>"},{"location":"user-guide/managing-instances/#via-api_1","title":"Via API","text":"<pre><code>curl -X POST http://localhost:8080/api/instances/{name}/start\n</code></pre>"},{"location":"user-guide/managing-instances/#stop-instance","title":"Stop Instance","text":""},{"location":"user-guide/managing-instances/#via-web-ui_2","title":"Via Web UI","text":"<ol> <li>Click the \"Stop\" button on an instance card  </li> <li>Instance gracefully shuts down  </li> </ol>"},{"location":"user-guide/managing-instances/#via-api_2","title":"Via API","text":"<pre><code>curl -X POST http://localhost:8080/api/instances/{name}/stop\n</code></pre>"},{"location":"user-guide/managing-instances/#edit-instance","title":"Edit Instance","text":""},{"location":"user-guide/managing-instances/#via-web-ui_3","title":"Via Web UI","text":"<ol> <li>Click the \"Edit\" button on an instance card  </li> <li>Modify settings in the configuration dialog  </li> <li>Changes require instance restart to take effect  </li> <li>Click \"Update &amp; Restart\" to apply changes  </li> </ol>"},{"location":"user-guide/managing-instances/#via-api_3","title":"Via API","text":"<p>Modify instance settings:  </p> <pre><code>curl -X PUT http://localhost:8080/api/instances/{name} \\\n  -H \"Content-Type: application/json\" \\\n  -d '{\n    \"backend_options\": {\n      \"threads\": 8,\n      \"context_size\": 4096\n    }\n  }'\n</code></pre> <p>Note</p> <p>Configuration changes require restarting the instance to take effect.  </p>"},{"location":"user-guide/managing-instances/#view-logs","title":"View Logs","text":""},{"location":"user-guide/managing-instances/#via-web-ui_4","title":"Via Web UI","text":"<ol> <li>Click the \"Logs\" button on any instance card  </li> <li>Real-time log viewer opens  </li> </ol>"},{"location":"user-guide/managing-instances/#via-api_4","title":"Via API","text":"<p>Check instance status in real-time:  </p> <pre><code># Get instance details\ncurl http://localhost:8080/api/instances/{name}/logs\n</code></pre>"},{"location":"user-guide/managing-instances/#delete-instance","title":"Delete Instance","text":""},{"location":"user-guide/managing-instances/#via-web-ui_5","title":"Via Web UI","text":"<ol> <li>Click the \"Delete\" button on an instance card  </li> <li>Only stopped instances can be deleted  </li> <li>Confirm deletion in the dialog  </li> </ol>"},{"location":"user-guide/managing-instances/#via-api_5","title":"Via API","text":"<pre><code>curl -X DELETE http://localhost:8080/api/instances/{name}\n</code></pre>"},{"location":"user-guide/managing-instances/#instance-proxy","title":"Instance Proxy","text":"<p>Llamactl proxies all requests to the underlying backend instances (llama-server, MLX, or vLLM).  </p> <pre><code># Get instance details\ncurl http://localhost:8080/api/instances/{name}/proxy/\n</code></pre> <p>All backends provide OpenAI-compatible endpoints. Check the respective documentation: - llama-server docs - MLX-LM docs - vLLM docs </p>"},{"location":"user-guide/managing-instances/#instance-health","title":"Instance Health","text":""},{"location":"user-guide/managing-instances/#via-web-ui_6","title":"Via Web UI","text":"<ol> <li>The health status badge is displayed on each instance card  </li> </ol>"},{"location":"user-guide/managing-instances/#via-api_6","title":"Via API","text":"<p>Check the health status of your instances:  </p> <pre><code>curl http://localhost:8080/api/instances/{name}/proxy/health\n</code></pre>"},{"location":"user-guide/troubleshooting/","title":"Troubleshooting","text":"<p>Issues specific to Llamactl deployment and operation.  </p>"},{"location":"user-guide/troubleshooting/#configuration-issues","title":"Configuration Issues","text":""},{"location":"user-guide/troubleshooting/#invalid-configuration","title":"Invalid Configuration","text":"<p>Problem: Invalid configuration preventing startup  </p> <p>Solutions: 1. Use minimal configuration: <pre><code>server:\n  host: \"0.0.0.0\"\n  port: 8080\ninstances:\n  port_range: [8000, 9000]\n</code></pre></p> <ol> <li>Check data directory permissions: <pre><code># Ensure data directory is writable (default: ~/.local/share/llamactl)\nmkdir -p ~/.local/share/llamactl/{instances,logs}\n</code></pre></li> </ol>"},{"location":"user-guide/troubleshooting/#instance-management-issues","title":"Instance Management Issues","text":""},{"location":"user-guide/troubleshooting/#model-loading-failures","title":"Model Loading Failures","text":"<p>Problem: Instance fails to start with model loading errors  </p> <p>Common Solutions: - llama-server not found: Ensure <code>llama-server</code> binary is in PATH - Wrong model format: Ensure model is in GGUF format - Insufficient memory: Use smaller model or reduce context size - Path issues: Use absolute paths to model files  </p>"},{"location":"user-guide/troubleshooting/#memory-issues","title":"Memory Issues","text":"<p>Problem: Out of memory errors or system becomes unresponsive  </p> <p>Solutions: 1. Reduce context size: <pre><code>{\n  \"n_ctx\": 1024\n}\n</code></pre></p> <ol> <li>Use quantized models: </li> <li>Try Q4_K_M instead of higher precision models  </li> <li>Use smaller model variants (7B instead of 13B)  </li> </ol>"},{"location":"user-guide/troubleshooting/#gpu-configuration","title":"GPU Configuration","text":"<p>Problem: GPU not being used effectively  </p> <p>Solutions: 1. Configure GPU layers: <pre><code>{\n  \"n_gpu_layers\": 35\n}\n</code></pre></p>"},{"location":"user-guide/troubleshooting/#advanced-instance-issues","title":"Advanced Instance Issues","text":"<p>Problem: Complex model loading, performance, or compatibility issues  </p> <p>Since llamactl uses <code>llama-server</code> under the hood, many instance-related issues are actually llama.cpp issues. For advanced troubleshooting:  </p> <p>Resources: - llama.cpp Documentation: https://github.com/ggml/llama.cpp - llama.cpp Issues: https://github.com/ggml/llama.cpp/issues - llama.cpp Discussions: https://github.com/ggml/llama.cpp/discussions </p> <p>Testing directly with llama-server: <pre><code># Test your model and parameters directly with llama-server\nllama-server --model /path/to/model.gguf --port 8081 --n-gpu-layers 35\n</code></pre></p> <p>This helps determine if the issue is with llamactl or with the underlying llama.cpp/llama-server.  </p>"},{"location":"user-guide/troubleshooting/#api-and-network-issues","title":"API and Network Issues","text":""},{"location":"user-guide/troubleshooting/#cors-errors","title":"CORS Errors","text":"<p>Problem: Web UI shows CORS errors in browser console  </p> <p>Solutions: 1. Configure allowed origins: <pre><code>server:\n  allowed_origins:\n    - \"http://localhost:3000\"\n    - \"https://yourdomain.com\"\n</code></pre></p>"},{"location":"user-guide/troubleshooting/#authentication-issues","title":"Authentication Issues","text":"<p>Problem: API requests failing with authentication errors  </p> <p>Solutions: 1. Disable authentication temporarily: <pre><code>auth:\n  require_management_auth: false\n  require_inference_auth: false\n</code></pre></p> <ol> <li> <p>Configure API keys: <pre><code>auth:\n  management_keys:\n    - \"your-management-key\"\n  inference_keys:\n    - \"your-inference-key\"\n</code></pre></p> </li> <li> <p>Use correct Authorization header: <pre><code>curl -H \"Authorization: Bearer your-api-key\" \\\n  http://localhost:8080/api/v1/instances\n</code></pre></p> </li> </ol>"},{"location":"user-guide/troubleshooting/#remote-node-issues","title":"Remote Node Issues","text":""},{"location":"user-guide/troubleshooting/#node-configuration","title":"Node Configuration","text":"<p>Problem: Remote instances not appearing or cannot be managed  </p> <p>Solutions: 1. Verify node configuration: <pre><code>local_node: \"main\"  # Must match a key in nodes map\nnodes:\n  main:\n    address: \"\"     # Empty for local node\n  worker1:\n    address: \"http://worker1.internal:8080\"\n    api_key: \"secure-key\"  # Must match worker1's management key\n</code></pre></p> <ol> <li>Test remote node connectivity: <pre><code>curl -H \"Authorization: Bearer remote-node-key\" \\\n  http://remote-node:8080/api/v1/instances\n</code></pre></li> </ol>"},{"location":"user-guide/troubleshooting/#debugging-and-logs","title":"Debugging and Logs","text":""},{"location":"user-guide/troubleshooting/#viewing-instance-logs","title":"Viewing Instance Logs","text":"<pre><code># Get instance logs via API\ncurl http://localhost:8080/api/v1/instances/{name}/logs\n\n# Or check log files directly\ntail -f ~/.local/share/llamactl/logs/{instance-name}.log\n</code></pre>"},{"location":"user-guide/troubleshooting/#enable-debug-logging","title":"Enable Debug Logging","text":"<pre><code>export LLAMACTL_LOG_LEVEL=debug\nllamactl\n</code></pre>"},{"location":"user-guide/troubleshooting/#getting-help","title":"Getting Help","text":"<p>When reporting issues, include:  </p> <ol> <li> <p>System information: <pre><code>llamactl --version\n</code></pre></p> </li> <li> <p>Configuration file (remove sensitive keys)  </p> </li> <li> <p>Relevant log output </p> </li> <li> <p>Steps to reproduce the issue </p> </li> </ol>"}]}
\ No newline at end of file
diff --git a/dev/sitemap.xml.gz b/dev/sitemap.xml.gz
index 8cc4884dbac4363ae93dfd40d020e2b3ecb255fa..6a62a35740646b3a1f7c934f8f97a76e5e00c554 100644
GIT binary patch
delta 15
WcmZ3?w3vxazMF&Nx5h>`Sw;XNr33>2

delta 15
WcmZ3?w3vxazMF&Np!7yISw;XM2m`zT

diff --git a/dev/user-guide/api-reference/index.html b/dev/user-guide/api-reference/index.html
index a812cf1..1e11a5d 100644
--- a/dev/user-guide/api-reference/index.html
+++ b/dev/user-guide/api-reference/index.html
@@ -1396,50 +1396,50 @@
 
 
 <h1 id="api-reference">API Reference<a class="headerlink" href="#api-reference" title="Permanent link">&para;</a></h1>
-<p>Complete reference for the Llamactl REST API.</p>
+<p>Complete reference for the Llamactl REST API.  </p>
 <h2 id="base-url">Base URL<a class="headerlink" href="#base-url" title="Permanent link">&para;</a></h2>
-<p>All API endpoints are relative to the base URL:</p>
+<p>All API endpoints are relative to the base URL:  </p>
 <div class="highlight"><pre><span></span><code><a id="__codelineno-0-1" name="__codelineno-0-1" href="#__codelineno-0-1"></a>http://localhost:8080/api/v1
 </code></pre></div>
 <h2 id="authentication">Authentication<a class="headerlink" href="#authentication" title="Permanent link">&para;</a></h2>
-<p>Llamactl supports API key authentication. If authentication is enabled, include the API key in the Authorization header:</p>
+<p>Llamactl supports API key authentication. If authentication is enabled, include the API key in the Authorization header:  </p>
 <div class="highlight"><pre><span></span><code><a id="__codelineno-1-1" name="__codelineno-1-1" href="#__codelineno-1-1"></a>curl<span class="w"> </span>-H<span class="w"> </span><span class="s2">&quot;Authorization: Bearer &lt;your-api-key&gt;&quot;</span><span class="w"> </span><span class="se">\</span>
 <a id="__codelineno-1-2" name="__codelineno-1-2" href="#__codelineno-1-2"></a><span class="w">  </span>http://localhost:8080/api/v1/instances
 </code></pre></div>
-<p>The server supports two types of API keys:
-- <strong>Management API Keys</strong>: Required for instance management operations (CRUD operations on instances)
-- <strong>Inference API Keys</strong>: Required for OpenAI-compatible inference endpoints</p>
+<p>The server supports two types of API keys:<br />
+- <strong>Management API Keys</strong>: Required for instance management operations (CRUD operations on instances)<br />
+- <strong>Inference API Keys</strong>: Required for OpenAI-compatible inference endpoints  </p>
 <h2 id="system-endpoints">System Endpoints<a class="headerlink" href="#system-endpoints" title="Permanent link">&para;</a></h2>
 <h3 id="get-llamactl-version">Get Llamactl Version<a class="headerlink" href="#get-llamactl-version" title="Permanent link">&para;</a></h3>
-<p>Get the version information of the llamactl server.</p>
+<p>Get the version information of the llamactl server.  </p>
 <div class="highlight"><pre><span></span><code><a id="__codelineno-2-1" name="__codelineno-2-1" href="#__codelineno-2-1"></a><span class="err">GET /api/v1/version</span>
 </code></pre></div>
-<p><strong>Response:</strong>
+<p><strong>Response:</strong><br />
 <div class="highlight"><pre><span></span><code><a id="__codelineno-3-1" name="__codelineno-3-1" href="#__codelineno-3-1"></a>Version: 1.0.0
 <a id="__codelineno-3-2" name="__codelineno-3-2" href="#__codelineno-3-2"></a>Commit: abc123
 <a id="__codelineno-3-3" name="__codelineno-3-3" href="#__codelineno-3-3"></a>Build Time: 2024-01-15T10:00:00Z
 </code></pre></div></p>
 <h3 id="get-llama-server-help">Get Llama Server Help<a class="headerlink" href="#get-llama-server-help" title="Permanent link">&para;</a></h3>
-<p>Get help text for the llama-server command.</p>
+<p>Get help text for the llama-server command.  </p>
 <div class="highlight"><pre><span></span><code><a id="__codelineno-4-1" name="__codelineno-4-1" href="#__codelineno-4-1"></a><span class="err">GET /api/v1/server/help</span>
 </code></pre></div>
-<p><strong>Response:</strong> Plain text help output from <code>llama-server --help</code></p>
+<p><strong>Response:</strong> Plain text help output from <code>llama-server --help</code>  </p>
 <h3 id="get-llama-server-version">Get Llama Server Version<a class="headerlink" href="#get-llama-server-version" title="Permanent link">&para;</a></h3>
-<p>Get version information of the llama-server binary.</p>
+<p>Get version information of the llama-server binary.  </p>
 <div class="highlight"><pre><span></span><code><a id="__codelineno-5-1" name="__codelineno-5-1" href="#__codelineno-5-1"></a><span class="err">GET /api/v1/server/version</span>
 </code></pre></div>
-<p><strong>Response:</strong> Plain text version output from <code>llama-server --version</code></p>
+<p><strong>Response:</strong> Plain text version output from <code>llama-server --version</code>  </p>
 <h3 id="list-available-devices">List Available Devices<a class="headerlink" href="#list-available-devices" title="Permanent link">&para;</a></h3>
-<p>List available devices for llama-server.</p>
+<p>List available devices for llama-server.  </p>
 <div class="highlight"><pre><span></span><code><a id="__codelineno-6-1" name="__codelineno-6-1" href="#__codelineno-6-1"></a><span class="err">GET /api/v1/server/devices</span>
 </code></pre></div>
-<p><strong>Response:</strong> Plain text device list from <code>llama-server --list-devices</code></p>
+<p><strong>Response:</strong> Plain text device list from <code>llama-server --list-devices</code>  </p>
 <h2 id="instances">Instances<a class="headerlink" href="#instances" title="Permanent link">&para;</a></h2>
 <h3 id="list-all-instances">List All Instances<a class="headerlink" href="#list-all-instances" title="Permanent link">&para;</a></h3>
-<p>Get a list of all instances.</p>
+<p>Get a list of all instances.  </p>
 <div class="highlight"><pre><span></span><code><a id="__codelineno-7-1" name="__codelineno-7-1" href="#__codelineno-7-1"></a><span class="err">GET /api/v1/instances</span>
 </code></pre></div>
-<p><strong>Response:</strong>
+<p><strong>Response:</strong><br />
 <div class="highlight"><pre><span></span><code><a id="__codelineno-8-1" name="__codelineno-8-1" href="#__codelineno-8-1"></a><span class="p">[</span>
 <a id="__codelineno-8-2" name="__codelineno-8-2" href="#__codelineno-8-2"></a><span class="w">  </span><span class="p">{</span>
 <a id="__codelineno-8-3" name="__codelineno-8-3" href="#__codelineno-8-3"></a><span class="w">    </span><span class="nt">&quot;name&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;llama2-7b&quot;</span><span class="p">,</span>
@@ -1449,10 +1449,10 @@
 <a id="__codelineno-8-7" name="__codelineno-8-7" href="#__codelineno-8-7"></a><span class="p">]</span>
 </code></pre></div></p>
 <h3 id="get-instance-details">Get Instance Details<a class="headerlink" href="#get-instance-details" title="Permanent link">&para;</a></h3>
-<p>Get detailed information about a specific instance.</p>
+<p>Get detailed information about a specific instance.  </p>
 <div class="highlight"><pre><span></span><code><a id="__codelineno-9-1" name="__codelineno-9-1" href="#__codelineno-9-1"></a><span class="err">GET /api/v1/instances/{name}</span>
 </code></pre></div>
-<p><strong>Response:</strong>
+<p><strong>Response:</strong><br />
 <div class="highlight"><pre><span></span><code><a id="__codelineno-10-1" name="__codelineno-10-1" href="#__codelineno-10-1"></a><span class="p">{</span>
 <a id="__codelineno-10-2" name="__codelineno-10-2" href="#__codelineno-10-2"></a><span class="w">  </span><span class="nt">&quot;name&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;llama2-7b&quot;</span><span class="p">,</span>
 <a id="__codelineno-10-3" name="__codelineno-10-3" href="#__codelineno-10-3"></a><span class="w">  </span><span class="nt">&quot;status&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;running&quot;</span><span class="p">,</span>
@@ -1460,23 +1460,23 @@
 <a id="__codelineno-10-5" name="__codelineno-10-5" href="#__codelineno-10-5"></a><span class="p">}</span>
 </code></pre></div></p>
 <h3 id="create-instance">Create Instance<a class="headerlink" href="#create-instance" title="Permanent link">&para;</a></h3>
-<p>Create and start a new instance.</p>
+<p>Create and start a new instance.  </p>
 <div class="highlight"><pre><span></span><code><a id="__codelineno-11-1" name="__codelineno-11-1" href="#__codelineno-11-1"></a><span class="err">POST /api/v1/instances/{name}</span>
 </code></pre></div>
-<p><strong>Request Body:</strong> JSON object with instance configuration. Common fields include:</p>
+<p><strong>Request Body:</strong> JSON object with instance configuration. Common fields include:  </p>
 <ul>
-<li><code>backend_type</code>: Backend type (<code>llama_cpp</code>, <code>mlx_lm</code>, or <code>vllm</code>)</li>
-<li><code>backend_options</code>: Backend-specific configuration</li>
-<li><code>auto_restart</code>: Enable automatic restart on failure</li>
-<li><code>max_restarts</code>: Maximum restart attempts</li>
-<li><code>restart_delay</code>: Delay between restarts in seconds</li>
-<li><code>on_demand_start</code>: Start instance when receiving requests</li>
-<li><code>idle_timeout</code>: Idle timeout in minutes</li>
-<li><code>environment</code>: Environment variables as key-value pairs</li>
-<li><code>nodes</code>: Array with single node name to deploy the instance to (for remote deployments)</li>
+<li><code>backend_type</code>: Backend type (<code>llama_cpp</code>, <code>mlx_lm</code>, or <code>vllm</code>)  </li>
+<li><code>backend_options</code>: Backend-specific configuration  </li>
+<li><code>auto_restart</code>: Enable automatic restart on failure  </li>
+<li><code>max_restarts</code>: Maximum restart attempts  </li>
+<li><code>restart_delay</code>: Delay between restarts in seconds  </li>
+<li><code>on_demand_start</code>: Start instance when receiving requests  </li>
+<li><code>idle_timeout</code>: Idle timeout in minutes  </li>
+<li><code>environment</code>: Environment variables as key-value pairs  </li>
+<li><code>nodes</code>: Array with single node name to deploy the instance to (for remote deployments)  </li>
 </ul>
-<p>See <a href="../managing-instances/">Managing Instances</a> for complete configuration options.</p>
-<p><strong>Response:</strong>
+<p>See <a href="../managing-instances/">Managing Instances</a> for complete configuration options.  </p>
+<p><strong>Response:</strong><br />
 <div class="highlight"><pre><span></span><code><a id="__codelineno-12-1" name="__codelineno-12-1" href="#__codelineno-12-1"></a><span class="p">{</span>
 <a id="__codelineno-12-2" name="__codelineno-12-2" href="#__codelineno-12-2"></a><span class="w">  </span><span class="nt">&quot;name&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;llama2-7b&quot;</span><span class="p">,</span>
 <a id="__codelineno-12-3" name="__codelineno-12-3" href="#__codelineno-12-3"></a><span class="w">  </span><span class="nt">&quot;status&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;running&quot;</span><span class="p">,</span>
@@ -1484,11 +1484,11 @@
 <a id="__codelineno-12-5" name="__codelineno-12-5" href="#__codelineno-12-5"></a><span class="p">}</span>
 </code></pre></div></p>
 <h3 id="update-instance">Update Instance<a class="headerlink" href="#update-instance" title="Permanent link">&para;</a></h3>
-<p>Update an existing instance configuration. See <a href="../managing-instances/">Managing Instances</a> for available configuration options.</p>
+<p>Update an existing instance configuration. See <a href="../managing-instances/">Managing Instances</a> for available configuration options.  </p>
 <div class="highlight"><pre><span></span><code><a id="__codelineno-13-1" name="__codelineno-13-1" href="#__codelineno-13-1"></a><span class="err">PUT /api/v1/instances/{name}</span>
 </code></pre></div>
-<p><strong>Request Body:</strong> JSON object with configuration fields to update.</p>
-<p><strong>Response:</strong>
+<p><strong>Request Body:</strong> JSON object with configuration fields to update.  </p>
+<p><strong>Response:</strong><br />
 <div class="highlight"><pre><span></span><code><a id="__codelineno-14-1" name="__codelineno-14-1" href="#__codelineno-14-1"></a><span class="p">{</span>
 <a id="__codelineno-14-2" name="__codelineno-14-2" href="#__codelineno-14-2"></a><span class="w">  </span><span class="nt">&quot;name&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;llama2-7b&quot;</span><span class="p">,</span>
 <a id="__codelineno-14-3" name="__codelineno-14-3" href="#__codelineno-14-3"></a><span class="w">  </span><span class="nt">&quot;status&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;running&quot;</span><span class="p">,</span>
@@ -1496,30 +1496,30 @@
 <a id="__codelineno-14-5" name="__codelineno-14-5" href="#__codelineno-14-5"></a><span class="p">}</span>
 </code></pre></div></p>
 <h3 id="delete-instance">Delete Instance<a class="headerlink" href="#delete-instance" title="Permanent link">&para;</a></h3>
-<p>Stop and remove an instance.</p>
+<p>Stop and remove an instance.  </p>
 <div class="highlight"><pre><span></span><code><a id="__codelineno-15-1" name="__codelineno-15-1" href="#__codelineno-15-1"></a><span class="err">DELETE /api/v1/instances/{name}</span>
 </code></pre></div>
-<p><strong>Response:</strong> <code>204 No Content</code></p>
+<p><strong>Response:</strong> <code>204 No Content</code>  </p>
 <h2 id="instance-operations">Instance Operations<a class="headerlink" href="#instance-operations" title="Permanent link">&para;</a></h2>
 <h3 id="start-instance">Start Instance<a class="headerlink" href="#start-instance" title="Permanent link">&para;</a></h3>
-<p>Start a stopped instance.</p>
+<p>Start a stopped instance.  </p>
 <div class="highlight"><pre><span></span><code><a id="__codelineno-16-1" name="__codelineno-16-1" href="#__codelineno-16-1"></a><span class="err">POST /api/v1/instances/{name}/start</span>
 </code></pre></div>
-<p><strong>Response:</strong>
+<p><strong>Response:</strong><br />
 <div class="highlight"><pre><span></span><code><a id="__codelineno-17-1" name="__codelineno-17-1" href="#__codelineno-17-1"></a><span class="p">{</span>
 <a id="__codelineno-17-2" name="__codelineno-17-2" href="#__codelineno-17-2"></a><span class="w">  </span><span class="nt">&quot;name&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;llama2-7b&quot;</span><span class="p">,</span>
 <a id="__codelineno-17-3" name="__codelineno-17-3" href="#__codelineno-17-3"></a><span class="w">  </span><span class="nt">&quot;status&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;running&quot;</span><span class="p">,</span>
 <a id="__codelineno-17-4" name="__codelineno-17-4" href="#__codelineno-17-4"></a><span class="w">  </span><span class="nt">&quot;created&quot;</span><span class="p">:</span><span class="w"> </span><span class="mi">1705312200</span>
 <a id="__codelineno-17-5" name="__codelineno-17-5" href="#__codelineno-17-5"></a><span class="p">}</span>
 </code></pre></div></p>
-<p><strong>Error Responses:</strong>
-- <code>409 Conflict</code>: Maximum number of running instances reached
-- <code>500 Internal Server Error</code>: Failed to start instance</p>
+<p><strong>Error Responses:</strong><br />
+- <code>409 Conflict</code>: Maximum number of running instances reached<br />
+- <code>500 Internal Server Error</code>: Failed to start instance  </p>
 <h3 id="stop-instance">Stop Instance<a class="headerlink" href="#stop-instance" title="Permanent link">&para;</a></h3>
-<p>Stop a running instance.</p>
+<p>Stop a running instance.  </p>
 <div class="highlight"><pre><span></span><code><a id="__codelineno-18-1" name="__codelineno-18-1" href="#__codelineno-18-1"></a><span class="err">POST /api/v1/instances/{name}/stop</span>
 </code></pre></div>
-<p><strong>Response:</strong>
+<p><strong>Response:</strong><br />
 <div class="highlight"><pre><span></span><code><a id="__codelineno-19-1" name="__codelineno-19-1" href="#__codelineno-19-1"></a><span class="p">{</span>
 <a id="__codelineno-19-2" name="__codelineno-19-2" href="#__codelineno-19-2"></a><span class="w">  </span><span class="nt">&quot;name&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;llama2-7b&quot;</span><span class="p">,</span>
 <a id="__codelineno-19-3" name="__codelineno-19-3" href="#__codelineno-19-3"></a><span class="w">  </span><span class="nt">&quot;status&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;stopped&quot;</span><span class="p">,</span>
@@ -1527,10 +1527,10 @@
 <a id="__codelineno-19-5" name="__codelineno-19-5" href="#__codelineno-19-5"></a><span class="p">}</span>
 </code></pre></div></p>
 <h3 id="restart-instance">Restart Instance<a class="headerlink" href="#restart-instance" title="Permanent link">&para;</a></h3>
-<p>Restart an instance (stop then start).</p>
+<p>Restart an instance (stop then start).  </p>
 <div class="highlight"><pre><span></span><code><a id="__codelineno-20-1" name="__codelineno-20-1" href="#__codelineno-20-1"></a><span class="err">POST /api/v1/instances/{name}/restart</span>
 </code></pre></div>
-<p><strong>Response:</strong>
+<p><strong>Response:</strong><br />
 <div class="highlight"><pre><span></span><code><a id="__codelineno-21-1" name="__codelineno-21-1" href="#__codelineno-21-1"></a><span class="p">{</span>
 <a id="__codelineno-21-2" name="__codelineno-21-2" href="#__codelineno-21-2"></a><span class="w">  </span><span class="nt">&quot;name&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;llama2-7b&quot;</span><span class="p">,</span>
 <a id="__codelineno-21-3" name="__codelineno-21-3" href="#__codelineno-21-3"></a><span class="w">  </span><span class="nt">&quot;status&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;running&quot;</span><span class="p">,</span>
@@ -1538,35 +1538,35 @@
 <a id="__codelineno-21-5" name="__codelineno-21-5" href="#__codelineno-21-5"></a><span class="p">}</span>
 </code></pre></div></p>
 <h3 id="get-instance-logs">Get Instance Logs<a class="headerlink" href="#get-instance-logs" title="Permanent link">&para;</a></h3>
-<p>Retrieve instance logs.</p>
+<p>Retrieve instance logs.  </p>
 <div class="highlight"><pre><span></span><code><a id="__codelineno-22-1" name="__codelineno-22-1" href="#__codelineno-22-1"></a><span class="err">GET /api/v1/instances/{name}/logs</span>
 </code></pre></div>
-<p><strong>Query Parameters:</strong>
-- <code>lines</code>: Number of lines to return (default: all lines, use -1 for all)</p>
-<p><strong>Response:</strong> Plain text log output</p>
-<p><strong>Example:</strong>
+<p><strong>Query Parameters:</strong><br />
+- <code>lines</code>: Number of lines to return (default: all lines, use -1 for all)  </p>
+<p><strong>Response:</strong> Plain text log output  </p>
+<p><strong>Example:</strong><br />
 <div class="highlight"><pre><span></span><code><a id="__codelineno-23-1" name="__codelineno-23-1" href="#__codelineno-23-1"></a>curl<span class="w"> </span><span class="s2">&quot;http://localhost:8080/api/v1/instances/my-instance/logs?lines=100&quot;</span>
 </code></pre></div></p>
 <h3 id="proxy-to-instance">Proxy to Instance<a class="headerlink" href="#proxy-to-instance" title="Permanent link">&para;</a></h3>
-<p>Proxy HTTP requests directly to the llama-server instance.</p>
+<p>Proxy HTTP requests directly to the llama-server instance.  </p>
 <div class="highlight"><pre><span></span><code><a id="__codelineno-24-1" name="__codelineno-24-1" href="#__codelineno-24-1"></a><span class="err">GET /api/v1/instances/{name}/proxy/*</span>
 <a id="__codelineno-24-2" name="__codelineno-24-2" href="#__codelineno-24-2"></a><span class="err">POST /api/v1/instances/{name}/proxy/*</span>
 </code></pre></div>
-<p>This endpoint forwards all requests to the underlying llama-server instance running on its configured port. The proxy strips the <code>/api/v1/instances/{name}/proxy</code> prefix and forwards the remaining path to the instance.</p>
-<p><strong>Example - Check Instance Health:</strong>
+<p>This endpoint forwards all requests to the underlying llama-server instance running on its configured port. The proxy strips the <code>/api/v1/instances/{name}/proxy</code> prefix and forwards the remaining path to the instance.  </p>
+<p><strong>Example - Check Instance Health:</strong><br />
 <div class="highlight"><pre><span></span><code><a id="__codelineno-25-1" name="__codelineno-25-1" href="#__codelineno-25-1"></a>curl<span class="w"> </span>-H<span class="w"> </span><span class="s2">&quot;Authorization: Bearer your-api-key&quot;</span><span class="w"> </span><span class="se">\</span>
 <a id="__codelineno-25-2" name="__codelineno-25-2" href="#__codelineno-25-2"></a><span class="w">  </span>http://localhost:8080/api/v1/instances/my-model/proxy/health
 </code></pre></div></p>
-<p>This forwards the request to <code>http://instance-host:instance-port/health</code> on the actual llama-server instance.</p>
-<p><strong>Error Responses:</strong>
-- <code>503 Service Unavailable</code>: Instance is not running</p>
+<p>This forwards the request to <code>http://instance-host:instance-port/health</code> on the actual llama-server instance.  </p>
+<p><strong>Error Responses:</strong><br />
+- <code>503 Service Unavailable</code>: Instance is not running  </p>
 <h2 id="openai-compatible-api">OpenAI-Compatible API<a class="headerlink" href="#openai-compatible-api" title="Permanent link">&para;</a></h2>
-<p>Llamactl provides OpenAI-compatible endpoints for inference operations.</p>
+<p>Llamactl provides OpenAI-compatible endpoints for inference operations.  </p>
 <h3 id="list-models">List Models<a class="headerlink" href="#list-models" title="Permanent link">&para;</a></h3>
-<p>List all instances in OpenAI-compatible format.</p>
+<p>List all instances in OpenAI-compatible format.  </p>
 <div class="highlight"><pre><span></span><code><a id="__codelineno-26-1" name="__codelineno-26-1" href="#__codelineno-26-1"></a><span class="err">GET /v1/models</span>
 </code></pre></div>
-<p><strong>Response:</strong>
+<p><strong>Response:</strong><br />
 <div class="highlight"><pre><span></span><code><a id="__codelineno-27-1" name="__codelineno-27-1" href="#__codelineno-27-1"></a><span class="p">{</span>
 <a id="__codelineno-27-2" name="__codelineno-27-2" href="#__codelineno-27-2"></a><span class="w">  </span><span class="nt">&quot;object&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;list&quot;</span><span class="p">,</span>
 <a id="__codelineno-27-3" name="__codelineno-27-3" href="#__codelineno-27-3"></a><span class="w">  </span><span class="nt">&quot;data&quot;</span><span class="p">:</span><span class="w"> </span><span class="p">[</span>
@@ -1580,15 +1580,15 @@
 <a id="__codelineno-27-11" name="__codelineno-27-11" href="#__codelineno-27-11"></a><span class="p">}</span>
 </code></pre></div></p>
 <h3 id="chat-completions-completions-embeddings">Chat Completions, Completions, Embeddings<a class="headerlink" href="#chat-completions-completions-embeddings" title="Permanent link">&para;</a></h3>
-<p>All OpenAI-compatible inference endpoints are available:</p>
+<p>All OpenAI-compatible inference endpoints are available:  </p>
 <div class="highlight"><pre><span></span><code><a id="__codelineno-28-1" name="__codelineno-28-1" href="#__codelineno-28-1"></a><span class="err">POST /v1/chat/completions</span>
 <a id="__codelineno-28-2" name="__codelineno-28-2" href="#__codelineno-28-2"></a><span class="err">POST /v1/completions</span>
 <a id="__codelineno-28-3" name="__codelineno-28-3" href="#__codelineno-28-3"></a><span class="err">POST /v1/embeddings</span>
 <a id="__codelineno-28-4" name="__codelineno-28-4" href="#__codelineno-28-4"></a><span class="err">POST /v1/rerank</span>
 <a id="__codelineno-28-5" name="__codelineno-28-5" href="#__codelineno-28-5"></a><span class="err">POST /v1/reranking</span>
 </code></pre></div>
-<p><strong>Request Body:</strong> Standard OpenAI format with <code>model</code> field specifying the instance name</p>
-<p><strong>Example:</strong>
+<p><strong>Request Body:</strong> Standard OpenAI format with <code>model</code> field specifying the instance name  </p>
+<p><strong>Example:</strong><br />
 <div class="highlight"><pre><span></span><code><a id="__codelineno-29-1" name="__codelineno-29-1" href="#__codelineno-29-1"></a><span class="p">{</span>
 <a id="__codelineno-29-2" name="__codelineno-29-2" href="#__codelineno-29-2"></a><span class="w">  </span><span class="nt">&quot;model&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;llama2-7b&quot;</span><span class="p">,</span>
 <a id="__codelineno-29-3" name="__codelineno-29-3" href="#__codelineno-29-3"></a><span class="w">  </span><span class="nt">&quot;messages&quot;</span><span class="p">:</span><span class="w"> </span><span class="p">[</span>
@@ -1599,34 +1599,34 @@
 <a id="__codelineno-29-8" name="__codelineno-29-8" href="#__codelineno-29-8"></a><span class="w">  </span><span class="p">]</span>
 <a id="__codelineno-29-9" name="__codelineno-29-9" href="#__codelineno-29-9"></a><span class="p">}</span>
 </code></pre></div></p>
-<p>The server routes requests to the appropriate instance based on the <code>model</code> field in the request body. Instances with on-demand starting enabled will be automatically started if not running. For configuration details, see <a href="../managing-instances/">Managing Instances</a>.</p>
-<p><strong>Error Responses:</strong>
-- <code>400 Bad Request</code>: Invalid request body or missing instance name
-- <code>503 Service Unavailable</code>: Instance is not running and on-demand start is disabled
-- <code>409 Conflict</code>: Cannot start instance due to maximum instances limit</p>
+<p>The server routes requests to the appropriate instance based on the <code>model</code> field in the request body. Instances with on-demand starting enabled will be automatically started if not running. For configuration details, see <a href="../managing-instances/">Managing Instances</a>.  </p>
+<p><strong>Error Responses:</strong><br />
+- <code>400 Bad Request</code>: Invalid request body or missing instance name<br />
+- <code>503 Service Unavailable</code>: Instance is not running and on-demand start is disabled<br />
+- <code>409 Conflict</code>: Cannot start instance due to maximum instances limit  </p>
 <h2 id="instance-status-values">Instance Status Values<a class="headerlink" href="#instance-status-values" title="Permanent link">&para;</a></h2>
-<p>Instances can have the following status values:
-- <code>stopped</code>: Instance is not running
-- <code>running</code>: Instance is running and ready to accept requests
+<p>Instances can have the following status values:<br />
+- <code>stopped</code>: Instance is not running<br />
+- <code>running</code>: Instance is running and ready to accept requests<br />
 - <code>failed</code>: Instance failed to start or crashed  </p>
 <h2 id="error-responses">Error Responses<a class="headerlink" href="#error-responses" title="Permanent link">&para;</a></h2>
-<p>All endpoints may return error responses in the following format:</p>
+<p>All endpoints may return error responses in the following format:  </p>
 <div class="highlight"><pre><span></span><code><a id="__codelineno-30-1" name="__codelineno-30-1" href="#__codelineno-30-1"></a><span class="p">{</span>
 <a id="__codelineno-30-2" name="__codelineno-30-2" href="#__codelineno-30-2"></a><span class="w">  </span><span class="nt">&quot;error&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;Error message description&quot;</span>
 <a id="__codelineno-30-3" name="__codelineno-30-3" href="#__codelineno-30-3"></a><span class="p">}</span>
 </code></pre></div>
 <h3 id="common-http-status-codes">Common HTTP Status Codes<a class="headerlink" href="#common-http-status-codes" title="Permanent link">&para;</a></h3>
 <ul>
-<li><code>200</code>: Success</li>
-<li><code>201</code>: Created</li>
-<li><code>204</code>: No Content (successful deletion)</li>
-<li><code>400</code>: Bad Request (invalid parameters or request body)</li>
-<li><code>401</code>: Unauthorized (missing or invalid API key)</li>
-<li><code>403</code>: Forbidden (insufficient permissions)</li>
-<li><code>404</code>: Not Found (instance not found)</li>
-<li><code>409</code>: Conflict (instance already exists, max instances reached)</li>
-<li><code>500</code>: Internal Server Error</li>
-<li><code>503</code>: Service Unavailable (instance not running)</li>
+<li><code>200</code>: Success  </li>
+<li><code>201</code>: Created  </li>
+<li><code>204</code>: No Content (successful deletion)  </li>
+<li><code>400</code>: Bad Request (invalid parameters or request body)  </li>
+<li><code>401</code>: Unauthorized (missing or invalid API key)  </li>
+<li><code>403</code>: Forbidden (insufficient permissions)  </li>
+<li><code>404</code>: Not Found (instance not found)  </li>
+<li><code>409</code>: Conflict (instance already exists, max instances reached)  </li>
+<li><code>500</code>: Internal Server Error  </li>
+<li><code>503</code>: Service Unavailable (instance not running)  </li>
 </ul>
 <h2 id="examples">Examples<a class="headerlink" href="#examples" title="Permanent link">&para;</a></h2>
 <h3 id="complete-instance-lifecycle">Complete Instance Lifecycle<a class="headerlink" href="#complete-instance-lifecycle" title="Permanent link">&para;</a></h3>
@@ -1704,7 +1704,7 @@
 <a id="__codelineno-32-27" name="__codelineno-32-27" href="#__codelineno-32-27"></a><span class="s1">  }&#39;</span>
 </code></pre></div>
 <h3 id="using-the-proxy-endpoint">Using the Proxy Endpoint<a class="headerlink" href="#using-the-proxy-endpoint" title="Permanent link">&para;</a></h3>
-<p>You can also directly proxy requests to the llama-server instance:</p>
+<p>You can also directly proxy requests to the llama-server instance:  </p>
 <div class="highlight"><pre><span></span><code><a id="__codelineno-33-1" name="__codelineno-33-1" href="#__codelineno-33-1"></a><span class="c1"># Direct proxy to instance (bypasses OpenAI compatibility layer)</span>
 <a id="__codelineno-33-2" name="__codelineno-33-2" href="#__codelineno-33-2"></a>curl<span class="w"> </span>-X<span class="w"> </span>POST<span class="w"> </span>http://localhost:8080/api/v1/instances/my-model/proxy/completion<span class="w"> </span><span class="se">\</span>
 <a id="__codelineno-33-3" name="__codelineno-33-3" href="#__codelineno-33-3"></a><span class="w">  </span>-H<span class="w"> </span><span class="s2">&quot;Content-Type: application/json&quot;</span><span class="w"> </span><span class="se">\</span>
@@ -1716,17 +1716,17 @@
 </code></pre></div>
 <h2 id="backend-specific-endpoints">Backend-Specific Endpoints<a class="headerlink" href="#backend-specific-endpoints" title="Permanent link">&para;</a></h2>
 <h3 id="parse-commands">Parse Commands<a class="headerlink" href="#parse-commands" title="Permanent link">&para;</a></h3>
-<p>Llamactl provides endpoints to parse command strings from different backends into instance configuration options.</p>
+<p>Llamactl provides endpoints to parse command strings from different backends into instance configuration options.  </p>
 <h4 id="parse-llamacpp-command">Parse Llama.cpp Command<a class="headerlink" href="#parse-llamacpp-command" title="Permanent link">&para;</a></h4>
-<p>Parse a llama-server command string into instance options.</p>
+<p>Parse a llama-server command string into instance options.  </p>
 <div class="highlight"><pre><span></span><code><a id="__codelineno-34-1" name="__codelineno-34-1" href="#__codelineno-34-1"></a><span class="err">POST /api/v1/backends/llama-cpp/parse-command</span>
 </code></pre></div>
-<p><strong>Request Body:</strong>
+<p><strong>Request Body:</strong><br />
 <div class="highlight"><pre><span></span><code><a id="__codelineno-35-1" name="__codelineno-35-1" href="#__codelineno-35-1"></a><span class="p">{</span>
 <a id="__codelineno-35-2" name="__codelineno-35-2" href="#__codelineno-35-2"></a><span class="w">  </span><span class="nt">&quot;command&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;llama-server -m /path/to/model.gguf -c 2048 --port 8080&quot;</span>
 <a id="__codelineno-35-3" name="__codelineno-35-3" href="#__codelineno-35-3"></a><span class="p">}</span>
 </code></pre></div></p>
-<p><strong>Response:</strong>
+<p><strong>Response:</strong><br />
 <div class="highlight"><pre><span></span><code><a id="__codelineno-36-1" name="__codelineno-36-1" href="#__codelineno-36-1"></a><span class="p">{</span>
 <a id="__codelineno-36-2" name="__codelineno-36-2" href="#__codelineno-36-2"></a><span class="w">  </span><span class="nt">&quot;backend_type&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;llama_cpp&quot;</span><span class="p">,</span>
 <a id="__codelineno-36-3" name="__codelineno-36-3" href="#__codelineno-36-3"></a><span class="w">  </span><span class="nt">&quot;llama_server_options&quot;</span><span class="p">:</span><span class="w"> </span><span class="p">{</span>
@@ -1737,15 +1737,15 @@
 <a id="__codelineno-36-8" name="__codelineno-36-8" href="#__codelineno-36-8"></a><span class="p">}</span>
 </code></pre></div></p>
 <h4 id="parse-mlx-lm-command">Parse MLX-LM Command<a class="headerlink" href="#parse-mlx-lm-command" title="Permanent link">&para;</a></h4>
-<p>Parse an MLX-LM server command string into instance options.</p>
+<p>Parse an MLX-LM server command string into instance options.  </p>
 <div class="highlight"><pre><span></span><code><a id="__codelineno-37-1" name="__codelineno-37-1" href="#__codelineno-37-1"></a><span class="err">POST /api/v1/backends/mlx/parse-command</span>
 </code></pre></div>
-<p><strong>Request Body:</strong>
+<p><strong>Request Body:</strong><br />
 <div class="highlight"><pre><span></span><code><a id="__codelineno-38-1" name="__codelineno-38-1" href="#__codelineno-38-1"></a><span class="p">{</span>
 <a id="__codelineno-38-2" name="__codelineno-38-2" href="#__codelineno-38-2"></a><span class="w">  </span><span class="nt">&quot;command&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;mlx_lm.server --model /path/to/model --port 8080&quot;</span>
 <a id="__codelineno-38-3" name="__codelineno-38-3" href="#__codelineno-38-3"></a><span class="p">}</span>
 </code></pre></div></p>
-<p><strong>Response:</strong>
+<p><strong>Response:</strong><br />
 <div class="highlight"><pre><span></span><code><a id="__codelineno-39-1" name="__codelineno-39-1" href="#__codelineno-39-1"></a><span class="p">{</span>
 <a id="__codelineno-39-2" name="__codelineno-39-2" href="#__codelineno-39-2"></a><span class="w">  </span><span class="nt">&quot;backend_type&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;mlx_lm&quot;</span><span class="p">,</span>
 <a id="__codelineno-39-3" name="__codelineno-39-3" href="#__codelineno-39-3"></a><span class="w">  </span><span class="nt">&quot;mlx_server_options&quot;</span><span class="p">:</span><span class="w"> </span><span class="p">{</span>
@@ -1755,15 +1755,15 @@
 <a id="__codelineno-39-7" name="__codelineno-39-7" href="#__codelineno-39-7"></a><span class="p">}</span>
 </code></pre></div></p>
 <h4 id="parse-vllm-command">Parse vLLM Command<a class="headerlink" href="#parse-vllm-command" title="Permanent link">&para;</a></h4>
-<p>Parse a vLLM serve command string into instance options.</p>
+<p>Parse a vLLM serve command string into instance options.  </p>
 <div class="highlight"><pre><span></span><code><a id="__codelineno-40-1" name="__codelineno-40-1" href="#__codelineno-40-1"></a><span class="err">POST /api/v1/backends/vllm/parse-command</span>
 </code></pre></div>
-<p><strong>Request Body:</strong>
+<p><strong>Request Body:</strong><br />
 <div class="highlight"><pre><span></span><code><a id="__codelineno-41-1" name="__codelineno-41-1" href="#__codelineno-41-1"></a><span class="p">{</span>
 <a id="__codelineno-41-2" name="__codelineno-41-2" href="#__codelineno-41-2"></a><span class="w">  </span><span class="nt">&quot;command&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;vllm serve /path/to/model --port 8080&quot;</span>
 <a id="__codelineno-41-3" name="__codelineno-41-3" href="#__codelineno-41-3"></a><span class="p">}</span>
 </code></pre></div></p>
-<p><strong>Response:</strong>
+<p><strong>Response:</strong><br />
 <div class="highlight"><pre><span></span><code><a id="__codelineno-42-1" name="__codelineno-42-1" href="#__codelineno-42-1"></a><span class="p">{</span>
 <a id="__codelineno-42-2" name="__codelineno-42-2" href="#__codelineno-42-2"></a><span class="w">  </span><span class="nt">&quot;backend_type&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;vllm&quot;</span><span class="p">,</span>
 <a id="__codelineno-42-3" name="__codelineno-42-3" href="#__codelineno-42-3"></a><span class="w">  </span><span class="nt">&quot;vllm_server_options&quot;</span><span class="p">:</span><span class="w"> </span><span class="p">{</span>
@@ -1772,20 +1772,20 @@
 <a id="__codelineno-42-6" name="__codelineno-42-6" href="#__codelineno-42-6"></a><span class="w">  </span><span class="p">}</span>
 <a id="__codelineno-42-7" name="__codelineno-42-7" href="#__codelineno-42-7"></a><span class="p">}</span>
 </code></pre></div></p>
-<p><strong>Error Responses for Parse Commands:</strong>
-- <code>400 Bad Request</code>: Invalid request body, empty command, or parse error
-- <code>500 Internal Server Error</code>: Encoding error</p>
+<p><strong>Error Responses for Parse Commands:</strong><br />
+- <code>400 Bad Request</code>: Invalid request body, empty command, or parse error<br />
+- <code>500 Internal Server Error</code>: Encoding error  </p>
 <h2 id="auto-generated-documentation">Auto-Generated Documentation<a class="headerlink" href="#auto-generated-documentation" title="Permanent link">&para;</a></h2>
-<p>The API documentation is automatically generated from code annotations using Swagger/OpenAPI. To regenerate the documentation:</p>
+<p>The API documentation is automatically generated from code annotations using Swagger/OpenAPI. To regenerate the documentation:  </p>
 <ol>
-<li>Install the swag tool: <code>go install github.com/swaggo/swag/cmd/swag@latest</code></li>
-<li>Generate docs: <code>swag init -g cmd/server/main.go -o apidocs</code></li>
+<li>Install the swag tool: <code>go install github.com/swaggo/swag/cmd/swag@latest</code>  </li>
+<li>Generate docs: <code>swag init -g cmd/server/main.go -o apidocs</code>  </li>
 </ol>
 <h2 id="swagger-documentation">Swagger Documentation<a class="headerlink" href="#swagger-documentation" title="Permanent link">&para;</a></h2>
-<p>If swagger documentation is enabled in the server configuration, you can access the interactive API documentation at:</p>
+<p>If swagger documentation is enabled in the server configuration, you can access the interactive API documentation at:  </p>
 <div class="highlight"><pre><span></span><code><a id="__codelineno-43-1" name="__codelineno-43-1" href="#__codelineno-43-1"></a>http://localhost:8080/swagger/
 </code></pre></div>
-<p>This provides a complete interactive interface for testing all API endpoints.</p>
+<p>This provides a complete interactive interface for testing all API endpoints.  </p>
 
 
 
diff --git a/dev/user-guide/managing-instances/index.html b/dev/user-guide/managing-instances/index.html
index ca1999b..8ad6120 100644
--- a/dev/user-guide/managing-instances/index.html
+++ b/dev/user-guide/managing-instances/index.html
@@ -1228,63 +1228,63 @@
 
 
 <h1 id="managing-instances">Managing Instances<a class="headerlink" href="#managing-instances" title="Permanent link">&para;</a></h1>
-<p>Learn how to effectively manage your llama.cpp, MLX, and vLLM instances with Llamactl through both the Web UI and API.</p>
+<p>Learn how to effectively manage your llama.cpp, MLX, and vLLM instances with Llamactl through both the Web UI and API.  </p>
 <h2 id="overview">Overview<a class="headerlink" href="#overview" title="Permanent link">&para;</a></h2>
-<p>Llamactl provides two ways to manage instances:</p>
+<p>Llamactl provides two ways to manage instances:  </p>
 <ul>
-<li><strong>Web UI</strong>: Accessible at <code>http://localhost:8080</code> with an intuitive dashboard</li>
-<li><strong>REST API</strong>: Programmatic access for automation and integration</li>
+<li><strong>Web UI</strong>: Accessible at <code>http://localhost:8080</code> with an intuitive dashboard  </li>
+<li><strong>REST API</strong>: Programmatic access for automation and integration  </li>
 </ul>
-<p><img alt="Dashboard Screenshot" src="../../images/dashboard.png" /></p>
+<p><img alt="Dashboard Screenshot" src="../../images/dashboard.png" />  </p>
 <h3 id="authentication">Authentication<a class="headerlink" href="#authentication" title="Permanent link">&para;</a></h3>
-<p>If authentication is enabled:
-1. Navigate to the web UI
-2. Enter your credentials
-3. Bearer token is stored for the session</p>
+<p>If authentication is enabled:<br />
+1. Navigate to the web UI<br />
+2. Enter your credentials<br />
+3. Bearer token is stored for the session  </p>
 <h3 id="theme-support">Theme Support<a class="headerlink" href="#theme-support" title="Permanent link">&para;</a></h3>
 <ul>
-<li>Switch between light and dark themes</li>
-<li>Setting is remembered across sessions</li>
+<li>Switch between light and dark themes  </li>
+<li>Setting is remembered across sessions  </li>
 </ul>
 <h2 id="instance-cards">Instance Cards<a class="headerlink" href="#instance-cards" title="Permanent link">&para;</a></h2>
-<p>Each instance is displayed as a card showing:</p>
+<p>Each instance is displayed as a card showing:  </p>
 <ul>
-<li><strong>Instance name</strong></li>
-<li><strong>Health status badge</strong> (unknown, ready, error, failed)</li>
-<li><strong>Action buttons</strong> (start, stop, edit, logs, delete)</li>
+<li><strong>Instance name</strong>  </li>
+<li><strong>Health status badge</strong> (unknown, ready, error, failed)  </li>
+<li><strong>Action buttons</strong> (start, stop, edit, logs, delete)  </li>
 </ul>
 <h2 id="create-instance">Create Instance<a class="headerlink" href="#create-instance" title="Permanent link">&para;</a></h2>
 <h3 id="via-web-ui">Via Web UI<a class="headerlink" href="#via-web-ui" title="Permanent link">&para;</a></h3>
-<p><img alt="Create Instance Screenshot" src="../../images/create_instance.png" /></p>
+<p><img alt="Create Instance Screenshot" src="../../images/create_instance.png" />  </p>
 <ol>
-<li>Click the <strong>"Create Instance"</strong> button on the dashboard</li>
-<li>Enter a unique <strong>Name</strong> for your instance (only required field)</li>
-<li><strong>Select Target Node</strong>: Choose which node to deploy the instance to from the dropdown</li>
-<li><strong>Choose Backend Type</strong>:<ul>
-<li><strong>llama.cpp</strong>: For GGUF models using llama-server</li>
-<li><strong>MLX</strong>: For MLX-optimized models (macOS only)</li>
-<li><strong>vLLM</strong>: For distributed serving and high-throughput inference</li>
+<li>Click the <strong>"Create Instance"</strong> button on the dashboard  </li>
+<li>Enter a unique <strong>Name</strong> for your instance (only required field)  </li>
+<li><strong>Select Target Node</strong>: Choose which node to deploy the instance to from the dropdown  </li>
+<li><strong>Choose Backend Type</strong>:  <ul>
+<li><strong>llama.cpp</strong>: For GGUF models using llama-server  </li>
+<li><strong>MLX</strong>: For MLX-optimized models (macOS only)  </li>
+<li><strong>vLLM</strong>: For distributed serving and high-throughput inference  </li>
 </ul>
 </li>
-<li>Configure model source:<ul>
-<li><strong>For llama.cpp</strong>: GGUF model path or HuggingFace repo</li>
-<li><strong>For MLX</strong>: MLX model path or identifier (e.g., <code>mlx-community/Mistral-7B-Instruct-v0.3-4bit</code>)</li>
-<li><strong>For vLLM</strong>: HuggingFace model identifier (e.g., <code>microsoft/DialoGPT-medium</code>)</li>
+<li>Configure model source:  <ul>
+<li><strong>For llama.cpp</strong>: GGUF model path or HuggingFace repo  </li>
+<li><strong>For MLX</strong>: MLX model path or identifier (e.g., <code>mlx-community/Mistral-7B-Instruct-v0.3-4bit</code>)  </li>
+<li><strong>For vLLM</strong>: HuggingFace model identifier (e.g., <code>microsoft/DialoGPT-medium</code>)  </li>
 </ul>
 </li>
-<li>Configure optional instance management settings:<ul>
-<li><strong>Auto Restart</strong>: Automatically restart instance on failure</li>
-<li><strong>Max Restarts</strong>: Maximum number of restart attempts</li>
-<li><strong>Restart Delay</strong>: Delay in seconds between restart attempts</li>
-<li><strong>On Demand Start</strong>: Start instance when receiving a request to the OpenAI compatible endpoint</li>
-<li><strong>Idle Timeout</strong>: Minutes before stopping idle instance (set to 0 to disable)</li>
-<li><strong>Environment Variables</strong>: Set custom environment variables for the instance process</li>
+<li>Configure optional instance management settings:  <ul>
+<li><strong>Auto Restart</strong>: Automatically restart instance on failure  </li>
+<li><strong>Max Restarts</strong>: Maximum number of restart attempts  </li>
+<li><strong>Restart Delay</strong>: Delay in seconds between restart attempts  </li>
+<li><strong>On Demand Start</strong>: Start instance when receiving a request to the OpenAI compatible endpoint  </li>
+<li><strong>Idle Timeout</strong>: Minutes before stopping idle instance (set to 0 to disable)  </li>
+<li><strong>Environment Variables</strong>: Set custom environment variables for the instance process  </li>
 </ul>
 </li>
-<li>Configure backend-specific options:<ul>
-<li><strong>llama.cpp</strong>: Threads, context size, GPU layers, port, etc.</li>
-<li><strong>MLX</strong>: Temperature, top-p, adapter path, Python environment, etc.</li>
-<li><strong>vLLM</strong>: Tensor parallel size, GPU memory utilization, quantization, etc.</li>
+<li>Configure backend-specific options:  <ul>
+<li><strong>llama.cpp</strong>: Threads, context size, GPU layers, port, etc.  </li>
+<li><strong>MLX</strong>: Temperature, top-p, adapter path, Python environment, etc.  </li>
+<li><strong>vLLM</strong>: Tensor parallel size, GPU memory utilization, quantization, etc.  </li>
 </ul>
 </li>
 <li>Click <strong>"Create"</strong> to save the instance  </li>
@@ -1364,10 +1364,10 @@
 <h2 id="start-instance">Start Instance<a class="headerlink" href="#start-instance" title="Permanent link">&para;</a></h2>
 <h3 id="via-web-ui_1">Via Web UI<a class="headerlink" href="#via-web-ui_1" title="Permanent link">&para;</a></h3>
 <ol>
-<li>Click the <strong>"Start"</strong> button on an instance card</li>
-<li>Watch the status change to "Unknown"</li>
-<li>Monitor progress in the logs</li>
-<li>Instance status changes to "Ready" when ready</li>
+<li>Click the <strong>"Start"</strong> button on an instance card  </li>
+<li>Watch the status change to "Unknown"  </li>
+<li>Monitor progress in the logs  </li>
+<li>Instance status changes to "Ready" when ready  </li>
 </ol>
 <h3 id="via-api_1">Via API<a class="headerlink" href="#via-api_1" title="Permanent link">&para;</a></h3>
 <div class="highlight"><pre><span></span><code><a id="__codelineno-1-1" name="__codelineno-1-1" href="#__codelineno-1-1"></a>curl<span class="w"> </span>-X<span class="w"> </span>POST<span class="w"> </span>http://localhost:8080/api/instances/<span class="o">{</span>name<span class="o">}</span>/start
@@ -1375,8 +1375,8 @@
 <h2 id="stop-instance">Stop Instance<a class="headerlink" href="#stop-instance" title="Permanent link">&para;</a></h2>
 <h3 id="via-web-ui_2">Via Web UI<a class="headerlink" href="#via-web-ui_2" title="Permanent link">&para;</a></h3>
 <ol>
-<li>Click the <strong>"Stop"</strong> button on an instance card</li>
-<li>Instance gracefully shuts down</li>
+<li>Click the <strong>"Stop"</strong> button on an instance card  </li>
+<li>Instance gracefully shuts down  </li>
 </ol>
 <h3 id="via-api_2">Via API<a class="headerlink" href="#via-api_2" title="Permanent link">&para;</a></h3>
 <div class="highlight"><pre><span></span><code><a id="__codelineno-2-1" name="__codelineno-2-1" href="#__codelineno-2-1"></a>curl<span class="w"> </span>-X<span class="w"> </span>POST<span class="w"> </span>http://localhost:8080/api/instances/<span class="o">{</span>name<span class="o">}</span>/stop
@@ -1384,13 +1384,13 @@
 <h2 id="edit-instance">Edit Instance<a class="headerlink" href="#edit-instance" title="Permanent link">&para;</a></h2>
 <h3 id="via-web-ui_3">Via Web UI<a class="headerlink" href="#via-web-ui_3" title="Permanent link">&para;</a></h3>
 <ol>
-<li>Click the <strong>"Edit"</strong> button on an instance card</li>
-<li>Modify settings in the configuration dialog</li>
-<li>Changes require instance restart to take effect</li>
-<li>Click <strong>"Update &amp; Restart"</strong> to apply changes</li>
+<li>Click the <strong>"Edit"</strong> button on an instance card  </li>
+<li>Modify settings in the configuration dialog  </li>
+<li>Changes require instance restart to take effect  </li>
+<li>Click <strong>"Update &amp; Restart"</strong> to apply changes  </li>
 </ol>
 <h3 id="via-api_3">Via API<a class="headerlink" href="#via-api_3" title="Permanent link">&para;</a></h3>
-<p>Modify instance settings:</p>
+<p>Modify instance settings:  </p>
 <div class="highlight"><pre><span></span><code><a id="__codelineno-3-1" name="__codelineno-3-1" href="#__codelineno-3-1"></a>curl<span class="w"> </span>-X<span class="w"> </span>PUT<span class="w"> </span>http://localhost:8080/api/instances/<span class="o">{</span>name<span class="o">}</span><span class="w"> </span><span class="se">\</span>
 <a id="__codelineno-3-2" name="__codelineno-3-2" href="#__codelineno-3-2"></a><span class="w">  </span>-H<span class="w"> </span><span class="s2">&quot;Content-Type: application/json&quot;</span><span class="w"> </span><span class="se">\</span>
 <a id="__codelineno-3-3" name="__codelineno-3-3" href="#__codelineno-3-3"></a><span class="w">  </span>-d<span class="w"> </span><span class="s1">&#39;{</span>
@@ -1402,45 +1402,45 @@
 </code></pre></div>
 <div class="admonition note">
 <p class="admonition-title">Note</p>
-<p>Configuration changes require restarting the instance to take effect.</p>
+<p>Configuration changes require restarting the instance to take effect.  </p>
 </div>
 <h2 id="view-logs">View Logs<a class="headerlink" href="#view-logs" title="Permanent link">&para;</a></h2>
 <h3 id="via-web-ui_4">Via Web UI<a class="headerlink" href="#via-web-ui_4" title="Permanent link">&para;</a></h3>
 <ol>
-<li>Click the <strong>"Logs"</strong> button on any instance card</li>
-<li>Real-time log viewer opens</li>
+<li>Click the <strong>"Logs"</strong> button on any instance card  </li>
+<li>Real-time log viewer opens  </li>
 </ol>
 <h3 id="via-api_4">Via API<a class="headerlink" href="#via-api_4" title="Permanent link">&para;</a></h3>
-<p>Check instance status in real-time:</p>
+<p>Check instance status in real-time:  </p>
 <div class="highlight"><pre><span></span><code><a id="__codelineno-4-1" name="__codelineno-4-1" href="#__codelineno-4-1"></a><span class="c1"># Get instance details</span>
 <a id="__codelineno-4-2" name="__codelineno-4-2" href="#__codelineno-4-2"></a>curl<span class="w"> </span>http://localhost:8080/api/instances/<span class="o">{</span>name<span class="o">}</span>/logs
 </code></pre></div>
 <h2 id="delete-instance">Delete Instance<a class="headerlink" href="#delete-instance" title="Permanent link">&para;</a></h2>
 <h3 id="via-web-ui_5">Via Web UI<a class="headerlink" href="#via-web-ui_5" title="Permanent link">&para;</a></h3>
 <ol>
-<li>Click the <strong>"Delete"</strong> button on an instance card</li>
-<li>Only stopped instances can be deleted</li>
-<li>Confirm deletion in the dialog</li>
+<li>Click the <strong>"Delete"</strong> button on an instance card  </li>
+<li>Only stopped instances can be deleted  </li>
+<li>Confirm deletion in the dialog  </li>
 </ol>
 <h3 id="via-api_5">Via API<a class="headerlink" href="#via-api_5" title="Permanent link">&para;</a></h3>
 <div class="highlight"><pre><span></span><code><a id="__codelineno-5-1" name="__codelineno-5-1" href="#__codelineno-5-1"></a>curl<span class="w"> </span>-X<span class="w"> </span>DELETE<span class="w"> </span>http://localhost:8080/api/instances/<span class="o">{</span>name<span class="o">}</span>
 </code></pre></div>
 <h2 id="instance-proxy">Instance Proxy<a class="headerlink" href="#instance-proxy" title="Permanent link">&para;</a></h2>
-<p>Llamactl proxies all requests to the underlying backend instances (llama-server, MLX, or vLLM).</p>
+<p>Llamactl proxies all requests to the underlying backend instances (llama-server, MLX, or vLLM).  </p>
 <div class="highlight"><pre><span></span><code><a id="__codelineno-6-1" name="__codelineno-6-1" href="#__codelineno-6-1"></a><span class="c1"># Get instance details</span>
 <a id="__codelineno-6-2" name="__codelineno-6-2" href="#__codelineno-6-2"></a>curl<span class="w"> </span>http://localhost:8080/api/instances/<span class="o">{</span>name<span class="o">}</span>/proxy/
 </code></pre></div>
-<p>All backends provide OpenAI-compatible endpoints. Check the respective documentation:
-- <a href="https://github.com/ggml-org/llama.cpp/blob/master/tools/server/README.md">llama-server docs</a>
-- <a href="https://github.com/ml-explore/mlx-lm/blob/main/mlx_lm/SERVER.md">MLX-LM docs</a>
-- <a href="https://docs.vllm.ai/en/latest/">vLLM docs</a></p>
+<p>All backends provide OpenAI-compatible endpoints. Check the respective documentation:<br />
+- <a href="https://github.com/ggml-org/llama.cpp/blob/master/tools/server/README.md">llama-server docs</a><br />
+- <a href="https://github.com/ml-explore/mlx-lm/blob/main/mlx_lm/SERVER.md">MLX-LM docs</a><br />
+- <a href="https://docs.vllm.ai/en/latest/">vLLM docs</a>  </p>
 <h3 id="instance-health">Instance Health<a class="headerlink" href="#instance-health" title="Permanent link">&para;</a></h3>
 <h4 id="via-web-ui_6">Via Web UI<a class="headerlink" href="#via-web-ui_6" title="Permanent link">&para;</a></h4>
 <ol>
-<li>The health status badge is displayed on each instance card</li>
+<li>The health status badge is displayed on each instance card  </li>
 </ol>
 <h4 id="via-api_6">Via API<a class="headerlink" href="#via-api_6" title="Permanent link">&para;</a></h4>
-<p>Check the health status of your instances:</p>
+<p>Check the health status of your instances:  </p>
 <div class="highlight"><pre><span></span><code><a id="__codelineno-7-1" name="__codelineno-7-1" href="#__codelineno-7-1"></a>curl<span class="w"> </span>http://localhost:8080/api/instances/<span class="o">{</span>name<span class="o">}</span>/proxy/health
 </code></pre></div>
 
diff --git a/dev/user-guide/troubleshooting/index.html b/dev/user-guide/troubleshooting/index.html
index 0de9c30..e3f6eeb 100644
--- a/dev/user-guide/troubleshooting/index.html
+++ b/dev/user-guide/troubleshooting/index.html
@@ -998,12 +998,12 @@
 
 
 <h1 id="troubleshooting">Troubleshooting<a class="headerlink" href="#troubleshooting" title="Permanent link">&para;</a></h1>
-<p>Issues specific to Llamactl deployment and operation.</p>
+<p>Issues specific to Llamactl deployment and operation.  </p>
 <h2 id="configuration-issues">Configuration Issues<a class="headerlink" href="#configuration-issues" title="Permanent link">&para;</a></h2>
 <h3 id="invalid-configuration">Invalid Configuration<a class="headerlink" href="#invalid-configuration" title="Permanent link">&para;</a></h3>
-<p><strong>Problem:</strong> Invalid configuration preventing startup</p>
-<p><strong>Solutions:</strong>
-1. Use minimal configuration:
+<p><strong>Problem:</strong> Invalid configuration preventing startup  </p>
+<p><strong>Solutions:</strong><br />
+1. Use minimal configuration:<br />
    <div class="highlight"><pre><span></span><code><a id="__codelineno-0-1" name="__codelineno-0-1" href="#__codelineno-0-1"></a><span class="nt">server</span><span class="p">:</span>
 <a id="__codelineno-0-2" name="__codelineno-0-2" href="#__codelineno-0-2"></a><span class="w">  </span><span class="nt">host</span><span class="p">:</span><span class="w"> </span><span class="s">&quot;0.0.0.0&quot;</span>
 <a id="__codelineno-0-3" name="__codelineno-0-3" href="#__codelineno-0-3"></a><span class="w">  </span><span class="nt">port</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">8080</span>
@@ -1011,23 +1011,23 @@
 <a id="__codelineno-0-5" name="__codelineno-0-5" href="#__codelineno-0-5"></a><span class="w">  </span><span class="nt">port_range</span><span class="p">:</span><span class="w"> </span><span class="p p-Indicator">[</span><span class="nv">8000</span><span class="p p-Indicator">,</span><span class="w"> </span><span class="nv">9000</span><span class="p p-Indicator">]</span>
 </code></pre></div></p>
 <ol>
-<li>Check data directory permissions:
+<li>Check data directory permissions:<br />
    <div class="highlight"><pre><span></span><code><a id="__codelineno-1-1" name="__codelineno-1-1" href="#__codelineno-1-1"></a><span class="c1"># Ensure data directory is writable (default: ~/.local/share/llamactl)</span>
 <a id="__codelineno-1-2" name="__codelineno-1-2" href="#__codelineno-1-2"></a>mkdir<span class="w"> </span>-p<span class="w"> </span>~/.local/share/llamactl/<span class="o">{</span>instances,logs<span class="o">}</span>
 </code></pre></div></li>
 </ol>
 <h2 id="instance-management-issues">Instance Management Issues<a class="headerlink" href="#instance-management-issues" title="Permanent link">&para;</a></h2>
 <h3 id="model-loading-failures">Model Loading Failures<a class="headerlink" href="#model-loading-failures" title="Permanent link">&para;</a></h3>
-<p><strong>Problem:</strong> Instance fails to start with model loading errors</p>
+<p><strong>Problem:</strong> Instance fails to start with model loading errors  </p>
 <p><strong>Common Solutions:</strong><br />
 - <strong>llama-server not found:</strong> Ensure <code>llama-server</code> binary is in PATH<br />
 - <strong>Wrong model format:</strong> Ensure model is in GGUF format<br />
 - <strong>Insufficient memory:</strong> Use smaller model or reduce context size<br />
 - <strong>Path issues:</strong> Use absolute paths to model files  </p>
 <h3 id="memory-issues">Memory Issues<a class="headerlink" href="#memory-issues" title="Permanent link">&para;</a></h3>
-<p><strong>Problem:</strong> Out of memory errors or system becomes unresponsive</p>
-<p><strong>Solutions:</strong>
-1. <strong>Reduce context size:</strong>
+<p><strong>Problem:</strong> Out of memory errors or system becomes unresponsive  </p>
+<p><strong>Solutions:</strong><br />
+1. <strong>Reduce context size:</strong><br />
    <div class="highlight"><pre><span></span><code><a id="__codelineno-2-1" name="__codelineno-2-1" href="#__codelineno-2-1"></a><span class="p">{</span>
 <a id="__codelineno-2-2" name="__codelineno-2-2" href="#__codelineno-2-2"></a><span class="w">  </span><span class="nt">&quot;n_ctx&quot;</span><span class="p">:</span><span class="w"> </span><span class="mi">1024</span>
 <a id="__codelineno-2-3" name="__codelineno-2-3" href="#__codelineno-2-3"></a><span class="p">}</span>
@@ -1038,16 +1038,16 @@
 <li>Use smaller model variants (7B instead of 13B)  </li>
 </ol>
 <h3 id="gpu-configuration">GPU Configuration<a class="headerlink" href="#gpu-configuration" title="Permanent link">&para;</a></h3>
-<p><strong>Problem:</strong> GPU not being used effectively</p>
-<p><strong>Solutions:</strong>
-1. <strong>Configure GPU layers:</strong>
+<p><strong>Problem:</strong> GPU not being used effectively  </p>
+<p><strong>Solutions:</strong><br />
+1. <strong>Configure GPU layers:</strong><br />
    <div class="highlight"><pre><span></span><code><a id="__codelineno-3-1" name="__codelineno-3-1" href="#__codelineno-3-1"></a><span class="p">{</span>
 <a id="__codelineno-3-2" name="__codelineno-3-2" href="#__codelineno-3-2"></a><span class="w">  </span><span class="nt">&quot;n_gpu_layers&quot;</span><span class="p">:</span><span class="w"> </span><span class="mi">35</span>
 <a id="__codelineno-3-3" name="__codelineno-3-3" href="#__codelineno-3-3"></a><span class="p">}</span>
 </code></pre></div></p>
 <h3 id="advanced-instance-issues">Advanced Instance Issues<a class="headerlink" href="#advanced-instance-issues" title="Permanent link">&para;</a></h3>
-<p><strong>Problem:</strong> Complex model loading, performance, or compatibility issues</p>
-<p>Since llamactl uses <code>llama-server</code> under the hood, many instance-related issues are actually llama.cpp issues. For advanced troubleshooting:</p>
+<p><strong>Problem:</strong> Complex model loading, performance, or compatibility issues  </p>
+<p>Since llamactl uses <code>llama-server</code> under the hood, many instance-related issues are actually llama.cpp issues. For advanced troubleshooting:  </p>
 <p><strong>Resources:</strong><br />
 - <strong>llama.cpp Documentation:</strong> <a href="https://github.com/ggml/llama.cpp">https://github.com/ggml/llama.cpp</a><br />
 - <strong>llama.cpp Issues:</strong> <a href="https://github.com/ggml/llama.cpp/issues">https://github.com/ggml/llama.cpp/issues</a><br />
@@ -1056,28 +1056,28 @@
 <div class="highlight"><pre><span></span><code><a id="__codelineno-4-1" name="__codelineno-4-1" href="#__codelineno-4-1"></a><span class="c1"># Test your model and parameters directly with llama-server</span>
 <a id="__codelineno-4-2" name="__codelineno-4-2" href="#__codelineno-4-2"></a>llama-server<span class="w"> </span>--model<span class="w"> </span>/path/to/model.gguf<span class="w"> </span>--port<span class="w"> </span><span class="m">8081</span><span class="w"> </span>--n-gpu-layers<span class="w"> </span><span class="m">35</span>
 </code></pre></div></p>
-<p>This helps determine if the issue is with llamactl or with the underlying llama.cpp/llama-server.</p>
+<p>This helps determine if the issue is with llamactl or with the underlying llama.cpp/llama-server.  </p>
 <h2 id="api-and-network-issues">API and Network Issues<a class="headerlink" href="#api-and-network-issues" title="Permanent link">&para;</a></h2>
 <h3 id="cors-errors">CORS Errors<a class="headerlink" href="#cors-errors" title="Permanent link">&para;</a></h3>
-<p><strong>Problem:</strong> Web UI shows CORS errors in browser console</p>
-<p><strong>Solutions:</strong>
-1. <strong>Configure allowed origins:</strong>
+<p><strong>Problem:</strong> Web UI shows CORS errors in browser console  </p>
+<p><strong>Solutions:</strong><br />
+1. <strong>Configure allowed origins:</strong><br />
    <div class="highlight"><pre><span></span><code><a id="__codelineno-5-1" name="__codelineno-5-1" href="#__codelineno-5-1"></a><span class="nt">server</span><span class="p">:</span>
 <a id="__codelineno-5-2" name="__codelineno-5-2" href="#__codelineno-5-2"></a><span class="w">  </span><span class="nt">allowed_origins</span><span class="p">:</span>
 <a id="__codelineno-5-3" name="__codelineno-5-3" href="#__codelineno-5-3"></a><span class="w">    </span><span class="p p-Indicator">-</span><span class="w"> </span><span class="s">&quot;http://localhost:3000&quot;</span>
 <a id="__codelineno-5-4" name="__codelineno-5-4" href="#__codelineno-5-4"></a><span class="w">    </span><span class="p p-Indicator">-</span><span class="w"> </span><span class="s">&quot;https://yourdomain.com&quot;</span>
 </code></pre></div></p>
 <h2 id="authentication-issues">Authentication Issues<a class="headerlink" href="#authentication-issues" title="Permanent link">&para;</a></h2>
-<p><strong>Problem:</strong> API requests failing with authentication errors</p>
-<p><strong>Solutions:</strong>
-1. <strong>Disable authentication temporarily:</strong>
+<p><strong>Problem:</strong> API requests failing with authentication errors  </p>
+<p><strong>Solutions:</strong><br />
+1. <strong>Disable authentication temporarily:</strong><br />
    <div class="highlight"><pre><span></span><code><a id="__codelineno-6-1" name="__codelineno-6-1" href="#__codelineno-6-1"></a><span class="nt">auth</span><span class="p">:</span>
 <a id="__codelineno-6-2" name="__codelineno-6-2" href="#__codelineno-6-2"></a><span class="w">  </span><span class="nt">require_management_auth</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">false</span>
 <a id="__codelineno-6-3" name="__codelineno-6-3" href="#__codelineno-6-3"></a><span class="w">  </span><span class="nt">require_inference_auth</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">false</span>
 </code></pre></div></p>
 <ol>
 <li>
-<p><strong>Configure API keys:</strong>
+<p><strong>Configure API keys:</strong><br />
    <div class="highlight"><pre><span></span><code><a id="__codelineno-7-1" name="__codelineno-7-1" href="#__codelineno-7-1"></a><span class="nt">auth</span><span class="p">:</span>
 <a id="__codelineno-7-2" name="__codelineno-7-2" href="#__codelineno-7-2"></a><span class="w">  </span><span class="nt">management_keys</span><span class="p">:</span>
 <a id="__codelineno-7-3" name="__codelineno-7-3" href="#__codelineno-7-3"></a><span class="w">    </span><span class="p p-Indicator">-</span><span class="w"> </span><span class="s">&quot;your-management-key&quot;</span>
@@ -1086,7 +1086,7 @@
 </code></pre></div></p>
 </li>
 <li>
-<p><strong>Use correct Authorization header:</strong>
+<p><strong>Use correct Authorization header:</strong><br />
    <div class="highlight"><pre><span></span><code><a id="__codelineno-8-1" name="__codelineno-8-1" href="#__codelineno-8-1"></a>curl<span class="w"> </span>-H<span class="w"> </span><span class="s2">&quot;Authorization: Bearer your-api-key&quot;</span><span class="w"> </span><span class="se">\</span>
 <a id="__codelineno-8-2" name="__codelineno-8-2" href="#__codelineno-8-2"></a><span class="w">  </span>http://localhost:8080/api/v1/instances
 </code></pre></div></p>
@@ -1094,9 +1094,9 @@
 </ol>
 <h2 id="remote-node-issues">Remote Node Issues<a class="headerlink" href="#remote-node-issues" title="Permanent link">&para;</a></h2>
 <h3 id="node-configuration">Node Configuration<a class="headerlink" href="#node-configuration" title="Permanent link">&para;</a></h3>
-<p><strong>Problem:</strong> Remote instances not appearing or cannot be managed</p>
-<p><strong>Solutions:</strong>
-1. <strong>Verify node configuration:</strong>
+<p><strong>Problem:</strong> Remote instances not appearing or cannot be managed  </p>
+<p><strong>Solutions:</strong><br />
+1. <strong>Verify node configuration:</strong><br />
    <div class="highlight"><pre><span></span><code><a id="__codelineno-9-1" name="__codelineno-9-1" href="#__codelineno-9-1"></a><span class="nt">local_node</span><span class="p">:</span><span class="w"> </span><span class="s">&quot;main&quot;</span><span class="w">  </span><span class="c1"># Must match a key in nodes map</span>
 <a id="__codelineno-9-2" name="__codelineno-9-2" href="#__codelineno-9-2"></a><span class="nt">nodes</span><span class="p">:</span>
 <a id="__codelineno-9-3" name="__codelineno-9-3" href="#__codelineno-9-3"></a><span class="w">  </span><span class="nt">main</span><span class="p">:</span>
@@ -1106,7 +1106,7 @@
 <a id="__codelineno-9-7" name="__codelineno-9-7" href="#__codelineno-9-7"></a><span class="w">    </span><span class="nt">api_key</span><span class="p">:</span><span class="w"> </span><span class="s">&quot;secure-key&quot;</span><span class="w">  </span><span class="c1"># Must match worker1&#39;s management key</span>
 </code></pre></div></p>
 <ol>
-<li><strong>Test remote node connectivity:</strong>
+<li><strong>Test remote node connectivity:</strong><br />
    <div class="highlight"><pre><span></span><code><a id="__codelineno-10-1" name="__codelineno-10-1" href="#__codelineno-10-1"></a>curl<span class="w"> </span>-H<span class="w"> </span><span class="s2">&quot;Authorization: Bearer remote-node-key&quot;</span><span class="w"> </span><span class="se">\</span>
 <a id="__codelineno-10-2" name="__codelineno-10-2" href="#__codelineno-10-2"></a><span class="w">  </span>http://remote-node:8080/api/v1/instances
 </code></pre></div></li>
@@ -1124,21 +1124,21 @@
 <a id="__codelineno-12-2" name="__codelineno-12-2" href="#__codelineno-12-2"></a>llamactl
 </code></pre></div>
 <h2 id="getting-help">Getting Help<a class="headerlink" href="#getting-help" title="Permanent link">&para;</a></h2>
-<p>When reporting issues, include:</p>
+<p>When reporting issues, include:  </p>
 <ol>
 <li>
-<p><strong>System information:</strong>
+<p><strong>System information:</strong><br />
    <div class="highlight"><pre><span></span><code><a id="__codelineno-13-1" name="__codelineno-13-1" href="#__codelineno-13-1"></a>llamactl<span class="w"> </span>--version
 </code></pre></div></p>
 </li>
 <li>
-<p><strong>Configuration file</strong> (remove sensitive keys)</p>
+<p><strong>Configuration file</strong> (remove sensitive keys)  </p>
 </li>
 <li>
-<p><strong>Relevant log output</strong></p>
+<p><strong>Relevant log output</strong>  </p>
 </li>
 <li>
-<p><strong>Steps to reproduce the issue</strong></p>
+<p><strong>Steps to reproduce the issue</strong>  </p>
 </li>
 </ol>