diff --git a/dev/__pycache__/fix_line_endings.cpython-311.pyc b/dev/__pycache__/fix_line_endings.cpython-311.pyc
index efe5d54..c77657e 100644
Binary files a/dev/__pycache__/fix_line_endings.cpython-311.pyc and b/dev/__pycache__/fix_line_endings.cpython-311.pyc differ
diff --git a/dev/__pycache__/readme_sync.cpython-311.pyc b/dev/__pycache__/readme_sync.cpython-311.pyc
index ac14c60..3dfeb92 100644
Binary files a/dev/__pycache__/readme_sync.cpython-311.pyc and b/dev/__pycache__/readme_sync.cpython-311.pyc differ
diff --git a/dev/configuration/index.html b/dev/configuration/index.html
index 1211158..d0f684d 100644
--- a/dev/configuration/index.html
+++ b/dev/configuration/index.html
@@ -806,13 +806,12 @@
 <a id="__codelineno-1-59" name="__codelineno-1-59" href="#__codelineno-1-59"></a>
 <a id="__codelineno-1-60" name="__codelineno-1-60" href="#__codelineno-1-60"></a><span class="nt">auth</span><span class="p">:</span>
 <a id="__codelineno-1-61" name="__codelineno-1-61" href="#__codelineno-1-61"></a><span class="w">  </span><span class="nt">require_inference_auth</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">true</span><span class="w">   </span><span class="c1"># Require auth for inference endpoints</span>
-<a id="__codelineno-1-62" name="__codelineno-1-62" href="#__codelineno-1-62"></a><span class="w">  </span><span class="nt">inference_keys</span><span class="p">:</span><span class="w"> </span><span class="p p-Indicator">[]</span><span class="w">             </span><span class="c1"># Keys for inference endpoints</span>
-<a id="__codelineno-1-63" name="__codelineno-1-63" href="#__codelineno-1-63"></a><span class="w">  </span><span class="nt">require_management_auth</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">true</span><span class="w">  </span><span class="c1"># Require auth for management endpoints</span>
-<a id="__codelineno-1-64" name="__codelineno-1-64" href="#__codelineno-1-64"></a><span class="w">  </span><span class="nt">management_keys</span><span class="p">:</span><span class="w"> </span><span class="p p-Indicator">[]</span><span class="w">            </span><span class="c1"># Keys for management endpoints</span>
-<a id="__codelineno-1-65" name="__codelineno-1-65" href="#__codelineno-1-65"></a>
-<a id="__codelineno-1-66" name="__codelineno-1-66" href="#__codelineno-1-66"></a><span class="nt">local_node</span><span class="p">:</span><span class="w"> </span><span class="s">&quot;main&quot;</span><span class="w">               </span><span class="c1"># Name of the local node (default: &quot;main&quot;)</span>
-<a id="__codelineno-1-67" name="__codelineno-1-67" href="#__codelineno-1-67"></a><span class="nt">nodes</span><span class="p">:</span><span class="w">                           </span><span class="c1"># Node configuration for multi-node deployment</span>
-<a id="__codelineno-1-68" name="__codelineno-1-68" href="#__codelineno-1-68"></a><span class="w">  </span><span class="nt">main</span><span class="p">:</span><span class="w">                          </span><span class="c1"># Default local node (empty config)</span>
+<a id="__codelineno-1-62" name="__codelineno-1-62" href="#__codelineno-1-62"></a><span class="w">  </span><span class="nt">require_management_auth</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">true</span><span class="w">  </span><span class="c1"># Require auth for management endpoints</span>
+<a id="__codelineno-1-63" name="__codelineno-1-63" href="#__codelineno-1-63"></a><span class="w">  </span><span class="nt">management_keys</span><span class="p">:</span><span class="w"> </span><span class="p p-Indicator">[]</span><span class="w">            </span><span class="c1"># Keys for management endpoints</span>
+<a id="__codelineno-1-64" name="__codelineno-1-64" href="#__codelineno-1-64"></a>
+<a id="__codelineno-1-65" name="__codelineno-1-65" href="#__codelineno-1-65"></a><span class="nt">local_node</span><span class="p">:</span><span class="w"> </span><span class="s">&quot;main&quot;</span><span class="w">               </span><span class="c1"># Name of the local node (default: &quot;main&quot;)</span>
+<a id="__codelineno-1-66" name="__codelineno-1-66" href="#__codelineno-1-66"></a><span class="nt">nodes</span><span class="p">:</span><span class="w">                           </span><span class="c1"># Node configuration for multi-node deployment</span>
+<a id="__codelineno-1-67" name="__codelineno-1-67" href="#__codelineno-1-67"></a><span class="w">  </span><span class="nt">main</span><span class="p">:</span><span class="w">                          </span><span class="c1"># Default local node (empty config)</span>
 </code></pre></div>
 <h2 id="configuration-files">Configuration Files<a class="headerlink" href="#configuration-files" title="Permanent link">&para;</a></h2>
 <h3 id="configuration-file-locations">Configuration File Locations<a class="headerlink" href="#configuration-file-locations" title="Permanent link">&para;</a></h3>
@@ -965,15 +964,30 @@
 - <code>LLAMACTL_DATABASE_MAX_IDLE_CONNECTIONS</code> - Maximum idle database connections<br />
 - <code>LLAMACTL_DATABASE_CONN_MAX_LIFETIME</code> - Connection max lifetime (e.g., "5m", "1h")  </p>
 <h3 id="authentication-configuration">Authentication Configuration<a class="headerlink" href="#authentication-configuration" title="Permanent link">&para;</a></h3>
+<p>llamactl supports two types of authentication:  </p>
+<ul>
+<li><strong>Management API Keys</strong>: For accessing the web UI and management API (creating/managing instances). These can be configured in the config file or via environment variables.  </li>
+<li><strong>Inference API Keys</strong>: For accessing the OpenAI-compatible inference endpoints. These are managed via the web UI (Settings → API Keys) and stored in the database.  </li>
+</ul>
 <div class="highlight"><pre><span></span><code><a id="__codelineno-7-1" name="__codelineno-7-1" href="#__codelineno-7-1"></a><span class="nt">auth</span><span class="p">:</span>
 <a id="__codelineno-7-2" name="__codelineno-7-2" href="#__codelineno-7-2"></a><span class="w">  </span><span class="nt">require_inference_auth</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">true</span><span class="w">           </span><span class="c1"># Require API key for OpenAI endpoints (default: true)</span>
-<a id="__codelineno-7-3" name="__codelineno-7-3" href="#__codelineno-7-3"></a><span class="w">  </span><span class="nt">inference_keys</span><span class="p">:</span><span class="w"> </span><span class="p p-Indicator">[]</span><span class="w">                     </span><span class="c1"># List of valid inference API keys</span>
-<a id="__codelineno-7-4" name="__codelineno-7-4" href="#__codelineno-7-4"></a><span class="w">  </span><span class="nt">require_management_auth</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">true</span><span class="w">          </span><span class="c1"># Require API key for management endpoints (default: true)</span>
-<a id="__codelineno-7-5" name="__codelineno-7-5" href="#__codelineno-7-5"></a><span class="w">  </span><span class="nt">management_keys</span><span class="p">:</span><span class="w"> </span><span class="p p-Indicator">[]</span><span class="w">                    </span><span class="c1"># List of valid management API keys</span>
+<a id="__codelineno-7-3" name="__codelineno-7-3" href="#__codelineno-7-3"></a><span class="w">  </span><span class="nt">require_management_auth</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">true</span><span class="w">          </span><span class="c1"># Require API key for management endpoints (default: true)</span>
+<a id="__codelineno-7-4" name="__codelineno-7-4" href="#__codelineno-7-4"></a><span class="w">  </span><span class="nt">management_keys</span><span class="p">:</span><span class="w"> </span><span class="p p-Indicator">[]</span><span class="w">                    </span><span class="c1"># List of valid management API keys</span>
 </code></pre></div>
+<p><strong>Managing Inference API Keys:</strong>  </p>
+<p>Inference API keys are managed through the web UI or management API and stored in the database. To create and manage inference keys:  </p>
+<ol>
+<li>Open the web UI and log in with a management API key  </li>
+<li>Navigate to <strong>Settings → API Keys</strong>  </li>
+<li>Click <strong>Create API Key</strong>  </li>
+<li>Configure the key:  </li>
+<li><strong>Name</strong>: A descriptive name for the key  </li>
+<li><strong>Expiration</strong>: Optional expiration date  </li>
+<li><strong>Permissions</strong>: Grant access to all instances or specific instances only  </li>
+<li>Copy the generated key - it won't be shown again  </li>
+</ol>
 <p><strong>Environment Variables:</strong><br />
 - <code>LLAMACTL_REQUIRE_INFERENCE_AUTH</code> - Require auth for OpenAI endpoints (true/false)<br />
-- <code>LLAMACTL_INFERENCE_KEYS</code> - Comma-separated inference API keys<br />
 - <code>LLAMACTL_REQUIRE_MANAGEMENT_AUTH</code> - Require auth for management endpoints (true/false)<br />
 - <code>LLAMACTL_MANAGEMENT_KEYS</code> - Comma-separated management API keys  </p>
 <h3 id="remote-node-configuration">Remote Node Configuration<a class="headerlink" href="#remote-node-configuration" title="Permanent link">&para;</a></h3>
@@ -1014,7 +1028,7 @@
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1zM12.5 7v5.2l4 2.4-1 1L11 13V7zM11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-date" title="December 2, 2025 19:07:39 UTC">December 2, 2025</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-date" title="December 8, 2025 18:15:42 UTC">December 8, 2025</span>
   </span>
 
     
diff --git a/dev/docs.go b/dev/docs.go
index 4a9bce6..a3369fe 100644
--- a/dev/docs.go
+++ b/dev/docs.go
@@ -2063,20 +2063,19 @@ const docTemplate = `{
         "server.CreateKeyRequest": {
             "type": "object",
             "properties": {
-                "expiresAt": {
-                    "type": "integer",
-                    "format": "int64"
+                "expires_at": {
+                    "type": "integer"
                 },
-                "instancePermissions": {
+                "instance_ids": {
                     "type": "array",
                     "items": {
-                        "$ref": "#/definitions/server.InstancePermission"
+                        "type": "integer"
                     }
                 },
                 "name": {
                     "type": "string"
                 },
-                "permissionMode": {
+                "permission_mode": {
                     "$ref": "#/definitions/auth.PermissionMode"
                 }
             }
@@ -2087,9 +2086,6 @@ const docTemplate = `{
                 "created_at": {
                     "type": "integer"
                 },
-                "enabled": {
-                    "type": "boolean"
-                },
                 "expires_at": {
                     "type": "integer"
                 },
@@ -2116,29 +2112,9 @@ const docTemplate = `{
                 }
             }
         },
-        "server.InstancePermission": {
-            "type": "object",
-            "properties": {
-                "can_infer": {
-                    "type": "boolean"
-                },
-                "can_view_logs": {
-                    "type": "boolean"
-                },
-                "instance_id": {
-                    "type": "integer"
-                }
-            }
-        },
         "server.KeyPermissionResponse": {
             "type": "object",
             "properties": {
-                "can_infer": {
-                    "type": "boolean"
-                },
-                "can_view_logs": {
-                    "type": "boolean"
-                },
                 "instance_id": {
                     "type": "integer"
                 },
@@ -2153,9 +2129,6 @@ const docTemplate = `{
                 "created_at": {
                     "type": "integer"
                 },
-                "enabled": {
-                    "type": "boolean"
-                },
                 "expires_at": {
                     "type": "integer"
                 },
diff --git a/dev/quick-start/index.html b/dev/quick-start/index.html
index ebe5ede..e4062f6 100644
--- a/dev/quick-start/index.html
+++ b/dev/quick-start/index.html
@@ -564,6 +564,15 @@
     </span>
   </a>
   
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#create-an-inference-api-key" class="md-nav__link">
+    <span class="md-ellipsis">
+      Create an Inference API Key
+    </span>
+  </a>
+  
 </li>
       
         <li class="md-nav__item">
@@ -773,10 +782,10 @@
 <h2 id="authentication">Authentication<a class="headerlink" href="#authentication" title="Permanent link">&para;</a></h2>
 <p>Llamactl uses two types of API keys:  </p>
 <ul>
-<li><strong>Management API Key</strong>: Used to authenticate with the Llamactl management API (creating, starting, stopping instances).  </li>
-<li><strong>Inference API Key</strong>: Used to authenticate requests to the OpenAI-compatible endpoints (<code>/v1/chat/completions</code>, <code>/v1/completions</code>, etc.).  </li>
+<li><strong>Management API Key</strong>: Used to authenticate with the Llamactl management API and web UI. If not configured, one is auto-generated at startup and printed to the terminal.  </li>
+<li><strong>Inference API Key</strong>: Used to authenticate requests to the OpenAI-compatible endpoints (<code>/v1/chat/completions</code>, <code>/v1/completions</code>, etc.). These are created and managed via the web UI.  </li>
 </ul>
-<p>By default, authentication is required. If you don't configure these keys in your configuration file, llamactl will auto-generate them and print them to the terminal on startup. You can also configure custom keys or disable authentication entirely in the <a href="../configuration/">Configuration</a> guide.  </p>
+<p>By default, authentication is required for both management and inference endpoints. You can configure custom management keys or disable authentication in the <a href="../configuration/">Configuration</a> guide.  </p>
 <h2 id="start-llamactl">Start Llamactl<a class="headerlink" href="#start-llamactl" title="Permanent link">&para;</a></h2>
 <p>Start the Llamactl server:  </p>
 <div class="highlight"><pre><span></span><code><a id="__codelineno-0-1" name="__codelineno-0-1" href="#__codelineno-0-1"></a>llamactl
@@ -789,22 +798,15 @@
 <a id="__codelineno-1-6" name="__codelineno-1-6" href="#__codelineno-1-6"></a>    sk-management-...
 <a id="__codelineno-1-7" name="__codelineno-1-7" href="#__codelineno-1-7"></a>
 <a id="__codelineno-1-8" name="__codelineno-1-8" href="#__codelineno-1-8"></a>━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
-<a id="__codelineno-1-9" name="__codelineno-1-9" href="#__codelineno-1-9"></a>⚠️  INFERENCE AUTHENTICATION REQUIRED
+<a id="__codelineno-1-9" name="__codelineno-1-9" href="#__codelineno-1-9"></a>⚠️  IMPORTANT
 <a id="__codelineno-1-10" name="__codelineno-1-10" href="#__codelineno-1-10"></a>━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
-<a id="__codelineno-1-11" name="__codelineno-1-11" href="#__codelineno-1-11"></a>🔑  Generated Inference API Key:
-<a id="__codelineno-1-12" name="__codelineno-1-12" href="#__codelineno-1-12"></a>
-<a id="__codelineno-1-13" name="__codelineno-1-13" href="#__codelineno-1-13"></a>    sk-inference-...
-<a id="__codelineno-1-14" name="__codelineno-1-14" href="#__codelineno-1-14"></a>
-<a id="__codelineno-1-15" name="__codelineno-1-15" href="#__codelineno-1-15"></a>━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
-<a id="__codelineno-1-16" name="__codelineno-1-16" href="#__codelineno-1-16"></a>⚠️  IMPORTANT
-<a id="__codelineno-1-17" name="__codelineno-1-17" href="#__codelineno-1-17"></a>━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
-<a id="__codelineno-1-18" name="__codelineno-1-18" href="#__codelineno-1-18"></a>• These keys are auto-generated and will change on restart
-<a id="__codelineno-1-19" name="__codelineno-1-19" href="#__codelineno-1-19"></a>• For production, add explicit keys to your configuration
-<a id="__codelineno-1-20" name="__codelineno-1-20" href="#__codelineno-1-20"></a>• Copy these keys before they disappear from the terminal
-<a id="__codelineno-1-21" name="__codelineno-1-21" href="#__codelineno-1-21"></a>━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
-<a id="__codelineno-1-22" name="__codelineno-1-22" href="#__codelineno-1-22"></a>Llamactl server listening on 0.0.0.0:8080
+<a id="__codelineno-1-11" name="__codelineno-1-11" href="#__codelineno-1-11"></a>• This key is auto-generated and will change on restart
+<a id="__codelineno-1-12" name="__codelineno-1-12" href="#__codelineno-1-12"></a>• For production, add explicit management_keys to your configuration
+<a id="__codelineno-1-13" name="__codelineno-1-13" href="#__codelineno-1-13"></a>• Copy this key before it disappears from the terminal
+<a id="__codelineno-1-14" name="__codelineno-1-14" href="#__codelineno-1-14"></a>━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+<a id="__codelineno-1-15" name="__codelineno-1-15" href="#__codelineno-1-15"></a>Llamactl server listening on 0.0.0.0:8080
 </code></pre></div>
-<p>Copy the <strong>Management</strong> and <strong>Inference</strong> API Keys from the terminal - you'll need them to access the web UI and make inference requests.  </p>
+<p>Copy the <strong>Management API Key</strong> from the terminal - you'll need it to access the web UI.  </p>
 <p>By default, Llamactl will start on <code>http://localhost:8080</code>.  </p>
 <h2 id="access-the-web-ui">Access the Web UI<a class="headerlink" href="#access-the-web-ui" title="Permanent link">&para;</a></h2>
 <p>Open your web browser and navigate to:  </p>
@@ -826,7 +828,7 @@
 </ul>
 <div class="admonition tip">
 <p class="admonition-title">Auto-Assignment</p>
-<p>Llamactl automatically assigns ports from the configured port range (default: 8000-9000) and generates API keys if authentication is enabled. You typically don't need to manually specify these values.  </p>
+<p>Llamactl automatically assigns ports from the configured port range (default: 8000-9000) and manages API keys if authentication is enabled. You typically don't need to manually specify these values.  </p>
 </div>
 <div class="admonition note">
 <p class="admonition-title">Remote Node Deployment</p>
@@ -845,6 +847,21 @@
 <li><strong>View logs</strong> by clicking the logs button  </li>
 <li><strong>Stop</strong> the instance when needed  </li>
 </ul>
+<h2 id="create-an-inference-api-key">Create an Inference API Key<a class="headerlink" href="#create-an-inference-api-key" title="Permanent link">&para;</a></h2>
+<p>To make inference requests to your instances, you'll need an inference API key:  </p>
+<ol>
+<li>In the web UI, click the <strong>Settings</strong> icon (gear icon in the top-right)  </li>
+<li>Navigate to the <strong>API Keys</strong> tab  </li>
+<li>Click <strong>Create API Key</strong>  </li>
+<li>Configure your key:  </li>
+<li><strong>Name</strong>: Give it a descriptive name (e.g., "Production Key", "Development Key")  </li>
+<li><strong>Expiration</strong>: Optionally set an expiration date for the key  </li>
+<li><strong>Permissions</strong>: Choose whether the key can access all instances or only specific ones  </li>
+<li>Click <strong>Create</strong>  </li>
+<li><strong>Copy the generated key</strong> - it will only be shown once!  </li>
+</ol>
+<p>The key will look like: <code>llamactl-...</code>  </p>
+<p>You can create multiple inference keys with different permissions for different use cases (e.g., one for development, one for production, or keys limited to specific instances).  </p>
 <h2 id="example-configurations">Example Configurations<a class="headerlink" href="#example-configurations" title="Permanent link">&para;</a></h2>
 <p>Here are basic example configurations for each backend:  </p>
 <p><strong>llama.cpp backend:</strong><br />
@@ -966,7 +983,7 @@
 </code></pre></div>
 <div class="admonition note">
 <p class="admonition-title">API Key</p>
-<p>If you disabled authentication in your config, you can use any value for <code>api_key</code> (e.g., <code>"not-needed"</code>). Otherwise, use the inference API key shown in the terminal output on startup.  </p>
+<p>If you disabled authentication in your config, you can use any value for <code>api_key</code> (e.g., <code>"not-needed"</code>). Otherwise, use the inference API key you created via the web UI (Settings → API Keys).  </p>
 </div>
 <h3 id="list-available-models">List Available Models<a class="headerlink" href="#list-available-models" title="Permanent link">&para;</a></h3>
 <p>Get a list of running instances (models) in OpenAI-compatible format:  </p>
@@ -998,7 +1015,7 @@
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1zM12.5 7v5.2l4 2.4-1 1L11 13V7zM11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-date" title="October 26, 2025 16:19:53 UTC">October 26, 2025</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-date" title="December 8, 2025 18:15:42 UTC">December 8, 2025</span>
   </span>
 
     
diff --git a/dev/search/search_index.json b/dev/search/search_index.json
index d7d5e0b..0ac7796 100644
--- a/dev/search/search_index.json
+++ b/dev/search/search_index.json
@@ -1 +1 @@
-{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"","title":"Llamactl Documentation","text":"<p>Welcome to the Llamactl documentation!  </p> <p> </p>"},{"location":"#what-is-llamactl","title":"What is Llamactl?","text":"<p>Unified management and routing for llama.cpp, MLX and vLLM models with web dashboard. </p>"},{"location":"#features","title":"Features","text":"<p>\ud83d\ude80 Easy Model Management - Multiple Models Simultaneously: Run different models at the same time (7B for speed, 70B for quality) - Smart Resource Management: Automatic idle timeout, LRU eviction, and configurable instance limits - Web Dashboard: Modern React UI for managing instances, monitoring health, and viewing logs  </p> <p>\ud83d\udd17 Flexible Integration - OpenAI API Compatible: Drop-in replacement - route requests to different models by instance name - Multi-Backend Support: Native support for llama.cpp, MLX (Apple Silicon optimized), and vLLM - Docker Ready: Run backends in containers with full GPU support  </p> <p>\ud83c\udf10 Distributed Deployment - Remote Instances: Deploy instances on remote hosts - Central Management: Manage everything from a single dashboard with automatic routing    </p>"},{"location":"#quick-links","title":"Quick Links","text":"<ul> <li>Installation Guide - Get Llamactl up and running  </li> <li>Configuration Guide - Detailed configuration options  </li> <li>Quick Start - Your first steps with Llamactl  </li> <li>Managing Instances - Instance lifecycle management  </li> <li>API Reference - Complete API documentation  </li> </ul>"},{"location":"#getting-help","title":"Getting Help","text":"<p>If you need help or have questions:  </p> <ul> <li>Check the Troubleshooting guide  </li> <li>Visit the GitHub repository </li> <li>Review the Configuration Guide for advanced settings  </li> </ul>"},{"location":"#license","title":"License","text":"<p>MIT License - see the LICENSE file.  </p>"},{"location":"api-reference/","title":"API Reference","text":""},{"location":"api-reference/#llamactl-api-10","title":"llamactl API 1.0","text":"<p>llamactl is a control server for managing Llama Server instances.  </p> License: MIT License"},{"location":"api-reference/#keys","title":"Keys","text":""},{"location":"api-reference/#get-apiv1authkeys","title":"GET /api/v1/auth/keys","text":"<p>List all API keys  </p> Description <p>Returns a list of all API keys for the system user (excludes key hash and plain-text key)  </p> <p>Input parameters </p> Parameter In Type Default Nullable Description <code>ApiKeyAuth</code> header string N/A No <p> Response 200 OK </p> <p> Response 500 Internal Server Error </p>"},{"location":"api-reference/#post-apiv1authkeys","title":"POST /api/v1/auth/keys","text":"<p>Create a new API key  </p> Description <p>Creates a new API key with the specified permissions and returns the plain- text key (only shown once)  </p> <p>Input parameters </p> Parameter In Type Default Nullable Description <code>key</code> body None No API key configuration <p> Response 201 Created </p> <p> Response 400 Bad Request </p> <p> Response 500 Internal Server Error </p>"},{"location":"api-reference/#delete-apiv1authkeysid","title":"DELETE /api/v1/auth/keys/{id}","text":"<p>Delete an API key  </p> Description <p>Deletes an API key by ID  </p> <p>Input parameters </p> Parameter In Type Default Nullable Description <code>ApiKeyAuth</code> header string N/A No <code>id</code> path None No Key ID <p> Response 204 No Content </p> <p> Response 400 Bad Request </p> <p> Response 404 Not Found </p> <p> Response 500 Internal Server Error </p>"},{"location":"api-reference/#get-apiv1authkeysid","title":"GET /api/v1/auth/keys/{id}","text":"<p>Get details of a specific API key  </p> Description <p>Returns details for a specific API key by ID (excludes key hash and plain- text key)  </p> <p>Input parameters </p> Parameter In Type Default Nullable Description <code>ApiKeyAuth</code> header string N/A No <code>id</code> path None No Key ID <p> Response 200 OK </p> <p> Response 400 Bad Request </p> <p> Response 404 Not Found </p> <p> Response 500 Internal Server Error </p>"},{"location":"api-reference/#get-apiv1authkeysidpermissions","title":"GET /api/v1/auth/keys/{id}/permissions","text":"<p>Get API key permissions  </p> Description <p>Returns the instance-level permissions for a specific API key (includes instance names)  </p> <p>Input parameters </p> Parameter In Type Default Nullable Description <code>ApiKeyAuth</code> header string N/A No <code>id</code> path None No Key ID <p> Response 200 OK </p> <p> Response 400 Bad Request </p> <p> Response 404 Not Found </p> <p> Response 500 Internal Server Error </p>"},{"location":"api-reference/#backends","title":"Backends","text":""},{"location":"api-reference/#get-apiv1backendsllama-cppdevices","title":"GET /api/v1/backends/llama-cpp/devices","text":"<p>List available devices for llama server  </p> Description <p>Returns a list of available devices for the llama server  </p> <p>Input parameters </p> Parameter In Type Default Nullable Description <code>ApiKeyAuth</code> header string N/A No <p> Response 200 OK </p> <p> Response 500 Internal Server Error </p>"},{"location":"api-reference/#get-apiv1backendsllama-cpphelp","title":"GET /api/v1/backends/llama-cpp/help","text":"<p>Get help for llama server  </p> Description <p>Returns the help text for the llama server command  </p> <p>Input parameters </p> Parameter In Type Default Nullable Description <code>ApiKeyAuth</code> header string N/A No <p> Response 200 OK </p> <p> Response 500 Internal Server Error </p>"},{"location":"api-reference/#post-apiv1backendsllama-cppparse-command","title":"POST /api/v1/backends/llama-cpp/parse-command","text":"<p>Parse llama-server command  </p> Description <p>Parses a llama-server command string into instance options  </p> <p>Input parameters </p> Parameter In Type Default Nullable Description <code>ApiKeyAuth</code> header string N/A No <code>request</code> body None No Command to parse <p> Response 200 OK </p> <p> Response 400 Bad Request </p> <p> Response 500 Internal Server Error </p>"},{"location":"api-reference/#get-apiv1backendsllama-cppversion","title":"GET /api/v1/backends/llama-cpp/version","text":"<p>Get version of llama server  </p> Description <p>Returns the version of the llama server command  </p> <p>Input parameters </p> Parameter In Type Default Nullable Description <code>ApiKeyAuth</code> header string N/A No <p> Response 200 OK </p> <p> Response 500 Internal Server Error </p>"},{"location":"api-reference/#post-apiv1backendsmlxparse-command","title":"POST /api/v1/backends/mlx/parse-command","text":"<p>Parse mlx_lm.server command  </p> Description <p>Parses MLX-LM server command string into instance options  </p> <p>Input parameters </p> Parameter In Type Default Nullable Description <code>ApiKeyAuth</code> header string N/A No <code>request</code> body None No Command to parse <p> Response 200 OK </p> <p> Response 400 Bad Request </p>"},{"location":"api-reference/#post-apiv1backendsvllmparse-command","title":"POST /api/v1/backends/vllm/parse-command","text":"<p>Parse vllm serve command  </p> Description <p>Parses a vLLM serve command string into instance options  </p> <p>Input parameters </p> Parameter In Type Default Nullable Description <code>ApiKeyAuth</code> header string N/A No <code>request</code> body None No Command to parse <p> Response 200 OK </p> <p> Response 400 Bad Request </p>"},{"location":"api-reference/#system","title":"System","text":""},{"location":"api-reference/#get-apiv1config","title":"GET /api/v1/config","text":"<p>Get server configuration  </p> Description <p>Returns the current server configuration (sanitized)  </p> <p>Input parameters </p> Parameter In Type Default Nullable Description <code>ApiKeyAuth</code> header string N/A No <p> Response 200 OK </p> <p> Response 500 Internal Server Error </p>"},{"location":"api-reference/#get-apiv1version","title":"GET /api/v1/version","text":"<p>Get llamactl version  </p> Description <p>Returns the version of the llamactl command  </p> <p>Input parameters </p> Parameter In Type Default Nullable Description <code>ApiKeyAuth</code> header string N/A No <p> Response 200 OK </p> <p> Response 500 Internal Server Error </p>"},{"location":"api-reference/#instances","title":"Instances","text":""},{"location":"api-reference/#get-apiv1instances","title":"GET /api/v1/instances","text":"<p>List all instances  </p> Description <p>Returns a list of all instances managed by the server  </p> <p>Input parameters </p> Parameter In Type Default Nullable Description <code>ApiKeyAuth</code> header string N/A No <p> Response 200 OK </p> <p> Response 500 Internal Server Error </p>"},{"location":"api-reference/#delete-apiv1instancesname","title":"DELETE /api/v1/instances/{name}","text":"<p>Delete an instance  </p> Description <p>Stops and removes a specific instance by name  </p> <p>Input parameters </p> Parameter In Type Default Nullable Description <code>ApiKeyAuth</code> header string N/A No <code>name</code> path None No Instance Name <p> Response 204 No Content </p> <p> Response 400 Bad Request </p> <p> Response 500 Internal Server Error </p>"},{"location":"api-reference/#get-apiv1instancesname","title":"GET /api/v1/instances/{name}","text":"<p>Get details of a specific instance  </p> Description <p>Returns the details of a specific instance by name  </p> <p>Input parameters </p> Parameter In Type Default Nullable Description <code>ApiKeyAuth</code> header string N/A No <code>name</code> path None No Instance Name <p> Response 200 OK </p> <p> Response 400 Bad Request </p> <p> Response 500 Internal Server Error </p>"},{"location":"api-reference/#post-apiv1instancesname","title":"POST /api/v1/instances/{name}","text":"<p>Create and start a new instance  </p> Description <p>Creates a new instance with the provided configuration options  </p> <p>Input parameters </p> Parameter In Type Default Nullable Description <code>ApiKeyAuth</code> header string N/A No <code>name</code> path None No Instance Name <code>options</code> body None No Instance configuration options <p> Response 201 Created </p> <p> Response 400 Bad Request </p> <p> Response 500 Internal Server Error </p>"},{"location":"api-reference/#put-apiv1instancesname","title":"PUT /api/v1/instances/{name}","text":"<p>Update an instance's configuration  </p> Description <p>Updates the configuration of a specific instance by name  </p> <p>Input parameters </p> Parameter In Type Default Nullable Description <code>ApiKeyAuth</code> header string N/A No <code>name</code> path None No Instance Name <code>options</code> body None No Instance configuration options <p> Response 200 OK </p> <p> Response 400 Bad Request </p> <p> Response 500 Internal Server Error </p>"},{"location":"api-reference/#get-apiv1instancesnamelogs","title":"GET /api/v1/instances/{name}/logs","text":"<p>Get logs from a specific instance  </p> Description <p>Returns the logs from a specific instance by name with optional line limit  </p> <p>Input parameters </p> Parameter In Type Default Nullable Description <code>ApiKeyAuth</code> header string N/A No <code>lines</code> query None No Number of lines to retrieve (default: all lines) <code>name</code> path None No Instance Name <p> Response 200 OK </p> <p> Response 400 Bad Request </p> <p> Response 500 Internal Server Error </p>"},{"location":"api-reference/#get-apiv1instancesnameproxy","title":"GET /api/v1/instances/{name}/proxy","text":"<p>Proxy requests to a specific instance, does not autostart instance if stopped  </p> Description <p>Forwards HTTP requests to the llama-server instance running on a specific port  </p> <p>Input parameters </p> Parameter In Type Default Nullable Description <code>ApiKeyAuth</code> header string N/A No <code>name</code> path None No Instance Name <p> Response 200 OK </p> <p> Response 400 Bad Request </p> <p> Response 500 Internal Server Error </p> <p> Response 503 Service Unavailable </p>"},{"location":"api-reference/#post-apiv1instancesnameproxy","title":"POST /api/v1/instances/{name}/proxy","text":"<p>Proxy requests to a specific instance, does not autostart instance if stopped  </p> Description <p>Forwards HTTP requests to the llama-server instance running on a specific port  </p> <p>Input parameters </p> Parameter In Type Default Nullable Description <code>ApiKeyAuth</code> header string N/A No <code>name</code> path None No Instance Name <p> Response 200 OK </p> <p> Response 400 Bad Request </p> <p> Response 500 Internal Server Error </p> <p> Response 503 Service Unavailable </p>"},{"location":"api-reference/#post-apiv1instancesnamerestart","title":"POST /api/v1/instances/{name}/restart","text":"<p>Restart a running instance  </p> Description <p>Restarts a specific instance by name  </p> <p>Input parameters </p> Parameter In Type Default Nullable Description <code>ApiKeyAuth</code> header string N/A No <code>name</code> path None No Instance Name <p> Response 200 OK </p> <p> Response 400 Bad Request </p> <p> Response 500 Internal Server Error </p>"},{"location":"api-reference/#post-apiv1instancesnamestart","title":"POST /api/v1/instances/{name}/start","text":"<p>Start a stopped instance  </p> Description <p>Starts a specific instance by name  </p> <p>Input parameters </p> Parameter In Type Default Nullable Description <code>ApiKeyAuth</code> header string N/A No <code>name</code> path None No Instance Name <p> Response 200 OK </p> <p> Response 400 Bad Request </p> <p> Response 500 Internal Server Error </p>"},{"location":"api-reference/#post-apiv1instancesnamestop","title":"POST /api/v1/instances/{name}/stop","text":"<p>Stop a running instance  </p> Description <p>Stops a specific instance by name  </p> <p>Input parameters </p> Parameter In Type Default Nullable Description <code>ApiKeyAuth</code> header string N/A No <code>name</code> path None No Instance Name <p> Response 200 OK </p> <p> Response 400 Bad Request </p> <p> Response 500 Internal Server Error </p>"},{"location":"api-reference/#nodes","title":"Nodes","text":""},{"location":"api-reference/#get-apiv1nodes","title":"GET /api/v1/nodes","text":"<p>List all configured nodes  </p> Description <p>Returns a map of all nodes configured in the server (node name -&gt; node config)  </p> <p>Input parameters </p> Parameter In Type Default Nullable Description <code>ApiKeyAuth</code> header string N/A No <p> Response 200 OK </p> <p> Response 500 Internal Server Error </p>"},{"location":"api-reference/#get-apiv1nodesname","title":"GET /api/v1/nodes/{name}","text":"<p>Get details of a specific node  </p> Description <p>Returns the details of a specific node by name  </p> <p>Input parameters </p> Parameter In Type Default Nullable Description <code>ApiKeyAuth</code> header string N/A No <code>name</code> path None No Node Name <p> Response 200 OK </p> <p> Response 400 Bad Request </p> <p> Response 404 Not Found </p> <p> Response 500 Internal Server Error </p>"},{"location":"api-reference/#llamacpp","title":"Llama.cpp","text":""},{"location":"api-reference/#get-llama-cppname","title":"GET /llama-cpp/{name}/","text":"<p>Proxy requests to llama.cpp UI for the instance  </p> Description <p>Proxies requests to the llama.cpp UI for the specified instance  </p> <p>Input parameters </p> Parameter In Type Default Nullable Description <code>ApiKeyAuth</code> header string N/A No <code>name</code> query None No Instance Name <p> Response 200 OK </p> <p> Response 400 Bad Request </p> <p> Response 500 Internal Server Error </p>"},{"location":"api-reference/#post-llama-cppnameapply-template","title":"POST /llama-cpp/{name}/apply-template","text":"<p>Proxy requests to llama.cpp server instance  </p> Description <p>Proxies requests to the specified llama.cpp server instance, starting it on- demand if configured  </p> <p>Input parameters </p> Parameter In Type Default Nullable Description <code>ApiKeyAuth</code> header string N/A No <code>name</code> path None No Instance Name <p> Response 200 OK </p> <p> Response 400 Bad Request </p> <p> Response 500 Internal Server Error </p>"},{"location":"api-reference/#post-llama-cppnamecompletion","title":"POST /llama-cpp/{name}/completion","text":"<p>Proxy requests to llama.cpp server instance  </p> Description <p>Proxies requests to the specified llama.cpp server instance, starting it on- demand if configured  </p> <p>Input parameters </p> Parameter In Type Default Nullable Description <code>ApiKeyAuth</code> header string N/A No <code>name</code> path None No Instance Name <p> Response 200 OK </p> <p> Response 400 Bad Request </p> <p> Response 500 Internal Server Error </p>"},{"location":"api-reference/#post-llama-cppnamedetokenize","title":"POST /llama-cpp/{name}/detokenize","text":"<p>Proxy requests to llama.cpp server instance  </p> Description <p>Proxies requests to the specified llama.cpp server instance, starting it on- demand if configured  </p> <p>Input parameters </p> Parameter In Type Default Nullable Description <code>ApiKeyAuth</code> header string N/A No <code>name</code> path None No Instance Name <p> Response 200 OK </p> <p> Response 400 Bad Request </p> <p> Response 500 Internal Server Error </p>"},{"location":"api-reference/#post-llama-cppnameembeddings","title":"POST /llama-cpp/{name}/embeddings","text":"<p>Proxy requests to llama.cpp server instance  </p> Description <p>Proxies requests to the specified llama.cpp server instance, starting it on- demand if configured  </p> <p>Input parameters </p> Parameter In Type Default Nullable Description <code>ApiKeyAuth</code> header string N/A No <code>name</code> path None No Instance Name <p> Response 200 OK </p> <p> Response 400 Bad Request </p> <p> Response 500 Internal Server Error </p>"},{"location":"api-reference/#post-llama-cppnameinfill","title":"POST /llama-cpp/{name}/infill","text":"<p>Proxy requests to llama.cpp server instance  </p> Description <p>Proxies requests to the specified llama.cpp server instance, starting it on- demand if configured  </p> <p>Input parameters </p> Parameter In Type Default Nullable Description <code>ApiKeyAuth</code> header string N/A No <code>name</code> path None No Instance Name <p> Response 200 OK </p> <p> Response 400 Bad Request </p> <p> Response 500 Internal Server Error </p>"},{"location":"api-reference/#post-llama-cppnamemetrics","title":"POST /llama-cpp/{name}/metrics","text":"<p>Proxy requests to llama.cpp server instance  </p> Description <p>Proxies requests to the specified llama.cpp server instance, starting it on- demand if configured  </p> <p>Input parameters </p> Parameter In Type Default Nullable Description <code>ApiKeyAuth</code> header string N/A No <code>name</code> path None No Instance Name <p> Response 200 OK </p> <p> Response 400 Bad Request </p> <p> Response 500 Internal Server Error </p>"},{"location":"api-reference/#get-llama-cppnameprops","title":"GET /llama-cpp/{name}/props","text":"<p>Proxy requests to llama.cpp server instance  </p> Description <p>Proxies requests to the specified llama.cpp server instance, starting it on- demand if configured  </p> <p>Input parameters </p> Parameter In Type Default Nullable Description <code>ApiKeyAuth</code> header string N/A No <code>name</code> path None No Instance Name <p> Response 200 OK </p> <p> Response 400 Bad Request </p> <p> Response 500 Internal Server Error </p>"},{"location":"api-reference/#post-llama-cppnameprops","title":"POST /llama-cpp/{name}/props","text":"<p>Proxy requests to llama.cpp server instance  </p> Description <p>Proxies requests to the specified llama.cpp server instance, starting it on- demand if configured  </p> <p>Input parameters </p> Parameter In Type Default Nullable Description <code>ApiKeyAuth</code> header string N/A No <code>name</code> path None No Instance Name <p> Response 200 OK </p> <p> Response 400 Bad Request </p> <p> Response 500 Internal Server Error </p>"},{"location":"api-reference/#post-llama-cppnamereranking","title":"POST /llama-cpp/{name}/reranking","text":"<p>Proxy requests to llama.cpp server instance  </p> Description <p>Proxies requests to the specified llama.cpp server instance, starting it on- demand if configured  </p> <p>Input parameters </p> Parameter In Type Default Nullable Description <code>ApiKeyAuth</code> header string N/A No <code>name</code> path None No Instance Name <p> Response 200 OK </p> <p> Response 400 Bad Request </p> <p> Response 500 Internal Server Error </p>"},{"location":"api-reference/#get-llama-cppnameslots","title":"GET /llama-cpp/{name}/slots","text":"<p>Proxy requests to llama.cpp server instance  </p> Description <p>Proxies requests to the specified llama.cpp server instance, starting it on- demand if configured  </p> <p>Input parameters </p> Parameter In Type Default Nullable Description <code>ApiKeyAuth</code> header string N/A No <code>name</code> path None No Instance Name <p> Response 200 OK </p> <p> Response 400 Bad Request </p> <p> Response 500 Internal Server Error </p>"},{"location":"api-reference/#post-llama-cppnametokenize","title":"POST /llama-cpp/{name}/tokenize","text":"<p>Proxy requests to llama.cpp server instance  </p> Description <p>Proxies requests to the specified llama.cpp server instance, starting it on- demand if configured  </p> <p>Input parameters </p> Parameter In Type Default Nullable Description <code>ApiKeyAuth</code> header string N/A No <code>name</code> path None No Instance Name <p> Response 200 OK </p> <p> Response 400 Bad Request </p> <p> Response 500 Internal Server Error </p>"},{"location":"api-reference/#openai","title":"OpenAI","text":""},{"location":"api-reference/#post-v1","title":"POST /v1/","text":"<p>OpenAI-compatible proxy endpoint  </p> Description <p>Handles all POST requests to /v1/*, routing to the appropriate instance based on the request body. Requires API key authentication via the <code>Authorization</code> header.  </p> <p>Input parameters </p> Parameter In Type Default Nullable Description <code>ApiKeyAuth</code> header string N/A No <p> Response 200 OK </p> <p> Response 400 Bad Request </p> <p> Response 500 Internal Server Error </p>"},{"location":"api-reference/#get-v1models","title":"GET /v1/models","text":"<p>List instances in OpenAI-compatible format  </p> Description <p>Returns a list of instances in a format compatible with OpenAI API  </p> <p>Input parameters </p> Parameter In Type Default Nullable Description <code>ApiKeyAuth</code> header string N/A No <p> Response 200 OK </p> <p> Response 500 Internal Server Error </p>"},{"location":"configuration/","title":"Configuration","text":"<p>llamactl can be configured via configuration files or environment variables. Configuration is loaded in the following order of precedence:  </p> <pre><code>Defaults &lt; Configuration file &lt; Environment variables\n</code></pre> <p>llamactl works out of the box with sensible defaults, but you can customize the behavior to suit your needs.  </p>"},{"location":"configuration/#default-configuration","title":"Default Configuration","text":"<p>Here's the default configuration with all available options:  </p> <pre><code>server:\n  host: \"0.0.0.0\"                # Server host to bind to\n  port: 8080                     # Server port to bind to\n  allowed_origins: [\"*\"]         # Allowed CORS origins (default: all)\n  allowed_headers: [\"*\"]         # Allowed CORS headers (default: all)\n  enable_swagger: false          # Enable Swagger UI for API docs\n\nbackends:\n  llama-cpp:\n    command: \"llama-server\"\n    args: []\n    environment: {}              # Environment variables for the backend process\n    docker:\n      enabled: false\n      image: \"ghcr.io/ggml-org/llama.cpp:server\"\n      args: [\"run\", \"--rm\", \"--network\", \"host\", \"--gpus\", \"all\"]\n      environment: {}\n    response_headers: {}         # Additional response headers to send with responses\n\n  vllm:\n    command: \"vllm\"\n    args: [\"serve\"]\n    environment: {}              # Environment variables for the backend process\n    docker:\n      enabled: false\n      image: \"vllm/vllm-openai:latest\"\n      args: [\"run\", \"--rm\", \"--network\", \"host\", \"--gpus\", \"all\", \"--shm-size\", \"1g\"]\n      environment: {}\n    response_headers: {}         # Additional response headers to send with responses\n\n  mlx:\n    command: \"mlx_lm.server\"\n    args: []\n    environment: {}              # Environment variables for the backend process\n    response_headers: {}         # Additional response headers to send with responses\n\ndata_dir: ~/.local/share/llamactl  # Main data directory (database, instances, logs), default varies by OS\n\ninstances:\n  port_range: [8000, 9000]         # Port range for instances\n  configs_dir: data_dir/instances  # Instance configs directory\n  logs_dir: data_dir/logs          # Logs directory\n  auto_create_dirs: true           # Auto-create data/config/logs dirs if missing\n  max_instances: -1                # Max instances (-1 = unlimited)\n  max_running_instances: -1        # Max running instances (-1 = unlimited)\n  enable_lru_eviction: true        # Enable LRU eviction for idle instances\n  default_auto_restart: true       # Auto-restart new instances by default\n  default_max_restarts: 3          # Max restarts for new instances\n  default_restart_delay: 5         # Restart delay (seconds) for new instances\n  default_on_demand_start: true    # Default on-demand start setting\n  on_demand_start_timeout: 120     # Default on-demand start timeout in seconds\n  timeout_check_interval: 5        # Idle instance timeout check in minutes\n\ndatabase:\n  path: data_dir/llamactl.db              # Database file path\n  max_open_connections: 25       # Maximum open database connections\n  max_idle_connections: 5        # Maximum idle database connections\n  connection_max_lifetime: 5m    # Connection max lifetime\n\nauth:\n  require_inference_auth: true   # Require auth for inference endpoints\n  inference_keys: []             # Keys for inference endpoints\n  require_management_auth: true  # Require auth for management endpoints\n  management_keys: []            # Keys for management endpoints\n\nlocal_node: \"main\"               # Name of the local node (default: \"main\")\nnodes:                           # Node configuration for multi-node deployment\n  main:                          # Default local node (empty config)\n</code></pre>"},{"location":"configuration/#configuration-files","title":"Configuration Files","text":""},{"location":"configuration/#configuration-file-locations","title":"Configuration File Locations","text":"<p>Configuration files are searched in the following locations (in order of precedence, first found is used):  </p> <p>Linux: - <code>./llamactl.yaml</code> or <code>./config.yaml</code> (current directory) - <code>$HOME/.config/llamactl/config.yaml</code> - <code>/etc/llamactl/config.yaml</code> </p> <p>macOS: - <code>./llamactl.yaml</code> or <code>./config.yaml</code> (current directory) - <code>$HOME/Library/Application Support/llamactl/config.yaml</code> - <code>/Library/Application Support/llamactl/config.yaml</code> </p> <p>Windows: - <code>./llamactl.yaml</code> or <code>./config.yaml</code> (current directory) - <code>%APPDATA%\\llamactl\\config.yaml</code> - <code>%USERPROFILE%\\llamactl\\config.yaml</code> - <code>%PROGRAMDATA%\\llamactl\\config.yaml</code> </p> <p>You can specify the path to config file with <code>LLAMACTL_CONFIG_PATH</code> environment variable.  </p>"},{"location":"configuration/#configuration-options","title":"Configuration Options","text":""},{"location":"configuration/#server-configuration","title":"Server Configuration","text":"<pre><code>server:\n  host: \"0.0.0.0\"         # Server host to bind to (default: \"0.0.0.0\")\n  port: 8080              # Server port to bind to (default: 8080)\n  allowed_origins: [\"*\"]  # CORS allowed origins (default: [\"*\"])\n  allowed_headers: [\"*\"]  # CORS allowed headers (default: [\"*\"])\n  enable_swagger: false   # Enable Swagger UI (default: false)\n</code></pre> <p>Environment Variables: - <code>LLAMACTL_HOST</code> - Server host - <code>LLAMACTL_PORT</code> - Server port - <code>LLAMACTL_ALLOWED_ORIGINS</code> - Comma-separated CORS origins - <code>LLAMACTL_ENABLE_SWAGGER</code> - Enable Swagger UI (true/false)  </p>"},{"location":"configuration/#backend-configuration","title":"Backend Configuration","text":"<pre><code>backends:\n  llama-cpp:\n    command: \"llama-server\"\n    args: []\n    environment: {}              # Environment variables for the backend process\n    docker:\n      enabled: false             # Enable Docker runtime (default: false)\n      image: \"ghcr.io/ggml-org/llama.cpp:server\"\n      args: [\"run\", \"--rm\", \"--network\", \"host\", \"--gpus\", \"all\"]\n      environment: {}\n    response_headers: {}         # Additional response headers to send with responses\n\n  vllm:\n    command: \"vllm\"\n    args: [\"serve\"]\n    environment: {}              # Environment variables for the backend process\n    docker:\n      enabled: false             # Enable Docker runtime (default: false)\n      image: \"vllm/vllm-openai:latest\"\n      args: [\"run\", \"--rm\", \"--network\", \"host\", \"--gpus\", \"all\", \"--shm-size\", \"1g\"]\n      environment: {}\n    response_headers: {}         # Additional response headers to send with responses\n\n  mlx:\n    command: \"mlx_lm.server\"\n    args: []\n    environment: {}              # Environment variables for the backend process\n    # MLX does not support Docker\n    response_headers: {}         # Additional response headers to send with responses\n</code></pre> <p>Backend Configuration Fields: - <code>command</code>: Executable name/path for the backend - <code>args</code>: Default arguments prepended to all instances - <code>environment</code>: Environment variables for the backend process (optional) - <code>response_headers</code>: Additional response headers to send with responses (optional) - <code>docker</code>: Docker-specific configuration (optional)   - <code>enabled</code>: Boolean flag to enable Docker runtime   - <code>image</code>: Docker image to use   - <code>args</code>: Additional arguments passed to <code>docker run</code>   - <code>environment</code>: Environment variables for the container (optional)  </p> <p>If llamactl is behind an NGINX proxy, <code>X-Accel-Buffering: no</code> response header may be required for NGINX to properly stream the responses without buffering.</p> <p>Environment Variables: </p> <p>LlamaCpp Backend: - <code>LLAMACTL_LLAMACPP_COMMAND</code> - LlamaCpp executable command - <code>LLAMACTL_LLAMACPP_ARGS</code> - Space-separated default arguments - <code>LLAMACTL_LLAMACPP_ENV</code> - Environment variables in format \"KEY1=value1,KEY2=value2\" - <code>LLAMACTL_LLAMACPP_DOCKER_ENABLED</code> - Enable Docker runtime (true/false) - <code>LLAMACTL_LLAMACPP_DOCKER_IMAGE</code> - Docker image to use - <code>LLAMACTL_LLAMACPP_DOCKER_ARGS</code> - Space-separated Docker arguments - <code>LLAMACTL_LLAMACPP_DOCKER_ENV</code> - Docker environment variables in format \"KEY1=value1,KEY2=value2\" - <code>LLAMACTL_LLAMACPP_RESPONSE_HEADERS</code> - Response headers in format \"KEY1=value1;KEY2=value2\"  </p> <p>VLLM Backend: - <code>LLAMACTL_VLLM_COMMAND</code> - VLLM executable command - <code>LLAMACTL_VLLM_ARGS</code> - Space-separated default arguments - <code>LLAMACTL_VLLM_ENV</code> - Environment variables in format \"KEY1=value1,KEY2=value2\" - <code>LLAMACTL_VLLM_DOCKER_ENABLED</code> - Enable Docker runtime (true/false) - <code>LLAMACTL_VLLM_DOCKER_IMAGE</code> - Docker image to use - <code>LLAMACTL_VLLM_DOCKER_ARGS</code> - Space-separated Docker arguments - <code>LLAMACTL_VLLM_DOCKER_ENV</code> - Docker environment variables in format \"KEY1=value1,KEY2=value2\" - <code>LLAMACTL_VLLM_RESPONSE_HEADERS</code> - Response headers in format \"KEY1=value1;KEY2=value2\"  </p> <p>MLX Backend: - <code>LLAMACTL_MLX_COMMAND</code> - MLX executable command - <code>LLAMACTL_MLX_ARGS</code> - Space-separated default arguments - <code>LLAMACTL_MLX_ENV</code> - Environment variables in format \"KEY1=value1,KEY2=value2\" - <code>LLAMACTL_MLX_RESPONSE_HEADERS</code> - Response headers in format \"KEY1=value1;KEY2=value2\"  </p>"},{"location":"configuration/#data-directory-configuration","title":"Data Directory Configuration","text":"<pre><code>data_dir: \"~/.local/share/llamactl\"  # Main data directory for database, instances, and logs (default varies by OS)\n</code></pre> <p>Environment Variables: - <code>LLAMACTL_DATA_DIRECTORY</code> - Main data directory path  </p> <p>Default Data Directory by Platform: - Linux: <code>~/.local/share/llamactl</code> - macOS: <code>~/Library/Application Support/llamactl</code> - Windows: <code>%LOCALAPPDATA%\\llamactl</code> or <code>%PROGRAMDATA%\\llamactl</code> </p>"},{"location":"configuration/#instance-configuration","title":"Instance Configuration","text":"<pre><code>instances:\n  port_range: [8000, 9000]      # Port range for instances (default: [8000, 9000])\n  configs_dir: \"instances\"      # Directory for instance configs, default: data_dir/instances\n  logs_dir: \"logs\"              # Directory for instance logs, default: data_dir/logs\n  auto_create_dirs: true        # Automatically create data/config/logs directories (default: true)\n  max_instances: -1             # Maximum instances (-1 = unlimited)\n  max_running_instances: -1     # Maximum running instances (-1 = unlimited)\n  enable_lru_eviction: true     # Enable LRU eviction for idle instances\n  default_auto_restart: true    # Default auto-restart setting\n  default_max_restarts: 3       # Default maximum restart attempts\n  default_restart_delay: 5      # Default restart delay in seconds\n  default_on_demand_start: true # Default on-demand start setting\n  on_demand_start_timeout: 120  # Default on-demand start timeout in seconds\n  timeout_check_interval: 5     # Default instance timeout check interval in minutes\n</code></pre> <p>Environment Variables: - <code>LLAMACTL_INSTANCE_PORT_RANGE</code> - Port range (format: \"8000-9000\" or \"8000,9000\") - <code>LLAMACTL_INSTANCES_DIR</code> - Instance configs directory path - <code>LLAMACTL_LOGS_DIR</code> - Log directory path - <code>LLAMACTL_AUTO_CREATE_DATA_DIR</code> - Auto-create data/config/logs directories (true/false) - <code>LLAMACTL_MAX_INSTANCES</code> - Maximum number of instances - <code>LLAMACTL_MAX_RUNNING_INSTANCES</code> - Maximum number of running instances - <code>LLAMACTL_ENABLE_LRU_EVICTION</code> - Enable LRU eviction for idle instances - <code>LLAMACTL_DEFAULT_AUTO_RESTART</code> - Default auto-restart setting (true/false) - <code>LLAMACTL_DEFAULT_MAX_RESTARTS</code> - Default maximum restarts - <code>LLAMACTL_DEFAULT_RESTART_DELAY</code> - Default restart delay in seconds - <code>LLAMACTL_DEFAULT_ON_DEMAND_START</code> - Default on-demand start setting (true/false) - <code>LLAMACTL_ON_DEMAND_START_TIMEOUT</code> - Default on-demand start timeout in seconds - <code>LLAMACTL_TIMEOUT_CHECK_INTERVAL</code> - Default instance timeout check interval in minutes  </p>"},{"location":"configuration/#database-configuration","title":"Database Configuration","text":"<pre><code>database:\n  path: \"llamactl.db\"              # Database file path, default: data_dir/llamactl.db\n  max_open_connections: 25         # Maximum open database connections (default: 25)\n  max_idle_connections: 5          # Maximum idle database connections (default: 5)\n  connection_max_lifetime: 5m      # Connection max lifetime (default: 5m)\n</code></pre> <p>Environment Variables: - <code>LLAMACTL_DATABASE_PATH</code> - Database file path (relative to data_dir or absolute) - <code>LLAMACTL_DATABASE_MAX_OPEN_CONNECTIONS</code> - Maximum open database connections - <code>LLAMACTL_DATABASE_MAX_IDLE_CONNECTIONS</code> - Maximum idle database connections - <code>LLAMACTL_DATABASE_CONN_MAX_LIFETIME</code> - Connection max lifetime (e.g., \"5m\", \"1h\")  </p>"},{"location":"configuration/#authentication-configuration","title":"Authentication Configuration","text":"<pre><code>auth:\n  require_inference_auth: true           # Require API key for OpenAI endpoints (default: true)\n  inference_keys: []                     # List of valid inference API keys\n  require_management_auth: true          # Require API key for management endpoints (default: true)\n  management_keys: []                    # List of valid management API keys\n</code></pre> <p>Environment Variables: - <code>LLAMACTL_REQUIRE_INFERENCE_AUTH</code> - Require auth for OpenAI endpoints (true/false) - <code>LLAMACTL_INFERENCE_KEYS</code> - Comma-separated inference API keys - <code>LLAMACTL_REQUIRE_MANAGEMENT_AUTH</code> - Require auth for management endpoints (true/false) - <code>LLAMACTL_MANAGEMENT_KEYS</code> - Comma-separated management API keys  </p>"},{"location":"configuration/#remote-node-configuration","title":"Remote Node Configuration","text":"<p>llamactl supports remote node deployments. Configure remote nodes to deploy instances on remote hosts and manage them centrally.  </p> <pre><code>local_node: \"main\"               # Name of the local node (default: \"main\")\nnodes:                           # Node configuration map\n  main:                          # Local node (empty address means local)\n    address: \"\"                  # Not used for local node\n    api_key: \"\"                  # Not used for local node\n  worker1:                       # Remote worker node\n    address: \"http://192.168.1.10:8080\"\n    api_key: \"worker1-api-key\"   # Management API key for authentication\n</code></pre> <p>Node Configuration Fields: - <code>local_node</code>: Specifies which node in the <code>nodes</code> map represents the local node. Must match exactly what other nodes call this node. - <code>nodes</code>: Map of node configurations   - <code>address</code>: HTTP/HTTPS URL of the remote node (empty for local node)   - <code>api_key</code>: Management API key for authenticating with the remote node  </p> <p>Environment Variables: - <code>LLAMACTL_LOCAL_NODE</code> - Name of the local node  </p>"},{"location":"installation/","title":"Installation","text":"<p>This guide will walk you through installing Llamactl on your system.  </p>"},{"location":"installation/#prerequisites","title":"Prerequisites","text":""},{"location":"installation/#backend-dependencies","title":"Backend Dependencies","text":"<p>llamactl supports multiple backends. Install at least one:  </p> <p>For llama.cpp backend (all platforms): </p> <p>You need <code>llama-server</code> from llama.cpp installed:  </p> <pre><code># Homebrew (macOS/Linux)\nbrew install llama.cpp\n# Winget (Windows)\nwinget install llama.cpp\n</code></pre> <p>Or build from source - see llama.cpp docs  </p> <p>For MLX backend (macOS only): </p> <p>MLX provides optimized inference on Apple Silicon. Install MLX-LM:  </p> <pre><code># Install via pip (requires Python 3.8+)\npip install mlx-lm\n\n# Or in a virtual environment (recommended)\npython -m venv mlx-env\nsource mlx-env/bin/activate\npip install mlx-lm\n</code></pre> <p>Note: MLX backend is only available on macOS with Apple Silicon (M1, M2, M3, etc.)  </p> <p>For vLLM backend: </p> <p>vLLM provides high-throughput distributed serving for LLMs. Install vLLM:  </p> <pre><code># Install in a virtual environment\npython -m venv vllm-env\nsource vllm-env/bin/activate\npip install vllm\n</code></pre>"},{"location":"installation/#installation-methods","title":"Installation Methods","text":""},{"location":"installation/#option-1-download-binary-recommended","title":"Option 1: Download Binary (Recommended)","text":"<p>Download the latest release from the GitHub releases page:  </p> <pre><code># Linux/macOS - Get latest version and download\nLATEST_VERSION=$(curl -s https://api.github.com/repos/lordmathis/llamactl/releases/latest | grep '\"tag_name\":' | sed -E 's/.*\"([^\"]+)\".*/\\1/')\ncurl -L https://github.com/lordmathis/llamactl/releases/download/${LATEST_VERSION}/llamactl-${LATEST_VERSION}-$(uname -s | tr '[:upper:]' '[:lower:]')-$(uname -m).tar.gz | tar -xz\nsudo mv llamactl /usr/local/bin/\n\n# Or download manually from:\n# https://github.com/lordmathis/llamactl/releases/latest\n\n# Windows - Download from releases page\n</code></pre>"},{"location":"installation/#option-2-docker","title":"Option 2: Docker","text":"<p>llamactl provides Dockerfiles for creating Docker images with backends pre-installed. The resulting images include the latest llamactl release with the respective backend.  </p> <p>Available Dockerfiles (CUDA): - llamactl with llama.cpp CUDA: <code>docker/Dockerfile.llamacpp</code> (based on <code>ghcr.io/ggml-org/llama.cpp:server-cuda</code>) - llamactl with vLLM CUDA: <code>docker/Dockerfile.vllm</code> (based on <code>vllm/vllm-openai:latest</code>) - llamactl built from source: <code>docker/Dockerfile.source</code> (multi-stage build with webui)  </p> <p>Note: These Dockerfiles are configured for CUDA. For other platforms (CPU, ROCm, Vulkan, etc.), adapt the base image. For llama.cpp, see available tags at llama.cpp Docker docs. For vLLM, check vLLM docs.  </p> <p>Using Docker Compose </p> <pre><code># Clone the repository\ngit clone https://github.com/lordmathis/llamactl.git\ncd llamactl\n\n# Create directories for data and models\nmkdir -p data/llamacpp data/vllm models\n\n# Start llamactl with llama.cpp backend\ndocker-compose -f docker/docker-compose.yml up llamactl-llamacpp -d\n\n# Or start llamactl with vLLM backend\ndocker-compose -f docker/docker-compose.yml up llamactl-vllm -d\n</code></pre> <p>Access the dashboard at: - llamactl with llama.cpp: http://localhost:8080 - llamactl with vLLM: http://localhost:8081  </p> <p>Using Docker Build and Run </p> <ol> <li> <p>llamactl with llama.cpp CUDA: <pre><code>docker build -f docker/Dockerfile.llamacpp -t llamactl:llamacpp-cuda .\ndocker run -d \\\n  --name llamactl-llamacpp \\\n  --gpus all \\\n  -p 8080:8080 \\\n  -v ~/.cache/llama.cpp:/root/.cache/llama.cpp \\\n  llamactl:llamacpp-cuda\n</code></pre></p> </li> <li> <p>llamactl with vLLM CUDA: <pre><code>docker build -f docker/Dockerfile.vllm -t llamactl:vllm-cuda .\ndocker run -d \\\n  --name llamactl-vllm \\\n  --gpus all \\\n  -p 8080:8080 \\\n  -v ~/.cache/huggingface:/root/.cache/huggingface \\\n  llamactl:vllm-cuda\n</code></pre></p> </li> <li> <p>llamactl built from source: <pre><code>docker build -f docker/Dockerfile.source -t llamactl:source .\ndocker run -d \\\n  --name llamactl \\\n  -p 8080:8080 \\\n  llamactl:source\n</code></pre></p> </li> </ol>"},{"location":"installation/#option-3-build-from-source","title":"Option 3: Build from Source","text":"<p>Requirements: - Go 1.24 or later - Node.js 22 or later - Git  </p> <p>If you prefer to build from source:  </p> <pre><code># Clone the repository\ngit clone https://github.com/lordmathis/llamactl.git\ncd llamactl\n\n# Build the web UI\ncd webui &amp;&amp; npm ci &amp;&amp; npm run build &amp;&amp; cd ..\n\n# Build the application\ngo build -o llamactl ./cmd/server\n</code></pre>"},{"location":"installation/#remote-node-installation","title":"Remote Node Installation","text":"<p>For deployments with remote nodes: - Install llamactl on each node using any of the methods above - Configure API keys for authentication between nodes - Ensure node names are consistent across all configurations  </p>"},{"location":"installation/#verification","title":"Verification","text":"<p>Verify your installation by checking the version:  </p> <pre><code>llamactl --version\n</code></pre>"},{"location":"installation/#next-steps","title":"Next Steps","text":"<p>Now that Llamactl is installed, continue to the Quick Start guide to get your first instance running!  </p> <p>For remote node deployments, see the Configuration Guide for node setup instructions.  </p>"},{"location":"managing-instances/","title":"Managing Instances","text":"<p>Learn how to effectively manage your llama.cpp, MLX, and vLLM instances with Llamactl through both the Web UI and API.  </p>"},{"location":"managing-instances/#overview","title":"Overview","text":"<p>Llamactl provides two ways to manage instances:  </p> <ul> <li>Web UI: Accessible at <code>http://localhost:8080</code> with an intuitive dashboard  </li> <li>REST API: Programmatic access for automation and integration  </li> </ul> <p> </p>"},{"location":"managing-instances/#authentication","title":"Authentication","text":"<p>Llamactl uses a Management API Key to authenticate requests to the management API (creating, starting, stopping instances). All curl examples below use <code>&lt;token&gt;</code> as a placeholder - replace this with your actual Management API Key.  </p> <p>By default, authentication is required. If you don't configure a management API key in your configuration file, llamactl will auto-generate one and print it to the terminal on startup. See the Configuration guide for details.  </p> <p>For Web UI access: 1. Navigate to the web UI 2. Enter your Management API Key 3. Bearer token is stored for the session  </p>"},{"location":"managing-instances/#theme-support","title":"Theme Support","text":"<ul> <li>Switch between light and dark themes  </li> <li>Setting is remembered across sessions  </li> </ul>"},{"location":"managing-instances/#instance-cards","title":"Instance Cards","text":"<p>Each instance is displayed as a card showing:  </p> <ul> <li>Instance name </li> <li>Health status badge (unknown, ready, error, failed)  </li> <li>Action buttons (start, stop, edit, logs, delete)  </li> </ul>"},{"location":"managing-instances/#create-instance","title":"Create Instance","text":"<p>Via Web UI </p> <p> </p> <ol> <li>Click the \"Create Instance\" button on the dashboard  </li> <li>Optional: Click \"Import\" to load a previously exported configuration  </li> </ol> <p>Instance Settings: </p> <ol> <li>Enter a unique Instance Name (required)  </li> <li>Select Node: Choose which node to deploy the instance to  </li> <li>Configure Auto Restart settings:  <ul> <li>Enable automatic restart on failure  </li> <li>Set max restarts and delay between attempts  </li> </ul> </li> <li>Configure basic instance options:  <ul> <li>Idle Timeout: Minutes before stopping idle instance  </li> <li>On Demand Start: Start instance only when needed  </li> </ul> </li> </ol> <p>Backend Configuration: </p> <ol> <li>Select Backend Type:  <ul> <li>Llama Server: For GGUF models using llama-server  </li> <li>MLX LM: For MLX-optimized models (macOS only)  </li> <li>vLLM: For distributed serving and high-throughput inference  </li> </ul> </li> <li>Optional: Click \"Parse Command\" to import settings from an existing backend command  </li> <li>Configure Execution Context:  <ul> <li>Enable Docker: Run backend in Docker container  </li> <li>Command Override: Custom path to backend executable  </li> <li>Environment Variables: Custom environment variables  </li> </ul> </li> </ol> <p>Auto-Assignment</p> <p>Llamactl automatically assigns ports from the configured port range (default: 8000-9000) and generates API keys if authentication is enabled. You typically don't need to manually specify these values.  </p> <ol> <li>Configure Basic Backend Options (varies by backend):  <ul> <li>llama.cpp: Model path, threads, context size, GPU layers, etc.  </li> <li>MLX: Model identifier, temperature, max tokens, etc.  </li> <li>vLLM: Model identifier, tensor parallel size, GPU memory utilization, etc.  </li> </ul> </li> <li>Optional: Expand Advanced Backend Options for additional settings  </li> <li>Optional: Add Extra Args as key-value pairs for custom command-line arguments  </li> <li>Click \"Create\" to save the instance  </li> </ol> <p>Via API </p> <pre><code># Create llama.cpp instance with local model file\ncurl -X POST http://localhost:8080/api/v1/instances/my-llama-instance \\\n  -H \"Content-Type: application/json\" \\\n  -H \"Authorization: Bearer &lt;token&gt;\" \\\n  -d '{\n    \"backend_type\": \"llama_cpp\",\n    \"backend_options\": {\n      \"model\": \"/path/to/model.gguf\",\n      \"threads\": 8,\n      \"ctx_size\": 4096,\n      \"gpu_layers\": 32,\n      \"flash_attn\": \"on\"\n    },\n    \"auto_restart\": true,\n    \"max_restarts\": 3,\n    \"docker_enabled\": false,\n    \"command_override\": \"/opt/llama-server-dev\",\n    \"nodes\": [\"main\"]\n  }'\n\n# Create vLLM instance with environment variables\ncurl -X POST http://localhost:8080/api/v1/instances/my-vllm-instance \\\n  -H \"Content-Type: application/json\" \\\n  -H \"Authorization: Bearer &lt;token&gt;\" \\\n  -d '{\n    \"backend_type\": \"vllm\",\n    \"backend_options\": {\n      \"model\": \"microsoft/DialoGPT-medium\",\n      \"tensor_parallel_size\": 2,\n      \"gpu_memory_utilization\": 0.9\n    },\n    \"on_demand_start\": true,\n    \"environment\": {\n      \"CUDA_VISIBLE_DEVICES\": \"0,1\"\n    },\n    \"nodes\": [\"worker1\", \"worker2\"]\n  }'\n\n# Create MLX instance (macOS only)\ncurl -X POST http://localhost:8080/api/v1/instances/my-mlx-instance \\\n  -H \"Content-Type: application/json\" \\\n  -H \"Authorization: Bearer &lt;token&gt;\" \\\n  -d '{\n    \"backend_type\": \"mlx_lm\",\n    \"backend_options\": {\n      \"model\": \"mlx-community/Mistral-7B-Instruct-v0.3-4bit\",\n      \"temp\": 0.7,\n      \"max_tokens\": 2048\n    },\n    \"nodes\": [\"main\"]\n  }'\n</code></pre>"},{"location":"managing-instances/#start-instance","title":"Start Instance","text":"<p>Via Web UI 1. Click the \"Start\" button on an instance card 2. Watch the status change to \"Unknown\" 3. Monitor progress in the logs 4. Instance status changes to \"Ready\" when ready  </p> <p>Via API <pre><code>curl -X POST http://localhost:8080/api/v1/instances/{name}/start \\\n  -H \"Authorization: Bearer &lt;token&gt;\"\n</code></pre></p>"},{"location":"managing-instances/#stop-instance","title":"Stop Instance","text":"<p>Via Web UI 1. Click the \"Stop\" button on an instance card 2. Instance gracefully shuts down  </p> <p>Via API <pre><code>curl -X POST http://localhost:8080/api/v1/instances/{name}/stop \\\n  -H \"Authorization: Bearer &lt;token&gt;\"\n</code></pre></p>"},{"location":"managing-instances/#edit-instance","title":"Edit Instance","text":"<p>Via Web UI 1. Click the \"Edit\" button on an instance card 2. Modify settings in the configuration dialog 3. Changes require instance restart to take effect 4. Click \"Update &amp; Restart\" to apply changes  </p> <p>Via API Modify instance settings:  </p> <pre><code>curl -X PUT http://localhost:8080/api/v1/instances/{name} \\\n  -H \"Content-Type: application/json\" \\\n  -H \"Authorization: Bearer &lt;token&gt;\" \\\n  -d '{\n    \"backend_options\": {\n      \"threads\": 8,\n      \"context_size\": 4096\n    }\n  }'\n</code></pre> <p>Note</p> <p>Configuration changes require restarting the instance to take effect.  </p>"},{"location":"managing-instances/#export-instance","title":"Export Instance","text":"<p>Via Web UI 1. Click the \"More actions\" button (three dots) on an instance card 2. Click \"Export\" to download the instance configuration as a JSON file  </p>"},{"location":"managing-instances/#view-logs","title":"View Logs","text":"<p>Via Web UI </p> <ol> <li>Click the \"Logs\" button on any instance card  </li> <li>Real-time log viewer opens  </li> </ol> <p>Via API Check instance status in real-time:  </p> <pre><code># Get instance logs\ncurl http://localhost:8080/api/v1/instances/{name}/logs \\\n  -H \"Authorization: Bearer &lt;token&gt;\"\n</code></pre>"},{"location":"managing-instances/#delete-instance","title":"Delete Instance","text":"<p>Via Web UI 1. Click the \"Delete\" button on an instance card 2. Only stopped instances can be deleted 3. Confirm deletion in the dialog  </p> <p>Via API <pre><code>curl -X DELETE http://localhost:8080/api/v1/instances/{name} \\\n  -H \"Authorization: Bearer &lt;token&gt;\"\n</code></pre></p>"},{"location":"managing-instances/#instance-proxy","title":"Instance Proxy","text":"<p>Llamactl proxies all requests to the underlying backend instances (llama-server, MLX, or vLLM).  </p> <pre><code># Proxy requests to the instance\ncurl http://localhost:8080/api/v1/instances/{name}/proxy/ \\\n  -H \"Authorization: Bearer &lt;token&gt;\"\n</code></pre> <p>All backends provide OpenAI-compatible endpoints. Check the respective documentation: - llama-server docs - MLX-LM docs - vLLM docs </p>"},{"location":"managing-instances/#instance-health","title":"Instance Health","text":"<p>Via Web UI </p> <ol> <li>The health status badge is displayed on each instance card  </li> </ol> <p>Via API </p> <p>Check the health status of your instances:  </p> <pre><code>curl http://localhost:8080/api/v1/instances/{name}/proxy/health \\\n  -H \"Authorization: Bearer &lt;token&gt;\"\n</code></pre>"},{"location":"quick-start/","title":"Quick Start","text":"<p>This guide will help you get Llamactl up and running in just a few minutes.  </p> <p>Before you begin: Ensure you have at least one backend installed (llama.cpp, MLX, or vLLM). See the Installation Guide for backend setup.  </p>"},{"location":"quick-start/#core-concepts","title":"Core Concepts","text":"<p>Before you start, let's clarify a few key terms:  </p> <ul> <li>Instance: A running backend server that serves a specific model. Each instance has a unique name and runs independently.  </li> <li>Backend: The inference engine that actually runs the model (llama.cpp, MLX, or vLLM). You need at least one backend installed before creating instances.  </li> <li>Node: In multi-machine setups, a node represents one machine. Most users will just use the default \"main\" node for single-machine deployments.  </li> <li>Proxy Architecture: Llamactl acts as a proxy in front of your instances. You make requests to llamactl (e.g., <code>http://localhost:8080/v1/chat/completions</code>), and it routes them to the appropriate backend instance. This means you don't need to track individual instance ports or endpoints.  </li> </ul>"},{"location":"quick-start/#authentication","title":"Authentication","text":"<p>Llamactl uses two types of API keys:  </p> <ul> <li>Management API Key: Used to authenticate with the Llamactl management API (creating, starting, stopping instances).  </li> <li>Inference API Key: Used to authenticate requests to the OpenAI-compatible endpoints (<code>/v1/chat/completions</code>, <code>/v1/completions</code>, etc.).  </li> </ul> <p>By default, authentication is required. If you don't configure these keys in your configuration file, llamactl will auto-generate them and print them to the terminal on startup. You can also configure custom keys or disable authentication entirely in the Configuration guide.  </p>"},{"location":"quick-start/#start-llamactl","title":"Start Llamactl","text":"<p>Start the Llamactl server:  </p> <pre><code>llamactl\n</code></pre> <pre><code>\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\n\u26a0\ufe0f  MANAGEMENT AUTHENTICATION REQUIRED\n\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\n\ud83d\udd11  Generated Management API Key:\n\n    sk-management-...\n\n\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\n\u26a0\ufe0f  INFERENCE AUTHENTICATION REQUIRED\n\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\n\ud83d\udd11  Generated Inference API Key:\n\n    sk-inference-...\n\n\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\n\u26a0\ufe0f  IMPORTANT\n\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\n\u2022 These keys are auto-generated and will change on restart\n\u2022 For production, add explicit keys to your configuration\n\u2022 Copy these keys before they disappear from the terminal\n\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\nLlamactl server listening on 0.0.0.0:8080\n</code></pre> <p>Copy the Management and Inference API Keys from the terminal - you'll need them to access the web UI and make inference requests.  </p> <p>By default, Llamactl will start on <code>http://localhost:8080</code>.  </p>"},{"location":"quick-start/#access-the-web-ui","title":"Access the Web UI","text":"<p>Open your web browser and navigate to:  </p> <pre><code>http://localhost:8080\n</code></pre> <p>Login with the management API key from the terminal output.  </p> <p>You should see the Llamactl web interface.  </p>"},{"location":"quick-start/#create-your-first-instance","title":"Create Your First Instance","text":"<ol> <li>Click the \"Add Instance\" button  </li> <li> <p>Fill in the instance configuration:  </p> <ul> <li>Name: Give your instance a descriptive name  </li> <li>Node: Select which node to deploy the instance to (defaults to \"main\" for single-node setups)  </li> <li>Backend Type: Choose from llama.cpp, MLX, or vLLM  </li> <li>Model: Model path or huggingface repo  </li> <li>Additional Options: Backend-specific parameters  </li> </ul> <p>Auto-Assignment</p> <p>Llamactl automatically assigns ports from the configured port range (default: 8000-9000) and generates API keys if authentication is enabled. You typically don't need to manually specify these values.  </p> <p>Remote Node Deployment</p> <p>If you have configured remote nodes in your configuration file, you can select which node to deploy the instance to. This allows you to distribute instances across multiple machines. See the Configuration guide for details on setting up remote nodes.  </p> </li> <li> <p>Click \"Create Instance\"  </p> </li> </ol>"},{"location":"quick-start/#start-your-instance","title":"Start Your Instance","text":"<p>Once created, you can:  </p> <ul> <li>Start the instance by clicking the start button  </li> <li>Monitor its status in real-time  </li> <li>View logs by clicking the logs button  </li> <li>Stop the instance when needed  </li> </ul>"},{"location":"quick-start/#example-configurations","title":"Example Configurations","text":"<p>Here are basic example configurations for each backend:  </p> <p>llama.cpp backend: <pre><code>{\n  \"name\": \"llama2-7b\",\n  \"backend_type\": \"llama_cpp\",\n  \"backend_options\": {\n    \"model\": \"/path/to/llama-2-7b-chat.gguf\",\n    \"threads\": 4,\n    \"ctx_size\": 2048,\n    \"gpu_layers\": 32\n  },\n  \"nodes\": [\"main\"]\n}\n</code></pre></p> <p>MLX backend (macOS only): <pre><code>{\n  \"name\": \"mistral-mlx\",\n  \"backend_type\": \"mlx_lm\",\n  \"backend_options\": {\n    \"model\": \"mlx-community/Mistral-7B-Instruct-v0.3-4bit\",\n    \"temp\": 0.7,\n    \"max_tokens\": 2048\n  },\n  \"nodes\": [\"main\"]\n}\n</code></pre></p> <p>vLLM backend: <pre><code>{\n  \"name\": \"dialogpt-vllm\",\n  \"backend_type\": \"vllm\",\n  \"backend_options\": {\n    \"model\": \"microsoft/DialoGPT-medium\",\n    \"tensor_parallel_size\": 2,\n    \"gpu_memory_utilization\": 0.9\n  },\n  \"nodes\": [\"main\"]\n}\n</code></pre></p> <p>Remote node deployment example: <pre><code>{\n  \"name\": \"distributed-model\",\n  \"backend_type\": \"llama_cpp\",\n  \"backend_options\": {\n    \"model\": \"/path/to/model.gguf\",\n    \"gpu_layers\": 32\n  },\n  \"nodes\": [\"worker1\"]\n}\n</code></pre></p>"},{"location":"quick-start/#docker-support","title":"Docker Support","text":"<p>Llamactl can run backends in Docker containers. To enable Docker for a backend, add a <code>docker</code> section to that backend in your YAML configuration file (e.g. <code>config.yaml</code>) as shown below:  </p> <pre><code>backends:\n  vllm:\n    command: \"vllm\"\n    args: [\"serve\"]\n    docker:\n      enabled: true\n      image: \"vllm/vllm-openai:latest\"\n      args: [\"run\", \"--rm\", \"--network\", \"host\", \"--gpus\", \"all\", \"--shm-size\", \"1g\"]\n</code></pre>"},{"location":"quick-start/#using-the-api","title":"Using the API","text":"<p>You can also manage instances via the REST API:  </p> <pre><code># List all instances\ncurl http://localhost:8080/api/v1/instances\n\n# Create a new llama.cpp instance\ncurl -X POST http://localhost:8080/api/v1/instances/my-model \\\n  -H \"Content-Type: application/json\" \\\n  -d '{\n    \"backend_type\": \"llama_cpp\",\n    \"backend_options\": {\n      \"model\": \"/path/to/model.gguf\"\n    }\n  }'\n\n# Start an instance\ncurl -X POST http://localhost:8080/api/v1/instances/my-model/start\n</code></pre>"},{"location":"quick-start/#openai-compatible-api","title":"OpenAI Compatible API","text":"<p>Llamactl provides OpenAI-compatible endpoints, making it easy to integrate with existing OpenAI client libraries and tools.  </p>"},{"location":"quick-start/#chat-completions","title":"Chat Completions","text":"<p>Once you have an instance running, you can use it with the OpenAI-compatible chat completions endpoint:  </p> <pre><code>curl -X POST http://localhost:8080/v1/chat/completions \\\n  -H \"Content-Type: application/json\" \\\n  -d '{\n    \"model\": \"my-model\",\n    \"messages\": [\n      {\n        \"role\": \"user\",\n        \"content\": \"Hello! Can you help me write a Python function?\"\n      }\n    ],\n    \"max_tokens\": 150,\n    \"temperature\": 0.7\n  }'\n</code></pre>"},{"location":"quick-start/#using-with-python-openai-client","title":"Using with Python OpenAI Client","text":"<p>You can also use the official OpenAI Python client:  </p> <pre><code>from openai import OpenAI\n\n# Point the client to your Llamactl server\nclient = OpenAI(\n    base_url=\"http://localhost:8080/v1\",\n    api_key=\"your-inference-api-key\"  # Use the inference API key from terminal or config\n)\n\n# Create a chat completion\nresponse = client.chat.completions.create(\n    model=\"my-model\",  # Use the name of your instance\n    messages=[\n        {\"role\": \"user\", \"content\": \"Explain quantum computing in simple terms\"}\n    ],\n    max_tokens=200,\n    temperature=0.7\n)\n\nprint(response.choices[0].message.content)\n</code></pre> <p>API Key</p> <p>If you disabled authentication in your config, you can use any value for <code>api_key</code> (e.g., <code>\"not-needed\"</code>). Otherwise, use the inference API key shown in the terminal output on startup.  </p>"},{"location":"quick-start/#list-available-models","title":"List Available Models","text":"<p>Get a list of running instances (models) in OpenAI-compatible format:  </p> <pre><code>curl http://localhost:8080/v1/models\n</code></pre>"},{"location":"quick-start/#next-steps","title":"Next Steps","text":"<ul> <li>Manage instances Managing Instances </li> <li>Explore the API Reference </li> <li>Configure advanced settings in the Configuration guide  </li> </ul>"},{"location":"troubleshooting/","title":"Troubleshooting","text":"<p>Issues specific to Llamactl deployment and operation.  </p>"},{"location":"troubleshooting/#configuration-issues","title":"Configuration Issues","text":""},{"location":"troubleshooting/#invalid-configuration","title":"Invalid Configuration","text":"<p>Problem: Invalid configuration preventing startup  </p> <p>Solutions: 1. Use minimal configuration: <pre><code>server:\n  host: \"0.0.0.0\"\n  port: 8080\ninstances:\n  port_range: [8000, 9000]\n</code></pre></p> <ol> <li>Check data directory permissions: <pre><code># Ensure data directory is writable (default: ~/.local/share/llamactl)\nmkdir -p ~/.local/share/llamactl/{instances,logs}\n</code></pre></li> </ol>"},{"location":"troubleshooting/#instance-management-issues","title":"Instance Management Issues","text":""},{"location":"troubleshooting/#instance-fails-to-start","title":"Instance Fails to Start","text":"<p>Problem: Instance fails to start or immediately stops  </p> <p>Solutions: </p> <ol> <li> <p>Check instance logs to see the actual error: <pre><code>curl http://localhost:8080/api/v1/instances/{name}/logs\n# Or check log files directly\ntail -f ~/.local/share/llamactl/logs/{instance-name}.log\n</code></pre></p> </li> <li> <p>Verify backend is installed: </p> <ul> <li>llama.cpp: Ensure <code>llama-server</code> is in PATH  </li> <li>MLX: Ensure <code>mlx-lm</code> Python package is installed  </li> <li>vLLM: Ensure <code>vllm</code> Python package is installed  </li> </ul> </li> <li> <p>Check model path and format: </p> <ul> <li>Use absolute paths to model files  </li> <li>Verify model format matches backend (GGUF for llama.cpp, etc.)  </li> </ul> </li> <li> <p>Verify backend command configuration: </p> <ul> <li>Check that the backend <code>command</code> is correctly configured in the global config  </li> <li>For virtual environments, specify the full path to the command (e.g., <code>/path/to/venv/bin/mlx_lm.server</code>)  </li> <li>See the Configuration Guide for backend configuration details  </li> <li>Test the backend directly (see Backend-Specific Issues below)  </li> </ul> </li> </ol>"},{"location":"troubleshooting/#backend-specific-issues","title":"Backend-Specific Issues","text":"<p>Problem: Model loading, memory, GPU, or performance issues  </p> <p>Most model-specific issues (memory, GPU configuration, performance tuning) are backend-specific and should be resolved by consulting the respective backend documentation:  </p> <p>llama.cpp: - llama.cpp GitHub - llama-server README </p> <p>MLX: - MLX-LM GitHub - MLX-LM Server Guide </p> <p>vLLM: - vLLM Documentation - OpenAI Compatible Server - vllm serve Command </p> <p>Testing backends directly: </p> <p>Testing your model and configuration directly with the backend helps determine if the issue is with llamactl or the backend itself:  </p> <pre><code># llama.cpp\nllama-server --model /path/to/model.gguf --port 8081\n\n# MLX\nmlx_lm.server --model mlx-community/Mistral-7B-Instruct-v0.3-4bit --port 8081\n\n# vLLM\nvllm serve microsoft/DialoGPT-medium --port 8081\n</code></pre>"},{"location":"troubleshooting/#api-and-network-issues","title":"API and Network Issues","text":""},{"location":"troubleshooting/#cors-errors","title":"CORS Errors","text":"<p>Problem: Web UI shows CORS errors in browser console  </p> <p>Solutions: 1. Configure allowed origins: <pre><code>server:\n  allowed_origins:\n    - \"http://localhost:3000\"\n    - \"https://yourdomain.com\"\n</code></pre></p>"},{"location":"troubleshooting/#authentication-issues","title":"Authentication Issues","text":"<p>Problem: API requests failing with authentication errors  </p> <p>Solutions: 1. Disable authentication temporarily: <pre><code>auth:\n  require_management_auth: false\n  require_inference_auth: false\n</code></pre></p> <ol> <li> <p>Configure API keys: <pre><code>auth:\n  management_keys:\n    - \"your-management-key\"\n  inference_keys:\n    - \"your-inference-key\"\n</code></pre></p> </li> <li> <p>Use correct Authorization header: <pre><code>curl -H \"Authorization: Bearer your-api-key\" \\\n  http://localhost:8080/api/v1/instances\n</code></pre></p> </li> </ol>"},{"location":"troubleshooting/#remote-node-issues","title":"Remote Node Issues","text":""},{"location":"troubleshooting/#node-configuration","title":"Node Configuration","text":"<p>Problem: Remote instances not appearing or cannot be managed  </p> <p>Solutions: 1. Verify node configuration: <pre><code>local_node: \"main\"  # Must match a key in nodes map\nnodes:\n  main:\n    address: \"\"     # Empty for local node\n  worker1:\n    address: \"http://worker1.internal:8080\"\n    api_key: \"secure-key\"  # Must match worker1's management key\n</code></pre></p> <ol> <li>Check node name consistency: </li> <li><code>local_node</code> on each node must match what other nodes call it  </li> <li> <p>Node names are case-sensitive  </p> </li> <li> <p>Test remote node connectivity: <pre><code>curl -H \"Authorization: Bearer remote-node-key\" \\\n  http://remote-node:8080/api/v1/instances\n</code></pre></p> </li> </ol>"},{"location":"troubleshooting/#debugging-and-logs","title":"Debugging and Logs","text":""},{"location":"troubleshooting/#viewing-instance-logs","title":"Viewing Instance Logs","text":"<pre><code># Get instance logs via API\ncurl http://localhost:8080/api/v1/instances/{name}/logs\n\n# Or check log files directly\ntail -f ~/.local/share/llamactl/logs/{instance-name}.log\n</code></pre>"},{"location":"troubleshooting/#enable-debug-logging","title":"Enable Debug Logging","text":"<pre><code>export LLAMACTL_LOG_LEVEL=debug\nllamactl\n</code></pre>"},{"location":"troubleshooting/#getting-help","title":"Getting Help","text":"<p>When reporting issues, include:  </p> <ol> <li> <p>System information: <pre><code>llamactl --version\n</code></pre></p> </li> <li> <p>Configuration file (remove sensitive keys)  </p> </li> <li> <p>Relevant log output </p> </li> <li> <p>Steps to reproduce the issue </p> </li> </ol>"}]}
\ No newline at end of file
+{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"","title":"Llamactl Documentation","text":"<p>Welcome to the Llamactl documentation!  </p> <p> </p>"},{"location":"#what-is-llamactl","title":"What is Llamactl?","text":"<p>Unified management and routing for llama.cpp, MLX and vLLM models with web dashboard. </p>"},{"location":"#features","title":"Features","text":"<p>\ud83d\ude80 Easy Model Management - Multiple Models Simultaneously: Run different models at the same time (7B for speed, 70B for quality) - Smart Resource Management: Automatic idle timeout, LRU eviction, and configurable instance limits - Web Dashboard: Modern React UI for managing instances, monitoring health, and viewing logs  </p> <p>\ud83d\udd17 Flexible Integration - OpenAI API Compatible: Drop-in replacement - route requests to different models by instance name - Multi-Backend Support: Native support for llama.cpp, MLX (Apple Silicon optimized), and vLLM - Docker Ready: Run backends in containers with full GPU support  </p> <p>\ud83c\udf10 Distributed Deployment - Remote Instances: Deploy instances on remote hosts - Central Management: Manage everything from a single dashboard with automatic routing    </p>"},{"location":"#quick-links","title":"Quick Links","text":"<ul> <li>Installation Guide - Get Llamactl up and running  </li> <li>Configuration Guide - Detailed configuration options  </li> <li>Quick Start - Your first steps with Llamactl  </li> <li>Managing Instances - Instance lifecycle management  </li> <li>API Reference - Complete API documentation  </li> </ul>"},{"location":"#getting-help","title":"Getting Help","text":"<p>If you need help or have questions:  </p> <ul> <li>Check the Troubleshooting guide  </li> <li>Visit the GitHub repository </li> <li>Review the Configuration Guide for advanced settings  </li> </ul>"},{"location":"#license","title":"License","text":"<p>MIT License - see the LICENSE file.  </p>"},{"location":"api-reference/","title":"API Reference","text":""},{"location":"api-reference/#llamactl-api-10","title":"llamactl API 1.0","text":"<p>llamactl is a control server for managing Llama Server instances.  </p> License: MIT License"},{"location":"api-reference/#keys","title":"Keys","text":""},{"location":"api-reference/#get-apiv1authkeys","title":"GET /api/v1/auth/keys","text":"<p>List all API keys  </p> Description <p>Returns a list of all API keys for the system user (excludes key hash and plain-text key)  </p> <p>Input parameters </p> Parameter In Type Default Nullable Description <code>ApiKeyAuth</code> header string N/A No <p> Response 200 OK </p> <p> Response 500 Internal Server Error </p>"},{"location":"api-reference/#post-apiv1authkeys","title":"POST /api/v1/auth/keys","text":"<p>Create a new API key  </p> Description <p>Creates a new API key with the specified permissions and returns the plain- text key (only shown once)  </p> <p>Input parameters </p> Parameter In Type Default Nullable Description <code>key</code> body None No API key configuration <p> Response 201 Created </p> <p> Response 400 Bad Request </p> <p> Response 500 Internal Server Error </p>"},{"location":"api-reference/#delete-apiv1authkeysid","title":"DELETE /api/v1/auth/keys/{id}","text":"<p>Delete an API key  </p> Description <p>Deletes an API key by ID  </p> <p>Input parameters </p> Parameter In Type Default Nullable Description <code>ApiKeyAuth</code> header string N/A No <code>id</code> path None No Key ID <p> Response 204 No Content </p> <p> Response 400 Bad Request </p> <p> Response 404 Not Found </p> <p> Response 500 Internal Server Error </p>"},{"location":"api-reference/#get-apiv1authkeysid","title":"GET /api/v1/auth/keys/{id}","text":"<p>Get details of a specific API key  </p> Description <p>Returns details for a specific API key by ID (excludes key hash and plain- text key)  </p> <p>Input parameters </p> Parameter In Type Default Nullable Description <code>ApiKeyAuth</code> header string N/A No <code>id</code> path None No Key ID <p> Response 200 OK </p> <p> Response 400 Bad Request </p> <p> Response 404 Not Found </p> <p> Response 500 Internal Server Error </p>"},{"location":"api-reference/#get-apiv1authkeysidpermissions","title":"GET /api/v1/auth/keys/{id}/permissions","text":"<p>Get API key permissions  </p> Description <p>Returns the instance-level permissions for a specific API key (includes instance names)  </p> <p>Input parameters </p> Parameter In Type Default Nullable Description <code>ApiKeyAuth</code> header string N/A No <code>id</code> path None No Key ID <p> Response 200 OK </p> <p> Response 400 Bad Request </p> <p> Response 404 Not Found </p> <p> Response 500 Internal Server Error </p>"},{"location":"api-reference/#backends","title":"Backends","text":""},{"location":"api-reference/#get-apiv1backendsllama-cppdevices","title":"GET /api/v1/backends/llama-cpp/devices","text":"<p>List available devices for llama server  </p> Description <p>Returns a list of available devices for the llama server  </p> <p>Input parameters </p> Parameter In Type Default Nullable Description <code>ApiKeyAuth</code> header string N/A No <p> Response 200 OK </p> <p> Response 500 Internal Server Error </p>"},{"location":"api-reference/#get-apiv1backendsllama-cpphelp","title":"GET /api/v1/backends/llama-cpp/help","text":"<p>Get help for llama server  </p> Description <p>Returns the help text for the llama server command  </p> <p>Input parameters </p> Parameter In Type Default Nullable Description <code>ApiKeyAuth</code> header string N/A No <p> Response 200 OK </p> <p> Response 500 Internal Server Error </p>"},{"location":"api-reference/#post-apiv1backendsllama-cppparse-command","title":"POST /api/v1/backends/llama-cpp/parse-command","text":"<p>Parse llama-server command  </p> Description <p>Parses a llama-server command string into instance options  </p> <p>Input parameters </p> Parameter In Type Default Nullable Description <code>ApiKeyAuth</code> header string N/A No <code>request</code> body None No Command to parse <p> Response 200 OK </p> <p> Response 400 Bad Request </p> <p> Response 500 Internal Server Error </p>"},{"location":"api-reference/#get-apiv1backendsllama-cppversion","title":"GET /api/v1/backends/llama-cpp/version","text":"<p>Get version of llama server  </p> Description <p>Returns the version of the llama server command  </p> <p>Input parameters </p> Parameter In Type Default Nullable Description <code>ApiKeyAuth</code> header string N/A No <p> Response 200 OK </p> <p> Response 500 Internal Server Error </p>"},{"location":"api-reference/#post-apiv1backendsmlxparse-command","title":"POST /api/v1/backends/mlx/parse-command","text":"<p>Parse mlx_lm.server command  </p> Description <p>Parses MLX-LM server command string into instance options  </p> <p>Input parameters </p> Parameter In Type Default Nullable Description <code>ApiKeyAuth</code> header string N/A No <code>request</code> body None No Command to parse <p> Response 200 OK </p> <p> Response 400 Bad Request </p>"},{"location":"api-reference/#post-apiv1backendsvllmparse-command","title":"POST /api/v1/backends/vllm/parse-command","text":"<p>Parse vllm serve command  </p> Description <p>Parses a vLLM serve command string into instance options  </p> <p>Input parameters </p> Parameter In Type Default Nullable Description <code>ApiKeyAuth</code> header string N/A No <code>request</code> body None No Command to parse <p> Response 200 OK </p> <p> Response 400 Bad Request </p>"},{"location":"api-reference/#system","title":"System","text":""},{"location":"api-reference/#get-apiv1config","title":"GET /api/v1/config","text":"<p>Get server configuration  </p> Description <p>Returns the current server configuration (sanitized)  </p> <p>Input parameters </p> Parameter In Type Default Nullable Description <code>ApiKeyAuth</code> header string N/A No <p> Response 200 OK </p> <p> Response 500 Internal Server Error </p>"},{"location":"api-reference/#get-apiv1version","title":"GET /api/v1/version","text":"<p>Get llamactl version  </p> Description <p>Returns the version of the llamactl command  </p> <p>Input parameters </p> Parameter In Type Default Nullable Description <code>ApiKeyAuth</code> header string N/A No <p> Response 200 OK </p> <p> Response 500 Internal Server Error </p>"},{"location":"api-reference/#instances","title":"Instances","text":""},{"location":"api-reference/#get-apiv1instances","title":"GET /api/v1/instances","text":"<p>List all instances  </p> Description <p>Returns a list of all instances managed by the server  </p> <p>Input parameters </p> Parameter In Type Default Nullable Description <code>ApiKeyAuth</code> header string N/A No <p> Response 200 OK </p> <p> Response 500 Internal Server Error </p>"},{"location":"api-reference/#delete-apiv1instancesname","title":"DELETE /api/v1/instances/{name}","text":"<p>Delete an instance  </p> Description <p>Stops and removes a specific instance by name  </p> <p>Input parameters </p> Parameter In Type Default Nullable Description <code>ApiKeyAuth</code> header string N/A No <code>name</code> path None No Instance Name <p> Response 204 No Content </p> <p> Response 400 Bad Request </p> <p> Response 500 Internal Server Error </p>"},{"location":"api-reference/#get-apiv1instancesname","title":"GET /api/v1/instances/{name}","text":"<p>Get details of a specific instance  </p> Description <p>Returns the details of a specific instance by name  </p> <p>Input parameters </p> Parameter In Type Default Nullable Description <code>ApiKeyAuth</code> header string N/A No <code>name</code> path None No Instance Name <p> Response 200 OK </p> <p> Response 400 Bad Request </p> <p> Response 500 Internal Server Error </p>"},{"location":"api-reference/#post-apiv1instancesname","title":"POST /api/v1/instances/{name}","text":"<p>Create and start a new instance  </p> Description <p>Creates a new instance with the provided configuration options  </p> <p>Input parameters </p> Parameter In Type Default Nullable Description <code>ApiKeyAuth</code> header string N/A No <code>name</code> path None No Instance Name <code>options</code> body None No Instance configuration options <p> Response 201 Created </p> <p> Response 400 Bad Request </p> <p> Response 500 Internal Server Error </p>"},{"location":"api-reference/#put-apiv1instancesname","title":"PUT /api/v1/instances/{name}","text":"<p>Update an instance's configuration  </p> Description <p>Updates the configuration of a specific instance by name  </p> <p>Input parameters </p> Parameter In Type Default Nullable Description <code>ApiKeyAuth</code> header string N/A No <code>name</code> path None No Instance Name <code>options</code> body None No Instance configuration options <p> Response 200 OK </p> <p> Response 400 Bad Request </p> <p> Response 500 Internal Server Error </p>"},{"location":"api-reference/#get-apiv1instancesnamelogs","title":"GET /api/v1/instances/{name}/logs","text":"<p>Get logs from a specific instance  </p> Description <p>Returns the logs from a specific instance by name with optional line limit  </p> <p>Input parameters </p> Parameter In Type Default Nullable Description <code>ApiKeyAuth</code> header string N/A No <code>lines</code> query None No Number of lines to retrieve (default: all lines) <code>name</code> path None No Instance Name <p> Response 200 OK </p> <p> Response 400 Bad Request </p> <p> Response 500 Internal Server Error </p>"},{"location":"api-reference/#get-apiv1instancesnameproxy","title":"GET /api/v1/instances/{name}/proxy","text":"<p>Proxy requests to a specific instance, does not autostart instance if stopped  </p> Description <p>Forwards HTTP requests to the llama-server instance running on a specific port  </p> <p>Input parameters </p> Parameter In Type Default Nullable Description <code>ApiKeyAuth</code> header string N/A No <code>name</code> path None No Instance Name <p> Response 200 OK </p> <p> Response 400 Bad Request </p> <p> Response 500 Internal Server Error </p> <p> Response 503 Service Unavailable </p>"},{"location":"api-reference/#post-apiv1instancesnameproxy","title":"POST /api/v1/instances/{name}/proxy","text":"<p>Proxy requests to a specific instance, does not autostart instance if stopped  </p> Description <p>Forwards HTTP requests to the llama-server instance running on a specific port  </p> <p>Input parameters </p> Parameter In Type Default Nullable Description <code>ApiKeyAuth</code> header string N/A No <code>name</code> path None No Instance Name <p> Response 200 OK </p> <p> Response 400 Bad Request </p> <p> Response 500 Internal Server Error </p> <p> Response 503 Service Unavailable </p>"},{"location":"api-reference/#post-apiv1instancesnamerestart","title":"POST /api/v1/instances/{name}/restart","text":"<p>Restart a running instance  </p> Description <p>Restarts a specific instance by name  </p> <p>Input parameters </p> Parameter In Type Default Nullable Description <code>ApiKeyAuth</code> header string N/A No <code>name</code> path None No Instance Name <p> Response 200 OK </p> <p> Response 400 Bad Request </p> <p> Response 500 Internal Server Error </p>"},{"location":"api-reference/#post-apiv1instancesnamestart","title":"POST /api/v1/instances/{name}/start","text":"<p>Start a stopped instance  </p> Description <p>Starts a specific instance by name  </p> <p>Input parameters </p> Parameter In Type Default Nullable Description <code>ApiKeyAuth</code> header string N/A No <code>name</code> path None No Instance Name <p> Response 200 OK </p> <p> Response 400 Bad Request </p> <p> Response 500 Internal Server Error </p>"},{"location":"api-reference/#post-apiv1instancesnamestop","title":"POST /api/v1/instances/{name}/stop","text":"<p>Stop a running instance  </p> Description <p>Stops a specific instance by name  </p> <p>Input parameters </p> Parameter In Type Default Nullable Description <code>ApiKeyAuth</code> header string N/A No <code>name</code> path None No Instance Name <p> Response 200 OK </p> <p> Response 400 Bad Request </p> <p> Response 500 Internal Server Error </p>"},{"location":"api-reference/#nodes","title":"Nodes","text":""},{"location":"api-reference/#get-apiv1nodes","title":"GET /api/v1/nodes","text":"<p>List all configured nodes  </p> Description <p>Returns a map of all nodes configured in the server (node name -&gt; node config)  </p> <p>Input parameters </p> Parameter In Type Default Nullable Description <code>ApiKeyAuth</code> header string N/A No <p> Response 200 OK </p> <p> Response 500 Internal Server Error </p>"},{"location":"api-reference/#get-apiv1nodesname","title":"GET /api/v1/nodes/{name}","text":"<p>Get details of a specific node  </p> Description <p>Returns the details of a specific node by name  </p> <p>Input parameters </p> Parameter In Type Default Nullable Description <code>ApiKeyAuth</code> header string N/A No <code>name</code> path None No Node Name <p> Response 200 OK </p> <p> Response 400 Bad Request </p> <p> Response 404 Not Found </p> <p> Response 500 Internal Server Error </p>"},{"location":"api-reference/#llamacpp","title":"Llama.cpp","text":""},{"location":"api-reference/#get-llama-cppname","title":"GET /llama-cpp/{name}/","text":"<p>Proxy requests to llama.cpp UI for the instance  </p> Description <p>Proxies requests to the llama.cpp UI for the specified instance  </p> <p>Input parameters </p> Parameter In Type Default Nullable Description <code>ApiKeyAuth</code> header string N/A No <code>name</code> query None No Instance Name <p> Response 200 OK </p> <p> Response 400 Bad Request </p> <p> Response 500 Internal Server Error </p>"},{"location":"api-reference/#post-llama-cppnameapply-template","title":"POST /llama-cpp/{name}/apply-template","text":"<p>Proxy requests to llama.cpp server instance  </p> Description <p>Proxies requests to the specified llama.cpp server instance, starting it on- demand if configured  </p> <p>Input parameters </p> Parameter In Type Default Nullable Description <code>ApiKeyAuth</code> header string N/A No <code>name</code> path None No Instance Name <p> Response 200 OK </p> <p> Response 400 Bad Request </p> <p> Response 500 Internal Server Error </p>"},{"location":"api-reference/#post-llama-cppnamecompletion","title":"POST /llama-cpp/{name}/completion","text":"<p>Proxy requests to llama.cpp server instance  </p> Description <p>Proxies requests to the specified llama.cpp server instance, starting it on- demand if configured  </p> <p>Input parameters </p> Parameter In Type Default Nullable Description <code>ApiKeyAuth</code> header string N/A No <code>name</code> path None No Instance Name <p> Response 200 OK </p> <p> Response 400 Bad Request </p> <p> Response 500 Internal Server Error </p>"},{"location":"api-reference/#post-llama-cppnamedetokenize","title":"POST /llama-cpp/{name}/detokenize","text":"<p>Proxy requests to llama.cpp server instance  </p> Description <p>Proxies requests to the specified llama.cpp server instance, starting it on- demand if configured  </p> <p>Input parameters </p> Parameter In Type Default Nullable Description <code>ApiKeyAuth</code> header string N/A No <code>name</code> path None No Instance Name <p> Response 200 OK </p> <p> Response 400 Bad Request </p> <p> Response 500 Internal Server Error </p>"},{"location":"api-reference/#post-llama-cppnameembeddings","title":"POST /llama-cpp/{name}/embeddings","text":"<p>Proxy requests to llama.cpp server instance  </p> Description <p>Proxies requests to the specified llama.cpp server instance, starting it on- demand if configured  </p> <p>Input parameters </p> Parameter In Type Default Nullable Description <code>ApiKeyAuth</code> header string N/A No <code>name</code> path None No Instance Name <p> Response 200 OK </p> <p> Response 400 Bad Request </p> <p> Response 500 Internal Server Error </p>"},{"location":"api-reference/#post-llama-cppnameinfill","title":"POST /llama-cpp/{name}/infill","text":"<p>Proxy requests to llama.cpp server instance  </p> Description <p>Proxies requests to the specified llama.cpp server instance, starting it on- demand if configured  </p> <p>Input parameters </p> Parameter In Type Default Nullable Description <code>ApiKeyAuth</code> header string N/A No <code>name</code> path None No Instance Name <p> Response 200 OK </p> <p> Response 400 Bad Request </p> <p> Response 500 Internal Server Error </p>"},{"location":"api-reference/#post-llama-cppnamemetrics","title":"POST /llama-cpp/{name}/metrics","text":"<p>Proxy requests to llama.cpp server instance  </p> Description <p>Proxies requests to the specified llama.cpp server instance, starting it on- demand if configured  </p> <p>Input parameters </p> Parameter In Type Default Nullable Description <code>ApiKeyAuth</code> header string N/A No <code>name</code> path None No Instance Name <p> Response 200 OK </p> <p> Response 400 Bad Request </p> <p> Response 500 Internal Server Error </p>"},{"location":"api-reference/#get-llama-cppnameprops","title":"GET /llama-cpp/{name}/props","text":"<p>Proxy requests to llama.cpp server instance  </p> Description <p>Proxies requests to the specified llama.cpp server instance, starting it on- demand if configured  </p> <p>Input parameters </p> Parameter In Type Default Nullable Description <code>ApiKeyAuth</code> header string N/A No <code>name</code> path None No Instance Name <p> Response 200 OK </p> <p> Response 400 Bad Request </p> <p> Response 500 Internal Server Error </p>"},{"location":"api-reference/#post-llama-cppnameprops","title":"POST /llama-cpp/{name}/props","text":"<p>Proxy requests to llama.cpp server instance  </p> Description <p>Proxies requests to the specified llama.cpp server instance, starting it on- demand if configured  </p> <p>Input parameters </p> Parameter In Type Default Nullable Description <code>ApiKeyAuth</code> header string N/A No <code>name</code> path None No Instance Name <p> Response 200 OK </p> <p> Response 400 Bad Request </p> <p> Response 500 Internal Server Error </p>"},{"location":"api-reference/#post-llama-cppnamereranking","title":"POST /llama-cpp/{name}/reranking","text":"<p>Proxy requests to llama.cpp server instance  </p> Description <p>Proxies requests to the specified llama.cpp server instance, starting it on- demand if configured  </p> <p>Input parameters </p> Parameter In Type Default Nullable Description <code>ApiKeyAuth</code> header string N/A No <code>name</code> path None No Instance Name <p> Response 200 OK </p> <p> Response 400 Bad Request </p> <p> Response 500 Internal Server Error </p>"},{"location":"api-reference/#get-llama-cppnameslots","title":"GET /llama-cpp/{name}/slots","text":"<p>Proxy requests to llama.cpp server instance  </p> Description <p>Proxies requests to the specified llama.cpp server instance, starting it on- demand if configured  </p> <p>Input parameters </p> Parameter In Type Default Nullable Description <code>ApiKeyAuth</code> header string N/A No <code>name</code> path None No Instance Name <p> Response 200 OK </p> <p> Response 400 Bad Request </p> <p> Response 500 Internal Server Error </p>"},{"location":"api-reference/#post-llama-cppnametokenize","title":"POST /llama-cpp/{name}/tokenize","text":"<p>Proxy requests to llama.cpp server instance  </p> Description <p>Proxies requests to the specified llama.cpp server instance, starting it on- demand if configured  </p> <p>Input parameters </p> Parameter In Type Default Nullable Description <code>ApiKeyAuth</code> header string N/A No <code>name</code> path None No Instance Name <p> Response 200 OK </p> <p> Response 400 Bad Request </p> <p> Response 500 Internal Server Error </p>"},{"location":"api-reference/#openai","title":"OpenAI","text":""},{"location":"api-reference/#post-v1","title":"POST /v1/","text":"<p>OpenAI-compatible proxy endpoint  </p> Description <p>Handles all POST requests to /v1/*, routing to the appropriate instance based on the request body. Requires API key authentication via the <code>Authorization</code> header.  </p> <p>Input parameters </p> Parameter In Type Default Nullable Description <code>ApiKeyAuth</code> header string N/A No <p> Response 200 OK </p> <p> Response 400 Bad Request </p> <p> Response 500 Internal Server Error </p>"},{"location":"api-reference/#get-v1models","title":"GET /v1/models","text":"<p>List instances in OpenAI-compatible format  </p> Description <p>Returns a list of instances in a format compatible with OpenAI API  </p> <p>Input parameters </p> Parameter In Type Default Nullable Description <code>ApiKeyAuth</code> header string N/A No <p> Response 200 OK </p> <p> Response 500 Internal Server Error </p>"},{"location":"configuration/","title":"Configuration","text":"<p>llamactl can be configured via configuration files or environment variables. Configuration is loaded in the following order of precedence:  </p> <pre><code>Defaults &lt; Configuration file &lt; Environment variables\n</code></pre> <p>llamactl works out of the box with sensible defaults, but you can customize the behavior to suit your needs.  </p>"},{"location":"configuration/#default-configuration","title":"Default Configuration","text":"<p>Here's the default configuration with all available options:  </p> <pre><code>server:\n  host: \"0.0.0.0\"                # Server host to bind to\n  port: 8080                     # Server port to bind to\n  allowed_origins: [\"*\"]         # Allowed CORS origins (default: all)\n  allowed_headers: [\"*\"]         # Allowed CORS headers (default: all)\n  enable_swagger: false          # Enable Swagger UI for API docs\n\nbackends:\n  llama-cpp:\n    command: \"llama-server\"\n    args: []\n    environment: {}              # Environment variables for the backend process\n    docker:\n      enabled: false\n      image: \"ghcr.io/ggml-org/llama.cpp:server\"\n      args: [\"run\", \"--rm\", \"--network\", \"host\", \"--gpus\", \"all\"]\n      environment: {}\n    response_headers: {}         # Additional response headers to send with responses\n\n  vllm:\n    command: \"vllm\"\n    args: [\"serve\"]\n    environment: {}              # Environment variables for the backend process\n    docker:\n      enabled: false\n      image: \"vllm/vllm-openai:latest\"\n      args: [\"run\", \"--rm\", \"--network\", \"host\", \"--gpus\", \"all\", \"--shm-size\", \"1g\"]\n      environment: {}\n    response_headers: {}         # Additional response headers to send with responses\n\n  mlx:\n    command: \"mlx_lm.server\"\n    args: []\n    environment: {}              # Environment variables for the backend process\n    response_headers: {}         # Additional response headers to send with responses\n\ndata_dir: ~/.local/share/llamactl  # Main data directory (database, instances, logs), default varies by OS\n\ninstances:\n  port_range: [8000, 9000]         # Port range for instances\n  configs_dir: data_dir/instances  # Instance configs directory\n  logs_dir: data_dir/logs          # Logs directory\n  auto_create_dirs: true           # Auto-create data/config/logs dirs if missing\n  max_instances: -1                # Max instances (-1 = unlimited)\n  max_running_instances: -1        # Max running instances (-1 = unlimited)\n  enable_lru_eviction: true        # Enable LRU eviction for idle instances\n  default_auto_restart: true       # Auto-restart new instances by default\n  default_max_restarts: 3          # Max restarts for new instances\n  default_restart_delay: 5         # Restart delay (seconds) for new instances\n  default_on_demand_start: true    # Default on-demand start setting\n  on_demand_start_timeout: 120     # Default on-demand start timeout in seconds\n  timeout_check_interval: 5        # Idle instance timeout check in minutes\n\ndatabase:\n  path: data_dir/llamactl.db              # Database file path\n  max_open_connections: 25       # Maximum open database connections\n  max_idle_connections: 5        # Maximum idle database connections\n  connection_max_lifetime: 5m    # Connection max lifetime\n\nauth:\n  require_inference_auth: true   # Require auth for inference endpoints\n  require_management_auth: true  # Require auth for management endpoints\n  management_keys: []            # Keys for management endpoints\n\nlocal_node: \"main\"               # Name of the local node (default: \"main\")\nnodes:                           # Node configuration for multi-node deployment\n  main:                          # Default local node (empty config)\n</code></pre>"},{"location":"configuration/#configuration-files","title":"Configuration Files","text":""},{"location":"configuration/#configuration-file-locations","title":"Configuration File Locations","text":"<p>Configuration files are searched in the following locations (in order of precedence, first found is used):  </p> <p>Linux: - <code>./llamactl.yaml</code> or <code>./config.yaml</code> (current directory) - <code>$HOME/.config/llamactl/config.yaml</code> - <code>/etc/llamactl/config.yaml</code> </p> <p>macOS: - <code>./llamactl.yaml</code> or <code>./config.yaml</code> (current directory) - <code>$HOME/Library/Application Support/llamactl/config.yaml</code> - <code>/Library/Application Support/llamactl/config.yaml</code> </p> <p>Windows: - <code>./llamactl.yaml</code> or <code>./config.yaml</code> (current directory) - <code>%APPDATA%\\llamactl\\config.yaml</code> - <code>%USERPROFILE%\\llamactl\\config.yaml</code> - <code>%PROGRAMDATA%\\llamactl\\config.yaml</code> </p> <p>You can specify the path to config file with <code>LLAMACTL_CONFIG_PATH</code> environment variable.  </p>"},{"location":"configuration/#configuration-options","title":"Configuration Options","text":""},{"location":"configuration/#server-configuration","title":"Server Configuration","text":"<pre><code>server:\n  host: \"0.0.0.0\"         # Server host to bind to (default: \"0.0.0.0\")\n  port: 8080              # Server port to bind to (default: 8080)\n  allowed_origins: [\"*\"]  # CORS allowed origins (default: [\"*\"])\n  allowed_headers: [\"*\"]  # CORS allowed headers (default: [\"*\"])\n  enable_swagger: false   # Enable Swagger UI (default: false)\n</code></pre> <p>Environment Variables: - <code>LLAMACTL_HOST</code> - Server host - <code>LLAMACTL_PORT</code> - Server port - <code>LLAMACTL_ALLOWED_ORIGINS</code> - Comma-separated CORS origins - <code>LLAMACTL_ENABLE_SWAGGER</code> - Enable Swagger UI (true/false)  </p>"},{"location":"configuration/#backend-configuration","title":"Backend Configuration","text":"<pre><code>backends:\n  llama-cpp:\n    command: \"llama-server\"\n    args: []\n    environment: {}              # Environment variables for the backend process\n    docker:\n      enabled: false             # Enable Docker runtime (default: false)\n      image: \"ghcr.io/ggml-org/llama.cpp:server\"\n      args: [\"run\", \"--rm\", \"--network\", \"host\", \"--gpus\", \"all\"]\n      environment: {}\n    response_headers: {}         # Additional response headers to send with responses\n\n  vllm:\n    command: \"vllm\"\n    args: [\"serve\"]\n    environment: {}              # Environment variables for the backend process\n    docker:\n      enabled: false             # Enable Docker runtime (default: false)\n      image: \"vllm/vllm-openai:latest\"\n      args: [\"run\", \"--rm\", \"--network\", \"host\", \"--gpus\", \"all\", \"--shm-size\", \"1g\"]\n      environment: {}\n    response_headers: {}         # Additional response headers to send with responses\n\n  mlx:\n    command: \"mlx_lm.server\"\n    args: []\n    environment: {}              # Environment variables for the backend process\n    # MLX does not support Docker\n    response_headers: {}         # Additional response headers to send with responses\n</code></pre> <p>Backend Configuration Fields: - <code>command</code>: Executable name/path for the backend - <code>args</code>: Default arguments prepended to all instances - <code>environment</code>: Environment variables for the backend process (optional) - <code>response_headers</code>: Additional response headers to send with responses (optional) - <code>docker</code>: Docker-specific configuration (optional)   - <code>enabled</code>: Boolean flag to enable Docker runtime   - <code>image</code>: Docker image to use   - <code>args</code>: Additional arguments passed to <code>docker run</code>   - <code>environment</code>: Environment variables for the container (optional)  </p> <p>If llamactl is behind an NGINX proxy, <code>X-Accel-Buffering: no</code> response header may be required for NGINX to properly stream the responses without buffering.</p> <p>Environment Variables: </p> <p>LlamaCpp Backend: - <code>LLAMACTL_LLAMACPP_COMMAND</code> - LlamaCpp executable command - <code>LLAMACTL_LLAMACPP_ARGS</code> - Space-separated default arguments - <code>LLAMACTL_LLAMACPP_ENV</code> - Environment variables in format \"KEY1=value1,KEY2=value2\" - <code>LLAMACTL_LLAMACPP_DOCKER_ENABLED</code> - Enable Docker runtime (true/false) - <code>LLAMACTL_LLAMACPP_DOCKER_IMAGE</code> - Docker image to use - <code>LLAMACTL_LLAMACPP_DOCKER_ARGS</code> - Space-separated Docker arguments - <code>LLAMACTL_LLAMACPP_DOCKER_ENV</code> - Docker environment variables in format \"KEY1=value1,KEY2=value2\" - <code>LLAMACTL_LLAMACPP_RESPONSE_HEADERS</code> - Response headers in format \"KEY1=value1;KEY2=value2\"  </p> <p>VLLM Backend: - <code>LLAMACTL_VLLM_COMMAND</code> - VLLM executable command - <code>LLAMACTL_VLLM_ARGS</code> - Space-separated default arguments - <code>LLAMACTL_VLLM_ENV</code> - Environment variables in format \"KEY1=value1,KEY2=value2\" - <code>LLAMACTL_VLLM_DOCKER_ENABLED</code> - Enable Docker runtime (true/false) - <code>LLAMACTL_VLLM_DOCKER_IMAGE</code> - Docker image to use - <code>LLAMACTL_VLLM_DOCKER_ARGS</code> - Space-separated Docker arguments - <code>LLAMACTL_VLLM_DOCKER_ENV</code> - Docker environment variables in format \"KEY1=value1,KEY2=value2\" - <code>LLAMACTL_VLLM_RESPONSE_HEADERS</code> - Response headers in format \"KEY1=value1;KEY2=value2\"  </p> <p>MLX Backend: - <code>LLAMACTL_MLX_COMMAND</code> - MLX executable command - <code>LLAMACTL_MLX_ARGS</code> - Space-separated default arguments - <code>LLAMACTL_MLX_ENV</code> - Environment variables in format \"KEY1=value1,KEY2=value2\" - <code>LLAMACTL_MLX_RESPONSE_HEADERS</code> - Response headers in format \"KEY1=value1;KEY2=value2\"  </p>"},{"location":"configuration/#data-directory-configuration","title":"Data Directory Configuration","text":"<pre><code>data_dir: \"~/.local/share/llamactl\"  # Main data directory for database, instances, and logs (default varies by OS)\n</code></pre> <p>Environment Variables: - <code>LLAMACTL_DATA_DIRECTORY</code> - Main data directory path  </p> <p>Default Data Directory by Platform: - Linux: <code>~/.local/share/llamactl</code> - macOS: <code>~/Library/Application Support/llamactl</code> - Windows: <code>%LOCALAPPDATA%\\llamactl</code> or <code>%PROGRAMDATA%\\llamactl</code> </p>"},{"location":"configuration/#instance-configuration","title":"Instance Configuration","text":"<pre><code>instances:\n  port_range: [8000, 9000]      # Port range for instances (default: [8000, 9000])\n  configs_dir: \"instances\"      # Directory for instance configs, default: data_dir/instances\n  logs_dir: \"logs\"              # Directory for instance logs, default: data_dir/logs\n  auto_create_dirs: true        # Automatically create data/config/logs directories (default: true)\n  max_instances: -1             # Maximum instances (-1 = unlimited)\n  max_running_instances: -1     # Maximum running instances (-1 = unlimited)\n  enable_lru_eviction: true     # Enable LRU eviction for idle instances\n  default_auto_restart: true    # Default auto-restart setting\n  default_max_restarts: 3       # Default maximum restart attempts\n  default_restart_delay: 5      # Default restart delay in seconds\n  default_on_demand_start: true # Default on-demand start setting\n  on_demand_start_timeout: 120  # Default on-demand start timeout in seconds\n  timeout_check_interval: 5     # Default instance timeout check interval in minutes\n</code></pre> <p>Environment Variables: - <code>LLAMACTL_INSTANCE_PORT_RANGE</code> - Port range (format: \"8000-9000\" or \"8000,9000\") - <code>LLAMACTL_INSTANCES_DIR</code> - Instance configs directory path - <code>LLAMACTL_LOGS_DIR</code> - Log directory path - <code>LLAMACTL_AUTO_CREATE_DATA_DIR</code> - Auto-create data/config/logs directories (true/false) - <code>LLAMACTL_MAX_INSTANCES</code> - Maximum number of instances - <code>LLAMACTL_MAX_RUNNING_INSTANCES</code> - Maximum number of running instances - <code>LLAMACTL_ENABLE_LRU_EVICTION</code> - Enable LRU eviction for idle instances - <code>LLAMACTL_DEFAULT_AUTO_RESTART</code> - Default auto-restart setting (true/false) - <code>LLAMACTL_DEFAULT_MAX_RESTARTS</code> - Default maximum restarts - <code>LLAMACTL_DEFAULT_RESTART_DELAY</code> - Default restart delay in seconds - <code>LLAMACTL_DEFAULT_ON_DEMAND_START</code> - Default on-demand start setting (true/false) - <code>LLAMACTL_ON_DEMAND_START_TIMEOUT</code> - Default on-demand start timeout in seconds - <code>LLAMACTL_TIMEOUT_CHECK_INTERVAL</code> - Default instance timeout check interval in minutes  </p>"},{"location":"configuration/#database-configuration","title":"Database Configuration","text":"<pre><code>database:\n  path: \"llamactl.db\"              # Database file path, default: data_dir/llamactl.db\n  max_open_connections: 25         # Maximum open database connections (default: 25)\n  max_idle_connections: 5          # Maximum idle database connections (default: 5)\n  connection_max_lifetime: 5m      # Connection max lifetime (default: 5m)\n</code></pre> <p>Environment Variables: - <code>LLAMACTL_DATABASE_PATH</code> - Database file path (relative to data_dir or absolute) - <code>LLAMACTL_DATABASE_MAX_OPEN_CONNECTIONS</code> - Maximum open database connections - <code>LLAMACTL_DATABASE_MAX_IDLE_CONNECTIONS</code> - Maximum idle database connections - <code>LLAMACTL_DATABASE_CONN_MAX_LIFETIME</code> - Connection max lifetime (e.g., \"5m\", \"1h\")  </p>"},{"location":"configuration/#authentication-configuration","title":"Authentication Configuration","text":"<p>llamactl supports two types of authentication:  </p> <ul> <li>Management API Keys: For accessing the web UI and management API (creating/managing instances). These can be configured in the config file or via environment variables.  </li> <li>Inference API Keys: For accessing the OpenAI-compatible inference endpoints. These are managed via the web UI (Settings \u2192 API Keys) and stored in the database.  </li> </ul> <pre><code>auth:\n  require_inference_auth: true           # Require API key for OpenAI endpoints (default: true)\n  require_management_auth: true          # Require API key for management endpoints (default: true)\n  management_keys: []                    # List of valid management API keys\n</code></pre> <p>Managing Inference API Keys: </p> <p>Inference API keys are managed through the web UI or management API and stored in the database. To create and manage inference keys:  </p> <ol> <li>Open the web UI and log in with a management API key  </li> <li>Navigate to Settings \u2192 API Keys </li> <li>Click Create API Key </li> <li>Configure the key:  </li> <li>Name: A descriptive name for the key  </li> <li>Expiration: Optional expiration date  </li> <li>Permissions: Grant access to all instances or specific instances only  </li> <li>Copy the generated key - it won't be shown again  </li> </ol> <p>Environment Variables: - <code>LLAMACTL_REQUIRE_INFERENCE_AUTH</code> - Require auth for OpenAI endpoints (true/false) - <code>LLAMACTL_REQUIRE_MANAGEMENT_AUTH</code> - Require auth for management endpoints (true/false) - <code>LLAMACTL_MANAGEMENT_KEYS</code> - Comma-separated management API keys  </p>"},{"location":"configuration/#remote-node-configuration","title":"Remote Node Configuration","text":"<p>llamactl supports remote node deployments. Configure remote nodes to deploy instances on remote hosts and manage them centrally.  </p> <pre><code>local_node: \"main\"               # Name of the local node (default: \"main\")\nnodes:                           # Node configuration map\n  main:                          # Local node (empty address means local)\n    address: \"\"                  # Not used for local node\n    api_key: \"\"                  # Not used for local node\n  worker1:                       # Remote worker node\n    address: \"http://192.168.1.10:8080\"\n    api_key: \"worker1-api-key\"   # Management API key for authentication\n</code></pre> <p>Node Configuration Fields: - <code>local_node</code>: Specifies which node in the <code>nodes</code> map represents the local node. Must match exactly what other nodes call this node. - <code>nodes</code>: Map of node configurations   - <code>address</code>: HTTP/HTTPS URL of the remote node (empty for local node)   - <code>api_key</code>: Management API key for authenticating with the remote node  </p> <p>Environment Variables: - <code>LLAMACTL_LOCAL_NODE</code> - Name of the local node  </p>"},{"location":"installation/","title":"Installation","text":"<p>This guide will walk you through installing Llamactl on your system.  </p>"},{"location":"installation/#prerequisites","title":"Prerequisites","text":""},{"location":"installation/#backend-dependencies","title":"Backend Dependencies","text":"<p>llamactl supports multiple backends. Install at least one:  </p> <p>For llama.cpp backend (all platforms): </p> <p>You need <code>llama-server</code> from llama.cpp installed:  </p> <pre><code># Homebrew (macOS/Linux)\nbrew install llama.cpp\n# Winget (Windows)\nwinget install llama.cpp\n</code></pre> <p>Or build from source - see llama.cpp docs  </p> <p>For MLX backend (macOS only): </p> <p>MLX provides optimized inference on Apple Silicon. Install MLX-LM:  </p> <pre><code># Install via pip (requires Python 3.8+)\npip install mlx-lm\n\n# Or in a virtual environment (recommended)\npython -m venv mlx-env\nsource mlx-env/bin/activate\npip install mlx-lm\n</code></pre> <p>Note: MLX backend is only available on macOS with Apple Silicon (M1, M2, M3, etc.)  </p> <p>For vLLM backend: </p> <p>vLLM provides high-throughput distributed serving for LLMs. Install vLLM:  </p> <pre><code># Install in a virtual environment\npython -m venv vllm-env\nsource vllm-env/bin/activate\npip install vllm\n</code></pre>"},{"location":"installation/#installation-methods","title":"Installation Methods","text":""},{"location":"installation/#option-1-download-binary-recommended","title":"Option 1: Download Binary (Recommended)","text":"<p>Download the latest release from the GitHub releases page:  </p> <pre><code># Linux/macOS - Get latest version and download\nLATEST_VERSION=$(curl -s https://api.github.com/repos/lordmathis/llamactl/releases/latest | grep '\"tag_name\":' | sed -E 's/.*\"([^\"]+)\".*/\\1/')\ncurl -L https://github.com/lordmathis/llamactl/releases/download/${LATEST_VERSION}/llamactl-${LATEST_VERSION}-$(uname -s | tr '[:upper:]' '[:lower:]')-$(uname -m).tar.gz | tar -xz\nsudo mv llamactl /usr/local/bin/\n\n# Or download manually from:\n# https://github.com/lordmathis/llamactl/releases/latest\n\n# Windows - Download from releases page\n</code></pre>"},{"location":"installation/#option-2-docker","title":"Option 2: Docker","text":"<p>llamactl provides Dockerfiles for creating Docker images with backends pre-installed. The resulting images include the latest llamactl release with the respective backend.  </p> <p>Available Dockerfiles (CUDA): - llamactl with llama.cpp CUDA: <code>docker/Dockerfile.llamacpp</code> (based on <code>ghcr.io/ggml-org/llama.cpp:server-cuda</code>) - llamactl with vLLM CUDA: <code>docker/Dockerfile.vllm</code> (based on <code>vllm/vllm-openai:latest</code>) - llamactl built from source: <code>docker/Dockerfile.source</code> (multi-stage build with webui)  </p> <p>Note: These Dockerfiles are configured for CUDA. For other platforms (CPU, ROCm, Vulkan, etc.), adapt the base image. For llama.cpp, see available tags at llama.cpp Docker docs. For vLLM, check vLLM docs.  </p> <p>Using Docker Compose </p> <pre><code># Clone the repository\ngit clone https://github.com/lordmathis/llamactl.git\ncd llamactl\n\n# Create directories for data and models\nmkdir -p data/llamacpp data/vllm models\n\n# Start llamactl with llama.cpp backend\ndocker-compose -f docker/docker-compose.yml up llamactl-llamacpp -d\n\n# Or start llamactl with vLLM backend\ndocker-compose -f docker/docker-compose.yml up llamactl-vllm -d\n</code></pre> <p>Access the dashboard at: - llamactl with llama.cpp: http://localhost:8080 - llamactl with vLLM: http://localhost:8081  </p> <p>Using Docker Build and Run </p> <ol> <li> <p>llamactl with llama.cpp CUDA: <pre><code>docker build -f docker/Dockerfile.llamacpp -t llamactl:llamacpp-cuda .\ndocker run -d \\\n  --name llamactl-llamacpp \\\n  --gpus all \\\n  -p 8080:8080 \\\n  -v ~/.cache/llama.cpp:/root/.cache/llama.cpp \\\n  llamactl:llamacpp-cuda\n</code></pre></p> </li> <li> <p>llamactl with vLLM CUDA: <pre><code>docker build -f docker/Dockerfile.vllm -t llamactl:vllm-cuda .\ndocker run -d \\\n  --name llamactl-vllm \\\n  --gpus all \\\n  -p 8080:8080 \\\n  -v ~/.cache/huggingface:/root/.cache/huggingface \\\n  llamactl:vllm-cuda\n</code></pre></p> </li> <li> <p>llamactl built from source: <pre><code>docker build -f docker/Dockerfile.source -t llamactl:source .\ndocker run -d \\\n  --name llamactl \\\n  -p 8080:8080 \\\n  llamactl:source\n</code></pre></p> </li> </ol>"},{"location":"installation/#option-3-build-from-source","title":"Option 3: Build from Source","text":"<p>Requirements: - Go 1.24 or later - Node.js 22 or later - Git  </p> <p>If you prefer to build from source:  </p> <pre><code># Clone the repository\ngit clone https://github.com/lordmathis/llamactl.git\ncd llamactl\n\n# Build the web UI\ncd webui &amp;&amp; npm ci &amp;&amp; npm run build &amp;&amp; cd ..\n\n# Build the application\ngo build -o llamactl ./cmd/server\n</code></pre>"},{"location":"installation/#remote-node-installation","title":"Remote Node Installation","text":"<p>For deployments with remote nodes: - Install llamactl on each node using any of the methods above - Configure API keys for authentication between nodes - Ensure node names are consistent across all configurations  </p>"},{"location":"installation/#verification","title":"Verification","text":"<p>Verify your installation by checking the version:  </p> <pre><code>llamactl --version\n</code></pre>"},{"location":"installation/#next-steps","title":"Next Steps","text":"<p>Now that Llamactl is installed, continue to the Quick Start guide to get your first instance running!  </p> <p>For remote node deployments, see the Configuration Guide for node setup instructions.  </p>"},{"location":"managing-instances/","title":"Managing Instances","text":"<p>Learn how to effectively manage your llama.cpp, MLX, and vLLM instances with Llamactl through both the Web UI and API.  </p>"},{"location":"managing-instances/#overview","title":"Overview","text":"<p>Llamactl provides two ways to manage instances:  </p> <ul> <li>Web UI: Accessible at <code>http://localhost:8080</code> with an intuitive dashboard  </li> <li>REST API: Programmatic access for automation and integration  </li> </ul> <p> </p>"},{"location":"managing-instances/#authentication","title":"Authentication","text":"<p>Llamactl uses a Management API Key to authenticate requests to the management API (creating, starting, stopping instances). All curl examples below use <code>&lt;token&gt;</code> as a placeholder - replace this with your actual Management API Key.  </p> <p>By default, authentication is required. If you don't configure a management API key in your configuration file, llamactl will auto-generate one and print it to the terminal on startup. See the Configuration guide for details.  </p> <p>For Web UI access: 1. Navigate to the web UI 2. Enter your Management API Key 3. Bearer token is stored for the session  </p>"},{"location":"managing-instances/#theme-support","title":"Theme Support","text":"<ul> <li>Switch between light and dark themes  </li> <li>Setting is remembered across sessions  </li> </ul>"},{"location":"managing-instances/#instance-cards","title":"Instance Cards","text":"<p>Each instance is displayed as a card showing:  </p> <ul> <li>Instance name </li> <li>Health status badge (unknown, ready, error, failed)  </li> <li>Action buttons (start, stop, edit, logs, delete)  </li> </ul>"},{"location":"managing-instances/#create-instance","title":"Create Instance","text":"<p>Via Web UI </p> <p> </p> <ol> <li>Click the \"Create Instance\" button on the dashboard  </li> <li>Optional: Click \"Import\" to load a previously exported configuration  </li> </ol> <p>Instance Settings: </p> <ol> <li>Enter a unique Instance Name (required)  </li> <li>Select Node: Choose which node to deploy the instance to  </li> <li>Configure Auto Restart settings:  <ul> <li>Enable automatic restart on failure  </li> <li>Set max restarts and delay between attempts  </li> </ul> </li> <li>Configure basic instance options:  <ul> <li>Idle Timeout: Minutes before stopping idle instance  </li> <li>On Demand Start: Start instance only when needed  </li> </ul> </li> </ol> <p>Backend Configuration: </p> <ol> <li>Select Backend Type:  <ul> <li>Llama Server: For GGUF models using llama-server  </li> <li>MLX LM: For MLX-optimized models (macOS only)  </li> <li>vLLM: For distributed serving and high-throughput inference  </li> </ul> </li> <li>Optional: Click \"Parse Command\" to import settings from an existing backend command  </li> <li>Configure Execution Context:  <ul> <li>Enable Docker: Run backend in Docker container  </li> <li>Command Override: Custom path to backend executable  </li> <li>Environment Variables: Custom environment variables  </li> </ul> </li> </ol> <p>Auto-Assignment</p> <p>Llamactl automatically assigns ports from the configured port range (default: 8000-9000) and generates API keys if authentication is enabled. You typically don't need to manually specify these values.  </p> <ol> <li>Configure Basic Backend Options (varies by backend):  <ul> <li>llama.cpp: Model path, threads, context size, GPU layers, etc.  </li> <li>MLX: Model identifier, temperature, max tokens, etc.  </li> <li>vLLM: Model identifier, tensor parallel size, GPU memory utilization, etc.  </li> </ul> </li> <li>Optional: Expand Advanced Backend Options for additional settings  </li> <li>Optional: Add Extra Args as key-value pairs for custom command-line arguments  </li> <li>Click \"Create\" to save the instance  </li> </ol> <p>Via API </p> <pre><code># Create llama.cpp instance with local model file\ncurl -X POST http://localhost:8080/api/v1/instances/my-llama-instance \\\n  -H \"Content-Type: application/json\" \\\n  -H \"Authorization: Bearer &lt;token&gt;\" \\\n  -d '{\n    \"backend_type\": \"llama_cpp\",\n    \"backend_options\": {\n      \"model\": \"/path/to/model.gguf\",\n      \"threads\": 8,\n      \"ctx_size\": 4096,\n      \"gpu_layers\": 32,\n      \"flash_attn\": \"on\"\n    },\n    \"auto_restart\": true,\n    \"max_restarts\": 3,\n    \"docker_enabled\": false,\n    \"command_override\": \"/opt/llama-server-dev\",\n    \"nodes\": [\"main\"]\n  }'\n\n# Create vLLM instance with environment variables\ncurl -X POST http://localhost:8080/api/v1/instances/my-vllm-instance \\\n  -H \"Content-Type: application/json\" \\\n  -H \"Authorization: Bearer &lt;token&gt;\" \\\n  -d '{\n    \"backend_type\": \"vllm\",\n    \"backend_options\": {\n      \"model\": \"microsoft/DialoGPT-medium\",\n      \"tensor_parallel_size\": 2,\n      \"gpu_memory_utilization\": 0.9\n    },\n    \"on_demand_start\": true,\n    \"environment\": {\n      \"CUDA_VISIBLE_DEVICES\": \"0,1\"\n    },\n    \"nodes\": [\"worker1\", \"worker2\"]\n  }'\n\n# Create MLX instance (macOS only)\ncurl -X POST http://localhost:8080/api/v1/instances/my-mlx-instance \\\n  -H \"Content-Type: application/json\" \\\n  -H \"Authorization: Bearer &lt;token&gt;\" \\\n  -d '{\n    \"backend_type\": \"mlx_lm\",\n    \"backend_options\": {\n      \"model\": \"mlx-community/Mistral-7B-Instruct-v0.3-4bit\",\n      \"temp\": 0.7,\n      \"max_tokens\": 2048\n    },\n    \"nodes\": [\"main\"]\n  }'\n</code></pre>"},{"location":"managing-instances/#start-instance","title":"Start Instance","text":"<p>Via Web UI 1. Click the \"Start\" button on an instance card 2. Watch the status change to \"Unknown\" 3. Monitor progress in the logs 4. Instance status changes to \"Ready\" when ready  </p> <p>Via API <pre><code>curl -X POST http://localhost:8080/api/v1/instances/{name}/start \\\n  -H \"Authorization: Bearer &lt;token&gt;\"\n</code></pre></p>"},{"location":"managing-instances/#stop-instance","title":"Stop Instance","text":"<p>Via Web UI 1. Click the \"Stop\" button on an instance card 2. Instance gracefully shuts down  </p> <p>Via API <pre><code>curl -X POST http://localhost:8080/api/v1/instances/{name}/stop \\\n  -H \"Authorization: Bearer &lt;token&gt;\"\n</code></pre></p>"},{"location":"managing-instances/#edit-instance","title":"Edit Instance","text":"<p>Via Web UI 1. Click the \"Edit\" button on an instance card 2. Modify settings in the configuration dialog 3. Changes require instance restart to take effect 4. Click \"Update &amp; Restart\" to apply changes  </p> <p>Via API Modify instance settings:  </p> <pre><code>curl -X PUT http://localhost:8080/api/v1/instances/{name} \\\n  -H \"Content-Type: application/json\" \\\n  -H \"Authorization: Bearer &lt;token&gt;\" \\\n  -d '{\n    \"backend_options\": {\n      \"threads\": 8,\n      \"context_size\": 4096\n    }\n  }'\n</code></pre> <p>Note</p> <p>Configuration changes require restarting the instance to take effect.  </p>"},{"location":"managing-instances/#export-instance","title":"Export Instance","text":"<p>Via Web UI 1. Click the \"More actions\" button (three dots) on an instance card 2. Click \"Export\" to download the instance configuration as a JSON file  </p>"},{"location":"managing-instances/#view-logs","title":"View Logs","text":"<p>Via Web UI </p> <ol> <li>Click the \"Logs\" button on any instance card  </li> <li>Real-time log viewer opens  </li> </ol> <p>Via API Check instance status in real-time:  </p> <pre><code># Get instance logs\ncurl http://localhost:8080/api/v1/instances/{name}/logs \\\n  -H \"Authorization: Bearer &lt;token&gt;\"\n</code></pre>"},{"location":"managing-instances/#delete-instance","title":"Delete Instance","text":"<p>Via Web UI 1. Click the \"Delete\" button on an instance card 2. Only stopped instances can be deleted 3. Confirm deletion in the dialog  </p> <p>Via API <pre><code>curl -X DELETE http://localhost:8080/api/v1/instances/{name} \\\n  -H \"Authorization: Bearer &lt;token&gt;\"\n</code></pre></p>"},{"location":"managing-instances/#instance-proxy","title":"Instance Proxy","text":"<p>Llamactl proxies all requests to the underlying backend instances (llama-server, MLX, or vLLM).  </p> <pre><code># Proxy requests to the instance\ncurl http://localhost:8080/api/v1/instances/{name}/proxy/ \\\n  -H \"Authorization: Bearer &lt;token&gt;\"\n</code></pre> <p>All backends provide OpenAI-compatible endpoints. Check the respective documentation: - llama-server docs - MLX-LM docs - vLLM docs </p>"},{"location":"managing-instances/#instance-health","title":"Instance Health","text":"<p>Via Web UI </p> <ol> <li>The health status badge is displayed on each instance card  </li> </ol> <p>Via API </p> <p>Check the health status of your instances:  </p> <pre><code>curl http://localhost:8080/api/v1/instances/{name}/proxy/health \\\n  -H \"Authorization: Bearer &lt;token&gt;\"\n</code></pre>"},{"location":"quick-start/","title":"Quick Start","text":"<p>This guide will help you get Llamactl up and running in just a few minutes.  </p> <p>Before you begin: Ensure you have at least one backend installed (llama.cpp, MLX, or vLLM). See the Installation Guide for backend setup.  </p>"},{"location":"quick-start/#core-concepts","title":"Core Concepts","text":"<p>Before you start, let's clarify a few key terms:  </p> <ul> <li>Instance: A running backend server that serves a specific model. Each instance has a unique name and runs independently.  </li> <li>Backend: The inference engine that actually runs the model (llama.cpp, MLX, or vLLM). You need at least one backend installed before creating instances.  </li> <li>Node: In multi-machine setups, a node represents one machine. Most users will just use the default \"main\" node for single-machine deployments.  </li> <li>Proxy Architecture: Llamactl acts as a proxy in front of your instances. You make requests to llamactl (e.g., <code>http://localhost:8080/v1/chat/completions</code>), and it routes them to the appropriate backend instance. This means you don't need to track individual instance ports or endpoints.  </li> </ul>"},{"location":"quick-start/#authentication","title":"Authentication","text":"<p>Llamactl uses two types of API keys:  </p> <ul> <li>Management API Key: Used to authenticate with the Llamactl management API and web UI. If not configured, one is auto-generated at startup and printed to the terminal.  </li> <li>Inference API Key: Used to authenticate requests to the OpenAI-compatible endpoints (<code>/v1/chat/completions</code>, <code>/v1/completions</code>, etc.). These are created and managed via the web UI.  </li> </ul> <p>By default, authentication is required for both management and inference endpoints. You can configure custom management keys or disable authentication in the Configuration guide.  </p>"},{"location":"quick-start/#start-llamactl","title":"Start Llamactl","text":"<p>Start the Llamactl server:  </p> <pre><code>llamactl\n</code></pre> <pre><code>\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\n\u26a0\ufe0f  MANAGEMENT AUTHENTICATION REQUIRED\n\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\n\ud83d\udd11  Generated Management API Key:\n\n    sk-management-...\n\n\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\n\u26a0\ufe0f  IMPORTANT\n\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\n\u2022 This key is auto-generated and will change on restart\n\u2022 For production, add explicit management_keys to your configuration\n\u2022 Copy this key before it disappears from the terminal\n\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\nLlamactl server listening on 0.0.0.0:8080\n</code></pre> <p>Copy the Management API Key from the terminal - you'll need it to access the web UI.  </p> <p>By default, Llamactl will start on <code>http://localhost:8080</code>.  </p>"},{"location":"quick-start/#access-the-web-ui","title":"Access the Web UI","text":"<p>Open your web browser and navigate to:  </p> <pre><code>http://localhost:8080\n</code></pre> <p>Login with the management API key from the terminal output.  </p> <p>You should see the Llamactl web interface.  </p>"},{"location":"quick-start/#create-your-first-instance","title":"Create Your First Instance","text":"<ol> <li>Click the \"Add Instance\" button  </li> <li> <p>Fill in the instance configuration:  </p> <ul> <li>Name: Give your instance a descriptive name  </li> <li>Node: Select which node to deploy the instance to (defaults to \"main\" for single-node setups)  </li> <li>Backend Type: Choose from llama.cpp, MLX, or vLLM  </li> <li>Model: Model path or huggingface repo  </li> <li>Additional Options: Backend-specific parameters  </li> </ul> <p>Auto-Assignment</p> <p>Llamactl automatically assigns ports from the configured port range (default: 8000-9000) and manages API keys if authentication is enabled. You typically don't need to manually specify these values.  </p> <p>Remote Node Deployment</p> <p>If you have configured remote nodes in your configuration file, you can select which node to deploy the instance to. This allows you to distribute instances across multiple machines. See the Configuration guide for details on setting up remote nodes.  </p> </li> <li> <p>Click \"Create Instance\"  </p> </li> </ol>"},{"location":"quick-start/#start-your-instance","title":"Start Your Instance","text":"<p>Once created, you can:  </p> <ul> <li>Start the instance by clicking the start button  </li> <li>Monitor its status in real-time  </li> <li>View logs by clicking the logs button  </li> <li>Stop the instance when needed  </li> </ul>"},{"location":"quick-start/#create-an-inference-api-key","title":"Create an Inference API Key","text":"<p>To make inference requests to your instances, you'll need an inference API key:  </p> <ol> <li>In the web UI, click the Settings icon (gear icon in the top-right)  </li> <li>Navigate to the API Keys tab  </li> <li>Click Create API Key </li> <li>Configure your key:  </li> <li>Name: Give it a descriptive name (e.g., \"Production Key\", \"Development Key\")  </li> <li>Expiration: Optionally set an expiration date for the key  </li> <li>Permissions: Choose whether the key can access all instances or only specific ones  </li> <li>Click Create </li> <li>Copy the generated key - it will only be shown once!  </li> </ol> <p>The key will look like: <code>llamactl-...</code> </p> <p>You can create multiple inference keys with different permissions for different use cases (e.g., one for development, one for production, or keys limited to specific instances).  </p>"},{"location":"quick-start/#example-configurations","title":"Example Configurations","text":"<p>Here are basic example configurations for each backend:  </p> <p>llama.cpp backend: <pre><code>{\n  \"name\": \"llama2-7b\",\n  \"backend_type\": \"llama_cpp\",\n  \"backend_options\": {\n    \"model\": \"/path/to/llama-2-7b-chat.gguf\",\n    \"threads\": 4,\n    \"ctx_size\": 2048,\n    \"gpu_layers\": 32\n  },\n  \"nodes\": [\"main\"]\n}\n</code></pre></p> <p>MLX backend (macOS only): <pre><code>{\n  \"name\": \"mistral-mlx\",\n  \"backend_type\": \"mlx_lm\",\n  \"backend_options\": {\n    \"model\": \"mlx-community/Mistral-7B-Instruct-v0.3-4bit\",\n    \"temp\": 0.7,\n    \"max_tokens\": 2048\n  },\n  \"nodes\": [\"main\"]\n}\n</code></pre></p> <p>vLLM backend: <pre><code>{\n  \"name\": \"dialogpt-vllm\",\n  \"backend_type\": \"vllm\",\n  \"backend_options\": {\n    \"model\": \"microsoft/DialoGPT-medium\",\n    \"tensor_parallel_size\": 2,\n    \"gpu_memory_utilization\": 0.9\n  },\n  \"nodes\": [\"main\"]\n}\n</code></pre></p> <p>Remote node deployment example: <pre><code>{\n  \"name\": \"distributed-model\",\n  \"backend_type\": \"llama_cpp\",\n  \"backend_options\": {\n    \"model\": \"/path/to/model.gguf\",\n    \"gpu_layers\": 32\n  },\n  \"nodes\": [\"worker1\"]\n}\n</code></pre></p>"},{"location":"quick-start/#docker-support","title":"Docker Support","text":"<p>Llamactl can run backends in Docker containers. To enable Docker for a backend, add a <code>docker</code> section to that backend in your YAML configuration file (e.g. <code>config.yaml</code>) as shown below:  </p> <pre><code>backends:\n  vllm:\n    command: \"vllm\"\n    args: [\"serve\"]\n    docker:\n      enabled: true\n      image: \"vllm/vllm-openai:latest\"\n      args: [\"run\", \"--rm\", \"--network\", \"host\", \"--gpus\", \"all\", \"--shm-size\", \"1g\"]\n</code></pre>"},{"location":"quick-start/#using-the-api","title":"Using the API","text":"<p>You can also manage instances via the REST API:  </p> <pre><code># List all instances\ncurl http://localhost:8080/api/v1/instances\n\n# Create a new llama.cpp instance\ncurl -X POST http://localhost:8080/api/v1/instances/my-model \\\n  -H \"Content-Type: application/json\" \\\n  -d '{\n    \"backend_type\": \"llama_cpp\",\n    \"backend_options\": {\n      \"model\": \"/path/to/model.gguf\"\n    }\n  }'\n\n# Start an instance\ncurl -X POST http://localhost:8080/api/v1/instances/my-model/start\n</code></pre>"},{"location":"quick-start/#openai-compatible-api","title":"OpenAI Compatible API","text":"<p>Llamactl provides OpenAI-compatible endpoints, making it easy to integrate with existing OpenAI client libraries and tools.  </p>"},{"location":"quick-start/#chat-completions","title":"Chat Completions","text":"<p>Once you have an instance running, you can use it with the OpenAI-compatible chat completions endpoint:  </p> <pre><code>curl -X POST http://localhost:8080/v1/chat/completions \\\n  -H \"Content-Type: application/json\" \\\n  -d '{\n    \"model\": \"my-model\",\n    \"messages\": [\n      {\n        \"role\": \"user\",\n        \"content\": \"Hello! Can you help me write a Python function?\"\n      }\n    ],\n    \"max_tokens\": 150,\n    \"temperature\": 0.7\n  }'\n</code></pre>"},{"location":"quick-start/#using-with-python-openai-client","title":"Using with Python OpenAI Client","text":"<p>You can also use the official OpenAI Python client:  </p> <pre><code>from openai import OpenAI\n\n# Point the client to your Llamactl server\nclient = OpenAI(\n    base_url=\"http://localhost:8080/v1\",\n    api_key=\"your-inference-api-key\"  # Use the inference API key from terminal or config\n)\n\n# Create a chat completion\nresponse = client.chat.completions.create(\n    model=\"my-model\",  # Use the name of your instance\n    messages=[\n        {\"role\": \"user\", \"content\": \"Explain quantum computing in simple terms\"}\n    ],\n    max_tokens=200,\n    temperature=0.7\n)\n\nprint(response.choices[0].message.content)\n</code></pre> <p>API Key</p> <p>If you disabled authentication in your config, you can use any value for <code>api_key</code> (e.g., <code>\"not-needed\"</code>). Otherwise, use the inference API key you created via the web UI (Settings \u2192 API Keys).  </p>"},{"location":"quick-start/#list-available-models","title":"List Available Models","text":"<p>Get a list of running instances (models) in OpenAI-compatible format:  </p> <pre><code>curl http://localhost:8080/v1/models\n</code></pre>"},{"location":"quick-start/#next-steps","title":"Next Steps","text":"<ul> <li>Manage instances Managing Instances </li> <li>Explore the API Reference </li> <li>Configure advanced settings in the Configuration guide  </li> </ul>"},{"location":"troubleshooting/","title":"Troubleshooting","text":"<p>Issues specific to Llamactl deployment and operation.  </p>"},{"location":"troubleshooting/#configuration-issues","title":"Configuration Issues","text":""},{"location":"troubleshooting/#invalid-configuration","title":"Invalid Configuration","text":"<p>Problem: Invalid configuration preventing startup  </p> <p>Solutions: 1. Use minimal configuration: <pre><code>server:\n  host: \"0.0.0.0\"\n  port: 8080\ninstances:\n  port_range: [8000, 9000]\n</code></pre></p> <ol> <li>Check data directory permissions: <pre><code># Ensure data directory is writable (default: ~/.local/share/llamactl)\nmkdir -p ~/.local/share/llamactl/{instances,logs}\n</code></pre></li> </ol>"},{"location":"troubleshooting/#instance-management-issues","title":"Instance Management Issues","text":""},{"location":"troubleshooting/#instance-fails-to-start","title":"Instance Fails to Start","text":"<p>Problem: Instance fails to start or immediately stops  </p> <p>Solutions: </p> <ol> <li> <p>Check instance logs to see the actual error: <pre><code>curl http://localhost:8080/api/v1/instances/{name}/logs\n# Or check log files directly\ntail -f ~/.local/share/llamactl/logs/{instance-name}.log\n</code></pre></p> </li> <li> <p>Verify backend is installed: </p> <ul> <li>llama.cpp: Ensure <code>llama-server</code> is in PATH  </li> <li>MLX: Ensure <code>mlx-lm</code> Python package is installed  </li> <li>vLLM: Ensure <code>vllm</code> Python package is installed  </li> </ul> </li> <li> <p>Check model path and format: </p> <ul> <li>Use absolute paths to model files  </li> <li>Verify model format matches backend (GGUF for llama.cpp, etc.)  </li> </ul> </li> <li> <p>Verify backend command configuration: </p> <ul> <li>Check that the backend <code>command</code> is correctly configured in the global config  </li> <li>For virtual environments, specify the full path to the command (e.g., <code>/path/to/venv/bin/mlx_lm.server</code>)  </li> <li>See the Configuration Guide for backend configuration details  </li> <li>Test the backend directly (see Backend-Specific Issues below)  </li> </ul> </li> </ol>"},{"location":"troubleshooting/#backend-specific-issues","title":"Backend-Specific Issues","text":"<p>Problem: Model loading, memory, GPU, or performance issues  </p> <p>Most model-specific issues (memory, GPU configuration, performance tuning) are backend-specific and should be resolved by consulting the respective backend documentation:  </p> <p>llama.cpp: - llama.cpp GitHub - llama-server README </p> <p>MLX: - MLX-LM GitHub - MLX-LM Server Guide </p> <p>vLLM: - vLLM Documentation - OpenAI Compatible Server - vllm serve Command </p> <p>Testing backends directly: </p> <p>Testing your model and configuration directly with the backend helps determine if the issue is with llamactl or the backend itself:  </p> <pre><code># llama.cpp\nllama-server --model /path/to/model.gguf --port 8081\n\n# MLX\nmlx_lm.server --model mlx-community/Mistral-7B-Instruct-v0.3-4bit --port 8081\n\n# vLLM\nvllm serve microsoft/DialoGPT-medium --port 8081\n</code></pre>"},{"location":"troubleshooting/#api-and-network-issues","title":"API and Network Issues","text":""},{"location":"troubleshooting/#cors-errors","title":"CORS Errors","text":"<p>Problem: Web UI shows CORS errors in browser console  </p> <p>Solutions: 1. Configure allowed origins: <pre><code>server:\n  allowed_origins:\n    - \"http://localhost:3000\"\n    - \"https://yourdomain.com\"\n</code></pre></p>"},{"location":"troubleshooting/#authentication-issues","title":"Authentication Issues","text":"<p>Problem: API requests failing with authentication errors  </p> <p>Solutions: 1. Disable authentication temporarily: <pre><code>auth:\n  require_management_auth: false\n  require_inference_auth: false\n</code></pre></p> <ol> <li>Configure management API keys: <pre><code>auth:\n  management_keys:\n    - \"your-management-key\"\n</code></pre></li> </ol> <p>For inference API keys, create them via the web UI (Settings \u2192 API Keys) after logging in with your management key.  </p> <ol> <li>Use correct Authorization header: <pre><code>curl -H \"Authorization: Bearer your-api-key\" \\\n  http://localhost:8080/api/v1/instances\n</code></pre></li> </ol>"},{"location":"troubleshooting/#remote-node-issues","title":"Remote Node Issues","text":""},{"location":"troubleshooting/#node-configuration","title":"Node Configuration","text":"<p>Problem: Remote instances not appearing or cannot be managed  </p> <p>Solutions: 1. Verify node configuration: <pre><code>local_node: \"main\"  # Must match a key in nodes map\nnodes:\n  main:\n    address: \"\"     # Empty for local node\n  worker1:\n    address: \"http://worker1.internal:8080\"\n    api_key: \"secure-key\"  # Must match worker1's management key\n</code></pre></p> <ol> <li>Check node name consistency: </li> <li><code>local_node</code> on each node must match what other nodes call it  </li> <li> <p>Node names are case-sensitive  </p> </li> <li> <p>Test remote node connectivity: <pre><code>curl -H \"Authorization: Bearer remote-node-key\" \\\n  http://remote-node:8080/api/v1/instances\n</code></pre></p> </li> </ol>"},{"location":"troubleshooting/#debugging-and-logs","title":"Debugging and Logs","text":""},{"location":"troubleshooting/#viewing-instance-logs","title":"Viewing Instance Logs","text":"<pre><code># Get instance logs via API\ncurl http://localhost:8080/api/v1/instances/{name}/logs\n\n# Or check log files directly\ntail -f ~/.local/share/llamactl/logs/{instance-name}.log\n</code></pre>"},{"location":"troubleshooting/#enable-debug-logging","title":"Enable Debug Logging","text":"<pre><code>export LLAMACTL_LOG_LEVEL=debug\nllamactl\n</code></pre>"},{"location":"troubleshooting/#getting-help","title":"Getting Help","text":"<p>When reporting issues, include:  </p> <ol> <li> <p>System information: <pre><code>llamactl --version\n</code></pre></p> </li> <li> <p>Configuration file (remove sensitive keys)  </p> </li> <li> <p>Relevant log output </p> </li> <li> <p>Steps to reproduce the issue </p> </li> </ol>"}]}
\ No newline at end of file
diff --git a/dev/swagger.json b/dev/swagger.json
index 25cf87d..e8d6e49 100644
--- a/dev/swagger.json
+++ b/dev/swagger.json
@@ -2056,20 +2056,19 @@
         "server.CreateKeyRequest": {
             "type": "object",
             "properties": {
-                "expiresAt": {
-                    "type": "integer",
-                    "format": "int64"
+                "expires_at": {
+                    "type": "integer"
                 },
-                "instancePermissions": {
+                "instance_ids": {
                     "type": "array",
                     "items": {
-                        "$ref": "#/definitions/server.InstancePermission"
+                        "type": "integer"
                     }
                 },
                 "name": {
                     "type": "string"
                 },
-                "permissionMode": {
+                "permission_mode": {
                     "$ref": "#/definitions/auth.PermissionMode"
                 }
             }
@@ -2080,9 +2079,6 @@
                 "created_at": {
                     "type": "integer"
                 },
-                "enabled": {
-                    "type": "boolean"
-                },
                 "expires_at": {
                     "type": "integer"
                 },
@@ -2109,29 +2105,9 @@
                 }
             }
         },
-        "server.InstancePermission": {
-            "type": "object",
-            "properties": {
-                "can_infer": {
-                    "type": "boolean"
-                },
-                "can_view_logs": {
-                    "type": "boolean"
-                },
-                "instance_id": {
-                    "type": "integer"
-                }
-            }
-        },
         "server.KeyPermissionResponse": {
             "type": "object",
             "properties": {
-                "can_infer": {
-                    "type": "boolean"
-                },
-                "can_view_logs": {
-                    "type": "boolean"
-                },
                 "instance_id": {
                     "type": "integer"
                 },
@@ -2146,9 +2122,6 @@
                 "created_at": {
                     "type": "integer"
                 },
-                "enabled": {
-                    "type": "boolean"
-                },
                 "expires_at": {
                     "type": "integer"
                 },
diff --git a/dev/swagger.yaml b/dev/swagger.yaml
index 8143bc3..f464bee 100644
--- a/dev/swagger.yaml
+++ b/dev/swagger.yaml
@@ -232,24 +232,21 @@ definitions:
     type: object
   server.CreateKeyRequest:
     properties:
-      expiresAt:
-        format: int64
+      expires_at:
         type: integer
-      instancePermissions:
+      instance_ids:
         items:
-          $ref: '#/definitions/server.InstancePermission'
+          type: integer
         type: array
       name:
         type: string
-      permissionMode:
+      permission_mode:
         $ref: '#/definitions/auth.PermissionMode'
     type: object
   server.CreateKeyResponse:
     properties:
       created_at:
         type: integer
-      enabled:
-        type: boolean
       expires_at:
         type: integer
       id:
@@ -267,21 +264,8 @@ definitions:
       user_id:
         type: string
     type: object
-  server.InstancePermission:
-    properties:
-      can_infer:
-        type: boolean
-      can_view_logs:
-        type: boolean
-      instance_id:
-        type: integer
-    type: object
   server.KeyPermissionResponse:
     properties:
-      can_infer:
-        type: boolean
-      can_view_logs:
-        type: boolean
       instance_id:
         type: integer
       instance_name:
@@ -291,8 +275,6 @@ definitions:
     properties:
       created_at:
         type: integer
-      enabled:
-        type: boolean
       expires_at:
         type: integer
       id:
diff --git a/dev/troubleshooting/index.html b/dev/troubleshooting/index.html
index 3086f28..11485b5 100644
--- a/dev/troubleshooting/index.html
+++ b/dev/troubleshooting/index.html
@@ -880,21 +880,18 @@
 <a id="__codelineno-5-3" name="__codelineno-5-3" href="#__codelineno-5-3"></a><span class="w">  </span><span class="nt">require_inference_auth</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">false</span>
 </code></pre></div></p>
 <ol>
-<li>
-<p><strong>Configure API keys:</strong><br />
+<li><strong>Configure management API keys:</strong><br />
    <div class="highlight"><pre><span></span><code><a id="__codelineno-6-1" name="__codelineno-6-1" href="#__codelineno-6-1"></a><span class="nt">auth</span><span class="p">:</span>
 <a id="__codelineno-6-2" name="__codelineno-6-2" href="#__codelineno-6-2"></a><span class="w">  </span><span class="nt">management_keys</span><span class="p">:</span>
 <a id="__codelineno-6-3" name="__codelineno-6-3" href="#__codelineno-6-3"></a><span class="w">    </span><span class="p p-Indicator">-</span><span class="w"> </span><span class="s">&quot;your-management-key&quot;</span>
-<a id="__codelineno-6-4" name="__codelineno-6-4" href="#__codelineno-6-4"></a><span class="w">  </span><span class="nt">inference_keys</span><span class="p">:</span>
-<a id="__codelineno-6-5" name="__codelineno-6-5" href="#__codelineno-6-5"></a><span class="w">    </span><span class="p p-Indicator">-</span><span class="w"> </span><span class="s">&quot;your-inference-key&quot;</span>
-</code></pre></div></p>
-</li>
-<li>
-<p><strong>Use correct Authorization header:</strong><br />
+</code></pre></div></li>
+</ol>
+<p>For inference API keys, create them via the web UI (Settings → API Keys) after logging in with your management key.  </p>
+<ol>
+<li><strong>Use correct Authorization header:</strong><br />
    <div class="highlight"><pre><span></span><code><a id="__codelineno-7-1" name="__codelineno-7-1" href="#__codelineno-7-1"></a>curl<span class="w"> </span>-H<span class="w"> </span><span class="s2">&quot;Authorization: Bearer your-api-key&quot;</span><span class="w"> </span><span class="se">\</span>
 <a id="__codelineno-7-2" name="__codelineno-7-2" href="#__codelineno-7-2"></a><span class="w">  </span>http://localhost:8080/api/v1/instances
-</code></pre></div></p>
-</li>
+</code></pre></div></li>
 </ol>
 <h2 id="remote-node-issues">Remote Node Issues<a class="headerlink" href="#remote-node-issues" title="Permanent link">&para;</a></h2>
 <h3 id="node-configuration">Node Configuration<a class="headerlink" href="#node-configuration" title="Permanent link">&para;</a></h3>
@@ -972,7 +969,7 @@
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1zM12.5 7v5.2l4 2.4-1 1L11 13V7zM11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-date" title="October 26, 2025 16:19:53 UTC">October 26, 2025</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-date" title="December 8, 2025 18:15:42 UTC">December 8, 2025</span>
   </span>