mirror of
https://github.com/lordmathis/llamactl.git
synced 2025-11-06 17:14:28 +00:00
Deployed ebc82c3 to dev with MkDocs 1.5.3 and mike 2.0.0
This commit is contained in:
@@ -856,6 +856,72 @@
|
||||
</ul>
|
||||
</nav>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#backend-specific-endpoints" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
Backend-Specific Endpoints
|
||||
</span>
|
||||
</a>
|
||||
|
||||
<nav class="md-nav" aria-label="Backend-Specific Endpoints">
|
||||
<ul class="md-nav__list">
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#parse-commands" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
Parse Commands
|
||||
</span>
|
||||
</a>
|
||||
|
||||
<nav class="md-nav" aria-label="Parse Commands">
|
||||
<ul class="md-nav__list">
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#parse-llamacpp-command" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
Parse Llama.cpp Command
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#parse-mlx-lm-command" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
Parse MLX-LM Command
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#parse-vllm-command" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
Parse vLLM Command
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
</ul>
|
||||
</nav>
|
||||
|
||||
</li>
|
||||
|
||||
</ul>
|
||||
</nav>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#auto-generated-documentation" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
Auto-Generated Documentation
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
@@ -1216,6 +1282,72 @@
|
||||
</ul>
|
||||
</nav>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#backend-specific-endpoints" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
Backend-Specific Endpoints
|
||||
</span>
|
||||
</a>
|
||||
|
||||
<nav class="md-nav" aria-label="Backend-Specific Endpoints">
|
||||
<ul class="md-nav__list">
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#parse-commands" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
Parse Commands
|
||||
</span>
|
||||
</a>
|
||||
|
||||
<nav class="md-nav" aria-label="Parse Commands">
|
||||
<ul class="md-nav__list">
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#parse-llamacpp-command" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
Parse Llama.cpp Command
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#parse-mlx-lm-command" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
Parse MLX-LM Command
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#parse-vllm-command" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
Parse vLLM Command
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
</ul>
|
||||
</nav>
|
||||
|
||||
</li>
|
||||
|
||||
</ul>
|
||||
</nav>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#auto-generated-documentation" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
Auto-Generated Documentation
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
@@ -1346,7 +1478,7 @@
|
||||
<p><strong>Response:</strong>
|
||||
<div class="highlight"><pre><span></span><code><a id="__codelineno-17-1" name="__codelineno-17-1" href="#__codelineno-17-1"></a><span class="p">{</span>
|
||||
<a id="__codelineno-17-2" name="__codelineno-17-2" href="#__codelineno-17-2"></a><span class="w"> </span><span class="nt">"name"</span><span class="p">:</span><span class="w"> </span><span class="s2">"llama2-7b"</span><span class="p">,</span>
|
||||
<a id="__codelineno-17-3" name="__codelineno-17-3" href="#__codelineno-17-3"></a><span class="w"> </span><span class="nt">"status"</span><span class="p">:</span><span class="w"> </span><span class="s2">"starting"</span><span class="p">,</span>
|
||||
<a id="__codelineno-17-3" name="__codelineno-17-3" href="#__codelineno-17-3"></a><span class="w"> </span><span class="nt">"status"</span><span class="p">:</span><span class="w"> </span><span class="s2">"running"</span><span class="p">,</span>
|
||||
<a id="__codelineno-17-4" name="__codelineno-17-4" href="#__codelineno-17-4"></a><span class="w"> </span><span class="nt">"created"</span><span class="p">:</span><span class="w"> </span><span class="mi">1705312200</span>
|
||||
<a id="__codelineno-17-5" name="__codelineno-17-5" href="#__codelineno-17-5"></a><span class="p">}</span>
|
||||
</code></pre></div></p>
|
||||
@@ -1360,7 +1492,7 @@
|
||||
<p><strong>Response:</strong>
|
||||
<div class="highlight"><pre><span></span><code><a id="__codelineno-19-1" name="__codelineno-19-1" href="#__codelineno-19-1"></a><span class="p">{</span>
|
||||
<a id="__codelineno-19-2" name="__codelineno-19-2" href="#__codelineno-19-2"></a><span class="w"> </span><span class="nt">"name"</span><span class="p">:</span><span class="w"> </span><span class="s2">"llama2-7b"</span><span class="p">,</span>
|
||||
<a id="__codelineno-19-3" name="__codelineno-19-3" href="#__codelineno-19-3"></a><span class="w"> </span><span class="nt">"status"</span><span class="p">:</span><span class="w"> </span><span class="s2">"stopping"</span><span class="p">,</span>
|
||||
<a id="__codelineno-19-3" name="__codelineno-19-3" href="#__codelineno-19-3"></a><span class="w"> </span><span class="nt">"status"</span><span class="p">:</span><span class="w"> </span><span class="s2">"stopped"</span><span class="p">,</span>
|
||||
<a id="__codelineno-19-4" name="__codelineno-19-4" href="#__codelineno-19-4"></a><span class="w"> </span><span class="nt">"created"</span><span class="p">:</span><span class="w"> </span><span class="mi">1705312200</span>
|
||||
<a id="__codelineno-19-5" name="__codelineno-19-5" href="#__codelineno-19-5"></a><span class="p">}</span>
|
||||
</code></pre></div></p>
|
||||
@@ -1371,7 +1503,7 @@
|
||||
<p><strong>Response:</strong>
|
||||
<div class="highlight"><pre><span></span><code><a id="__codelineno-21-1" name="__codelineno-21-1" href="#__codelineno-21-1"></a><span class="p">{</span>
|
||||
<a id="__codelineno-21-2" name="__codelineno-21-2" href="#__codelineno-21-2"></a><span class="w"> </span><span class="nt">"name"</span><span class="p">:</span><span class="w"> </span><span class="s2">"llama2-7b"</span><span class="p">,</span>
|
||||
<a id="__codelineno-21-3" name="__codelineno-21-3" href="#__codelineno-21-3"></a><span class="w"> </span><span class="nt">"status"</span><span class="p">:</span><span class="w"> </span><span class="s2">"restarting"</span><span class="p">,</span>
|
||||
<a id="__codelineno-21-3" name="__codelineno-21-3" href="#__codelineno-21-3"></a><span class="w"> </span><span class="nt">"status"</span><span class="p">:</span><span class="w"> </span><span class="s2">"running"</span><span class="p">,</span>
|
||||
<a id="__codelineno-21-4" name="__codelineno-21-4" href="#__codelineno-21-4"></a><span class="w"> </span><span class="nt">"created"</span><span class="p">:</span><span class="w"> </span><span class="mi">1705312200</span>
|
||||
<a id="__codelineno-21-5" name="__codelineno-21-5" href="#__codelineno-21-5"></a><span class="p">}</span>
|
||||
</code></pre></div></p>
|
||||
@@ -1443,9 +1575,9 @@
|
||||
- <code>503 Service Unavailable</code>: Instance is not running and on-demand start is disabled
|
||||
- <code>409 Conflict</code>: Cannot start instance due to maximum instances limit</p>
|
||||
<h2 id="instance-status-values">Instance Status Values<a class="headerlink" href="#instance-status-values" title="Permanent link">¶</a></h2>
|
||||
<p>Instances can have the following status values:<br />
|
||||
- <code>stopped</code>: Instance is not running<br />
|
||||
- <code>running</code>: Instance is running and ready to accept requests<br />
|
||||
<p>Instances can have the following status values:
|
||||
- <code>stopped</code>: Instance is not running
|
||||
- <code>running</code>: Instance is running and ready to accept requests
|
||||
- <code>failed</code>: Instance failed to start or crashed </p>
|
||||
<h2 id="error-responses">Error Responses<a class="headerlink" href="#error-responses" title="Permanent link">¶</a></h2>
|
||||
<p>All endpoints may return error responses in the following format:</p>
|
||||
@@ -1515,9 +1647,76 @@
|
||||
<a id="__codelineno-32-7" name="__codelineno-32-7" href="#__codelineno-32-7"></a><span class="s1"> "n_predict": 50</span>
|
||||
<a id="__codelineno-32-8" name="__codelineno-32-8" href="#__codelineno-32-8"></a><span class="s1"> }'</span>
|
||||
</code></pre></div>
|
||||
<h2 id="backend-specific-endpoints">Backend-Specific Endpoints<a class="headerlink" href="#backend-specific-endpoints" title="Permanent link">¶</a></h2>
|
||||
<h3 id="parse-commands">Parse Commands<a class="headerlink" href="#parse-commands" title="Permanent link">¶</a></h3>
|
||||
<p>Llamactl provides endpoints to parse command strings from different backends into instance configuration options.</p>
|
||||
<h4 id="parse-llamacpp-command">Parse Llama.cpp Command<a class="headerlink" href="#parse-llamacpp-command" title="Permanent link">¶</a></h4>
|
||||
<p>Parse a llama-server command string into instance options.</p>
|
||||
<div class="highlight"><pre><span></span><code><a id="__codelineno-33-1" name="__codelineno-33-1" href="#__codelineno-33-1"></a><span class="err">POST /api/v1/backends/llama-cpp/parse-command</span>
|
||||
</code></pre></div>
|
||||
<p><strong>Request Body:</strong>
|
||||
<div class="highlight"><pre><span></span><code><a id="__codelineno-34-1" name="__codelineno-34-1" href="#__codelineno-34-1"></a><span class="p">{</span>
|
||||
<a id="__codelineno-34-2" name="__codelineno-34-2" href="#__codelineno-34-2"></a><span class="w"> </span><span class="nt">"command"</span><span class="p">:</span><span class="w"> </span><span class="s2">"llama-server -m /path/to/model.gguf -c 2048 --port 8080"</span>
|
||||
<a id="__codelineno-34-3" name="__codelineno-34-3" href="#__codelineno-34-3"></a><span class="p">}</span>
|
||||
</code></pre></div></p>
|
||||
<p><strong>Response:</strong>
|
||||
<div class="highlight"><pre><span></span><code><a id="__codelineno-35-1" name="__codelineno-35-1" href="#__codelineno-35-1"></a><span class="p">{</span>
|
||||
<a id="__codelineno-35-2" name="__codelineno-35-2" href="#__codelineno-35-2"></a><span class="w"> </span><span class="nt">"backend_type"</span><span class="p">:</span><span class="w"> </span><span class="s2">"llama_cpp"</span><span class="p">,</span>
|
||||
<a id="__codelineno-35-3" name="__codelineno-35-3" href="#__codelineno-35-3"></a><span class="w"> </span><span class="nt">"llama_server_options"</span><span class="p">:</span><span class="w"> </span><span class="p">{</span>
|
||||
<a id="__codelineno-35-4" name="__codelineno-35-4" href="#__codelineno-35-4"></a><span class="w"> </span><span class="nt">"model"</span><span class="p">:</span><span class="w"> </span><span class="s2">"/path/to/model.gguf"</span><span class="p">,</span>
|
||||
<a id="__codelineno-35-5" name="__codelineno-35-5" href="#__codelineno-35-5"></a><span class="w"> </span><span class="nt">"ctx_size"</span><span class="p">:</span><span class="w"> </span><span class="mi">2048</span><span class="p">,</span>
|
||||
<a id="__codelineno-35-6" name="__codelineno-35-6" href="#__codelineno-35-6"></a><span class="w"> </span><span class="nt">"port"</span><span class="p">:</span><span class="w"> </span><span class="mi">8080</span>
|
||||
<a id="__codelineno-35-7" name="__codelineno-35-7" href="#__codelineno-35-7"></a><span class="w"> </span><span class="p">}</span>
|
||||
<a id="__codelineno-35-8" name="__codelineno-35-8" href="#__codelineno-35-8"></a><span class="p">}</span>
|
||||
</code></pre></div></p>
|
||||
<h4 id="parse-mlx-lm-command">Parse MLX-LM Command<a class="headerlink" href="#parse-mlx-lm-command" title="Permanent link">¶</a></h4>
|
||||
<p>Parse an MLX-LM server command string into instance options.</p>
|
||||
<div class="highlight"><pre><span></span><code><a id="__codelineno-36-1" name="__codelineno-36-1" href="#__codelineno-36-1"></a><span class="err">POST /api/v1/backends/mlx/parse-command</span>
|
||||
</code></pre></div>
|
||||
<p><strong>Request Body:</strong>
|
||||
<div class="highlight"><pre><span></span><code><a id="__codelineno-37-1" name="__codelineno-37-1" href="#__codelineno-37-1"></a><span class="p">{</span>
|
||||
<a id="__codelineno-37-2" name="__codelineno-37-2" href="#__codelineno-37-2"></a><span class="w"> </span><span class="nt">"command"</span><span class="p">:</span><span class="w"> </span><span class="s2">"mlx_lm.server --model /path/to/model --port 8080"</span>
|
||||
<a id="__codelineno-37-3" name="__codelineno-37-3" href="#__codelineno-37-3"></a><span class="p">}</span>
|
||||
</code></pre></div></p>
|
||||
<p><strong>Response:</strong>
|
||||
<div class="highlight"><pre><span></span><code><a id="__codelineno-38-1" name="__codelineno-38-1" href="#__codelineno-38-1"></a><span class="p">{</span>
|
||||
<a id="__codelineno-38-2" name="__codelineno-38-2" href="#__codelineno-38-2"></a><span class="w"> </span><span class="nt">"backend_type"</span><span class="p">:</span><span class="w"> </span><span class="s2">"mlx_lm"</span><span class="p">,</span>
|
||||
<a id="__codelineno-38-3" name="__codelineno-38-3" href="#__codelineno-38-3"></a><span class="w"> </span><span class="nt">"mlx_server_options"</span><span class="p">:</span><span class="w"> </span><span class="p">{</span>
|
||||
<a id="__codelineno-38-4" name="__codelineno-38-4" href="#__codelineno-38-4"></a><span class="w"> </span><span class="nt">"model"</span><span class="p">:</span><span class="w"> </span><span class="s2">"/path/to/model"</span><span class="p">,</span>
|
||||
<a id="__codelineno-38-5" name="__codelineno-38-5" href="#__codelineno-38-5"></a><span class="w"> </span><span class="nt">"port"</span><span class="p">:</span><span class="w"> </span><span class="mi">8080</span>
|
||||
<a id="__codelineno-38-6" name="__codelineno-38-6" href="#__codelineno-38-6"></a><span class="w"> </span><span class="p">}</span>
|
||||
<a id="__codelineno-38-7" name="__codelineno-38-7" href="#__codelineno-38-7"></a><span class="p">}</span>
|
||||
</code></pre></div></p>
|
||||
<h4 id="parse-vllm-command">Parse vLLM Command<a class="headerlink" href="#parse-vllm-command" title="Permanent link">¶</a></h4>
|
||||
<p>Parse a vLLM serve command string into instance options.</p>
|
||||
<div class="highlight"><pre><span></span><code><a id="__codelineno-39-1" name="__codelineno-39-1" href="#__codelineno-39-1"></a><span class="err">POST /api/v1/backends/vllm/parse-command</span>
|
||||
</code></pre></div>
|
||||
<p><strong>Request Body:</strong>
|
||||
<div class="highlight"><pre><span></span><code><a id="__codelineno-40-1" name="__codelineno-40-1" href="#__codelineno-40-1"></a><span class="p">{</span>
|
||||
<a id="__codelineno-40-2" name="__codelineno-40-2" href="#__codelineno-40-2"></a><span class="w"> </span><span class="nt">"command"</span><span class="p">:</span><span class="w"> </span><span class="s2">"vllm serve /path/to/model --port 8080"</span>
|
||||
<a id="__codelineno-40-3" name="__codelineno-40-3" href="#__codelineno-40-3"></a><span class="p">}</span>
|
||||
</code></pre></div></p>
|
||||
<p><strong>Response:</strong>
|
||||
<div class="highlight"><pre><span></span><code><a id="__codelineno-41-1" name="__codelineno-41-1" href="#__codelineno-41-1"></a><span class="p">{</span>
|
||||
<a id="__codelineno-41-2" name="__codelineno-41-2" href="#__codelineno-41-2"></a><span class="w"> </span><span class="nt">"backend_type"</span><span class="p">:</span><span class="w"> </span><span class="s2">"vllm"</span><span class="p">,</span>
|
||||
<a id="__codelineno-41-3" name="__codelineno-41-3" href="#__codelineno-41-3"></a><span class="w"> </span><span class="nt">"vllm_server_options"</span><span class="p">:</span><span class="w"> </span><span class="p">{</span>
|
||||
<a id="__codelineno-41-4" name="__codelineno-41-4" href="#__codelineno-41-4"></a><span class="w"> </span><span class="nt">"model"</span><span class="p">:</span><span class="w"> </span><span class="s2">"/path/to/model"</span><span class="p">,</span>
|
||||
<a id="__codelineno-41-5" name="__codelineno-41-5" href="#__codelineno-41-5"></a><span class="w"> </span><span class="nt">"port"</span><span class="p">:</span><span class="w"> </span><span class="mi">8080</span>
|
||||
<a id="__codelineno-41-6" name="__codelineno-41-6" href="#__codelineno-41-6"></a><span class="w"> </span><span class="p">}</span>
|
||||
<a id="__codelineno-41-7" name="__codelineno-41-7" href="#__codelineno-41-7"></a><span class="p">}</span>
|
||||
</code></pre></div></p>
|
||||
<p><strong>Error Responses for Parse Commands:</strong>
|
||||
- <code>400 Bad Request</code>: Invalid request body, empty command, or parse error
|
||||
- <code>500 Internal Server Error</code>: Encoding error</p>
|
||||
<h2 id="auto-generated-documentation">Auto-Generated Documentation<a class="headerlink" href="#auto-generated-documentation" title="Permanent link">¶</a></h2>
|
||||
<p>The API documentation is automatically generated from code annotations using Swagger/OpenAPI. To regenerate the documentation:</p>
|
||||
<ol>
|
||||
<li>Install the swag tool: <code>go install github.com/swaggo/swag/cmd/swag@latest</code></li>
|
||||
<li>Generate docs: <code>swag init -g cmd/server/main.go -o apidocs</code></li>
|
||||
</ol>
|
||||
<h2 id="swagger-documentation">Swagger Documentation<a class="headerlink" href="#swagger-documentation" title="Permanent link">¶</a></h2>
|
||||
<p>If swagger documentation is enabled in the server configuration, you can access the interactive API documentation at:</p>
|
||||
<div class="highlight"><pre><span></span><code><a id="__codelineno-33-1" name="__codelineno-33-1" href="#__codelineno-33-1"></a>http://localhost:8080/swagger/
|
||||
<div class="highlight"><pre><span></span><code><a id="__codelineno-42-1" name="__codelineno-42-1" href="#__codelineno-42-1"></a>http://localhost:8080/swagger/
|
||||
</code></pre></div>
|
||||
<p>This provides a complete interactive interface for testing all API endpoints.</p>
|
||||
|
||||
@@ -1540,7 +1739,7 @@
|
||||
<span class="md-icon" title="Last update">
|
||||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
|
||||
</span>
|
||||
<span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-date">September 3, 2025</span>
|
||||
<span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-date">September 21, 2025</span>
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
@@ -1228,7 +1228,7 @@
|
||||
|
||||
|
||||
<h1 id="managing-instances">Managing Instances<a class="headerlink" href="#managing-instances" title="Permanent link">¶</a></h1>
|
||||
<p>Learn how to effectively manage your llama.cpp and MLX instances with Llamactl through both the Web UI and API.</p>
|
||||
<p>Learn how to effectively manage your llama.cpp, MLX, and vLLM instances with Llamactl through both the Web UI and API.</p>
|
||||
<h2 id="overview">Overview<a class="headerlink" href="#overview" title="Permanent link">¶</a></h2>
|
||||
<p>Llamactl provides two ways to manage instances:</p>
|
||||
<ul>
|
||||
@@ -1262,11 +1262,13 @@
|
||||
<li><strong>Choose Backend Type</strong>:<ul>
|
||||
<li><strong>llama.cpp</strong>: For GGUF models using llama-server</li>
|
||||
<li><strong>MLX</strong>: For MLX-optimized models (macOS only)</li>
|
||||
<li><strong>vLLM</strong>: For distributed serving and high-throughput inference</li>
|
||||
</ul>
|
||||
</li>
|
||||
<li>Configure model source:<ul>
|
||||
<li><strong>For llama.cpp</strong>: GGUF model path or HuggingFace repo</li>
|
||||
<li><strong>For MLX</strong>: MLX model path or identifier (e.g., <code>mlx-community/Mistral-7B-Instruct-v0.3-4bit</code>)</li>
|
||||
<li><strong>For vLLM</strong>: HuggingFace model identifier (e.g., <code>microsoft/DialoGPT-medium</code>)</li>
|
||||
</ul>
|
||||
</li>
|
||||
<li>Configure optional instance management settings:<ul>
|
||||
@@ -1280,6 +1282,7 @@
|
||||
<li>Configure backend-specific options:<ul>
|
||||
<li><strong>llama.cpp</strong>: Threads, context size, GPU layers, port, etc.</li>
|
||||
<li><strong>MLX</strong>: Temperature, top-p, adapter path, Python environment, etc.</li>
|
||||
<li><strong>vLLM</strong>: Tensor parallel size, GPU memory utilization, quantization, etc.</li>
|
||||
</ul>
|
||||
</li>
|
||||
<li>Click <strong>"Create"</strong> to save the instance </li>
|
||||
@@ -1313,17 +1316,31 @@
|
||||
<a id="__codelineno-0-26" name="__codelineno-0-26" href="#__codelineno-0-26"></a><span class="s1"> "max_restarts": 3</span>
|
||||
<a id="__codelineno-0-27" name="__codelineno-0-27" href="#__codelineno-0-27"></a><span class="s1"> }'</span>
|
||||
<a id="__codelineno-0-28" name="__codelineno-0-28" href="#__codelineno-0-28"></a>
|
||||
<a id="__codelineno-0-29" name="__codelineno-0-29" href="#__codelineno-0-29"></a><span class="c1"># Create llama.cpp instance with HuggingFace model</span>
|
||||
<a id="__codelineno-0-30" name="__codelineno-0-30" href="#__codelineno-0-30"></a>curl<span class="w"> </span>-X<span class="w"> </span>POST<span class="w"> </span>http://localhost:8080/api/instances/gemma-3-27b<span class="w"> </span><span class="se">\</span>
|
||||
<a id="__codelineno-0-29" name="__codelineno-0-29" href="#__codelineno-0-29"></a><span class="c1"># Create vLLM instance</span>
|
||||
<a id="__codelineno-0-30" name="__codelineno-0-30" href="#__codelineno-0-30"></a>curl<span class="w"> </span>-X<span class="w"> </span>POST<span class="w"> </span>http://localhost:8080/api/instances/my-vllm-instance<span class="w"> </span><span class="se">\</span>
|
||||
<a id="__codelineno-0-31" name="__codelineno-0-31" href="#__codelineno-0-31"></a><span class="w"> </span>-H<span class="w"> </span><span class="s2">"Content-Type: application/json"</span><span class="w"> </span><span class="se">\</span>
|
||||
<a id="__codelineno-0-32" name="__codelineno-0-32" href="#__codelineno-0-32"></a><span class="w"> </span>-d<span class="w"> </span><span class="s1">'{</span>
|
||||
<a id="__codelineno-0-33" name="__codelineno-0-33" href="#__codelineno-0-33"></a><span class="s1"> "backend_type": "llama_cpp",</span>
|
||||
<a id="__codelineno-0-33" name="__codelineno-0-33" href="#__codelineno-0-33"></a><span class="s1"> "backend_type": "vllm",</span>
|
||||
<a id="__codelineno-0-34" name="__codelineno-0-34" href="#__codelineno-0-34"></a><span class="s1"> "backend_options": {</span>
|
||||
<a id="__codelineno-0-35" name="__codelineno-0-35" href="#__codelineno-0-35"></a><span class="s1"> "hf_repo": "unsloth/gemma-3-27b-it-GGUF",</span>
|
||||
<a id="__codelineno-0-36" name="__codelineno-0-36" href="#__codelineno-0-36"></a><span class="s1"> "hf_file": "gemma-3-27b-it-GGUF.gguf",</span>
|
||||
<a id="__codelineno-0-37" name="__codelineno-0-37" href="#__codelineno-0-37"></a><span class="s1"> "gpu_layers": 32</span>
|
||||
<a id="__codelineno-0-38" name="__codelineno-0-38" href="#__codelineno-0-38"></a><span class="s1"> }</span>
|
||||
<a id="__codelineno-0-39" name="__codelineno-0-39" href="#__codelineno-0-39"></a><span class="s1"> }'</span>
|
||||
<a id="__codelineno-0-35" name="__codelineno-0-35" href="#__codelineno-0-35"></a><span class="s1"> "model": "microsoft/DialoGPT-medium",</span>
|
||||
<a id="__codelineno-0-36" name="__codelineno-0-36" href="#__codelineno-0-36"></a><span class="s1"> "tensor_parallel_size": 2,</span>
|
||||
<a id="__codelineno-0-37" name="__codelineno-0-37" href="#__codelineno-0-37"></a><span class="s1"> "gpu_memory_utilization": 0.9</span>
|
||||
<a id="__codelineno-0-38" name="__codelineno-0-38" href="#__codelineno-0-38"></a><span class="s1"> },</span>
|
||||
<a id="__codelineno-0-39" name="__codelineno-0-39" href="#__codelineno-0-39"></a><span class="s1"> "auto_restart": true,</span>
|
||||
<a id="__codelineno-0-40" name="__codelineno-0-40" href="#__codelineno-0-40"></a><span class="s1"> "on_demand_start": true</span>
|
||||
<a id="__codelineno-0-41" name="__codelineno-0-41" href="#__codelineno-0-41"></a><span class="s1"> }'</span>
|
||||
<a id="__codelineno-0-42" name="__codelineno-0-42" href="#__codelineno-0-42"></a>
|
||||
<a id="__codelineno-0-43" name="__codelineno-0-43" href="#__codelineno-0-43"></a><span class="c1"># Create llama.cpp instance with HuggingFace model</span>
|
||||
<a id="__codelineno-0-44" name="__codelineno-0-44" href="#__codelineno-0-44"></a>curl<span class="w"> </span>-X<span class="w"> </span>POST<span class="w"> </span>http://localhost:8080/api/instances/gemma-3-27b<span class="w"> </span><span class="se">\</span>
|
||||
<a id="__codelineno-0-45" name="__codelineno-0-45" href="#__codelineno-0-45"></a><span class="w"> </span>-H<span class="w"> </span><span class="s2">"Content-Type: application/json"</span><span class="w"> </span><span class="se">\</span>
|
||||
<a id="__codelineno-0-46" name="__codelineno-0-46" href="#__codelineno-0-46"></a><span class="w"> </span>-d<span class="w"> </span><span class="s1">'{</span>
|
||||
<a id="__codelineno-0-47" name="__codelineno-0-47" href="#__codelineno-0-47"></a><span class="s1"> "backend_type": "llama_cpp",</span>
|
||||
<a id="__codelineno-0-48" name="__codelineno-0-48" href="#__codelineno-0-48"></a><span class="s1"> "backend_options": {</span>
|
||||
<a id="__codelineno-0-49" name="__codelineno-0-49" href="#__codelineno-0-49"></a><span class="s1"> "hf_repo": "unsloth/gemma-3-27b-it-GGUF",</span>
|
||||
<a id="__codelineno-0-50" name="__codelineno-0-50" href="#__codelineno-0-50"></a><span class="s1"> "hf_file": "gemma-3-27b-it-GGUF.gguf",</span>
|
||||
<a id="__codelineno-0-51" name="__codelineno-0-51" href="#__codelineno-0-51"></a><span class="s1"> "gpu_layers": 32</span>
|
||||
<a id="__codelineno-0-52" name="__codelineno-0-52" href="#__codelineno-0-52"></a><span class="s1"> }</span>
|
||||
<a id="__codelineno-0-53" name="__codelineno-0-53" href="#__codelineno-0-53"></a><span class="s1"> }'</span>
|
||||
</code></pre></div>
|
||||
<h2 id="start-instance">Start Instance<a class="headerlink" href="#start-instance" title="Permanent link">¶</a></h2>
|
||||
<h3 id="via-web-ui_1">Via Web UI<a class="headerlink" href="#via-web-ui_1" title="Permanent link">¶</a></h3>
|
||||
@@ -1390,13 +1407,14 @@
|
||||
<div class="highlight"><pre><span></span><code><a id="__codelineno-5-1" name="__codelineno-5-1" href="#__codelineno-5-1"></a>curl<span class="w"> </span>-X<span class="w"> </span>DELETE<span class="w"> </span>http://localhost:8080/api/instances/<span class="o">{</span>name<span class="o">}</span>
|
||||
</code></pre></div>
|
||||
<h2 id="instance-proxy">Instance Proxy<a class="headerlink" href="#instance-proxy" title="Permanent link">¶</a></h2>
|
||||
<p>Llamactl proxies all requests to the underlying backend instances (llama-server or MLX).</p>
|
||||
<p>Llamactl proxies all requests to the underlying backend instances (llama-server, MLX, or vLLM).</p>
|
||||
<div class="highlight"><pre><span></span><code><a id="__codelineno-6-1" name="__codelineno-6-1" href="#__codelineno-6-1"></a><span class="c1"># Get instance details</span>
|
||||
<a id="__codelineno-6-2" name="__codelineno-6-2" href="#__codelineno-6-2"></a>curl<span class="w"> </span>http://localhost:8080/api/instances/<span class="o">{</span>name<span class="o">}</span>/proxy/
|
||||
</code></pre></div>
|
||||
<p>Both backends provide OpenAI-compatible endpoints. Check the respective documentation:
|
||||
<p>All backends provide OpenAI-compatible endpoints. Check the respective documentation:
|
||||
- <a href="https://github.com/ggml-org/llama.cpp/blob/master/tools/server/README.md">llama-server docs</a>
|
||||
- <a href="https://github.com/ml-explore/mlx-lm/blob/main/mlx_lm/SERVER.md">MLX-LM docs</a></p>
|
||||
- <a href="https://github.com/ml-explore/mlx-lm/blob/main/mlx_lm/SERVER.md">MLX-LM docs</a>
|
||||
- <a href="https://docs.vllm.ai/en/latest/">vLLM docs</a></p>
|
||||
<h3 id="instance-health">Instance Health<a class="headerlink" href="#instance-health" title="Permanent link">¶</a></h3>
|
||||
<h4 id="via-web-ui_6">Via Web UI<a class="headerlink" href="#via-web-ui_6" title="Permanent link">¶</a></h4>
|
||||
<ol>
|
||||
@@ -1426,7 +1444,7 @@
|
||||
<span class="md-icon" title="Last update">
|
||||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
|
||||
</span>
|
||||
<span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-date">September 18, 2025</span>
|
||||
<span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-date">September 21, 2025</span>
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user