Deployed ebc82c3 to dev with MkDocs 1.5.3 and mike 2.0.0

This commit is contained in:
lordmathis
2025-09-22 19:58:42 +00:00
parent 7296e304e7
commit a3dc1ca05a
8 changed files with 395 additions and 135 deletions

View File

@@ -853,28 +853,29 @@
<a id="__codelineno-1-7" name="__codelineno-1-7" href="#__codelineno-1-7"></a><span class="nt">backends</span><span class="p">:</span> <a id="__codelineno-1-7" name="__codelineno-1-7" href="#__codelineno-1-7"></a><span class="nt">backends</span><span class="p">:</span>
<a id="__codelineno-1-8" name="__codelineno-1-8" href="#__codelineno-1-8"></a><span class="w"> </span><span class="nt">llama_executable</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">llama-server</span><span class="w"> </span><span class="c1"># Path to llama-server executable</span> <a id="__codelineno-1-8" name="__codelineno-1-8" href="#__codelineno-1-8"></a><span class="w"> </span><span class="nt">llama_executable</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">llama-server</span><span class="w"> </span><span class="c1"># Path to llama-server executable</span>
<a id="__codelineno-1-9" name="__codelineno-1-9" href="#__codelineno-1-9"></a><span class="w"> </span><span class="nt">mlx_lm_executable</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">mlx_lm.server</span><span class="w"> </span><span class="c1"># Path to mlx_lm.server executable</span> <a id="__codelineno-1-9" name="__codelineno-1-9" href="#__codelineno-1-9"></a><span class="w"> </span><span class="nt">mlx_lm_executable</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">mlx_lm.server</span><span class="w"> </span><span class="c1"># Path to mlx_lm.server executable</span>
<a id="__codelineno-1-10" name="__codelineno-1-10" href="#__codelineno-1-10"></a> <a id="__codelineno-1-10" name="__codelineno-1-10" href="#__codelineno-1-10"></a><span class="w"> </span><span class="nt">vllm_executable</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">vllm</span><span class="w"> </span><span class="c1"># Path to vllm executable</span>
<a id="__codelineno-1-11" name="__codelineno-1-11" href="#__codelineno-1-11"></a><span class="nt">instances</span><span class="p">:</span> <a id="__codelineno-1-11" name="__codelineno-1-11" href="#__codelineno-1-11"></a>
<a id="__codelineno-1-12" name="__codelineno-1-12" href="#__codelineno-1-12"></a><span class="w"> </span><span class="nt">port_range</span><span class="p">:</span><span class="w"> </span><span class="p p-Indicator">[</span><span class="nv">8000</span><span class="p p-Indicator">,</span><span class="w"> </span><span class="nv">9000</span><span class="p p-Indicator">]</span><span class="w"> </span><span class="c1"># Port range for instances</span> <a id="__codelineno-1-12" name="__codelineno-1-12" href="#__codelineno-1-12"></a><span class="nt">instances</span><span class="p">:</span>
<a id="__codelineno-1-13" name="__codelineno-1-13" href="#__codelineno-1-13"></a><span class="w"> </span><span class="nt">data_dir</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">~/.local/share/llamactl</span><span class="w"> </span><span class="c1"># Data directory (platform-specific, see below)</span> <a id="__codelineno-1-13" name="__codelineno-1-13" href="#__codelineno-1-13"></a><span class="w"> </span><span class="nt">port_range</span><span class="p">:</span><span class="w"> </span><span class="p p-Indicator">[</span><span class="nv">8000</span><span class="p p-Indicator">,</span><span class="w"> </span><span class="nv">9000</span><span class="p p-Indicator">]</span><span class="w"> </span><span class="c1"># Port range for instances</span>
<a id="__codelineno-1-14" name="__codelineno-1-14" href="#__codelineno-1-14"></a><span class="w"> </span><span class="nt">configs_dir</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">~/.local/share/llamactl/instances</span><span class="w"> </span><span class="c1"># Instance configs directory</span> <a id="__codelineno-1-14" name="__codelineno-1-14" href="#__codelineno-1-14"></a><span class="w"> </span><span class="nt">data_dir</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">~/.local/share/llamactl</span><span class="w"> </span><span class="c1"># Data directory (platform-specific, see below)</span>
<a id="__codelineno-1-15" name="__codelineno-1-15" href="#__codelineno-1-15"></a><span class="w"> </span><span class="nt">logs_dir</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">~/.local/share/llamactl/logs</span><span class="w"> </span><span class="c1"># Logs directory</span> <a id="__codelineno-1-15" name="__codelineno-1-15" href="#__codelineno-1-15"></a><span class="w"> </span><span class="nt">configs_dir</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">~/.local/share/llamactl/instances</span><span class="w"> </span><span class="c1"># Instance configs directory</span>
<a id="__codelineno-1-16" name="__codelineno-1-16" href="#__codelineno-1-16"></a><span class="w"> </span><span class="nt">auto_create_dirs</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">true</span><span class="w"> </span><span class="c1"># Auto-create data/config/logs dirs if missing</span> <a id="__codelineno-1-16" name="__codelineno-1-16" href="#__codelineno-1-16"></a><span class="w"> </span><span class="nt">logs_dir</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">~/.local/share/llamactl/logs</span><span class="w"> </span><span class="c1"># Logs directory</span>
<a id="__codelineno-1-17" name="__codelineno-1-17" href="#__codelineno-1-17"></a><span class="w"> </span><span class="nt">max_instances</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">-1</span><span class="w"> </span><span class="c1"># Max instances (-1 = unlimited)</span> <a id="__codelineno-1-17" name="__codelineno-1-17" href="#__codelineno-1-17"></a><span class="w"> </span><span class="nt">auto_create_dirs</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">true</span><span class="w"> </span><span class="c1"># Auto-create data/config/logs dirs if missing</span>
<a id="__codelineno-1-18" name="__codelineno-1-18" href="#__codelineno-1-18"></a><span class="w"> </span><span class="nt">max_running_instances</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">-1</span><span class="w"> </span><span class="c1"># Max running instances (-1 = unlimited)</span> <a id="__codelineno-1-18" name="__codelineno-1-18" href="#__codelineno-1-18"></a><span class="w"> </span><span class="nt">max_instances</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">-1</span><span class="w"> </span><span class="c1"># Max instances (-1 = unlimited)</span>
<a id="__codelineno-1-19" name="__codelineno-1-19" href="#__codelineno-1-19"></a><span class="w"> </span><span class="nt">enable_lru_eviction</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">true</span><span class="w"> </span><span class="c1"># Enable LRU eviction for idle instances</span> <a id="__codelineno-1-19" name="__codelineno-1-19" href="#__codelineno-1-19"></a><span class="w"> </span><span class="nt">max_running_instances</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">-1</span><span class="w"> </span><span class="c1"># Max running instances (-1 = unlimited)</span>
<a id="__codelineno-1-20" name="__codelineno-1-20" href="#__codelineno-1-20"></a><span class="w"> </span><span class="nt">default_auto_restart</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">true</span><span class="w"> </span><span class="c1"># Auto-restart new instances by default</span> <a id="__codelineno-1-20" name="__codelineno-1-20" href="#__codelineno-1-20"></a><span class="w"> </span><span class="nt">enable_lru_eviction</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">true</span><span class="w"> </span><span class="c1"># Enable LRU eviction for idle instances</span>
<a id="__codelineno-1-21" name="__codelineno-1-21" href="#__codelineno-1-21"></a><span class="w"> </span><span class="nt">default_max_restarts</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">3</span><span class="w"> </span><span class="c1"># Max restarts for new instances</span> <a id="__codelineno-1-21" name="__codelineno-1-21" href="#__codelineno-1-21"></a><span class="w"> </span><span class="nt">default_auto_restart</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">true</span><span class="w"> </span><span class="c1"># Auto-restart new instances by default</span>
<a id="__codelineno-1-22" name="__codelineno-1-22" href="#__codelineno-1-22"></a><span class="w"> </span><span class="nt">default_restart_delay</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">5</span><span class="w"> </span><span class="c1"># Restart delay (seconds) for new instances</span> <a id="__codelineno-1-22" name="__codelineno-1-22" href="#__codelineno-1-22"></a><span class="w"> </span><span class="nt">default_max_restarts</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">3</span><span class="w"> </span><span class="c1"># Max restarts for new instances</span>
<a id="__codelineno-1-23" name="__codelineno-1-23" href="#__codelineno-1-23"></a><span class="w"> </span><span class="nt">default_on_demand_start</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">true</span><span class="w"> </span><span class="c1"># Default on-demand start setting</span> <a id="__codelineno-1-23" name="__codelineno-1-23" href="#__codelineno-1-23"></a><span class="w"> </span><span class="nt">default_restart_delay</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">5</span><span class="w"> </span><span class="c1"># Restart delay (seconds) for new instances</span>
<a id="__codelineno-1-24" name="__codelineno-1-24" href="#__codelineno-1-24"></a><span class="w"> </span><span class="nt">on_demand_start_timeout</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">120</span><span class="w"> </span><span class="c1"># Default on-demand start timeout in seconds</span> <a id="__codelineno-1-24" name="__codelineno-1-24" href="#__codelineno-1-24"></a><span class="w"> </span><span class="nt">default_on_demand_start</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">true</span><span class="w"> </span><span class="c1"># Default on-demand start setting</span>
<a id="__codelineno-1-25" name="__codelineno-1-25" href="#__codelineno-1-25"></a><span class="w"> </span><span class="nt">timeout_check_interval</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">5</span><span class="w"> </span><span class="c1"># Idle instance timeout check in minutes</span> <a id="__codelineno-1-25" name="__codelineno-1-25" href="#__codelineno-1-25"></a><span class="w"> </span><span class="nt">on_demand_start_timeout</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">120</span><span class="w"> </span><span class="c1"># Default on-demand start timeout in seconds</span>
<a id="__codelineno-1-26" name="__codelineno-1-26" href="#__codelineno-1-26"></a> <a id="__codelineno-1-26" name="__codelineno-1-26" href="#__codelineno-1-26"></a><span class="w"> </span><span class="nt">timeout_check_interval</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">5</span><span class="w"> </span><span class="c1"># Idle instance timeout check in minutes</span>
<a id="__codelineno-1-27" name="__codelineno-1-27" href="#__codelineno-1-27"></a><span class="nt">auth</span><span class="p">:</span> <a id="__codelineno-1-27" name="__codelineno-1-27" href="#__codelineno-1-27"></a>
<a id="__codelineno-1-28" name="__codelineno-1-28" href="#__codelineno-1-28"></a><span class="w"> </span><span class="nt">require_inference_auth</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">true</span><span class="w"> </span><span class="c1"># Require auth for inference endpoints</span> <a id="__codelineno-1-28" name="__codelineno-1-28" href="#__codelineno-1-28"></a><span class="nt">auth</span><span class="p">:</span>
<a id="__codelineno-1-29" name="__codelineno-1-29" href="#__codelineno-1-29"></a><span class="w"> </span><span class="nt">inference_keys</span><span class="p">:</span><span class="w"> </span><span class="p p-Indicator">[]</span><span class="w"> </span><span class="c1"># Keys for inference endpoints</span> <a id="__codelineno-1-29" name="__codelineno-1-29" href="#__codelineno-1-29"></a><span class="w"> </span><span class="nt">require_inference_auth</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">true</span><span class="w"> </span><span class="c1"># Require auth for inference endpoints</span>
<a id="__codelineno-1-30" name="__codelineno-1-30" href="#__codelineno-1-30"></a><span class="w"> </span><span class="nt">require_management_auth</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">true</span><span class="w"> </span><span class="c1"># Require auth for management endpoints</span> <a id="__codelineno-1-30" name="__codelineno-1-30" href="#__codelineno-1-30"></a><span class="w"> </span><span class="nt">inference_keys</span><span class="p">:</span><span class="w"> </span><span class="p p-Indicator">[]</span><span class="w"> </span><span class="c1"># Keys for inference endpoints</span>
<a id="__codelineno-1-31" name="__codelineno-1-31" href="#__codelineno-1-31"></a><span class="w"> </span><span class="nt">management_keys</span><span class="p">:</span><span class="w"> </span><span class="p p-Indicator">[]</span><span class="w"> </span><span class="c1"># Keys for management endpoints</span> <a id="__codelineno-1-31" name="__codelineno-1-31" href="#__codelineno-1-31"></a><span class="w"> </span><span class="nt">require_management_auth</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">true</span><span class="w"> </span><span class="c1"># Require auth for management endpoints</span>
<a id="__codelineno-1-32" name="__codelineno-1-32" href="#__codelineno-1-32"></a><span class="w"> </span><span class="nt">management_keys</span><span class="p">:</span><span class="w"> </span><span class="p p-Indicator">[]</span><span class="w"> </span><span class="c1"># Keys for management endpoints</span>
</code></pre></div> </code></pre></div>
<h2 id="configuration-files">Configuration Files<a class="headerlink" href="#configuration-files" title="Permanent link">&para;</a></h2> <h2 id="configuration-files">Configuration Files<a class="headerlink" href="#configuration-files" title="Permanent link">&para;</a></h2>
<h3 id="configuration-file-locations">Configuration File Locations<a class="headerlink" href="#configuration-file-locations" title="Permanent link">&para;</a></h3> <h3 id="configuration-file-locations">Configuration File Locations<a class="headerlink" href="#configuration-file-locations" title="Permanent link">&para;</a></h3>
@@ -910,10 +911,12 @@
<div class="highlight"><pre><span></span><code><a id="__codelineno-3-1" name="__codelineno-3-1" href="#__codelineno-3-1"></a><span class="nt">backends</span><span class="p">:</span> <div class="highlight"><pre><span></span><code><a id="__codelineno-3-1" name="__codelineno-3-1" href="#__codelineno-3-1"></a><span class="nt">backends</span><span class="p">:</span>
<a id="__codelineno-3-2" name="__codelineno-3-2" href="#__codelineno-3-2"></a><span class="w"> </span><span class="nt">llama_executable</span><span class="p">:</span><span class="w"> </span><span class="s">&quot;llama-server&quot;</span><span class="w"> </span><span class="c1"># Path to llama-server executable (default: &quot;llama-server&quot;)</span> <a id="__codelineno-3-2" name="__codelineno-3-2" href="#__codelineno-3-2"></a><span class="w"> </span><span class="nt">llama_executable</span><span class="p">:</span><span class="w"> </span><span class="s">&quot;llama-server&quot;</span><span class="w"> </span><span class="c1"># Path to llama-server executable (default: &quot;llama-server&quot;)</span>
<a id="__codelineno-3-3" name="__codelineno-3-3" href="#__codelineno-3-3"></a><span class="w"> </span><span class="nt">mlx_lm_executable</span><span class="p">:</span><span class="w"> </span><span class="s">&quot;mlx_lm.server&quot;</span><span class="w"> </span><span class="c1"># Path to mlx_lm.server executable (default: &quot;mlx_lm.server&quot;)</span> <a id="__codelineno-3-3" name="__codelineno-3-3" href="#__codelineno-3-3"></a><span class="w"> </span><span class="nt">mlx_lm_executable</span><span class="p">:</span><span class="w"> </span><span class="s">&quot;mlx_lm.server&quot;</span><span class="w"> </span><span class="c1"># Path to mlx_lm.server executable (default: &quot;mlx_lm.server&quot;)</span>
<a id="__codelineno-3-4" name="__codelineno-3-4" href="#__codelineno-3-4"></a><span class="w"> </span><span class="nt">vllm_executable</span><span class="p">:</span><span class="w"> </span><span class="s">&quot;vllm&quot;</span><span class="w"> </span><span class="c1"># Path to vllm executable (default: &quot;vllm&quot;)</span>
</code></pre></div> </code></pre></div>
<p><strong>Environment Variables:</strong> <p><strong>Environment Variables:</strong>
- <code>LLAMACTL_LLAMA_EXECUTABLE</code> - Path to llama-server executable - <code>LLAMACTL_LLAMA_EXECUTABLE</code> - Path to llama-server executable
- <code>LLAMACTL_MLX_LM_EXECUTABLE</code> - Path to mlx_lm.server executable</p> - <code>LLAMACTL_MLX_LM_EXECUTABLE</code> - Path to mlx_lm.server executable
- <code>LLAMACTL_VLLM_EXECUTABLE</code> - Path to vllm executable</p>
<h3 id="instance-configuration">Instance Configuration<a class="headerlink" href="#instance-configuration" title="Permanent link">&para;</a></h3> <h3 id="instance-configuration">Instance Configuration<a class="headerlink" href="#instance-configuration" title="Permanent link">&para;</a></h3>
<div class="highlight"><pre><span></span><code><a id="__codelineno-4-1" name="__codelineno-4-1" href="#__codelineno-4-1"></a><span class="nt">instances</span><span class="p">:</span> <div class="highlight"><pre><span></span><code><a id="__codelineno-4-1" name="__codelineno-4-1" href="#__codelineno-4-1"></a><span class="nt">instances</span><span class="p">:</span>
<a id="__codelineno-4-2" name="__codelineno-4-2" href="#__codelineno-4-2"></a><span class="w"> </span><span class="nt">port_range</span><span class="p">:</span><span class="w"> </span><span class="p p-Indicator">[</span><span class="nv">8000</span><span class="p p-Indicator">,</span><span class="w"> </span><span class="nv">9000</span><span class="p p-Indicator">]</span><span class="w"> </span><span class="c1"># Port range for instances (default: [8000, 9000])</span> <a id="__codelineno-4-2" name="__codelineno-4-2" href="#__codelineno-4-2"></a><span class="w"> </span><span class="nt">port_range</span><span class="p">:</span><span class="w"> </span><span class="p p-Indicator">[</span><span class="nv">8000</span><span class="p p-Indicator">,</span><span class="w"> </span><span class="nv">9000</span><span class="p p-Indicator">]</span><span class="w"> </span><span class="c1"># Port range for instances (default: [8000, 9000])</span>
@@ -983,7 +986,7 @@
<span class="md-icon" title="Last update"> <span class="md-icon" title="Last update">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
</span> </span>
<span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-date">September 18, 2025</span> <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-date">September 21, 2025</span>
</span> </span>

View File

@@ -825,18 +825,30 @@
<a id="__codelineno-1-7" name="__codelineno-1-7" href="#__codelineno-1-7"></a>pip<span class="w"> </span>install<span class="w"> </span>mlx-lm <a id="__codelineno-1-7" name="__codelineno-1-7" href="#__codelineno-1-7"></a>pip<span class="w"> </span>install<span class="w"> </span>mlx-lm
</code></pre></div> </code></pre></div>
<p>Note: MLX backend is only available on macOS with Apple Silicon (M1, M2, M3, etc.)</p> <p>Note: MLX backend is only available on macOS with Apple Silicon (M1, M2, M3, etc.)</p>
<p><strong>For vLLM backend:</strong></p>
<p>vLLM provides high-throughput distributed serving for LLMs. Install vLLM:</p>
<div class="highlight"><pre><span></span><code><a id="__codelineno-2-1" name="__codelineno-2-1" href="#__codelineno-2-1"></a><span class="c1"># Install via pip (requires Python 3.8+, GPU required)</span>
<a id="__codelineno-2-2" name="__codelineno-2-2" href="#__codelineno-2-2"></a>pip<span class="w"> </span>install<span class="w"> </span>vllm
<a id="__codelineno-2-3" name="__codelineno-2-3" href="#__codelineno-2-3"></a>
<a id="__codelineno-2-4" name="__codelineno-2-4" href="#__codelineno-2-4"></a><span class="c1"># Or in a virtual environment (recommended)</span>
<a id="__codelineno-2-5" name="__codelineno-2-5" href="#__codelineno-2-5"></a>python<span class="w"> </span>-m<span class="w"> </span>venv<span class="w"> </span>vllm-env
<a id="__codelineno-2-6" name="__codelineno-2-6" href="#__codelineno-2-6"></a><span class="nb">source</span><span class="w"> </span>vllm-env/bin/activate
<a id="__codelineno-2-7" name="__codelineno-2-7" href="#__codelineno-2-7"></a>pip<span class="w"> </span>install<span class="w"> </span>vllm
<a id="__codelineno-2-8" name="__codelineno-2-8" href="#__codelineno-2-8"></a>
<a id="__codelineno-2-9" name="__codelineno-2-9" href="#__codelineno-2-9"></a><span class="c1"># For production deployments, consider container-based installation</span>
</code></pre></div>
<h2 id="installation-methods">Installation Methods<a class="headerlink" href="#installation-methods" title="Permanent link">&para;</a></h2> <h2 id="installation-methods">Installation Methods<a class="headerlink" href="#installation-methods" title="Permanent link">&para;</a></h2>
<h3 id="option-1-download-binary-recommended">Option 1: Download Binary (Recommended)<a class="headerlink" href="#option-1-download-binary-recommended" title="Permanent link">&para;</a></h3> <h3 id="option-1-download-binary-recommended">Option 1: Download Binary (Recommended)<a class="headerlink" href="#option-1-download-binary-recommended" title="Permanent link">&para;</a></h3>
<p>Download the latest release from the <a href="https://github.com/lordmathis/llamactl/releases">GitHub releases page</a>:</p> <p>Download the latest release from the <a href="https://github.com/lordmathis/llamactl/releases">GitHub releases page</a>:</p>
<div class="highlight"><pre><span></span><code><a id="__codelineno-2-1" name="__codelineno-2-1" href="#__codelineno-2-1"></a><span class="c1"># Linux/macOS - Get latest version and download</span> <div class="highlight"><pre><span></span><code><a id="__codelineno-3-1" name="__codelineno-3-1" href="#__codelineno-3-1"></a><span class="c1"># Linux/macOS - Get latest version and download</span>
<a id="__codelineno-2-2" name="__codelineno-2-2" href="#__codelineno-2-2"></a><span class="nv">LATEST_VERSION</span><span class="o">=</span><span class="k">$(</span>curl<span class="w"> </span>-s<span class="w"> </span>https://api.github.com/repos/lordmathis/llamactl/releases/latest<span class="w"> </span><span class="p">|</span><span class="w"> </span>grep<span class="w"> </span><span class="s1">&#39;&quot;tag_name&quot;:&#39;</span><span class="w"> </span><span class="p">|</span><span class="w"> </span>sed<span class="w"> </span>-E<span class="w"> </span><span class="s1">&#39;s/.*&quot;([^&quot;]+)&quot;.*/\1/&#39;</span><span class="k">)</span> <a id="__codelineno-3-2" name="__codelineno-3-2" href="#__codelineno-3-2"></a><span class="nv">LATEST_VERSION</span><span class="o">=</span><span class="k">$(</span>curl<span class="w"> </span>-s<span class="w"> </span>https://api.github.com/repos/lordmathis/llamactl/releases/latest<span class="w"> </span><span class="p">|</span><span class="w"> </span>grep<span class="w"> </span><span class="s1">&#39;&quot;tag_name&quot;:&#39;</span><span class="w"> </span><span class="p">|</span><span class="w"> </span>sed<span class="w"> </span>-E<span class="w"> </span><span class="s1">&#39;s/.*&quot;([^&quot;]+)&quot;.*/\1/&#39;</span><span class="k">)</span>
<a id="__codelineno-2-3" name="__codelineno-2-3" href="#__codelineno-2-3"></a>curl<span class="w"> </span>-L<span class="w"> </span>https://github.com/lordmathis/llamactl/releases/download/<span class="si">${</span><span class="nv">LATEST_VERSION</span><span class="si">}</span>/llamactl-<span class="si">${</span><span class="nv">LATEST_VERSION</span><span class="si">}</span>-<span class="k">$(</span>uname<span class="w"> </span>-s<span class="w"> </span><span class="p">|</span><span class="w"> </span>tr<span class="w"> </span><span class="s1">&#39;[:upper:]&#39;</span><span class="w"> </span><span class="s1">&#39;[:lower:]&#39;</span><span class="k">)</span>-<span class="k">$(</span>uname<span class="w"> </span>-m<span class="k">)</span>.tar.gz<span class="w"> </span><span class="p">|</span><span class="w"> </span>tar<span class="w"> </span>-xz <a id="__codelineno-3-3" name="__codelineno-3-3" href="#__codelineno-3-3"></a>curl<span class="w"> </span>-L<span class="w"> </span>https://github.com/lordmathis/llamactl/releases/download/<span class="si">${</span><span class="nv">LATEST_VERSION</span><span class="si">}</span>/llamactl-<span class="si">${</span><span class="nv">LATEST_VERSION</span><span class="si">}</span>-<span class="k">$(</span>uname<span class="w"> </span>-s<span class="w"> </span><span class="p">|</span><span class="w"> </span>tr<span class="w"> </span><span class="s1">&#39;[:upper:]&#39;</span><span class="w"> </span><span class="s1">&#39;[:lower:]&#39;</span><span class="k">)</span>-<span class="k">$(</span>uname<span class="w"> </span>-m<span class="k">)</span>.tar.gz<span class="w"> </span><span class="p">|</span><span class="w"> </span>tar<span class="w"> </span>-xz
<a id="__codelineno-2-4" name="__codelineno-2-4" href="#__codelineno-2-4"></a>sudo<span class="w"> </span>mv<span class="w"> </span>llamactl<span class="w"> </span>/usr/local/bin/ <a id="__codelineno-3-4" name="__codelineno-3-4" href="#__codelineno-3-4"></a>sudo<span class="w"> </span>mv<span class="w"> </span>llamactl<span class="w"> </span>/usr/local/bin/
<a id="__codelineno-2-5" name="__codelineno-2-5" href="#__codelineno-2-5"></a> <a id="__codelineno-3-5" name="__codelineno-3-5" href="#__codelineno-3-5"></a>
<a id="__codelineno-2-6" name="__codelineno-2-6" href="#__codelineno-2-6"></a><span class="c1"># Or download manually from:</span> <a id="__codelineno-3-6" name="__codelineno-3-6" href="#__codelineno-3-6"></a><span class="c1"># Or download manually from:</span>
<a id="__codelineno-2-7" name="__codelineno-2-7" href="#__codelineno-2-7"></a><span class="c1"># https://github.com/lordmathis/llamactl/releases/latest</span> <a id="__codelineno-3-7" name="__codelineno-3-7" href="#__codelineno-3-7"></a><span class="c1"># https://github.com/lordmathis/llamactl/releases/latest</span>
<a id="__codelineno-2-8" name="__codelineno-2-8" href="#__codelineno-2-8"></a> <a id="__codelineno-3-8" name="__codelineno-3-8" href="#__codelineno-3-8"></a>
<a id="__codelineno-2-9" name="__codelineno-2-9" href="#__codelineno-2-9"></a><span class="c1"># Windows - Download from releases page</span> <a id="__codelineno-3-9" name="__codelineno-3-9" href="#__codelineno-3-9"></a><span class="c1"># Windows - Download from releases page</span>
</code></pre></div> </code></pre></div>
<h3 id="option-2-build-from-source">Option 2: Build from Source<a class="headerlink" href="#option-2-build-from-source" title="Permanent link">&para;</a></h3> <h3 id="option-2-build-from-source">Option 2: Build from Source<a class="headerlink" href="#option-2-build-from-source" title="Permanent link">&para;</a></h3>
<p>Requirements: <p>Requirements:
@@ -844,19 +856,19 @@
- Node.js 22 or later - Node.js 22 or later
- Git</p> - Git</p>
<p>If you prefer to build from source:</p> <p>If you prefer to build from source:</p>
<div class="highlight"><pre><span></span><code><a id="__codelineno-3-1" name="__codelineno-3-1" href="#__codelineno-3-1"></a><span class="c1"># Clone the repository</span> <div class="highlight"><pre><span></span><code><a id="__codelineno-4-1" name="__codelineno-4-1" href="#__codelineno-4-1"></a><span class="c1"># Clone the repository</span>
<a id="__codelineno-3-2" name="__codelineno-3-2" href="#__codelineno-3-2"></a>git<span class="w"> </span>clone<span class="w"> </span>https://github.com/lordmathis/llamactl.git <a id="__codelineno-4-2" name="__codelineno-4-2" href="#__codelineno-4-2"></a>git<span class="w"> </span>clone<span class="w"> </span>https://github.com/lordmathis/llamactl.git
<a id="__codelineno-3-3" name="__codelineno-3-3" href="#__codelineno-3-3"></a><span class="nb">cd</span><span class="w"> </span>llamactl <a id="__codelineno-4-3" name="__codelineno-4-3" href="#__codelineno-4-3"></a><span class="nb">cd</span><span class="w"> </span>llamactl
<a id="__codelineno-3-4" name="__codelineno-3-4" href="#__codelineno-3-4"></a> <a id="__codelineno-4-4" name="__codelineno-4-4" href="#__codelineno-4-4"></a>
<a id="__codelineno-3-5" name="__codelineno-3-5" href="#__codelineno-3-5"></a><span class="c1"># Build the web UI</span> <a id="__codelineno-4-5" name="__codelineno-4-5" href="#__codelineno-4-5"></a><span class="c1"># Build the web UI</span>
<a id="__codelineno-3-6" name="__codelineno-3-6" href="#__codelineno-3-6"></a><span class="nb">cd</span><span class="w"> </span>webui<span class="w"> </span><span class="o">&amp;&amp;</span><span class="w"> </span>npm<span class="w"> </span>ci<span class="w"> </span><span class="o">&amp;&amp;</span><span class="w"> </span>npm<span class="w"> </span>run<span class="w"> </span>build<span class="w"> </span><span class="o">&amp;&amp;</span><span class="w"> </span><span class="nb">cd</span><span class="w"> </span>.. <a id="__codelineno-4-6" name="__codelineno-4-6" href="#__codelineno-4-6"></a><span class="nb">cd</span><span class="w"> </span>webui<span class="w"> </span><span class="o">&amp;&amp;</span><span class="w"> </span>npm<span class="w"> </span>ci<span class="w"> </span><span class="o">&amp;&amp;</span><span class="w"> </span>npm<span class="w"> </span>run<span class="w"> </span>build<span class="w"> </span><span class="o">&amp;&amp;</span><span class="w"> </span><span class="nb">cd</span><span class="w"> </span>..
<a id="__codelineno-3-7" name="__codelineno-3-7" href="#__codelineno-3-7"></a> <a id="__codelineno-4-7" name="__codelineno-4-7" href="#__codelineno-4-7"></a>
<a id="__codelineno-3-8" name="__codelineno-3-8" href="#__codelineno-3-8"></a><span class="c1"># Build the application</span> <a id="__codelineno-4-8" name="__codelineno-4-8" href="#__codelineno-4-8"></a><span class="c1"># Build the application</span>
<a id="__codelineno-3-9" name="__codelineno-3-9" href="#__codelineno-3-9"></a>go<span class="w"> </span>build<span class="w"> </span>-o<span class="w"> </span>llamactl<span class="w"> </span>./cmd/server <a id="__codelineno-4-9" name="__codelineno-4-9" href="#__codelineno-4-9"></a>go<span class="w"> </span>build<span class="w"> </span>-o<span class="w"> </span>llamactl<span class="w"> </span>./cmd/server
</code></pre></div> </code></pre></div>
<h2 id="verification">Verification<a class="headerlink" href="#verification" title="Permanent link">&para;</a></h2> <h2 id="verification">Verification<a class="headerlink" href="#verification" title="Permanent link">&para;</a></h2>
<p>Verify your installation by checking the version:</p> <p>Verify your installation by checking the version:</p>
<div class="highlight"><pre><span></span><code><a id="__codelineno-4-1" name="__codelineno-4-1" href="#__codelineno-4-1"></a>llamactl<span class="w"> </span>--version <div class="highlight"><pre><span></span><code><a id="__codelineno-5-1" name="__codelineno-5-1" href="#__codelineno-5-1"></a>llamactl<span class="w"> </span>--version
</code></pre></div> </code></pre></div>
<h2 id="next-steps">Next Steps<a class="headerlink" href="#next-steps" title="Permanent link">&para;</a></h2> <h2 id="next-steps">Next Steps<a class="headerlink" href="#next-steps" title="Permanent link">&para;</a></h2>
<p>Now that Llamactl is installed, continue to the <a href="../quick-start/">Quick Start</a> guide to get your first instance running!</p> <p>Now that Llamactl is installed, continue to the <a href="../quick-start/">Quick Start</a> guide to get your first instance running!</p>
@@ -880,7 +892,7 @@
<span class="md-icon" title="Last update"> <span class="md-icon" title="Last update">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
</span> </span>
<span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-date">September 18, 2025</span> <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-date">September 21, 2025</span>
</span> </span>

View File

@@ -495,9 +495,9 @@
</li> </li>
<li class="md-nav__item"> <li class="md-nav__item">
<a href="#example-configuration" class="md-nav__link"> <a href="#example-configurations" class="md-nav__link">
<span class="md-ellipsis"> <span class="md-ellipsis">
Example Configuration Example Configurations
</span> </span>
</a> </a>
@@ -775,9 +775,9 @@
</li> </li>
<li class="md-nav__item"> <li class="md-nav__item">
<a href="#example-configuration" class="md-nav__link"> <a href="#example-configurations" class="md-nav__link">
<span class="md-ellipsis"> <span class="md-ellipsis">
Example Configuration Example Configurations
</span> </span>
</a> </a>
@@ -879,9 +879,10 @@
<li>Click the "Add Instance" button</li> <li>Click the "Add Instance" button</li>
<li>Fill in the instance configuration:</li> <li>Fill in the instance configuration:</li>
<li><strong>Name</strong>: Give your instance a descriptive name</li> <li><strong>Name</strong>: Give your instance a descriptive name</li>
<li><strong>Model Path</strong>: Path to your Llama.cpp model file</li> <li><strong>Backend Type</strong>: Choose from llama.cpp, MLX, or vLLM</li>
<li><strong>Model</strong>: Model path or identifier for your chosen backend</li>
<li> <li>
<p><strong>Additional Options</strong>: Any extra Llama.cpp parameters</p> <p><strong>Additional Options</strong>: Backend-specific parameters</p>
</li> </li>
<li> <li>
<p>Click "Create Instance"</p> <p>Click "Create Instance"</p>
@@ -895,76 +896,103 @@
<li><strong>View logs</strong> by clicking the logs button</li> <li><strong>View logs</strong> by clicking the logs button</li>
<li><strong>Stop</strong> the instance when needed</li> <li><strong>Stop</strong> the instance when needed</li>
</ul> </ul>
<h2 id="example-configuration">Example Configuration<a class="headerlink" href="#example-configuration" title="Permanent link">&para;</a></h2> <h2 id="example-configurations">Example Configurations<a class="headerlink" href="#example-configurations" title="Permanent link">&para;</a></h2>
<p>Here's a basic example configuration for a Llama 2 model:</p> <p>Here are basic example configurations for each backend:</p>
<p><strong>llama.cpp backend:</strong>
<div class="highlight"><pre><span></span><code><a id="__codelineno-2-1" name="__codelineno-2-1" href="#__codelineno-2-1"></a><span class="p">{</span> <div class="highlight"><pre><span></span><code><a id="__codelineno-2-1" name="__codelineno-2-1" href="#__codelineno-2-1"></a><span class="p">{</span>
<a id="__codelineno-2-2" name="__codelineno-2-2" href="#__codelineno-2-2"></a><span class="w"> </span><span class="nt">&quot;name&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;llama2-7b&quot;</span><span class="p">,</span> <a id="__codelineno-2-2" name="__codelineno-2-2" href="#__codelineno-2-2"></a><span class="w"> </span><span class="nt">&quot;name&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;llama2-7b&quot;</span><span class="p">,</span>
<a id="__codelineno-2-3" name="__codelineno-2-3" href="#__codelineno-2-3"></a><span class="w"> </span><span class="nt">&quot;model_path&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;/path/to/llama-2-7b-chat.gguf&quot;</span><span class="p">,</span> <a id="__codelineno-2-3" name="__codelineno-2-3" href="#__codelineno-2-3"></a><span class="w"> </span><span class="nt">&quot;backend_type&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;llama_cpp&quot;</span><span class="p">,</span>
<a id="__codelineno-2-4" name="__codelineno-2-4" href="#__codelineno-2-4"></a><span class="w"> </span><span class="nt">&quot;options&quot;</span><span class="p">:</span><span class="w"> </span><span class="p">{</span> <a id="__codelineno-2-4" name="__codelineno-2-4" href="#__codelineno-2-4"></a><span class="w"> </span><span class="nt">&quot;backend_options&quot;</span><span class="p">:</span><span class="w"> </span><span class="p">{</span>
<a id="__codelineno-2-5" name="__codelineno-2-5" href="#__codelineno-2-5"></a><span class="w"> </span><span class="nt">&quot;threads&quot;</span><span class="p">:</span><span class="w"> </span><span class="mi">4</span><span class="p">,</span> <a id="__codelineno-2-5" name="__codelineno-2-5" href="#__codelineno-2-5"></a><span class="w"> </span><span class="nt">&quot;model&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;/path/to/llama-2-7b-chat.gguf&quot;</span><span class="p">,</span>
<a id="__codelineno-2-6" name="__codelineno-2-6" href="#__codelineno-2-6"></a><span class="w"> </span><span class="nt">&quot;context_size&quot;</span><span class="p">:</span><span class="w"> </span><span class="mi">2048</span> <a id="__codelineno-2-6" name="__codelineno-2-6" href="#__codelineno-2-6"></a><span class="w"> </span><span class="nt">&quot;threads&quot;</span><span class="p">:</span><span class="w"> </span><span class="mi">4</span><span class="p">,</span>
<a id="__codelineno-2-7" name="__codelineno-2-7" href="#__codelineno-2-7"></a><span class="w"> </span><span class="p">}</span> <a id="__codelineno-2-7" name="__codelineno-2-7" href="#__codelineno-2-7"></a><span class="w"> </span><span class="nt">&quot;ctx_size&quot;</span><span class="p">:</span><span class="w"> </span><span class="mi">2048</span><span class="p">,</span>
<a id="__codelineno-2-8" name="__codelineno-2-8" href="#__codelineno-2-8"></a><span class="p">}</span> <a id="__codelineno-2-8" name="__codelineno-2-8" href="#__codelineno-2-8"></a><span class="w"> </span><span class="nt">&quot;gpu_layers&quot;</span><span class="p">:</span><span class="w"> </span><span class="mi">32</span>
</code></pre></div> <a id="__codelineno-2-9" name="__codelineno-2-9" href="#__codelineno-2-9"></a><span class="w"> </span><span class="p">}</span>
<a id="__codelineno-2-10" name="__codelineno-2-10" href="#__codelineno-2-10"></a><span class="p">}</span>
</code></pre></div></p>
<p><strong>MLX backend (macOS only):</strong>
<div class="highlight"><pre><span></span><code><a id="__codelineno-3-1" name="__codelineno-3-1" href="#__codelineno-3-1"></a><span class="p">{</span>
<a id="__codelineno-3-2" name="__codelineno-3-2" href="#__codelineno-3-2"></a><span class="w"> </span><span class="nt">&quot;name&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;mistral-mlx&quot;</span><span class="p">,</span>
<a id="__codelineno-3-3" name="__codelineno-3-3" href="#__codelineno-3-3"></a><span class="w"> </span><span class="nt">&quot;backend_type&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;mlx_lm&quot;</span><span class="p">,</span>
<a id="__codelineno-3-4" name="__codelineno-3-4" href="#__codelineno-3-4"></a><span class="w"> </span><span class="nt">&quot;backend_options&quot;</span><span class="p">:</span><span class="w"> </span><span class="p">{</span>
<a id="__codelineno-3-5" name="__codelineno-3-5" href="#__codelineno-3-5"></a><span class="w"> </span><span class="nt">&quot;model&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;mlx-community/Mistral-7B-Instruct-v0.3-4bit&quot;</span><span class="p">,</span>
<a id="__codelineno-3-6" name="__codelineno-3-6" href="#__codelineno-3-6"></a><span class="w"> </span><span class="nt">&quot;temp&quot;</span><span class="p">:</span><span class="w"> </span><span class="mf">0.7</span><span class="p">,</span>
<a id="__codelineno-3-7" name="__codelineno-3-7" href="#__codelineno-3-7"></a><span class="w"> </span><span class="nt">&quot;max_tokens&quot;</span><span class="p">:</span><span class="w"> </span><span class="mi">2048</span>
<a id="__codelineno-3-8" name="__codelineno-3-8" href="#__codelineno-3-8"></a><span class="w"> </span><span class="p">}</span>
<a id="__codelineno-3-9" name="__codelineno-3-9" href="#__codelineno-3-9"></a><span class="p">}</span>
</code></pre></div></p>
<p><strong>vLLM backend:</strong>
<div class="highlight"><pre><span></span><code><a id="__codelineno-4-1" name="__codelineno-4-1" href="#__codelineno-4-1"></a><span class="p">{</span>
<a id="__codelineno-4-2" name="__codelineno-4-2" href="#__codelineno-4-2"></a><span class="w"> </span><span class="nt">&quot;name&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;dialogpt-vllm&quot;</span><span class="p">,</span>
<a id="__codelineno-4-3" name="__codelineno-4-3" href="#__codelineno-4-3"></a><span class="w"> </span><span class="nt">&quot;backend_type&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;vllm&quot;</span><span class="p">,</span>
<a id="__codelineno-4-4" name="__codelineno-4-4" href="#__codelineno-4-4"></a><span class="w"> </span><span class="nt">&quot;backend_options&quot;</span><span class="p">:</span><span class="w"> </span><span class="p">{</span>
<a id="__codelineno-4-5" name="__codelineno-4-5" href="#__codelineno-4-5"></a><span class="w"> </span><span class="nt">&quot;model&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;microsoft/DialoGPT-medium&quot;</span><span class="p">,</span>
<a id="__codelineno-4-6" name="__codelineno-4-6" href="#__codelineno-4-6"></a><span class="w"> </span><span class="nt">&quot;tensor_parallel_size&quot;</span><span class="p">:</span><span class="w"> </span><span class="mi">2</span><span class="p">,</span>
<a id="__codelineno-4-7" name="__codelineno-4-7" href="#__codelineno-4-7"></a><span class="w"> </span><span class="nt">&quot;gpu_memory_utilization&quot;</span><span class="p">:</span><span class="w"> </span><span class="mf">0.9</span>
<a id="__codelineno-4-8" name="__codelineno-4-8" href="#__codelineno-4-8"></a><span class="w"> </span><span class="p">}</span>
<a id="__codelineno-4-9" name="__codelineno-4-9" href="#__codelineno-4-9"></a><span class="p">}</span>
</code></pre></div></p>
<h2 id="using-the-api">Using the API<a class="headerlink" href="#using-the-api" title="Permanent link">&para;</a></h2> <h2 id="using-the-api">Using the API<a class="headerlink" href="#using-the-api" title="Permanent link">&para;</a></h2>
<p>You can also manage instances via the REST API:</p> <p>You can also manage instances via the REST API:</p>
<div class="highlight"><pre><span></span><code><a id="__codelineno-3-1" name="__codelineno-3-1" href="#__codelineno-3-1"></a><span class="c1"># List all instances</span> <div class="highlight"><pre><span></span><code><a id="__codelineno-5-1" name="__codelineno-5-1" href="#__codelineno-5-1"></a><span class="c1"># List all instances</span>
<a id="__codelineno-3-2" name="__codelineno-3-2" href="#__codelineno-3-2"></a>curl<span class="w"> </span>http://localhost:8080/api/instances <a id="__codelineno-5-2" name="__codelineno-5-2" href="#__codelineno-5-2"></a>curl<span class="w"> </span>http://localhost:8080/api/instances
<a id="__codelineno-3-3" name="__codelineno-3-3" href="#__codelineno-3-3"></a> <a id="__codelineno-5-3" name="__codelineno-5-3" href="#__codelineno-5-3"></a>
<a id="__codelineno-3-4" name="__codelineno-3-4" href="#__codelineno-3-4"></a><span class="c1"># Create a new instance</span> <a id="__codelineno-5-4" name="__codelineno-5-4" href="#__codelineno-5-4"></a><span class="c1"># Create a new llama.cpp instance</span>
<a id="__codelineno-3-5" name="__codelineno-3-5" href="#__codelineno-3-5"></a>curl<span class="w"> </span>-X<span class="w"> </span>POST<span class="w"> </span>http://localhost:8080/api/instances<span class="w"> </span><span class="se">\</span> <a id="__codelineno-5-5" name="__codelineno-5-5" href="#__codelineno-5-5"></a>curl<span class="w"> </span>-X<span class="w"> </span>POST<span class="w"> </span>http://localhost:8080/api/instances/my-model<span class="w"> </span><span class="se">\</span>
<a id="__codelineno-3-6" name="__codelineno-3-6" href="#__codelineno-3-6"></a><span class="w"> </span>-H<span class="w"> </span><span class="s2">&quot;Content-Type: application/json&quot;</span><span class="w"> </span><span class="se">\</span> <a id="__codelineno-5-6" name="__codelineno-5-6" href="#__codelineno-5-6"></a><span class="w"> </span>-H<span class="w"> </span><span class="s2">&quot;Content-Type: application/json&quot;</span><span class="w"> </span><span class="se">\</span>
<a id="__codelineno-3-7" name="__codelineno-3-7" href="#__codelineno-3-7"></a><span class="w"> </span>-d<span class="w"> </span><span class="s1">&#39;{</span> <a id="__codelineno-5-7" name="__codelineno-5-7" href="#__codelineno-5-7"></a><span class="w"> </span>-d<span class="w"> </span><span class="s1">&#39;{</span>
<a id="__codelineno-3-8" name="__codelineno-3-8" href="#__codelineno-3-8"></a><span class="s1"> &quot;name&quot;: &quot;my-model&quot;,</span> <a id="__codelineno-5-8" name="__codelineno-5-8" href="#__codelineno-5-8"></a><span class="s1"> &quot;backend_type&quot;: &quot;llama_cpp&quot;,</span>
<a id="__codelineno-3-9" name="__codelineno-3-9" href="#__codelineno-3-9"></a><span class="s1"> &quot;model_path&quot;: &quot;/path/to/model.gguf&quot;,</span> <a id="__codelineno-5-9" name="__codelineno-5-9" href="#__codelineno-5-9"></a><span class="s1"> &quot;backend_options&quot;: {</span>
<a id="__codelineno-3-10" name="__codelineno-3-10" href="#__codelineno-3-10"></a><span class="s1"> }&#39;</span> <a id="__codelineno-5-10" name="__codelineno-5-10" href="#__codelineno-5-10"></a><span class="s1"> &quot;model&quot;: &quot;/path/to/model.gguf&quot;</span>
<a id="__codelineno-3-11" name="__codelineno-3-11" href="#__codelineno-3-11"></a> <a id="__codelineno-5-11" name="__codelineno-5-11" href="#__codelineno-5-11"></a><span class="s1"> }</span>
<a id="__codelineno-3-12" name="__codelineno-3-12" href="#__codelineno-3-12"></a><span class="c1"># Start an instance</span> <a id="__codelineno-5-12" name="__codelineno-5-12" href="#__codelineno-5-12"></a><span class="s1"> }&#39;</span>
<a id="__codelineno-3-13" name="__codelineno-3-13" href="#__codelineno-3-13"></a>curl<span class="w"> </span>-X<span class="w"> </span>POST<span class="w"> </span>http://localhost:8080/api/instances/my-model/start <a id="__codelineno-5-13" name="__codelineno-5-13" href="#__codelineno-5-13"></a>
<a id="__codelineno-5-14" name="__codelineno-5-14" href="#__codelineno-5-14"></a><span class="c1"># Start an instance</span>
<a id="__codelineno-5-15" name="__codelineno-5-15" href="#__codelineno-5-15"></a>curl<span class="w"> </span>-X<span class="w"> </span>POST<span class="w"> </span>http://localhost:8080/api/instances/my-model/start
</code></pre></div> </code></pre></div>
<h2 id="openai-compatible-api">OpenAI Compatible API<a class="headerlink" href="#openai-compatible-api" title="Permanent link">&para;</a></h2> <h2 id="openai-compatible-api">OpenAI Compatible API<a class="headerlink" href="#openai-compatible-api" title="Permanent link">&para;</a></h2>
<p>Llamactl provides OpenAI-compatible endpoints, making it easy to integrate with existing OpenAI client libraries and tools.</p> <p>Llamactl provides OpenAI-compatible endpoints, making it easy to integrate with existing OpenAI client libraries and tools.</p>
<h3 id="chat-completions">Chat Completions<a class="headerlink" href="#chat-completions" title="Permanent link">&para;</a></h3> <h3 id="chat-completions">Chat Completions<a class="headerlink" href="#chat-completions" title="Permanent link">&para;</a></h3>
<p>Once you have an instance running, you can use it with the OpenAI-compatible chat completions endpoint:</p> <p>Once you have an instance running, you can use it with the OpenAI-compatible chat completions endpoint:</p>
<div class="highlight"><pre><span></span><code><a id="__codelineno-4-1" name="__codelineno-4-1" href="#__codelineno-4-1"></a>curl<span class="w"> </span>-X<span class="w"> </span>POST<span class="w"> </span>http://localhost:8080/v1/chat/completions<span class="w"> </span><span class="se">\</span> <div class="highlight"><pre><span></span><code><a id="__codelineno-6-1" name="__codelineno-6-1" href="#__codelineno-6-1"></a>curl<span class="w"> </span>-X<span class="w"> </span>POST<span class="w"> </span>http://localhost:8080/v1/chat/completions<span class="w"> </span><span class="se">\</span>
<a id="__codelineno-4-2" name="__codelineno-4-2" href="#__codelineno-4-2"></a><span class="w"> </span>-H<span class="w"> </span><span class="s2">&quot;Content-Type: application/json&quot;</span><span class="w"> </span><span class="se">\</span> <a id="__codelineno-6-2" name="__codelineno-6-2" href="#__codelineno-6-2"></a><span class="w"> </span>-H<span class="w"> </span><span class="s2">&quot;Content-Type: application/json&quot;</span><span class="w"> </span><span class="se">\</span>
<a id="__codelineno-4-3" name="__codelineno-4-3" href="#__codelineno-4-3"></a><span class="w"> </span>-d<span class="w"> </span><span class="s1">&#39;{</span> <a id="__codelineno-6-3" name="__codelineno-6-3" href="#__codelineno-6-3"></a><span class="w"> </span>-d<span class="w"> </span><span class="s1">&#39;{</span>
<a id="__codelineno-4-4" name="__codelineno-4-4" href="#__codelineno-4-4"></a><span class="s1"> &quot;model&quot;: &quot;my-model&quot;,</span> <a id="__codelineno-6-4" name="__codelineno-6-4" href="#__codelineno-6-4"></a><span class="s1"> &quot;model&quot;: &quot;my-model&quot;,</span>
<a id="__codelineno-4-5" name="__codelineno-4-5" href="#__codelineno-4-5"></a><span class="s1"> &quot;messages&quot;: [</span> <a id="__codelineno-6-5" name="__codelineno-6-5" href="#__codelineno-6-5"></a><span class="s1"> &quot;messages&quot;: [</span>
<a id="__codelineno-4-6" name="__codelineno-4-6" href="#__codelineno-4-6"></a><span class="s1"> {</span> <a id="__codelineno-6-6" name="__codelineno-6-6" href="#__codelineno-6-6"></a><span class="s1"> {</span>
<a id="__codelineno-4-7" name="__codelineno-4-7" href="#__codelineno-4-7"></a><span class="s1"> &quot;role&quot;: &quot;user&quot;,</span> <a id="__codelineno-6-7" name="__codelineno-6-7" href="#__codelineno-6-7"></a><span class="s1"> &quot;role&quot;: &quot;user&quot;,</span>
<a id="__codelineno-4-8" name="__codelineno-4-8" href="#__codelineno-4-8"></a><span class="s1"> &quot;content&quot;: &quot;Hello! Can you help me write a Python function?&quot;</span> <a id="__codelineno-6-8" name="__codelineno-6-8" href="#__codelineno-6-8"></a><span class="s1"> &quot;content&quot;: &quot;Hello! Can you help me write a Python function?&quot;</span>
<a id="__codelineno-4-9" name="__codelineno-4-9" href="#__codelineno-4-9"></a><span class="s1"> }</span> <a id="__codelineno-6-9" name="__codelineno-6-9" href="#__codelineno-6-9"></a><span class="s1"> }</span>
<a id="__codelineno-4-10" name="__codelineno-4-10" href="#__codelineno-4-10"></a><span class="s1"> ],</span> <a id="__codelineno-6-10" name="__codelineno-6-10" href="#__codelineno-6-10"></a><span class="s1"> ],</span>
<a id="__codelineno-4-11" name="__codelineno-4-11" href="#__codelineno-4-11"></a><span class="s1"> &quot;max_tokens&quot;: 150,</span> <a id="__codelineno-6-11" name="__codelineno-6-11" href="#__codelineno-6-11"></a><span class="s1"> &quot;max_tokens&quot;: 150,</span>
<a id="__codelineno-4-12" name="__codelineno-4-12" href="#__codelineno-4-12"></a><span class="s1"> &quot;temperature&quot;: 0.7</span> <a id="__codelineno-6-12" name="__codelineno-6-12" href="#__codelineno-6-12"></a><span class="s1"> &quot;temperature&quot;: 0.7</span>
<a id="__codelineno-4-13" name="__codelineno-4-13" href="#__codelineno-4-13"></a><span class="s1"> }&#39;</span> <a id="__codelineno-6-13" name="__codelineno-6-13" href="#__codelineno-6-13"></a><span class="s1"> }&#39;</span>
</code></pre></div> </code></pre></div>
<h3 id="using-with-python-openai-client">Using with Python OpenAI Client<a class="headerlink" href="#using-with-python-openai-client" title="Permanent link">&para;</a></h3> <h3 id="using-with-python-openai-client">Using with Python OpenAI Client<a class="headerlink" href="#using-with-python-openai-client" title="Permanent link">&para;</a></h3>
<p>You can also use the official OpenAI Python client:</p> <p>You can also use the official OpenAI Python client:</p>
<div class="highlight"><pre><span></span><code><a id="__codelineno-5-1" name="__codelineno-5-1" href="#__codelineno-5-1"></a><span class="kn">from</span><span class="w"> </span><span class="nn">openai</span><span class="w"> </span><span class="kn">import</span> <span class="n">OpenAI</span> <div class="highlight"><pre><span></span><code><a id="__codelineno-7-1" name="__codelineno-7-1" href="#__codelineno-7-1"></a><span class="kn">from</span><span class="w"> </span><span class="nn">openai</span><span class="w"> </span><span class="kn">import</span> <span class="n">OpenAI</span>
<a id="__codelineno-5-2" name="__codelineno-5-2" href="#__codelineno-5-2"></a> <a id="__codelineno-7-2" name="__codelineno-7-2" href="#__codelineno-7-2"></a>
<a id="__codelineno-5-3" name="__codelineno-5-3" href="#__codelineno-5-3"></a><span class="c1"># Point the client to your Llamactl server</span> <a id="__codelineno-7-3" name="__codelineno-7-3" href="#__codelineno-7-3"></a><span class="c1"># Point the client to your Llamactl server</span>
<a id="__codelineno-5-4" name="__codelineno-5-4" href="#__codelineno-5-4"></a><span class="n">client</span> <span class="o">=</span> <span class="n">OpenAI</span><span class="p">(</span> <a id="__codelineno-7-4" name="__codelineno-7-4" href="#__codelineno-7-4"></a><span class="n">client</span> <span class="o">=</span> <span class="n">OpenAI</span><span class="p">(</span>
<a id="__codelineno-5-5" name="__codelineno-5-5" href="#__codelineno-5-5"></a> <span class="n">base_url</span><span class="o">=</span><span class="s2">&quot;http://localhost:8080/v1&quot;</span><span class="p">,</span> <a id="__codelineno-7-5" name="__codelineno-7-5" href="#__codelineno-7-5"></a> <span class="n">base_url</span><span class="o">=</span><span class="s2">&quot;http://localhost:8080/v1&quot;</span><span class="p">,</span>
<a id="__codelineno-5-6" name="__codelineno-5-6" href="#__codelineno-5-6"></a> <span class="n">api_key</span><span class="o">=</span><span class="s2">&quot;not-needed&quot;</span> <span class="c1"># Llamactl doesn&#39;t require API keys by default</span> <a id="__codelineno-7-6" name="__codelineno-7-6" href="#__codelineno-7-6"></a> <span class="n">api_key</span><span class="o">=</span><span class="s2">&quot;not-needed&quot;</span> <span class="c1"># Llamactl doesn&#39;t require API keys by default</span>
<a id="__codelineno-5-7" name="__codelineno-5-7" href="#__codelineno-5-7"></a><span class="p">)</span> <a id="__codelineno-7-7" name="__codelineno-7-7" href="#__codelineno-7-7"></a><span class="p">)</span>
<a id="__codelineno-5-8" name="__codelineno-5-8" href="#__codelineno-5-8"></a> <a id="__codelineno-7-8" name="__codelineno-7-8" href="#__codelineno-7-8"></a>
<a id="__codelineno-5-9" name="__codelineno-5-9" href="#__codelineno-5-9"></a><span class="c1"># Create a chat completion</span> <a id="__codelineno-7-9" name="__codelineno-7-9" href="#__codelineno-7-9"></a><span class="c1"># Create a chat completion</span>
<a id="__codelineno-5-10" name="__codelineno-5-10" href="#__codelineno-5-10"></a><span class="n">response</span> <span class="o">=</span> <span class="n">client</span><span class="o">.</span><span class="n">chat</span><span class="o">.</span><span class="n">completions</span><span class="o">.</span><span class="n">create</span><span class="p">(</span> <a id="__codelineno-7-10" name="__codelineno-7-10" href="#__codelineno-7-10"></a><span class="n">response</span> <span class="o">=</span> <span class="n">client</span><span class="o">.</span><span class="n">chat</span><span class="o">.</span><span class="n">completions</span><span class="o">.</span><span class="n">create</span><span class="p">(</span>
<a id="__codelineno-5-11" name="__codelineno-5-11" href="#__codelineno-5-11"></a> <span class="n">model</span><span class="o">=</span><span class="s2">&quot;my-model&quot;</span><span class="p">,</span> <span class="c1"># Use the name of your instance</span> <a id="__codelineno-7-11" name="__codelineno-7-11" href="#__codelineno-7-11"></a> <span class="n">model</span><span class="o">=</span><span class="s2">&quot;my-model&quot;</span><span class="p">,</span> <span class="c1"># Use the name of your instance</span>
<a id="__codelineno-5-12" name="__codelineno-5-12" href="#__codelineno-5-12"></a> <span class="n">messages</span><span class="o">=</span><span class="p">[</span> <a id="__codelineno-7-12" name="__codelineno-7-12" href="#__codelineno-7-12"></a> <span class="n">messages</span><span class="o">=</span><span class="p">[</span>
<a id="__codelineno-5-13" name="__codelineno-5-13" href="#__codelineno-5-13"></a> <span class="p">{</span><span class="s2">&quot;role&quot;</span><span class="p">:</span> <span class="s2">&quot;user&quot;</span><span class="p">,</span> <span class="s2">&quot;content&quot;</span><span class="p">:</span> <span class="s2">&quot;Explain quantum computing in simple terms&quot;</span><span class="p">}</span> <a id="__codelineno-7-13" name="__codelineno-7-13" href="#__codelineno-7-13"></a> <span class="p">{</span><span class="s2">&quot;role&quot;</span><span class="p">:</span> <span class="s2">&quot;user&quot;</span><span class="p">,</span> <span class="s2">&quot;content&quot;</span><span class="p">:</span> <span class="s2">&quot;Explain quantum computing in simple terms&quot;</span><span class="p">}</span>
<a id="__codelineno-5-14" name="__codelineno-5-14" href="#__codelineno-5-14"></a> <span class="p">],</span> <a id="__codelineno-7-14" name="__codelineno-7-14" href="#__codelineno-7-14"></a> <span class="p">],</span>
<a id="__codelineno-5-15" name="__codelineno-5-15" href="#__codelineno-5-15"></a> <span class="n">max_tokens</span><span class="o">=</span><span class="mi">200</span><span class="p">,</span> <a id="__codelineno-7-15" name="__codelineno-7-15" href="#__codelineno-7-15"></a> <span class="n">max_tokens</span><span class="o">=</span><span class="mi">200</span><span class="p">,</span>
<a id="__codelineno-5-16" name="__codelineno-5-16" href="#__codelineno-5-16"></a> <span class="n">temperature</span><span class="o">=</span><span class="mf">0.7</span> <a id="__codelineno-7-16" name="__codelineno-7-16" href="#__codelineno-7-16"></a> <span class="n">temperature</span><span class="o">=</span><span class="mf">0.7</span>
<a id="__codelineno-5-17" name="__codelineno-5-17" href="#__codelineno-5-17"></a><span class="p">)</span> <a id="__codelineno-7-17" name="__codelineno-7-17" href="#__codelineno-7-17"></a><span class="p">)</span>
<a id="__codelineno-5-18" name="__codelineno-5-18" href="#__codelineno-5-18"></a> <a id="__codelineno-7-18" name="__codelineno-7-18" href="#__codelineno-7-18"></a>
<a id="__codelineno-5-19" name="__codelineno-5-19" href="#__codelineno-5-19"></a><span class="nb">print</span><span class="p">(</span><span class="n">response</span><span class="o">.</span><span class="n">choices</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">message</span><span class="o">.</span><span class="n">content</span><span class="p">)</span> <a id="__codelineno-7-19" name="__codelineno-7-19" href="#__codelineno-7-19"></a><span class="nb">print</span><span class="p">(</span><span class="n">response</span><span class="o">.</span><span class="n">choices</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">message</span><span class="o">.</span><span class="n">content</span><span class="p">)</span>
</code></pre></div> </code></pre></div>
<h3 id="list-available-models">List Available Models<a class="headerlink" href="#list-available-models" title="Permanent link">&para;</a></h3> <h3 id="list-available-models">List Available Models<a class="headerlink" href="#list-available-models" title="Permanent link">&para;</a></h3>
<p>Get a list of running instances (models) in OpenAI-compatible format:</p> <p>Get a list of running instances (models) in OpenAI-compatible format:</p>
<div class="highlight"><pre><span></span><code><a id="__codelineno-6-1" name="__codelineno-6-1" href="#__codelineno-6-1"></a>curl<span class="w"> </span>http://localhost:8080/v1/models <div class="highlight"><pre><span></span><code><a id="__codelineno-8-1" name="__codelineno-8-1" href="#__codelineno-8-1"></a>curl<span class="w"> </span>http://localhost:8080/v1/models
</code></pre></div> </code></pre></div>
<h2 id="next-steps">Next Steps<a class="headerlink" href="#next-steps" title="Permanent link">&para;</a></h2> <h2 id="next-steps">Next Steps<a class="headerlink" href="#next-steps" title="Permanent link">&para;</a></h2>
<ul> <ul>
@@ -992,7 +1020,7 @@
<span class="md-icon" title="Last update"> <span class="md-icon" title="Last update">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
</span> </span>
<span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-date">September 3, 2025</span> <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-date">September 21, 2025</span>
</span> </span>

File diff suppressed because one or more lines are too long

View File

@@ -2,37 +2,37 @@
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<url> <url>
<loc>https://llamactl.org/dev/</loc> <loc>https://llamactl.org/dev/</loc>
<lastmod>2025-09-18</lastmod> <lastmod>2025-09-22</lastmod>
<changefreq>daily</changefreq> <changefreq>daily</changefreq>
</url> </url>
<url> <url>
<loc>https://llamactl.org/dev/getting-started/configuration/</loc> <loc>https://llamactl.org/dev/getting-started/configuration/</loc>
<lastmod>2025-09-18</lastmod> <lastmod>2025-09-22</lastmod>
<changefreq>daily</changefreq> <changefreq>daily</changefreq>
</url> </url>
<url> <url>
<loc>https://llamactl.org/dev/getting-started/installation/</loc> <loc>https://llamactl.org/dev/getting-started/installation/</loc>
<lastmod>2025-09-18</lastmod> <lastmod>2025-09-22</lastmod>
<changefreq>daily</changefreq> <changefreq>daily</changefreq>
</url> </url>
<url> <url>
<loc>https://llamactl.org/dev/getting-started/quick-start/</loc> <loc>https://llamactl.org/dev/getting-started/quick-start/</loc>
<lastmod>2025-09-18</lastmod> <lastmod>2025-09-22</lastmod>
<changefreq>daily</changefreq> <changefreq>daily</changefreq>
</url> </url>
<url> <url>
<loc>https://llamactl.org/dev/user-guide/api-reference/</loc> <loc>https://llamactl.org/dev/user-guide/api-reference/</loc>
<lastmod>2025-09-18</lastmod> <lastmod>2025-09-22</lastmod>
<changefreq>daily</changefreq> <changefreq>daily</changefreq>
</url> </url>
<url> <url>
<loc>https://llamactl.org/dev/user-guide/managing-instances/</loc> <loc>https://llamactl.org/dev/user-guide/managing-instances/</loc>
<lastmod>2025-09-18</lastmod> <lastmod>2025-09-22</lastmod>
<changefreq>daily</changefreq> <changefreq>daily</changefreq>
</url> </url>
<url> <url>
<loc>https://llamactl.org/dev/user-guide/troubleshooting/</loc> <loc>https://llamactl.org/dev/user-guide/troubleshooting/</loc>
<lastmod>2025-09-18</lastmod> <lastmod>2025-09-22</lastmod>
<changefreq>daily</changefreq> <changefreq>daily</changefreq>
</url> </url>
</urlset> </urlset>

Binary file not shown.

View File

@@ -856,6 +856,72 @@
</ul> </ul>
</nav> </nav>
</li>
<li class="md-nav__item">
<a href="#backend-specific-endpoints" class="md-nav__link">
<span class="md-ellipsis">
Backend-Specific Endpoints
</span>
</a>
<nav class="md-nav" aria-label="Backend-Specific Endpoints">
<ul class="md-nav__list">
<li class="md-nav__item">
<a href="#parse-commands" class="md-nav__link">
<span class="md-ellipsis">
Parse Commands
</span>
</a>
<nav class="md-nav" aria-label="Parse Commands">
<ul class="md-nav__list">
<li class="md-nav__item">
<a href="#parse-llamacpp-command" class="md-nav__link">
<span class="md-ellipsis">
Parse Llama.cpp Command
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#parse-mlx-lm-command" class="md-nav__link">
<span class="md-ellipsis">
Parse MLX-LM Command
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#parse-vllm-command" class="md-nav__link">
<span class="md-ellipsis">
Parse vLLM Command
</span>
</a>
</li>
</ul>
</nav>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="#auto-generated-documentation" class="md-nav__link">
<span class="md-ellipsis">
Auto-Generated Documentation
</span>
</a>
</li> </li>
<li class="md-nav__item"> <li class="md-nav__item">
@@ -1216,6 +1282,72 @@
</ul> </ul>
</nav> </nav>
</li>
<li class="md-nav__item">
<a href="#backend-specific-endpoints" class="md-nav__link">
<span class="md-ellipsis">
Backend-Specific Endpoints
</span>
</a>
<nav class="md-nav" aria-label="Backend-Specific Endpoints">
<ul class="md-nav__list">
<li class="md-nav__item">
<a href="#parse-commands" class="md-nav__link">
<span class="md-ellipsis">
Parse Commands
</span>
</a>
<nav class="md-nav" aria-label="Parse Commands">
<ul class="md-nav__list">
<li class="md-nav__item">
<a href="#parse-llamacpp-command" class="md-nav__link">
<span class="md-ellipsis">
Parse Llama.cpp Command
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#parse-mlx-lm-command" class="md-nav__link">
<span class="md-ellipsis">
Parse MLX-LM Command
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#parse-vllm-command" class="md-nav__link">
<span class="md-ellipsis">
Parse vLLM Command
</span>
</a>
</li>
</ul>
</nav>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="#auto-generated-documentation" class="md-nav__link">
<span class="md-ellipsis">
Auto-Generated Documentation
</span>
</a>
</li> </li>
<li class="md-nav__item"> <li class="md-nav__item">
@@ -1346,7 +1478,7 @@
<p><strong>Response:</strong> <p><strong>Response:</strong>
<div class="highlight"><pre><span></span><code><a id="__codelineno-17-1" name="__codelineno-17-1" href="#__codelineno-17-1"></a><span class="p">{</span> <div class="highlight"><pre><span></span><code><a id="__codelineno-17-1" name="__codelineno-17-1" href="#__codelineno-17-1"></a><span class="p">{</span>
<a id="__codelineno-17-2" name="__codelineno-17-2" href="#__codelineno-17-2"></a><span class="w"> </span><span class="nt">&quot;name&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;llama2-7b&quot;</span><span class="p">,</span> <a id="__codelineno-17-2" name="__codelineno-17-2" href="#__codelineno-17-2"></a><span class="w"> </span><span class="nt">&quot;name&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;llama2-7b&quot;</span><span class="p">,</span>
<a id="__codelineno-17-3" name="__codelineno-17-3" href="#__codelineno-17-3"></a><span class="w"> </span><span class="nt">&quot;status&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;starting&quot;</span><span class="p">,</span> <a id="__codelineno-17-3" name="__codelineno-17-3" href="#__codelineno-17-3"></a><span class="w"> </span><span class="nt">&quot;status&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;running&quot;</span><span class="p">,</span>
<a id="__codelineno-17-4" name="__codelineno-17-4" href="#__codelineno-17-4"></a><span class="w"> </span><span class="nt">&quot;created&quot;</span><span class="p">:</span><span class="w"> </span><span class="mi">1705312200</span> <a id="__codelineno-17-4" name="__codelineno-17-4" href="#__codelineno-17-4"></a><span class="w"> </span><span class="nt">&quot;created&quot;</span><span class="p">:</span><span class="w"> </span><span class="mi">1705312200</span>
<a id="__codelineno-17-5" name="__codelineno-17-5" href="#__codelineno-17-5"></a><span class="p">}</span> <a id="__codelineno-17-5" name="__codelineno-17-5" href="#__codelineno-17-5"></a><span class="p">}</span>
</code></pre></div></p> </code></pre></div></p>
@@ -1360,7 +1492,7 @@
<p><strong>Response:</strong> <p><strong>Response:</strong>
<div class="highlight"><pre><span></span><code><a id="__codelineno-19-1" name="__codelineno-19-1" href="#__codelineno-19-1"></a><span class="p">{</span> <div class="highlight"><pre><span></span><code><a id="__codelineno-19-1" name="__codelineno-19-1" href="#__codelineno-19-1"></a><span class="p">{</span>
<a id="__codelineno-19-2" name="__codelineno-19-2" href="#__codelineno-19-2"></a><span class="w"> </span><span class="nt">&quot;name&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;llama2-7b&quot;</span><span class="p">,</span> <a id="__codelineno-19-2" name="__codelineno-19-2" href="#__codelineno-19-2"></a><span class="w"> </span><span class="nt">&quot;name&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;llama2-7b&quot;</span><span class="p">,</span>
<a id="__codelineno-19-3" name="__codelineno-19-3" href="#__codelineno-19-3"></a><span class="w"> </span><span class="nt">&quot;status&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;stopping&quot;</span><span class="p">,</span> <a id="__codelineno-19-3" name="__codelineno-19-3" href="#__codelineno-19-3"></a><span class="w"> </span><span class="nt">&quot;status&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;stopped&quot;</span><span class="p">,</span>
<a id="__codelineno-19-4" name="__codelineno-19-4" href="#__codelineno-19-4"></a><span class="w"> </span><span class="nt">&quot;created&quot;</span><span class="p">:</span><span class="w"> </span><span class="mi">1705312200</span> <a id="__codelineno-19-4" name="__codelineno-19-4" href="#__codelineno-19-4"></a><span class="w"> </span><span class="nt">&quot;created&quot;</span><span class="p">:</span><span class="w"> </span><span class="mi">1705312200</span>
<a id="__codelineno-19-5" name="__codelineno-19-5" href="#__codelineno-19-5"></a><span class="p">}</span> <a id="__codelineno-19-5" name="__codelineno-19-5" href="#__codelineno-19-5"></a><span class="p">}</span>
</code></pre></div></p> </code></pre></div></p>
@@ -1371,7 +1503,7 @@
<p><strong>Response:</strong> <p><strong>Response:</strong>
<div class="highlight"><pre><span></span><code><a id="__codelineno-21-1" name="__codelineno-21-1" href="#__codelineno-21-1"></a><span class="p">{</span> <div class="highlight"><pre><span></span><code><a id="__codelineno-21-1" name="__codelineno-21-1" href="#__codelineno-21-1"></a><span class="p">{</span>
<a id="__codelineno-21-2" name="__codelineno-21-2" href="#__codelineno-21-2"></a><span class="w"> </span><span class="nt">&quot;name&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;llama2-7b&quot;</span><span class="p">,</span> <a id="__codelineno-21-2" name="__codelineno-21-2" href="#__codelineno-21-2"></a><span class="w"> </span><span class="nt">&quot;name&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;llama2-7b&quot;</span><span class="p">,</span>
<a id="__codelineno-21-3" name="__codelineno-21-3" href="#__codelineno-21-3"></a><span class="w"> </span><span class="nt">&quot;status&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;restarting&quot;</span><span class="p">,</span> <a id="__codelineno-21-3" name="__codelineno-21-3" href="#__codelineno-21-3"></a><span class="w"> </span><span class="nt">&quot;status&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;running&quot;</span><span class="p">,</span>
<a id="__codelineno-21-4" name="__codelineno-21-4" href="#__codelineno-21-4"></a><span class="w"> </span><span class="nt">&quot;created&quot;</span><span class="p">:</span><span class="w"> </span><span class="mi">1705312200</span> <a id="__codelineno-21-4" name="__codelineno-21-4" href="#__codelineno-21-4"></a><span class="w"> </span><span class="nt">&quot;created&quot;</span><span class="p">:</span><span class="w"> </span><span class="mi">1705312200</span>
<a id="__codelineno-21-5" name="__codelineno-21-5" href="#__codelineno-21-5"></a><span class="p">}</span> <a id="__codelineno-21-5" name="__codelineno-21-5" href="#__codelineno-21-5"></a><span class="p">}</span>
</code></pre></div></p> </code></pre></div></p>
@@ -1443,9 +1575,9 @@
- <code>503 Service Unavailable</code>: Instance is not running and on-demand start is disabled - <code>503 Service Unavailable</code>: Instance is not running and on-demand start is disabled
- <code>409 Conflict</code>: Cannot start instance due to maximum instances limit</p> - <code>409 Conflict</code>: Cannot start instance due to maximum instances limit</p>
<h2 id="instance-status-values">Instance Status Values<a class="headerlink" href="#instance-status-values" title="Permanent link">&para;</a></h2> <h2 id="instance-status-values">Instance Status Values<a class="headerlink" href="#instance-status-values" title="Permanent link">&para;</a></h2>
<p>Instances can have the following status values:<br /> <p>Instances can have the following status values:
- <code>stopped</code>: Instance is not running<br /> - <code>stopped</code>: Instance is not running
- <code>running</code>: Instance is running and ready to accept requests<br /> - <code>running</code>: Instance is running and ready to accept requests
- <code>failed</code>: Instance failed to start or crashed </p> - <code>failed</code>: Instance failed to start or crashed </p>
<h2 id="error-responses">Error Responses<a class="headerlink" href="#error-responses" title="Permanent link">&para;</a></h2> <h2 id="error-responses">Error Responses<a class="headerlink" href="#error-responses" title="Permanent link">&para;</a></h2>
<p>All endpoints may return error responses in the following format:</p> <p>All endpoints may return error responses in the following format:</p>
@@ -1515,9 +1647,76 @@
<a id="__codelineno-32-7" name="__codelineno-32-7" href="#__codelineno-32-7"></a><span class="s1"> &quot;n_predict&quot;: 50</span> <a id="__codelineno-32-7" name="__codelineno-32-7" href="#__codelineno-32-7"></a><span class="s1"> &quot;n_predict&quot;: 50</span>
<a id="__codelineno-32-8" name="__codelineno-32-8" href="#__codelineno-32-8"></a><span class="s1"> }&#39;</span> <a id="__codelineno-32-8" name="__codelineno-32-8" href="#__codelineno-32-8"></a><span class="s1"> }&#39;</span>
</code></pre></div> </code></pre></div>
<h2 id="backend-specific-endpoints">Backend-Specific Endpoints<a class="headerlink" href="#backend-specific-endpoints" title="Permanent link">&para;</a></h2>
<h3 id="parse-commands">Parse Commands<a class="headerlink" href="#parse-commands" title="Permanent link">&para;</a></h3>
<p>Llamactl provides endpoints to parse command strings from different backends into instance configuration options.</p>
<h4 id="parse-llamacpp-command">Parse Llama.cpp Command<a class="headerlink" href="#parse-llamacpp-command" title="Permanent link">&para;</a></h4>
<p>Parse a llama-server command string into instance options.</p>
<div class="highlight"><pre><span></span><code><a id="__codelineno-33-1" name="__codelineno-33-1" href="#__codelineno-33-1"></a><span class="err">POST /api/v1/backends/llama-cpp/parse-command</span>
</code></pre></div>
<p><strong>Request Body:</strong>
<div class="highlight"><pre><span></span><code><a id="__codelineno-34-1" name="__codelineno-34-1" href="#__codelineno-34-1"></a><span class="p">{</span>
<a id="__codelineno-34-2" name="__codelineno-34-2" href="#__codelineno-34-2"></a><span class="w"> </span><span class="nt">&quot;command&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;llama-server -m /path/to/model.gguf -c 2048 --port 8080&quot;</span>
<a id="__codelineno-34-3" name="__codelineno-34-3" href="#__codelineno-34-3"></a><span class="p">}</span>
</code></pre></div></p>
<p><strong>Response:</strong>
<div class="highlight"><pre><span></span><code><a id="__codelineno-35-1" name="__codelineno-35-1" href="#__codelineno-35-1"></a><span class="p">{</span>
<a id="__codelineno-35-2" name="__codelineno-35-2" href="#__codelineno-35-2"></a><span class="w"> </span><span class="nt">&quot;backend_type&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;llama_cpp&quot;</span><span class="p">,</span>
<a id="__codelineno-35-3" name="__codelineno-35-3" href="#__codelineno-35-3"></a><span class="w"> </span><span class="nt">&quot;llama_server_options&quot;</span><span class="p">:</span><span class="w"> </span><span class="p">{</span>
<a id="__codelineno-35-4" name="__codelineno-35-4" href="#__codelineno-35-4"></a><span class="w"> </span><span class="nt">&quot;model&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;/path/to/model.gguf&quot;</span><span class="p">,</span>
<a id="__codelineno-35-5" name="__codelineno-35-5" href="#__codelineno-35-5"></a><span class="w"> </span><span class="nt">&quot;ctx_size&quot;</span><span class="p">:</span><span class="w"> </span><span class="mi">2048</span><span class="p">,</span>
<a id="__codelineno-35-6" name="__codelineno-35-6" href="#__codelineno-35-6"></a><span class="w"> </span><span class="nt">&quot;port&quot;</span><span class="p">:</span><span class="w"> </span><span class="mi">8080</span>
<a id="__codelineno-35-7" name="__codelineno-35-7" href="#__codelineno-35-7"></a><span class="w"> </span><span class="p">}</span>
<a id="__codelineno-35-8" name="__codelineno-35-8" href="#__codelineno-35-8"></a><span class="p">}</span>
</code></pre></div></p>
<h4 id="parse-mlx-lm-command">Parse MLX-LM Command<a class="headerlink" href="#parse-mlx-lm-command" title="Permanent link">&para;</a></h4>
<p>Parse an MLX-LM server command string into instance options.</p>
<div class="highlight"><pre><span></span><code><a id="__codelineno-36-1" name="__codelineno-36-1" href="#__codelineno-36-1"></a><span class="err">POST /api/v1/backends/mlx/parse-command</span>
</code></pre></div>
<p><strong>Request Body:</strong>
<div class="highlight"><pre><span></span><code><a id="__codelineno-37-1" name="__codelineno-37-1" href="#__codelineno-37-1"></a><span class="p">{</span>
<a id="__codelineno-37-2" name="__codelineno-37-2" href="#__codelineno-37-2"></a><span class="w"> </span><span class="nt">&quot;command&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;mlx_lm.server --model /path/to/model --port 8080&quot;</span>
<a id="__codelineno-37-3" name="__codelineno-37-3" href="#__codelineno-37-3"></a><span class="p">}</span>
</code></pre></div></p>
<p><strong>Response:</strong>
<div class="highlight"><pre><span></span><code><a id="__codelineno-38-1" name="__codelineno-38-1" href="#__codelineno-38-1"></a><span class="p">{</span>
<a id="__codelineno-38-2" name="__codelineno-38-2" href="#__codelineno-38-2"></a><span class="w"> </span><span class="nt">&quot;backend_type&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;mlx_lm&quot;</span><span class="p">,</span>
<a id="__codelineno-38-3" name="__codelineno-38-3" href="#__codelineno-38-3"></a><span class="w"> </span><span class="nt">&quot;mlx_server_options&quot;</span><span class="p">:</span><span class="w"> </span><span class="p">{</span>
<a id="__codelineno-38-4" name="__codelineno-38-4" href="#__codelineno-38-4"></a><span class="w"> </span><span class="nt">&quot;model&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;/path/to/model&quot;</span><span class="p">,</span>
<a id="__codelineno-38-5" name="__codelineno-38-5" href="#__codelineno-38-5"></a><span class="w"> </span><span class="nt">&quot;port&quot;</span><span class="p">:</span><span class="w"> </span><span class="mi">8080</span>
<a id="__codelineno-38-6" name="__codelineno-38-6" href="#__codelineno-38-6"></a><span class="w"> </span><span class="p">}</span>
<a id="__codelineno-38-7" name="__codelineno-38-7" href="#__codelineno-38-7"></a><span class="p">}</span>
</code></pre></div></p>
<h4 id="parse-vllm-command">Parse vLLM Command<a class="headerlink" href="#parse-vllm-command" title="Permanent link">&para;</a></h4>
<p>Parse a vLLM serve command string into instance options.</p>
<div class="highlight"><pre><span></span><code><a id="__codelineno-39-1" name="__codelineno-39-1" href="#__codelineno-39-1"></a><span class="err">POST /api/v1/backends/vllm/parse-command</span>
</code></pre></div>
<p><strong>Request Body:</strong>
<div class="highlight"><pre><span></span><code><a id="__codelineno-40-1" name="__codelineno-40-1" href="#__codelineno-40-1"></a><span class="p">{</span>
<a id="__codelineno-40-2" name="__codelineno-40-2" href="#__codelineno-40-2"></a><span class="w"> </span><span class="nt">&quot;command&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;vllm serve /path/to/model --port 8080&quot;</span>
<a id="__codelineno-40-3" name="__codelineno-40-3" href="#__codelineno-40-3"></a><span class="p">}</span>
</code></pre></div></p>
<p><strong>Response:</strong>
<div class="highlight"><pre><span></span><code><a id="__codelineno-41-1" name="__codelineno-41-1" href="#__codelineno-41-1"></a><span class="p">{</span>
<a id="__codelineno-41-2" name="__codelineno-41-2" href="#__codelineno-41-2"></a><span class="w"> </span><span class="nt">&quot;backend_type&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;vllm&quot;</span><span class="p">,</span>
<a id="__codelineno-41-3" name="__codelineno-41-3" href="#__codelineno-41-3"></a><span class="w"> </span><span class="nt">&quot;vllm_server_options&quot;</span><span class="p">:</span><span class="w"> </span><span class="p">{</span>
<a id="__codelineno-41-4" name="__codelineno-41-4" href="#__codelineno-41-4"></a><span class="w"> </span><span class="nt">&quot;model&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;/path/to/model&quot;</span><span class="p">,</span>
<a id="__codelineno-41-5" name="__codelineno-41-5" href="#__codelineno-41-5"></a><span class="w"> </span><span class="nt">&quot;port&quot;</span><span class="p">:</span><span class="w"> </span><span class="mi">8080</span>
<a id="__codelineno-41-6" name="__codelineno-41-6" href="#__codelineno-41-6"></a><span class="w"> </span><span class="p">}</span>
<a id="__codelineno-41-7" name="__codelineno-41-7" href="#__codelineno-41-7"></a><span class="p">}</span>
</code></pre></div></p>
<p><strong>Error Responses for Parse Commands:</strong>
- <code>400 Bad Request</code>: Invalid request body, empty command, or parse error
- <code>500 Internal Server Error</code>: Encoding error</p>
<h2 id="auto-generated-documentation">Auto-Generated Documentation<a class="headerlink" href="#auto-generated-documentation" title="Permanent link">&para;</a></h2>
<p>The API documentation is automatically generated from code annotations using Swagger/OpenAPI. To regenerate the documentation:</p>
<ol>
<li>Install the swag tool: <code>go install github.com/swaggo/swag/cmd/swag@latest</code></li>
<li>Generate docs: <code>swag init -g cmd/server/main.go -o apidocs</code></li>
</ol>
<h2 id="swagger-documentation">Swagger Documentation<a class="headerlink" href="#swagger-documentation" title="Permanent link">&para;</a></h2> <h2 id="swagger-documentation">Swagger Documentation<a class="headerlink" href="#swagger-documentation" title="Permanent link">&para;</a></h2>
<p>If swagger documentation is enabled in the server configuration, you can access the interactive API documentation at:</p> <p>If swagger documentation is enabled in the server configuration, you can access the interactive API documentation at:</p>
<div class="highlight"><pre><span></span><code><a id="__codelineno-33-1" name="__codelineno-33-1" href="#__codelineno-33-1"></a>http://localhost:8080/swagger/ <div class="highlight"><pre><span></span><code><a id="__codelineno-42-1" name="__codelineno-42-1" href="#__codelineno-42-1"></a>http://localhost:8080/swagger/
</code></pre></div> </code></pre></div>
<p>This provides a complete interactive interface for testing all API endpoints.</p> <p>This provides a complete interactive interface for testing all API endpoints.</p>
@@ -1540,7 +1739,7 @@
<span class="md-icon" title="Last update"> <span class="md-icon" title="Last update">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
</span> </span>
<span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-date">September 3, 2025</span> <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-date">September 21, 2025</span>
</span> </span>

View File

@@ -1228,7 +1228,7 @@
<h1 id="managing-instances">Managing Instances<a class="headerlink" href="#managing-instances" title="Permanent link">&para;</a></h1> <h1 id="managing-instances">Managing Instances<a class="headerlink" href="#managing-instances" title="Permanent link">&para;</a></h1>
<p>Learn how to effectively manage your llama.cpp and MLX instances with Llamactl through both the Web UI and API.</p> <p>Learn how to effectively manage your llama.cpp, MLX, and vLLM instances with Llamactl through both the Web UI and API.</p>
<h2 id="overview">Overview<a class="headerlink" href="#overview" title="Permanent link">&para;</a></h2> <h2 id="overview">Overview<a class="headerlink" href="#overview" title="Permanent link">&para;</a></h2>
<p>Llamactl provides two ways to manage instances:</p> <p>Llamactl provides two ways to manage instances:</p>
<ul> <ul>
@@ -1262,11 +1262,13 @@
<li><strong>Choose Backend Type</strong>:<ul> <li><strong>Choose Backend Type</strong>:<ul>
<li><strong>llama.cpp</strong>: For GGUF models using llama-server</li> <li><strong>llama.cpp</strong>: For GGUF models using llama-server</li>
<li><strong>MLX</strong>: For MLX-optimized models (macOS only)</li> <li><strong>MLX</strong>: For MLX-optimized models (macOS only)</li>
<li><strong>vLLM</strong>: For distributed serving and high-throughput inference</li>
</ul> </ul>
</li> </li>
<li>Configure model source:<ul> <li>Configure model source:<ul>
<li><strong>For llama.cpp</strong>: GGUF model path or HuggingFace repo</li> <li><strong>For llama.cpp</strong>: GGUF model path or HuggingFace repo</li>
<li><strong>For MLX</strong>: MLX model path or identifier (e.g., <code>mlx-community/Mistral-7B-Instruct-v0.3-4bit</code>)</li> <li><strong>For MLX</strong>: MLX model path or identifier (e.g., <code>mlx-community/Mistral-7B-Instruct-v0.3-4bit</code>)</li>
<li><strong>For vLLM</strong>: HuggingFace model identifier (e.g., <code>microsoft/DialoGPT-medium</code>)</li>
</ul> </ul>
</li> </li>
<li>Configure optional instance management settings:<ul> <li>Configure optional instance management settings:<ul>
@@ -1280,6 +1282,7 @@
<li>Configure backend-specific options:<ul> <li>Configure backend-specific options:<ul>
<li><strong>llama.cpp</strong>: Threads, context size, GPU layers, port, etc.</li> <li><strong>llama.cpp</strong>: Threads, context size, GPU layers, port, etc.</li>
<li><strong>MLX</strong>: Temperature, top-p, adapter path, Python environment, etc.</li> <li><strong>MLX</strong>: Temperature, top-p, adapter path, Python environment, etc.</li>
<li><strong>vLLM</strong>: Tensor parallel size, GPU memory utilization, quantization, etc.</li>
</ul> </ul>
</li> </li>
<li>Click <strong>"Create"</strong> to save the instance </li> <li>Click <strong>"Create"</strong> to save the instance </li>
@@ -1313,17 +1316,31 @@
<a id="__codelineno-0-26" name="__codelineno-0-26" href="#__codelineno-0-26"></a><span class="s1"> &quot;max_restarts&quot;: 3</span> <a id="__codelineno-0-26" name="__codelineno-0-26" href="#__codelineno-0-26"></a><span class="s1"> &quot;max_restarts&quot;: 3</span>
<a id="__codelineno-0-27" name="__codelineno-0-27" href="#__codelineno-0-27"></a><span class="s1"> }&#39;</span> <a id="__codelineno-0-27" name="__codelineno-0-27" href="#__codelineno-0-27"></a><span class="s1"> }&#39;</span>
<a id="__codelineno-0-28" name="__codelineno-0-28" href="#__codelineno-0-28"></a> <a id="__codelineno-0-28" name="__codelineno-0-28" href="#__codelineno-0-28"></a>
<a id="__codelineno-0-29" name="__codelineno-0-29" href="#__codelineno-0-29"></a><span class="c1"># Create llama.cpp instance with HuggingFace model</span> <a id="__codelineno-0-29" name="__codelineno-0-29" href="#__codelineno-0-29"></a><span class="c1"># Create vLLM instance</span>
<a id="__codelineno-0-30" name="__codelineno-0-30" href="#__codelineno-0-30"></a>curl<span class="w"> </span>-X<span class="w"> </span>POST<span class="w"> </span>http://localhost:8080/api/instances/gemma-3-27b<span class="w"> </span><span class="se">\</span> <a id="__codelineno-0-30" name="__codelineno-0-30" href="#__codelineno-0-30"></a>curl<span class="w"> </span>-X<span class="w"> </span>POST<span class="w"> </span>http://localhost:8080/api/instances/my-vllm-instance<span class="w"> </span><span class="se">\</span>
<a id="__codelineno-0-31" name="__codelineno-0-31" href="#__codelineno-0-31"></a><span class="w"> </span>-H<span class="w"> </span><span class="s2">&quot;Content-Type: application/json&quot;</span><span class="w"> </span><span class="se">\</span> <a id="__codelineno-0-31" name="__codelineno-0-31" href="#__codelineno-0-31"></a><span class="w"> </span>-H<span class="w"> </span><span class="s2">&quot;Content-Type: application/json&quot;</span><span class="w"> </span><span class="se">\</span>
<a id="__codelineno-0-32" name="__codelineno-0-32" href="#__codelineno-0-32"></a><span class="w"> </span>-d<span class="w"> </span><span class="s1">&#39;{</span> <a id="__codelineno-0-32" name="__codelineno-0-32" href="#__codelineno-0-32"></a><span class="w"> </span>-d<span class="w"> </span><span class="s1">&#39;{</span>
<a id="__codelineno-0-33" name="__codelineno-0-33" href="#__codelineno-0-33"></a><span class="s1"> &quot;backend_type&quot;: &quot;llama_cpp&quot;,</span> <a id="__codelineno-0-33" name="__codelineno-0-33" href="#__codelineno-0-33"></a><span class="s1"> &quot;backend_type&quot;: &quot;vllm&quot;,</span>
<a id="__codelineno-0-34" name="__codelineno-0-34" href="#__codelineno-0-34"></a><span class="s1"> &quot;backend_options&quot;: {</span> <a id="__codelineno-0-34" name="__codelineno-0-34" href="#__codelineno-0-34"></a><span class="s1"> &quot;backend_options&quot;: {</span>
<a id="__codelineno-0-35" name="__codelineno-0-35" href="#__codelineno-0-35"></a><span class="s1"> &quot;hf_repo&quot;: &quot;unsloth/gemma-3-27b-it-GGUF&quot;,</span> <a id="__codelineno-0-35" name="__codelineno-0-35" href="#__codelineno-0-35"></a><span class="s1"> &quot;model&quot;: &quot;microsoft/DialoGPT-medium&quot;,</span>
<a id="__codelineno-0-36" name="__codelineno-0-36" href="#__codelineno-0-36"></a><span class="s1"> &quot;hf_file&quot;: &quot;gemma-3-27b-it-GGUF.gguf&quot;,</span> <a id="__codelineno-0-36" name="__codelineno-0-36" href="#__codelineno-0-36"></a><span class="s1"> &quot;tensor_parallel_size&quot;: 2,</span>
<a id="__codelineno-0-37" name="__codelineno-0-37" href="#__codelineno-0-37"></a><span class="s1"> &quot;gpu_layers&quot;: 32</span> <a id="__codelineno-0-37" name="__codelineno-0-37" href="#__codelineno-0-37"></a><span class="s1"> &quot;gpu_memory_utilization&quot;: 0.9</span>
<a id="__codelineno-0-38" name="__codelineno-0-38" href="#__codelineno-0-38"></a><span class="s1"> }</span> <a id="__codelineno-0-38" name="__codelineno-0-38" href="#__codelineno-0-38"></a><span class="s1"> },</span>
<a id="__codelineno-0-39" name="__codelineno-0-39" href="#__codelineno-0-39"></a><span class="s1"> }&#39;</span> <a id="__codelineno-0-39" name="__codelineno-0-39" href="#__codelineno-0-39"></a><span class="s1"> &quot;auto_restart&quot;: true,</span>
<a id="__codelineno-0-40" name="__codelineno-0-40" href="#__codelineno-0-40"></a><span class="s1"> &quot;on_demand_start&quot;: true</span>
<a id="__codelineno-0-41" name="__codelineno-0-41" href="#__codelineno-0-41"></a><span class="s1"> }&#39;</span>
<a id="__codelineno-0-42" name="__codelineno-0-42" href="#__codelineno-0-42"></a>
<a id="__codelineno-0-43" name="__codelineno-0-43" href="#__codelineno-0-43"></a><span class="c1"># Create llama.cpp instance with HuggingFace model</span>
<a id="__codelineno-0-44" name="__codelineno-0-44" href="#__codelineno-0-44"></a>curl<span class="w"> </span>-X<span class="w"> </span>POST<span class="w"> </span>http://localhost:8080/api/instances/gemma-3-27b<span class="w"> </span><span class="se">\</span>
<a id="__codelineno-0-45" name="__codelineno-0-45" href="#__codelineno-0-45"></a><span class="w"> </span>-H<span class="w"> </span><span class="s2">&quot;Content-Type: application/json&quot;</span><span class="w"> </span><span class="se">\</span>
<a id="__codelineno-0-46" name="__codelineno-0-46" href="#__codelineno-0-46"></a><span class="w"> </span>-d<span class="w"> </span><span class="s1">&#39;{</span>
<a id="__codelineno-0-47" name="__codelineno-0-47" href="#__codelineno-0-47"></a><span class="s1"> &quot;backend_type&quot;: &quot;llama_cpp&quot;,</span>
<a id="__codelineno-0-48" name="__codelineno-0-48" href="#__codelineno-0-48"></a><span class="s1"> &quot;backend_options&quot;: {</span>
<a id="__codelineno-0-49" name="__codelineno-0-49" href="#__codelineno-0-49"></a><span class="s1"> &quot;hf_repo&quot;: &quot;unsloth/gemma-3-27b-it-GGUF&quot;,</span>
<a id="__codelineno-0-50" name="__codelineno-0-50" href="#__codelineno-0-50"></a><span class="s1"> &quot;hf_file&quot;: &quot;gemma-3-27b-it-GGUF.gguf&quot;,</span>
<a id="__codelineno-0-51" name="__codelineno-0-51" href="#__codelineno-0-51"></a><span class="s1"> &quot;gpu_layers&quot;: 32</span>
<a id="__codelineno-0-52" name="__codelineno-0-52" href="#__codelineno-0-52"></a><span class="s1"> }</span>
<a id="__codelineno-0-53" name="__codelineno-0-53" href="#__codelineno-0-53"></a><span class="s1"> }&#39;</span>
</code></pre></div> </code></pre></div>
<h2 id="start-instance">Start Instance<a class="headerlink" href="#start-instance" title="Permanent link">&para;</a></h2> <h2 id="start-instance">Start Instance<a class="headerlink" href="#start-instance" title="Permanent link">&para;</a></h2>
<h3 id="via-web-ui_1">Via Web UI<a class="headerlink" href="#via-web-ui_1" title="Permanent link">&para;</a></h3> <h3 id="via-web-ui_1">Via Web UI<a class="headerlink" href="#via-web-ui_1" title="Permanent link">&para;</a></h3>
@@ -1390,13 +1407,14 @@
<div class="highlight"><pre><span></span><code><a id="__codelineno-5-1" name="__codelineno-5-1" href="#__codelineno-5-1"></a>curl<span class="w"> </span>-X<span class="w"> </span>DELETE<span class="w"> </span>http://localhost:8080/api/instances/<span class="o">{</span>name<span class="o">}</span> <div class="highlight"><pre><span></span><code><a id="__codelineno-5-1" name="__codelineno-5-1" href="#__codelineno-5-1"></a>curl<span class="w"> </span>-X<span class="w"> </span>DELETE<span class="w"> </span>http://localhost:8080/api/instances/<span class="o">{</span>name<span class="o">}</span>
</code></pre></div> </code></pre></div>
<h2 id="instance-proxy">Instance Proxy<a class="headerlink" href="#instance-proxy" title="Permanent link">&para;</a></h2> <h2 id="instance-proxy">Instance Proxy<a class="headerlink" href="#instance-proxy" title="Permanent link">&para;</a></h2>
<p>Llamactl proxies all requests to the underlying backend instances (llama-server or MLX).</p> <p>Llamactl proxies all requests to the underlying backend instances (llama-server, MLX, or vLLM).</p>
<div class="highlight"><pre><span></span><code><a id="__codelineno-6-1" name="__codelineno-6-1" href="#__codelineno-6-1"></a><span class="c1"># Get instance details</span> <div class="highlight"><pre><span></span><code><a id="__codelineno-6-1" name="__codelineno-6-1" href="#__codelineno-6-1"></a><span class="c1"># Get instance details</span>
<a id="__codelineno-6-2" name="__codelineno-6-2" href="#__codelineno-6-2"></a>curl<span class="w"> </span>http://localhost:8080/api/instances/<span class="o">{</span>name<span class="o">}</span>/proxy/ <a id="__codelineno-6-2" name="__codelineno-6-2" href="#__codelineno-6-2"></a>curl<span class="w"> </span>http://localhost:8080/api/instances/<span class="o">{</span>name<span class="o">}</span>/proxy/
</code></pre></div> </code></pre></div>
<p>Both backends provide OpenAI-compatible endpoints. Check the respective documentation: <p>All backends provide OpenAI-compatible endpoints. Check the respective documentation:
- <a href="https://github.com/ggml-org/llama.cpp/blob/master/tools/server/README.md">llama-server docs</a> - <a href="https://github.com/ggml-org/llama.cpp/blob/master/tools/server/README.md">llama-server docs</a>
- <a href="https://github.com/ml-explore/mlx-lm/blob/main/mlx_lm/SERVER.md">MLX-LM docs</a></p> - <a href="https://github.com/ml-explore/mlx-lm/blob/main/mlx_lm/SERVER.md">MLX-LM docs</a>
- <a href="https://docs.vllm.ai/en/latest/">vLLM docs</a></p>
<h3 id="instance-health">Instance Health<a class="headerlink" href="#instance-health" title="Permanent link">&para;</a></h3> <h3 id="instance-health">Instance Health<a class="headerlink" href="#instance-health" title="Permanent link">&para;</a></h3>
<h4 id="via-web-ui_6">Via Web UI<a class="headerlink" href="#via-web-ui_6" title="Permanent link">&para;</a></h4> <h4 id="via-web-ui_6">Via Web UI<a class="headerlink" href="#via-web-ui_6" title="Permanent link">&para;</a></h4>
<ol> <ol>
@@ -1426,7 +1444,7 @@
<span class="md-icon" title="Last update"> <span class="md-icon" title="Last update">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
</span> </span>
<span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-date">September 18, 2025</span> <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-date">September 21, 2025</span>
</span> </span>