Deployed 514b1b0 to dev with MkDocs 1.6.1 and mike 2.1.3

This commit is contained in:
lordmathis
2025-11-15 00:04:23 +00:00
parent 985351643c
commit b21cc41e42
10 changed files with 941 additions and 181 deletions

View File

@@ -645,6 +645,39 @@
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="#system" class="md-nav__link">
<span class="md-ellipsis">
System
</span>
</a>
<nav class="md-nav" aria-label="System">
<ul class="md-nav__list">
<li class="md-nav__item">
<a href="#get-apiv1config" class="md-nav__link">
<span class="md-ellipsis">
GET /api/v1/config
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#get-apiv1version" class="md-nav__link">
<span class="md-ellipsis">
GET /api/v1/version
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
@@ -792,30 +825,6 @@
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="#system" class="md-nav__link">
<span class="md-ellipsis">
System
</span>
</a>
<nav class="md-nav" aria-label="System">
<ul class="md-nav__list">
<li class="md-nav__item">
<a href="#get-apiv1version" class="md-nav__link">
<span class="md-ellipsis">
GET /api/v1/version
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
@@ -1306,6 +1315,87 @@ Most likely, it is not desirable to edit this file by hand!
<strong>Response <span class="response-code code-400">400</span>&nbsp;<span class="status-phrase">Bad Request</span></strong>
</p>
<h2 id="system"><span class="api-tag">System</span><a class="headerlink" href="#system" title="Permanent link">&para;</a></h2>
<hr class="operation-separator" />
<h3 id="get-apiv1config"><span class="http-get">GET</span> /api/v1/config<a class="headerlink" href="#get-apiv1config" title="Permanent link">&para;</a></h3>
<p>Get server configuration </p>
<details class="note">
<summary>Description</summary>
<p>Returns the current server configuration (sanitized) </p>
</details>
<p><strong>Input parameters</strong> </p>
<table>
<thead>
<tr>
<th>Parameter</th>
<th>In</th>
<th>Type</th>
<th>Default</th>
<th>Nullable</th>
<th>Description</th>
</tr>
</thead>
<tbody>
<tr>
<td class="parameter-name"><code>ApiKeyAuth</code></td>
<td>header</td>
<td>string</td>
<td>N/A</td>
<td>No</td>
<td></td>
</tr>
</tbody>
</table>
<p class="response-title">
<strong>Response <span class="response-code code-200">200</span>&nbsp;<span class="status-phrase">OK</span></strong>
</p>
<p class="response-title">
<strong>Response <span class="response-code code-500">500</span>&nbsp;<span class="status-phrase">Internal Server Error</span></strong>
</p>
<hr class="operation-separator" />
<h3 id="get-apiv1version"><span class="http-get">GET</span> /api/v1/version<a class="headerlink" href="#get-apiv1version" title="Permanent link">&para;</a></h3>
<p>Get llamactl version </p>
<details class="note">
<summary>Description</summary>
<p>Returns the version of the llamactl command </p>
</details>
<p><strong>Input parameters</strong> </p>
<table>
<thead>
<tr>
<th>Parameter</th>
<th>In</th>
<th>Type</th>
<th>Default</th>
<th>Nullable</th>
<th>Description</th>
</tr>
</thead>
<tbody>
<tr>
<td class="parameter-name"><code>ApiKeyAuth</code></td>
<td>header</td>
<td>string</td>
<td>N/A</td>
<td>No</td>
<td></td>
</tr>
</tbody>
</table>
<p class="response-title">
<strong>Response <span class="response-code code-200">200</span>&nbsp;<span class="status-phrase">OK</span></strong>
</p>
<p class="response-title">
<strong>Response <span class="response-code code-500">500</span>&nbsp;<span class="status-phrase">Internal Server Error</span></strong>
</p>
<h2 id="instances"><span class="api-tag">Instances</span><a class="headerlink" href="#instances" title="Permanent link">&para;</a></h2>
<hr class="operation-separator" />
@@ -1999,47 +2089,6 @@ config) </p>
<strong>Response <span class="response-code code-500">500</span>&nbsp;<span class="status-phrase">Internal Server Error</span></strong>
</p>
<h2 id="system"><span class="api-tag">System</span><a class="headerlink" href="#system" title="Permanent link">&para;</a></h2>
<hr class="operation-separator" />
<h3 id="get-apiv1version"><span class="http-get">GET</span> /api/v1/version<a class="headerlink" href="#get-apiv1version" title="Permanent link">&para;</a></h3>
<p>Get llamactl version </p>
<details class="note">
<summary>Description</summary>
<p>Returns the version of the llamactl command </p>
</details>
<p><strong>Input parameters</strong> </p>
<table>
<thead>
<tr>
<th>Parameter</th>
<th>In</th>
<th>Type</th>
<th>Default</th>
<th>Nullable</th>
<th>Description</th>
</tr>
</thead>
<tbody>
<tr>
<td class="parameter-name"><code>ApiKeyAuth</code></td>
<td>header</td>
<td>string</td>
<td>N/A</td>
<td>No</td>
<td></td>
</tr>
</tbody>
</table>
<p class="response-title">
<strong>Response <span class="response-code code-200">200</span>&nbsp;<span class="status-phrase">OK</span></strong>
</p>
<p class="response-title">
<strong>Response <span class="response-code code-500">500</span>&nbsp;<span class="status-phrase">Internal Server Error</span></strong>
</p>
<h2 id="llamacpp"><span class="api-tag">Llama.cpp</span><a class="headerlink" href="#llamacpp" title="Permanent link">&para;</a></h2>
<hr class="operation-separator" />

View File

@@ -256,6 +256,34 @@ const docTemplate = `{
}
}
},
"/api/v1/config": {
"get": {
"security": [
{
"ApiKeyAuth": []
}
],
"description": "Returns the current server configuration (sanitized)",
"tags": [
"System"
],
"summary": "Get server configuration",
"responses": {
"200": {
"description": "Sanitized configuration",
"schema": {
"$ref": "#/definitions/config.AppConfig"
}
},
"500": {
"description": "Internal Server Error",
"schema": {
"type": "string"
}
}
}
}
},
"/api/v1/instances": {
"get": {
"security": [
@@ -1475,6 +1503,247 @@ const docTemplate = `{
}
},
"definitions": {
"config.AppConfig": {
"type": "object",
"properties": {
"auth": {
"$ref": "#/definitions/config.AuthConfig"
},
"backends": {
"$ref": "#/definitions/config.BackendConfig"
},
"build_time": {
"type": "string"
},
"commit_hash": {
"type": "string"
},
"instances": {
"$ref": "#/definitions/config.InstancesConfig"
},
"local_node": {
"type": "string"
},
"nodes": {
"type": "object",
"additionalProperties": {
"$ref": "#/definitions/config.NodeConfig"
}
},
"server": {
"$ref": "#/definitions/config.ServerConfig"
},
"version": {
"type": "string"
}
}
},
"config.AuthConfig": {
"type": "object",
"properties": {
"inference_keys": {
"description": "List of keys for OpenAI compatible inference endpoints",
"type": "array",
"items": {
"type": "string"
}
},
"management_keys": {
"description": "List of keys for management endpoints",
"type": "array",
"items": {
"type": "string"
}
},
"require_inference_auth": {
"description": "Require authentication for OpenAI compatible inference endpoints",
"type": "boolean"
},
"require_management_auth": {
"description": "Require authentication for management endpoints",
"type": "boolean"
}
}
},
"config.BackendConfig": {
"type": "object",
"properties": {
"llama-cpp": {
"$ref": "#/definitions/config.BackendSettings"
},
"mlx": {
"$ref": "#/definitions/config.BackendSettings"
},
"vllm": {
"$ref": "#/definitions/config.BackendSettings"
}
}
},
"config.BackendSettings": {
"type": "object",
"properties": {
"args": {
"type": "array",
"items": {
"type": "string"
}
},
"command": {
"type": "string"
},
"docker": {
"$ref": "#/definitions/config.DockerSettings"
},
"environment": {
"type": "object",
"additionalProperties": {
"type": "string"
}
},
"response_headers": {
"type": "object",
"additionalProperties": {
"type": "string"
}
}
}
},
"config.DockerSettings": {
"type": "object",
"properties": {
"args": {
"type": "array",
"items": {
"type": "string"
}
},
"enabled": {
"type": "boolean"
},
"environment": {
"type": "object",
"additionalProperties": {
"type": "string"
}
},
"image": {
"type": "string"
}
}
},
"config.InstancesConfig": {
"type": "object",
"properties": {
"auto_create_dirs": {
"description": "Automatically create the data directory if it doesn't exist",
"type": "boolean"
},
"configs_dir": {
"description": "Instance config directory override",
"type": "string"
},
"data_dir": {
"description": "Directory where all llamactl data will be stored (instances.json, logs, etc.)",
"type": "string"
},
"default_auto_restart": {
"description": "Default auto-restart setting for new instances",
"type": "boolean"
},
"default_max_restarts": {
"description": "Default max restarts for new instances",
"type": "integer"
},
"default_on_demand_start": {
"description": "Default on-demand start setting for new instances",
"type": "boolean"
},
"default_restart_delay": {
"description": "Default restart delay for new instances (in seconds)",
"type": "integer"
},
"enable_lru_eviction": {
"description": "Enable LRU eviction for instance logs",
"type": "boolean"
},
"logs_dir": {
"description": "Logs directory override",
"type": "string"
},
"max_instances": {
"description": "Maximum number of instances that can be created",
"type": "integer"
},
"max_running_instances": {
"description": "Maximum number of instances that can be running at the same time",
"type": "integer"
},
"on_demand_start_timeout": {
"description": "How long to wait for an instance to start on demand (in seconds)",
"type": "integer"
},
"port_range": {
"description": "Port range for instances (e.g., 8000,9000)",
"type": "array",
"items": {
"type": "integer"
}
},
"timeout_check_interval": {
"description": "Interval for checking instance timeouts (in minutes)",
"type": "integer"
}
}
},
"config.NodeConfig": {
"type": "object",
"properties": {
"address": {
"type": "string"
},
"api_key": {
"type": "string"
}
}
},
"config.ServerConfig": {
"type": "object",
"properties": {
"allowed_headers": {
"description": "Allowed headers for CORS (e.g., \"Accept\", \"Authorization\", \"Content-Type\", \"X-CSRF-Token\")",
"type": "array",
"items": {
"type": "string"
}
},
"allowed_origins": {
"description": "Allowed origins for CORS (e.g., \"http://localhost:3000\")",
"type": "array",
"items": {
"type": "string"
}
},
"enable_swagger": {
"description": "Enable Swagger UI for API documentation",
"type": "boolean"
},
"host": {
"description": "Server host to bind to",
"type": "string"
},
"port": {
"description": "Server port to bind to",
"type": "integer"
},
"response_headers": {
"description": "Response headers to send with responses",
"type": "object",
"additionalProperties": {
"type": "string"
}
}
}
},
"instance.Instance": {
"type": "object",
"properties": {
@@ -1494,6 +1763,13 @@ const docTemplate = `{
"description": "Auto restart",
"type": "boolean"
},
"command_override": {
"type": "string"
},
"docker_enabled": {
"description": "Execution context overrides",
"type": "boolean"
},
"environment": {
"description": "Environment variables",
"type": "object",

View File

@@ -789,34 +789,36 @@
<p><img alt="Create Instance Screenshot" src="../images/create_instance.png" /> </p>
<ol>
<li>Click the <strong>"Create Instance"</strong> button on the dashboard </li>
<li><em>Optional</em>: Click <strong>"Import"</strong> in the dialog header to load a previously exported configuration </li>
<li>Enter a unique <strong>Name</strong> for your instance (only required field) </li>
<li><strong>Select Target Node</strong>: Choose which node to deploy the instance to from the dropdown </li>
<li><strong>Choose Backend Type</strong>: <ul>
<li><strong>llama.cpp</strong>: For GGUF models using llama-server </li>
<li><strong>MLX</strong>: For MLX-optimized models (macOS only) </li>
<li><em>Optional</em>: Click <strong>"Import"</strong> to load a previously exported configuration </li>
</ol>
<p><strong>Instance Settings:</strong> </p>
<ol>
<li>Enter a unique <strong>Instance Name</strong> (required) </li>
<li><strong>Select Node</strong>: Choose which node to deploy the instance to </li>
<li>Configure <strong>Auto Restart</strong> settings: <ul>
<li>Enable automatic restart on failure </li>
<li>Set max restarts and delay between attempts </li>
</ul>
</li>
<li>Configure basic instance options: <ul>
<li><strong>Idle Timeout</strong>: Minutes before stopping idle instance </li>
<li><strong>On Demand Start</strong>: Start instance only when needed </li>
</ul>
</li>
</ol>
<p><strong>Backend Configuration:</strong> </p>
<ol>
<li><strong>Select Backend Type</strong>: <ul>
<li><strong>Llama Server</strong>: For GGUF models using llama-server </li>
<li><strong>MLX LM</strong>: For MLX-optimized models (macOS only) </li>
<li><strong>vLLM</strong>: For distributed serving and high-throughput inference </li>
</ul>
</li>
<li>Configure model source: <ul>
<li><strong>For llama.cpp</strong>: GGUF model path or HuggingFace repo </li>
<li><strong>For MLX</strong>: MLX model path or identifier (e.g., <code>mlx-community/Mistral-7B-Instruct-v0.3-4bit</code>) </li>
<li><strong>For vLLM</strong>: HuggingFace model identifier (e.g., <code>microsoft/DialoGPT-medium</code>) </li>
</ul>
</li>
<li>Configure optional instance management settings: <ul>
<li><strong>Auto Restart</strong>: Automatically restart instance on failure </li>
<li><strong>Max Restarts</strong>: Maximum number of restart attempts </li>
<li><strong>Restart Delay</strong>: Delay in seconds between restart attempts </li>
<li><strong>On Demand Start</strong>: Start instance when receiving a request to the OpenAI compatible endpoint </li>
<li><strong>Idle Timeout</strong>: Minutes before stopping idle instance (set to 0 to disable) </li>
<li><strong>Environment Variables</strong>: Set custom environment variables for the instance process </li>
</ul>
</li>
<li>Configure backend-specific options: <ul>
<li><strong>llama.cpp</strong>: Threads, context size, GPU layers, port, etc. </li>
<li><strong>MLX</strong>: Temperature, top-p, adapter path, Python environment, etc. </li>
<li><strong>vLLM</strong>: Tensor parallel size, GPU memory utilization, quantization, etc. </li>
<li><em>Optional</em>: Click <strong>"Parse Command"</strong> to import settings from an existing backend command </li>
<li>Configure <strong>Execution Context</strong>: <ul>
<li><strong>Enable Docker</strong>: Run backend in Docker container </li>
<li><strong>Command Override</strong>: Custom path to backend executable </li>
<li><strong>Environment Variables</strong>: Custom environment variables </li>
</ul>
</li>
</ol>
@@ -825,6 +827,14 @@
<p>Llamactl automatically assigns ports from the configured port range (default: 8000-9000) and generates API keys if authentication is enabled. You typically don't need to manually specify these values. </p>
</div>
<ol>
<li>Configure <strong>Basic Backend Options</strong> (varies by backend): <ul>
<li><strong>llama.cpp</strong>: Model path, threads, context size, GPU layers, etc. </li>
<li><strong>MLX</strong>: Model identifier, temperature, max tokens, etc. </li>
<li><strong>vLLM</strong>: Model identifier, tensor parallel size, GPU memory utilization, etc. </li>
</ul>
</li>
<li><em>Optional</em>: Expand <strong>Advanced Backend Options</strong> for additional settings </li>
<li><em>Optional</em>: Add <strong>Extra Args</strong> as key-value pairs for custom command-line arguments </li>
<li>Click <strong>"Create"</strong> to save the instance </li>
</ol>
<p><strong>Via API</strong> </p>
@@ -838,88 +848,47 @@
<a id="__codelineno-0-8" name="__codelineno-0-8" href="#__codelineno-0-8"></a><span class="s1"> &quot;model&quot;: &quot;/path/to/model.gguf&quot;,</span>
<a id="__codelineno-0-9" name="__codelineno-0-9" href="#__codelineno-0-9"></a><span class="s1"> &quot;threads&quot;: 8,</span>
<a id="__codelineno-0-10" name="__codelineno-0-10" href="#__codelineno-0-10"></a><span class="s1"> &quot;ctx_size&quot;: 4096,</span>
<a id="__codelineno-0-11" name="__codelineno-0-11" href="#__codelineno-0-11"></a><span class="s1"> &quot;gpu_layers&quot;: 32</span>
<a id="__codelineno-0-12" name="__codelineno-0-12" href="#__codelineno-0-12"></a><span class="s1"> },</span>
<a id="__codelineno-0-13" name="__codelineno-0-13" href="#__codelineno-0-13"></a><span class="s1"> &quot;nodes&quot;: [&quot;main&quot;]</span>
<a id="__codelineno-0-14" name="__codelineno-0-14" href="#__codelineno-0-14"></a><span class="s1"> }&#39;</span>
<a id="__codelineno-0-15" name="__codelineno-0-15" href="#__codelineno-0-15"></a>
<a id="__codelineno-0-16" name="__codelineno-0-16" href="#__codelineno-0-16"></a><span class="c1"># Create MLX instance (macOS only)</span>
<a id="__codelineno-0-17" name="__codelineno-0-17" href="#__codelineno-0-17"></a>curl<span class="w"> </span>-X<span class="w"> </span>POST<span class="w"> </span>http://localhost:8080/api/v1/instances/my-mlx-instance<span class="w"> </span><span class="se">\</span>
<a id="__codelineno-0-18" name="__codelineno-0-18" href="#__codelineno-0-18"></a><span class="w"> </span>-H<span class="w"> </span><span class="s2">&quot;Content-Type: application/json&quot;</span><span class="w"> </span><span class="se">\</span>
<a id="__codelineno-0-19" name="__codelineno-0-19" href="#__codelineno-0-19"></a><span class="w"> </span>-H<span class="w"> </span><span class="s2">&quot;Authorization: Bearer &lt;token&gt;&quot;</span><span class="w"> </span><span class="se">\</span>
<a id="__codelineno-0-20" name="__codelineno-0-20" href="#__codelineno-0-20"></a><span class="w"> </span>-d<span class="w"> </span><span class="s1">&#39;{</span>
<a id="__codelineno-0-21" name="__codelineno-0-21" href="#__codelineno-0-21"></a><span class="s1"> &quot;backend_type&quot;: &quot;mlx_lm&quot;,</span>
<a id="__codelineno-0-22" name="__codelineno-0-22" href="#__codelineno-0-22"></a><span class="s1"> &quot;backend_options&quot;: {</span>
<a id="__codelineno-0-23" name="__codelineno-0-23" href="#__codelineno-0-23"></a><span class="s1"> &quot;model&quot;: &quot;mlx-community/Mistral-7B-Instruct-v0.3-4bit&quot;,</span>
<a id="__codelineno-0-24" name="__codelineno-0-24" href="#__codelineno-0-24"></a><span class="s1"> &quot;temp&quot;: 0.7,</span>
<a id="__codelineno-0-25" name="__codelineno-0-25" href="#__codelineno-0-25"></a><span class="s1"> &quot;top_p&quot;: 0.9,</span>
<a id="__codelineno-0-26" name="__codelineno-0-26" href="#__codelineno-0-26"></a><span class="s1"> &quot;max_tokens&quot;: 2048</span>
<a id="__codelineno-0-27" name="__codelineno-0-27" href="#__codelineno-0-27"></a><span class="s1"> },</span>
<a id="__codelineno-0-28" name="__codelineno-0-28" href="#__codelineno-0-28"></a><span class="s1"> &quot;auto_restart&quot;: true,</span>
<a id="__codelineno-0-29" name="__codelineno-0-29" href="#__codelineno-0-29"></a><span class="s1"> &quot;max_restarts&quot;: 3,</span>
<a id="__codelineno-0-30" name="__codelineno-0-30" href="#__codelineno-0-30"></a><span class="s1"> &quot;nodes&quot;: [&quot;main&quot;]</span>
<a id="__codelineno-0-31" name="__codelineno-0-31" href="#__codelineno-0-31"></a><span class="s1"> }&#39;</span>
<a id="__codelineno-0-32" name="__codelineno-0-32" href="#__codelineno-0-32"></a>
<a id="__codelineno-0-33" name="__codelineno-0-33" href="#__codelineno-0-33"></a><span class="c1"># Create vLLM instance</span>
<a id="__codelineno-0-34" name="__codelineno-0-34" href="#__codelineno-0-34"></a>curl<span class="w"> </span>-X<span class="w"> </span>POST<span class="w"> </span>http://localhost:8080/api/v1/instances/my-vllm-instance<span class="w"> </span><span class="se">\</span>
<a id="__codelineno-0-35" name="__codelineno-0-35" href="#__codelineno-0-35"></a><span class="w"> </span>-H<span class="w"> </span><span class="s2">&quot;Content-Type: application/json&quot;</span><span class="w"> </span><span class="se">\</span>
<a id="__codelineno-0-36" name="__codelineno-0-36" href="#__codelineno-0-36"></a><span class="w"> </span>-H<span class="w"> </span><span class="s2">&quot;Authorization: Bearer &lt;token&gt;&quot;</span><span class="w"> </span><span class="se">\</span>
<a id="__codelineno-0-37" name="__codelineno-0-37" href="#__codelineno-0-37"></a><span class="w"> </span>-d<span class="w"> </span><span class="s1">&#39;{</span>
<a id="__codelineno-0-38" name="__codelineno-0-38" href="#__codelineno-0-38"></a><span class="s1"> &quot;backend_type&quot;: &quot;vllm&quot;,</span>
<a id="__codelineno-0-39" name="__codelineno-0-39" href="#__codelineno-0-39"></a><span class="s1"> &quot;backend_options&quot;: {</span>
<a id="__codelineno-0-40" name="__codelineno-0-40" href="#__codelineno-0-40"></a><span class="s1"> &quot;model&quot;: &quot;microsoft/DialoGPT-medium&quot;,</span>
<a id="__codelineno-0-41" name="__codelineno-0-41" href="#__codelineno-0-41"></a><span class="s1"> &quot;tensor_parallel_size&quot;: 2,</span>
<a id="__codelineno-0-42" name="__codelineno-0-42" href="#__codelineno-0-42"></a><span class="s1"> &quot;gpu_memory_utilization&quot;: 0.9</span>
<a id="__codelineno-0-43" name="__codelineno-0-43" href="#__codelineno-0-43"></a><span class="s1"> },</span>
<a id="__codelineno-0-44" name="__codelineno-0-44" href="#__codelineno-0-44"></a><span class="s1"> &quot;auto_restart&quot;: true,</span>
<a id="__codelineno-0-45" name="__codelineno-0-45" href="#__codelineno-0-45"></a><span class="s1"> &quot;on_demand_start&quot;: true,</span>
<a id="__codelineno-0-46" name="__codelineno-0-46" href="#__codelineno-0-46"></a><span class="s1"> &quot;environment&quot;: {</span>
<a id="__codelineno-0-47" name="__codelineno-0-47" href="#__codelineno-0-47"></a><span class="s1"> &quot;CUDA_VISIBLE_DEVICES&quot;: &quot;0,1&quot;,</span>
<a id="__codelineno-0-48" name="__codelineno-0-48" href="#__codelineno-0-48"></a><span class="s1"> &quot;NCCL_DEBUG&quot;: &quot;INFO&quot;,</span>
<a id="__codelineno-0-49" name="__codelineno-0-49" href="#__codelineno-0-49"></a><span class="s1"> &quot;PYTHONPATH&quot;: &quot;/custom/path&quot;</span>
<a id="__codelineno-0-50" name="__codelineno-0-50" href="#__codelineno-0-50"></a><span class="s1"> },</span>
<a id="__codelineno-0-51" name="__codelineno-0-51" href="#__codelineno-0-51"></a><span class="s1"> &quot;nodes&quot;: [&quot;main&quot;]</span>
<a id="__codelineno-0-52" name="__codelineno-0-52" href="#__codelineno-0-52"></a><span class="s1"> }&#39;</span>
<a id="__codelineno-0-53" name="__codelineno-0-53" href="#__codelineno-0-53"></a>
<a id="__codelineno-0-54" name="__codelineno-0-54" href="#__codelineno-0-54"></a><span class="c1"># Create llama.cpp instance with HuggingFace model</span>
<a id="__codelineno-0-55" name="__codelineno-0-55" href="#__codelineno-0-55"></a>curl<span class="w"> </span>-X<span class="w"> </span>POST<span class="w"> </span>http://localhost:8080/api/v1/instances/gemma-3-27b<span class="w"> </span><span class="se">\</span>
<a id="__codelineno-0-56" name="__codelineno-0-56" href="#__codelineno-0-56"></a><span class="w"> </span>-H<span class="w"> </span><span class="s2">&quot;Content-Type: application/json&quot;</span><span class="w"> </span><span class="se">\</span>
<a id="__codelineno-0-57" name="__codelineno-0-57" href="#__codelineno-0-57"></a><span class="w"> </span>-H<span class="w"> </span><span class="s2">&quot;Authorization: Bearer &lt;token&gt;&quot;</span><span class="w"> </span><span class="se">\</span>
<a id="__codelineno-0-58" name="__codelineno-0-58" href="#__codelineno-0-58"></a><span class="w"> </span>-d<span class="w"> </span><span class="s1">&#39;{</span>
<a id="__codelineno-0-59" name="__codelineno-0-59" href="#__codelineno-0-59"></a><span class="s1"> &quot;backend_type&quot;: &quot;llama_cpp&quot;,</span>
<a id="__codelineno-0-60" name="__codelineno-0-60" href="#__codelineno-0-60"></a><span class="s1"> &quot;backend_options&quot;: {</span>
<a id="__codelineno-0-61" name="__codelineno-0-61" href="#__codelineno-0-61"></a><span class="s1"> &quot;hf_repo&quot;: &quot;unsloth/gemma-3-27b-it-GGUF&quot;,</span>
<a id="__codelineno-0-62" name="__codelineno-0-62" href="#__codelineno-0-62"></a><span class="s1"> &quot;hf_file&quot;: &quot;gemma-3-27b-it-GGUF.gguf&quot;,</span>
<a id="__codelineno-0-63" name="__codelineno-0-63" href="#__codelineno-0-63"></a><span class="s1"> &quot;gpu_layers&quot;: 32</span>
<a id="__codelineno-0-64" name="__codelineno-0-64" href="#__codelineno-0-64"></a><span class="s1"> },</span>
<a id="__codelineno-0-65" name="__codelineno-0-65" href="#__codelineno-0-65"></a><span class="s1"> &quot;nodes&quot;: [&quot;main&quot;]</span>
<a id="__codelineno-0-66" name="__codelineno-0-66" href="#__codelineno-0-66"></a><span class="s1"> }&#39;</span>
<a id="__codelineno-0-67" name="__codelineno-0-67" href="#__codelineno-0-67"></a>
<a id="__codelineno-0-68" name="__codelineno-0-68" href="#__codelineno-0-68"></a><span class="c1"># Create instance on specific remote node</span>
<a id="__codelineno-0-69" name="__codelineno-0-69" href="#__codelineno-0-69"></a>curl<span class="w"> </span>-X<span class="w"> </span>POST<span class="w"> </span>http://localhost:8080/api/v1/instances/remote-llama<span class="w"> </span><span class="se">\</span>
<a id="__codelineno-0-70" name="__codelineno-0-70" href="#__codelineno-0-70"></a><span class="w"> </span>-H<span class="w"> </span><span class="s2">&quot;Content-Type: application/json&quot;</span><span class="w"> </span><span class="se">\</span>
<a id="__codelineno-0-71" name="__codelineno-0-71" href="#__codelineno-0-71"></a><span class="w"> </span>-H<span class="w"> </span><span class="s2">&quot;Authorization: Bearer &lt;token&gt;&quot;</span><span class="w"> </span><span class="se">\</span>
<a id="__codelineno-0-72" name="__codelineno-0-72" href="#__codelineno-0-72"></a><span class="w"> </span>-d<span class="w"> </span><span class="s1">&#39;{</span>
<a id="__codelineno-0-73" name="__codelineno-0-73" href="#__codelineno-0-73"></a><span class="s1"> &quot;backend_type&quot;: &quot;llama_cpp&quot;,</span>
<a id="__codelineno-0-74" name="__codelineno-0-74" href="#__codelineno-0-74"></a><span class="s1"> &quot;backend_options&quot;: {</span>
<a id="__codelineno-0-75" name="__codelineno-0-75" href="#__codelineno-0-75"></a><span class="s1"> &quot;model&quot;: &quot;/models/llama-7b.gguf&quot;,</span>
<a id="__codelineno-0-76" name="__codelineno-0-76" href="#__codelineno-0-76"></a><span class="s1"> &quot;gpu_layers&quot;: 32</span>
<a id="__codelineno-0-77" name="__codelineno-0-77" href="#__codelineno-0-77"></a><span class="s1"> },</span>
<a id="__codelineno-0-78" name="__codelineno-0-78" href="#__codelineno-0-78"></a><span class="s1"> &quot;nodes&quot;: [&quot;worker1&quot;]</span>
<a id="__codelineno-0-79" name="__codelineno-0-79" href="#__codelineno-0-79"></a><span class="s1"> }&#39;</span>
<a id="__codelineno-0-80" name="__codelineno-0-80" href="#__codelineno-0-80"></a>
<a id="__codelineno-0-81" name="__codelineno-0-81" href="#__codelineno-0-81"></a><span class="c1"># Create instance on multiple nodes for high availability</span>
<a id="__codelineno-0-82" name="__codelineno-0-82" href="#__codelineno-0-82"></a>curl<span class="w"> </span>-X<span class="w"> </span>POST<span class="w"> </span>http://localhost:8080/api/v1/instances/multi-node-llama<span class="w"> </span><span class="se">\</span>
<a id="__codelineno-0-83" name="__codelineno-0-83" href="#__codelineno-0-83"></a><span class="w"> </span>-H<span class="w"> </span><span class="s2">&quot;Content-Type: application/json&quot;</span><span class="w"> </span><span class="se">\</span>
<a id="__codelineno-0-84" name="__codelineno-0-84" href="#__codelineno-0-84"></a><span class="w"> </span>-H<span class="w"> </span><span class="s2">&quot;Authorization: Bearer &lt;token&gt;&quot;</span><span class="w"> </span><span class="se">\</span>
<a id="__codelineno-0-85" name="__codelineno-0-85" href="#__codelineno-0-85"></a><span class="w"> </span>-d<span class="w"> </span><span class="s1">&#39;{</span>
<a id="__codelineno-0-86" name="__codelineno-0-86" href="#__codelineno-0-86"></a><span class="s1"> &quot;backend_type&quot;: &quot;llama_cpp&quot;,</span>
<a id="__codelineno-0-87" name="__codelineno-0-87" href="#__codelineno-0-87"></a><span class="s1"> &quot;backend_options&quot;: {</span>
<a id="__codelineno-0-88" name="__codelineno-0-88" href="#__codelineno-0-88"></a><span class="s1"> &quot;model&quot;: &quot;/models/llama-7b.gguf&quot;,</span>
<a id="__codelineno-0-89" name="__codelineno-0-89" href="#__codelineno-0-89"></a><span class="s1"> &quot;gpu_layers&quot;: 32</span>
<a id="__codelineno-0-90" name="__codelineno-0-90" href="#__codelineno-0-90"></a><span class="s1"> },</span>
<a id="__codelineno-0-91" name="__codelineno-0-91" href="#__codelineno-0-91"></a><span class="s1"> &quot;nodes&quot;: [&quot;worker1&quot;, &quot;worker2&quot;, &quot;worker3&quot;]</span>
<a id="__codelineno-0-92" name="__codelineno-0-92" href="#__codelineno-0-92"></a><span class="s1"> }&#39;</span>
<a id="__codelineno-0-11" name="__codelineno-0-11" href="#__codelineno-0-11"></a><span class="s1"> &quot;gpu_layers&quot;: 32,</span>
<a id="__codelineno-0-12" name="__codelineno-0-12" href="#__codelineno-0-12"></a><span class="s1"> &quot;flash_attn&quot;: &quot;on&quot;</span>
<a id="__codelineno-0-13" name="__codelineno-0-13" href="#__codelineno-0-13"></a><span class="s1"> },</span>
<a id="__codelineno-0-14" name="__codelineno-0-14" href="#__codelineno-0-14"></a><span class="s1"> &quot;auto_restart&quot;: true,</span>
<a id="__codelineno-0-15" name="__codelineno-0-15" href="#__codelineno-0-15"></a><span class="s1"> &quot;max_restarts&quot;: 3,</span>
<a id="__codelineno-0-16" name="__codelineno-0-16" href="#__codelineno-0-16"></a><span class="s1"> &quot;docker_enabled&quot;: false,</span>
<a id="__codelineno-0-17" name="__codelineno-0-17" href="#__codelineno-0-17"></a><span class="s1"> &quot;command_override&quot;: &quot;/opt/llama-server-dev&quot;,</span>
<a id="__codelineno-0-18" name="__codelineno-0-18" href="#__codelineno-0-18"></a><span class="s1"> &quot;nodes&quot;: [&quot;main&quot;]</span>
<a id="__codelineno-0-19" name="__codelineno-0-19" href="#__codelineno-0-19"></a><span class="s1"> }&#39;</span>
<a id="__codelineno-0-20" name="__codelineno-0-20" href="#__codelineno-0-20"></a>
<a id="__codelineno-0-21" name="__codelineno-0-21" href="#__codelineno-0-21"></a><span class="c1"># Create vLLM instance with environment variables</span>
<a id="__codelineno-0-22" name="__codelineno-0-22" href="#__codelineno-0-22"></a>curl<span class="w"> </span>-X<span class="w"> </span>POST<span class="w"> </span>http://localhost:8080/api/v1/instances/my-vllm-instance<span class="w"> </span><span class="se">\</span>
<a id="__codelineno-0-23" name="__codelineno-0-23" href="#__codelineno-0-23"></a><span class="w"> </span>-H<span class="w"> </span><span class="s2">&quot;Content-Type: application/json&quot;</span><span class="w"> </span><span class="se">\</span>
<a id="__codelineno-0-24" name="__codelineno-0-24" href="#__codelineno-0-24"></a><span class="w"> </span>-H<span class="w"> </span><span class="s2">&quot;Authorization: Bearer &lt;token&gt;&quot;</span><span class="w"> </span><span class="se">\</span>
<a id="__codelineno-0-25" name="__codelineno-0-25" href="#__codelineno-0-25"></a><span class="w"> </span>-d<span class="w"> </span><span class="s1">&#39;{</span>
<a id="__codelineno-0-26" name="__codelineno-0-26" href="#__codelineno-0-26"></a><span class="s1"> &quot;backend_type&quot;: &quot;vllm&quot;,</span>
<a id="__codelineno-0-27" name="__codelineno-0-27" href="#__codelineno-0-27"></a><span class="s1"> &quot;backend_options&quot;: {</span>
<a id="__codelineno-0-28" name="__codelineno-0-28" href="#__codelineno-0-28"></a><span class="s1"> &quot;model&quot;: &quot;microsoft/DialoGPT-medium&quot;,</span>
<a id="__codelineno-0-29" name="__codelineno-0-29" href="#__codelineno-0-29"></a><span class="s1"> &quot;tensor_parallel_size&quot;: 2,</span>
<a id="__codelineno-0-30" name="__codelineno-0-30" href="#__codelineno-0-30"></a><span class="s1"> &quot;gpu_memory_utilization&quot;: 0.9</span>
<a id="__codelineno-0-31" name="__codelineno-0-31" href="#__codelineno-0-31"></a><span class="s1"> },</span>
<a id="__codelineno-0-32" name="__codelineno-0-32" href="#__codelineno-0-32"></a><span class="s1"> &quot;on_demand_start&quot;: true,</span>
<a id="__codelineno-0-33" name="__codelineno-0-33" href="#__codelineno-0-33"></a><span class="s1"> &quot;environment&quot;: {</span>
<a id="__codelineno-0-34" name="__codelineno-0-34" href="#__codelineno-0-34"></a><span class="s1"> &quot;CUDA_VISIBLE_DEVICES&quot;: &quot;0,1&quot;</span>
<a id="__codelineno-0-35" name="__codelineno-0-35" href="#__codelineno-0-35"></a><span class="s1"> },</span>
<a id="__codelineno-0-36" name="__codelineno-0-36" href="#__codelineno-0-36"></a><span class="s1"> &quot;nodes&quot;: [&quot;worker1&quot;, &quot;worker2&quot;]</span>
<a id="__codelineno-0-37" name="__codelineno-0-37" href="#__codelineno-0-37"></a><span class="s1"> }&#39;</span>
<a id="__codelineno-0-38" name="__codelineno-0-38" href="#__codelineno-0-38"></a>
<a id="__codelineno-0-39" name="__codelineno-0-39" href="#__codelineno-0-39"></a><span class="c1"># Create MLX instance (macOS only)</span>
<a id="__codelineno-0-40" name="__codelineno-0-40" href="#__codelineno-0-40"></a>curl<span class="w"> </span>-X<span class="w"> </span>POST<span class="w"> </span>http://localhost:8080/api/v1/instances/my-mlx-instance<span class="w"> </span><span class="se">\</span>
<a id="__codelineno-0-41" name="__codelineno-0-41" href="#__codelineno-0-41"></a><span class="w"> </span>-H<span class="w"> </span><span class="s2">&quot;Content-Type: application/json&quot;</span><span class="w"> </span><span class="se">\</span>
<a id="__codelineno-0-42" name="__codelineno-0-42" href="#__codelineno-0-42"></a><span class="w"> </span>-H<span class="w"> </span><span class="s2">&quot;Authorization: Bearer &lt;token&gt;&quot;</span><span class="w"> </span><span class="se">\</span>
<a id="__codelineno-0-43" name="__codelineno-0-43" href="#__codelineno-0-43"></a><span class="w"> </span>-d<span class="w"> </span><span class="s1">&#39;{</span>
<a id="__codelineno-0-44" name="__codelineno-0-44" href="#__codelineno-0-44"></a><span class="s1"> &quot;backend_type&quot;: &quot;mlx_lm&quot;,</span>
<a id="__codelineno-0-45" name="__codelineno-0-45" href="#__codelineno-0-45"></a><span class="s1"> &quot;backend_options&quot;: {</span>
<a id="__codelineno-0-46" name="__codelineno-0-46" href="#__codelineno-0-46"></a><span class="s1"> &quot;model&quot;: &quot;mlx-community/Mistral-7B-Instruct-v0.3-4bit&quot;,</span>
<a id="__codelineno-0-47" name="__codelineno-0-47" href="#__codelineno-0-47"></a><span class="s1"> &quot;temp&quot;: 0.7,</span>
<a id="__codelineno-0-48" name="__codelineno-0-48" href="#__codelineno-0-48"></a><span class="s1"> &quot;max_tokens&quot;: 2048</span>
<a id="__codelineno-0-49" name="__codelineno-0-49" href="#__codelineno-0-49"></a><span class="s1"> },</span>
<a id="__codelineno-0-50" name="__codelineno-0-50" href="#__codelineno-0-50"></a><span class="s1"> &quot;nodes&quot;: [&quot;main&quot;]</span>
<a id="__codelineno-0-51" name="__codelineno-0-51" href="#__codelineno-0-51"></a><span class="s1"> }&#39;</span>
</code></pre></div>
<h2 id="start-instance">Start Instance<a class="headerlink" href="#start-instance" title="Permanent link">&para;</a></h2>
<p><strong>Via Web UI</strong><br />
@@ -1026,7 +995,7 @@ Check instance status in real-time: </p>
<span class="md-icon" title="Last update">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1zM12.5 7v5.2l4 2.4-1 1L11 13V7zM11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2z"/></svg>
</span>
<span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-date" title="October 27, 2025 19:44:28 UTC">October 27, 2025</span>
<span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-date" title="November 14, 2025 23:18:55 UTC">November 14, 2025</span>
</span>

File diff suppressed because one or more lines are too long

View File

@@ -2,30 +2,30 @@
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<url>
<loc>https://llamactl.org/dev/</loc>
<lastmod>2025-11-13</lastmod>
<lastmod>2025-11-15</lastmod>
</url>
<url>
<loc>https://llamactl.org/dev/api-reference/</loc>
<lastmod>2025-11-13</lastmod>
<lastmod>2025-11-15</lastmod>
</url>
<url>
<loc>https://llamactl.org/dev/configuration/</loc>
<lastmod>2025-11-13</lastmod>
<lastmod>2025-11-15</lastmod>
</url>
<url>
<loc>https://llamactl.org/dev/installation/</loc>
<lastmod>2025-11-13</lastmod>
<lastmod>2025-11-15</lastmod>
</url>
<url>
<loc>https://llamactl.org/dev/managing-instances/</loc>
<lastmod>2025-11-13</lastmod>
<lastmod>2025-11-15</lastmod>
</url>
<url>
<loc>https://llamactl.org/dev/quick-start/</loc>
<lastmod>2025-11-13</lastmod>
<lastmod>2025-11-15</lastmod>
</url>
<url>
<loc>https://llamactl.org/dev/troubleshooting/</loc>
<lastmod>2025-11-13</lastmod>
<lastmod>2025-11-15</lastmod>
</url>
</urlset>

Binary file not shown.

View File

@@ -249,6 +249,34 @@
}
}
},
"/api/v1/config": {
"get": {
"security": [
{
"ApiKeyAuth": []
}
],
"description": "Returns the current server configuration (sanitized)",
"tags": [
"System"
],
"summary": "Get server configuration",
"responses": {
"200": {
"description": "Sanitized configuration",
"schema": {
"$ref": "#/definitions/config.AppConfig"
}
},
"500": {
"description": "Internal Server Error",
"schema": {
"type": "string"
}
}
}
}
},
"/api/v1/instances": {
"get": {
"security": [
@@ -1468,6 +1496,247 @@
}
},
"definitions": {
"config.AppConfig": {
"type": "object",
"properties": {
"auth": {
"$ref": "#/definitions/config.AuthConfig"
},
"backends": {
"$ref": "#/definitions/config.BackendConfig"
},
"build_time": {
"type": "string"
},
"commit_hash": {
"type": "string"
},
"instances": {
"$ref": "#/definitions/config.InstancesConfig"
},
"local_node": {
"type": "string"
},
"nodes": {
"type": "object",
"additionalProperties": {
"$ref": "#/definitions/config.NodeConfig"
}
},
"server": {
"$ref": "#/definitions/config.ServerConfig"
},
"version": {
"type": "string"
}
}
},
"config.AuthConfig": {
"type": "object",
"properties": {
"inference_keys": {
"description": "List of keys for OpenAI compatible inference endpoints",
"type": "array",
"items": {
"type": "string"
}
},
"management_keys": {
"description": "List of keys for management endpoints",
"type": "array",
"items": {
"type": "string"
}
},
"require_inference_auth": {
"description": "Require authentication for OpenAI compatible inference endpoints",
"type": "boolean"
},
"require_management_auth": {
"description": "Require authentication for management endpoints",
"type": "boolean"
}
}
},
"config.BackendConfig": {
"type": "object",
"properties": {
"llama-cpp": {
"$ref": "#/definitions/config.BackendSettings"
},
"mlx": {
"$ref": "#/definitions/config.BackendSettings"
},
"vllm": {
"$ref": "#/definitions/config.BackendSettings"
}
}
},
"config.BackendSettings": {
"type": "object",
"properties": {
"args": {
"type": "array",
"items": {
"type": "string"
}
},
"command": {
"type": "string"
},
"docker": {
"$ref": "#/definitions/config.DockerSettings"
},
"environment": {
"type": "object",
"additionalProperties": {
"type": "string"
}
},
"response_headers": {
"type": "object",
"additionalProperties": {
"type": "string"
}
}
}
},
"config.DockerSettings": {
"type": "object",
"properties": {
"args": {
"type": "array",
"items": {
"type": "string"
}
},
"enabled": {
"type": "boolean"
},
"environment": {
"type": "object",
"additionalProperties": {
"type": "string"
}
},
"image": {
"type": "string"
}
}
},
"config.InstancesConfig": {
"type": "object",
"properties": {
"auto_create_dirs": {
"description": "Automatically create the data directory if it doesn't exist",
"type": "boolean"
},
"configs_dir": {
"description": "Instance config directory override",
"type": "string"
},
"data_dir": {
"description": "Directory where all llamactl data will be stored (instances.json, logs, etc.)",
"type": "string"
},
"default_auto_restart": {
"description": "Default auto-restart setting for new instances",
"type": "boolean"
},
"default_max_restarts": {
"description": "Default max restarts for new instances",
"type": "integer"
},
"default_on_demand_start": {
"description": "Default on-demand start setting for new instances",
"type": "boolean"
},
"default_restart_delay": {
"description": "Default restart delay for new instances (in seconds)",
"type": "integer"
},
"enable_lru_eviction": {
"description": "Enable LRU eviction for instance logs",
"type": "boolean"
},
"logs_dir": {
"description": "Logs directory override",
"type": "string"
},
"max_instances": {
"description": "Maximum number of instances that can be created",
"type": "integer"
},
"max_running_instances": {
"description": "Maximum number of instances that can be running at the same time",
"type": "integer"
},
"on_demand_start_timeout": {
"description": "How long to wait for an instance to start on demand (in seconds)",
"type": "integer"
},
"port_range": {
"description": "Port range for instances (e.g., 8000,9000)",
"type": "array",
"items": {
"type": "integer"
}
},
"timeout_check_interval": {
"description": "Interval for checking instance timeouts (in minutes)",
"type": "integer"
}
}
},
"config.NodeConfig": {
"type": "object",
"properties": {
"address": {
"type": "string"
},
"api_key": {
"type": "string"
}
}
},
"config.ServerConfig": {
"type": "object",
"properties": {
"allowed_headers": {
"description": "Allowed headers for CORS (e.g., \"Accept\", \"Authorization\", \"Content-Type\", \"X-CSRF-Token\")",
"type": "array",
"items": {
"type": "string"
}
},
"allowed_origins": {
"description": "Allowed origins for CORS (e.g., \"http://localhost:3000\")",
"type": "array",
"items": {
"type": "string"
}
},
"enable_swagger": {
"description": "Enable Swagger UI for API documentation",
"type": "boolean"
},
"host": {
"description": "Server host to bind to",
"type": "string"
},
"port": {
"description": "Server port to bind to",
"type": "integer"
},
"response_headers": {
"description": "Response headers to send with responses",
"type": "object",
"additionalProperties": {
"type": "string"
}
}
}
},
"instance.Instance": {
"type": "object",
"properties": {
@@ -1487,6 +1756,13 @@
"description": "Auto restart",
"type": "boolean"
},
"command_override": {
"type": "string"
},
"docker_enabled": {
"description": "Execution context overrides",
"type": "boolean"
},
"environment": {
"description": "Environment variables",
"type": "object",

View File

@@ -1,5 +1,173 @@
basePath: /api/v1
definitions:
config.AppConfig:
properties:
auth:
$ref: '#/definitions/config.AuthConfig'
backends:
$ref: '#/definitions/config.BackendConfig'
build_time:
type: string
commit_hash:
type: string
instances:
$ref: '#/definitions/config.InstancesConfig'
local_node:
type: string
nodes:
additionalProperties:
$ref: '#/definitions/config.NodeConfig'
type: object
server:
$ref: '#/definitions/config.ServerConfig'
version:
type: string
type: object
config.AuthConfig:
properties:
inference_keys:
description: List of keys for OpenAI compatible inference endpoints
items:
type: string
type: array
management_keys:
description: List of keys for management endpoints
items:
type: string
type: array
require_inference_auth:
description: Require authentication for OpenAI compatible inference endpoints
type: boolean
require_management_auth:
description: Require authentication for management endpoints
type: boolean
type: object
config.BackendConfig:
properties:
llama-cpp:
$ref: '#/definitions/config.BackendSettings'
mlx:
$ref: '#/definitions/config.BackendSettings'
vllm:
$ref: '#/definitions/config.BackendSettings'
type: object
config.BackendSettings:
properties:
args:
items:
type: string
type: array
command:
type: string
docker:
$ref: '#/definitions/config.DockerSettings'
environment:
additionalProperties:
type: string
type: object
response_headers:
additionalProperties:
type: string
type: object
type: object
config.DockerSettings:
properties:
args:
items:
type: string
type: array
enabled:
type: boolean
environment:
additionalProperties:
type: string
type: object
image:
type: string
type: object
config.InstancesConfig:
properties:
auto_create_dirs:
description: Automatically create the data directory if it doesn't exist
type: boolean
configs_dir:
description: Instance config directory override
type: string
data_dir:
description: Directory where all llamactl data will be stored (instances.json,
logs, etc.)
type: string
default_auto_restart:
description: Default auto-restart setting for new instances
type: boolean
default_max_restarts:
description: Default max restarts for new instances
type: integer
default_on_demand_start:
description: Default on-demand start setting for new instances
type: boolean
default_restart_delay:
description: Default restart delay for new instances (in seconds)
type: integer
enable_lru_eviction:
description: Enable LRU eviction for instance logs
type: boolean
logs_dir:
description: Logs directory override
type: string
max_instances:
description: Maximum number of instances that can be created
type: integer
max_running_instances:
description: Maximum number of instances that can be running at the same time
type: integer
on_demand_start_timeout:
description: How long to wait for an instance to start on demand (in seconds)
type: integer
port_range:
description: Port range for instances (e.g., 8000,9000)
items:
type: integer
type: array
timeout_check_interval:
description: Interval for checking instance timeouts (in minutes)
type: integer
type: object
config.NodeConfig:
properties:
address:
type: string
api_key:
type: string
type: object
config.ServerConfig:
properties:
allowed_headers:
description: Allowed headers for CORS (e.g., "Accept", "Authorization", "Content-Type",
"X-CSRF-Token")
items:
type: string
type: array
allowed_origins:
description: Allowed origins for CORS (e.g., "http://localhost:3000")
items:
type: string
type: array
enable_swagger:
description: Enable Swagger UI for API documentation
type: boolean
host:
description: Server host to bind to
type: string
port:
description: Server port to bind to
type: integer
response_headers:
additionalProperties:
type: string
description: Response headers to send with responses
type: object
type: object
instance.Instance:
properties:
created:
@@ -13,6 +181,11 @@ definitions:
auto_restart:
description: Auto restart
type: boolean
command_override:
type: string
docker_enabled:
description: Execution context overrides
type: boolean
environment:
additionalProperties:
type: string
@@ -216,6 +389,23 @@ paths:
summary: Parse vllm serve command
tags:
- Backends
/api/v1/config:
get:
description: Returns the current server configuration (sanitized)
responses:
"200":
description: Sanitized configuration
schema:
$ref: '#/definitions/config.AppConfig'
"500":
description: Internal Server Error
schema:
type: string
security:
- ApiKeyAuth: []
summary: Get server configuration
tags:
- System
/api/v1/instances:
get:
description: Returns a list of all instances managed by the server