mirror of
https://github.com/lordmathis/llamactl.git
synced 2025-12-23 01:24:24 +00:00
Deployed 514b1b0 to dev with MkDocs 1.6.1 and mike 2.1.3
This commit is contained in:
Binary file not shown.
Binary file not shown.
@@ -645,6 +645,39 @@
|
||||
</ul>
|
||||
</nav>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#system" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
System
|
||||
</span>
|
||||
</a>
|
||||
|
||||
<nav class="md-nav" aria-label="System">
|
||||
<ul class="md-nav__list">
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#get-apiv1config" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
GET /api/v1/config
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#get-apiv1version" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
GET /api/v1/version
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
</ul>
|
||||
</nav>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
@@ -792,30 +825,6 @@
|
||||
</ul>
|
||||
</nav>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#system" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
System
|
||||
</span>
|
||||
</a>
|
||||
|
||||
<nav class="md-nav" aria-label="System">
|
||||
<ul class="md-nav__list">
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#get-apiv1version" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
GET /api/v1/version
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
</ul>
|
||||
</nav>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
@@ -1306,6 +1315,87 @@ Most likely, it is not desirable to edit this file by hand!
|
||||
<strong>Response <span class="response-code code-400">400</span> <span class="status-phrase">Bad Request</span></strong>
|
||||
</p>
|
||||
|
||||
<h2 id="system"><span class="api-tag">System</span><a class="headerlink" href="#system" title="Permanent link">¶</a></h2>
|
||||
<hr class="operation-separator" />
|
||||
|
||||
<h3 id="get-apiv1config"><span class="http-get">GET</span> /api/v1/config<a class="headerlink" href="#get-apiv1config" title="Permanent link">¶</a></h3>
|
||||
<p>Get server configuration </p>
|
||||
<details class="note">
|
||||
<summary>Description</summary>
|
||||
<p>Returns the current server configuration (sanitized) </p>
|
||||
</details>
|
||||
<p><strong>Input parameters</strong> </p>
|
||||
<table>
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Parameter</th>
|
||||
<th>In</th>
|
||||
<th>Type</th>
|
||||
<th>Default</th>
|
||||
<th>Nullable</th>
|
||||
<th>Description</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
<tr>
|
||||
<td class="parameter-name"><code>ApiKeyAuth</code></td>
|
||||
<td>header</td>
|
||||
<td>string</td>
|
||||
<td>N/A</td>
|
||||
<td>No</td>
|
||||
<td></td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
|
||||
<p class="response-title">
|
||||
<strong>Response <span class="response-code code-200">200</span> <span class="status-phrase">OK</span></strong>
|
||||
</p>
|
||||
|
||||
<p class="response-title">
|
||||
<strong>Response <span class="response-code code-500">500</span> <span class="status-phrase">Internal Server Error</span></strong>
|
||||
</p>
|
||||
|
||||
<hr class="operation-separator" />
|
||||
|
||||
<h3 id="get-apiv1version"><span class="http-get">GET</span> /api/v1/version<a class="headerlink" href="#get-apiv1version" title="Permanent link">¶</a></h3>
|
||||
<p>Get llamactl version </p>
|
||||
<details class="note">
|
||||
<summary>Description</summary>
|
||||
<p>Returns the version of the llamactl command </p>
|
||||
</details>
|
||||
<p><strong>Input parameters</strong> </p>
|
||||
<table>
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Parameter</th>
|
||||
<th>In</th>
|
||||
<th>Type</th>
|
||||
<th>Default</th>
|
||||
<th>Nullable</th>
|
||||
<th>Description</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
<tr>
|
||||
<td class="parameter-name"><code>ApiKeyAuth</code></td>
|
||||
<td>header</td>
|
||||
<td>string</td>
|
||||
<td>N/A</td>
|
||||
<td>No</td>
|
||||
<td></td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
|
||||
<p class="response-title">
|
||||
<strong>Response <span class="response-code code-200">200</span> <span class="status-phrase">OK</span></strong>
|
||||
</p>
|
||||
|
||||
<p class="response-title">
|
||||
<strong>Response <span class="response-code code-500">500</span> <span class="status-phrase">Internal Server Error</span></strong>
|
||||
</p>
|
||||
|
||||
<h2 id="instances"><span class="api-tag">Instances</span><a class="headerlink" href="#instances" title="Permanent link">¶</a></h2>
|
||||
<hr class="operation-separator" />
|
||||
|
||||
@@ -1999,47 +2089,6 @@ config) </p>
|
||||
<strong>Response <span class="response-code code-500">500</span> <span class="status-phrase">Internal Server Error</span></strong>
|
||||
</p>
|
||||
|
||||
<h2 id="system"><span class="api-tag">System</span><a class="headerlink" href="#system" title="Permanent link">¶</a></h2>
|
||||
<hr class="operation-separator" />
|
||||
|
||||
<h3 id="get-apiv1version"><span class="http-get">GET</span> /api/v1/version<a class="headerlink" href="#get-apiv1version" title="Permanent link">¶</a></h3>
|
||||
<p>Get llamactl version </p>
|
||||
<details class="note">
|
||||
<summary>Description</summary>
|
||||
<p>Returns the version of the llamactl command </p>
|
||||
</details>
|
||||
<p><strong>Input parameters</strong> </p>
|
||||
<table>
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Parameter</th>
|
||||
<th>In</th>
|
||||
<th>Type</th>
|
||||
<th>Default</th>
|
||||
<th>Nullable</th>
|
||||
<th>Description</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
<tr>
|
||||
<td class="parameter-name"><code>ApiKeyAuth</code></td>
|
||||
<td>header</td>
|
||||
<td>string</td>
|
||||
<td>N/A</td>
|
||||
<td>No</td>
|
||||
<td></td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
|
||||
<p class="response-title">
|
||||
<strong>Response <span class="response-code code-200">200</span> <span class="status-phrase">OK</span></strong>
|
||||
</p>
|
||||
|
||||
<p class="response-title">
|
||||
<strong>Response <span class="response-code code-500">500</span> <span class="status-phrase">Internal Server Error</span></strong>
|
||||
</p>
|
||||
|
||||
<h2 id="llamacpp"><span class="api-tag">Llama.cpp</span><a class="headerlink" href="#llamacpp" title="Permanent link">¶</a></h2>
|
||||
<hr class="operation-separator" />
|
||||
|
||||
|
||||
276
dev/docs.go
276
dev/docs.go
@@ -256,6 +256,34 @@ const docTemplate = `{
|
||||
}
|
||||
}
|
||||
},
|
||||
"/api/v1/config": {
|
||||
"get": {
|
||||
"security": [
|
||||
{
|
||||
"ApiKeyAuth": []
|
||||
}
|
||||
],
|
||||
"description": "Returns the current server configuration (sanitized)",
|
||||
"tags": [
|
||||
"System"
|
||||
],
|
||||
"summary": "Get server configuration",
|
||||
"responses": {
|
||||
"200": {
|
||||
"description": "Sanitized configuration",
|
||||
"schema": {
|
||||
"$ref": "#/definitions/config.AppConfig"
|
||||
}
|
||||
},
|
||||
"500": {
|
||||
"description": "Internal Server Error",
|
||||
"schema": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"/api/v1/instances": {
|
||||
"get": {
|
||||
"security": [
|
||||
@@ -1475,6 +1503,247 @@ const docTemplate = `{
|
||||
}
|
||||
},
|
||||
"definitions": {
|
||||
"config.AppConfig": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"auth": {
|
||||
"$ref": "#/definitions/config.AuthConfig"
|
||||
},
|
||||
"backends": {
|
||||
"$ref": "#/definitions/config.BackendConfig"
|
||||
},
|
||||
"build_time": {
|
||||
"type": "string"
|
||||
},
|
||||
"commit_hash": {
|
||||
"type": "string"
|
||||
},
|
||||
"instances": {
|
||||
"$ref": "#/definitions/config.InstancesConfig"
|
||||
},
|
||||
"local_node": {
|
||||
"type": "string"
|
||||
},
|
||||
"nodes": {
|
||||
"type": "object",
|
||||
"additionalProperties": {
|
||||
"$ref": "#/definitions/config.NodeConfig"
|
||||
}
|
||||
},
|
||||
"server": {
|
||||
"$ref": "#/definitions/config.ServerConfig"
|
||||
},
|
||||
"version": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
},
|
||||
"config.AuthConfig": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"inference_keys": {
|
||||
"description": "List of keys for OpenAI compatible inference endpoints",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"management_keys": {
|
||||
"description": "List of keys for management endpoints",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"require_inference_auth": {
|
||||
"description": "Require authentication for OpenAI compatible inference endpoints",
|
||||
"type": "boolean"
|
||||
},
|
||||
"require_management_auth": {
|
||||
"description": "Require authentication for management endpoints",
|
||||
"type": "boolean"
|
||||
}
|
||||
}
|
||||
},
|
||||
"config.BackendConfig": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"llama-cpp": {
|
||||
"$ref": "#/definitions/config.BackendSettings"
|
||||
},
|
||||
"mlx": {
|
||||
"$ref": "#/definitions/config.BackendSettings"
|
||||
},
|
||||
"vllm": {
|
||||
"$ref": "#/definitions/config.BackendSettings"
|
||||
}
|
||||
}
|
||||
},
|
||||
"config.BackendSettings": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"args": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"command": {
|
||||
"type": "string"
|
||||
},
|
||||
"docker": {
|
||||
"$ref": "#/definitions/config.DockerSettings"
|
||||
},
|
||||
"environment": {
|
||||
"type": "object",
|
||||
"additionalProperties": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"response_headers": {
|
||||
"type": "object",
|
||||
"additionalProperties": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"config.DockerSettings": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"args": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"enabled": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"environment": {
|
||||
"type": "object",
|
||||
"additionalProperties": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"image": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
},
|
||||
"config.InstancesConfig": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"auto_create_dirs": {
|
||||
"description": "Automatically create the data directory if it doesn't exist",
|
||||
"type": "boolean"
|
||||
},
|
||||
"configs_dir": {
|
||||
"description": "Instance config directory override",
|
||||
"type": "string"
|
||||
},
|
||||
"data_dir": {
|
||||
"description": "Directory where all llamactl data will be stored (instances.json, logs, etc.)",
|
||||
"type": "string"
|
||||
},
|
||||
"default_auto_restart": {
|
||||
"description": "Default auto-restart setting for new instances",
|
||||
"type": "boolean"
|
||||
},
|
||||
"default_max_restarts": {
|
||||
"description": "Default max restarts for new instances",
|
||||
"type": "integer"
|
||||
},
|
||||
"default_on_demand_start": {
|
||||
"description": "Default on-demand start setting for new instances",
|
||||
"type": "boolean"
|
||||
},
|
||||
"default_restart_delay": {
|
||||
"description": "Default restart delay for new instances (in seconds)",
|
||||
"type": "integer"
|
||||
},
|
||||
"enable_lru_eviction": {
|
||||
"description": "Enable LRU eviction for instance logs",
|
||||
"type": "boolean"
|
||||
},
|
||||
"logs_dir": {
|
||||
"description": "Logs directory override",
|
||||
"type": "string"
|
||||
},
|
||||
"max_instances": {
|
||||
"description": "Maximum number of instances that can be created",
|
||||
"type": "integer"
|
||||
},
|
||||
"max_running_instances": {
|
||||
"description": "Maximum number of instances that can be running at the same time",
|
||||
"type": "integer"
|
||||
},
|
||||
"on_demand_start_timeout": {
|
||||
"description": "How long to wait for an instance to start on demand (in seconds)",
|
||||
"type": "integer"
|
||||
},
|
||||
"port_range": {
|
||||
"description": "Port range for instances (e.g., 8000,9000)",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "integer"
|
||||
}
|
||||
},
|
||||
"timeout_check_interval": {
|
||||
"description": "Interval for checking instance timeouts (in minutes)",
|
||||
"type": "integer"
|
||||
}
|
||||
}
|
||||
},
|
||||
"config.NodeConfig": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"address": {
|
||||
"type": "string"
|
||||
},
|
||||
"api_key": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
},
|
||||
"config.ServerConfig": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"allowed_headers": {
|
||||
"description": "Allowed headers for CORS (e.g., \"Accept\", \"Authorization\", \"Content-Type\", \"X-CSRF-Token\")",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"allowed_origins": {
|
||||
"description": "Allowed origins for CORS (e.g., \"http://localhost:3000\")",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"enable_swagger": {
|
||||
"description": "Enable Swagger UI for API documentation",
|
||||
"type": "boolean"
|
||||
},
|
||||
"host": {
|
||||
"description": "Server host to bind to",
|
||||
"type": "string"
|
||||
},
|
||||
"port": {
|
||||
"description": "Server port to bind to",
|
||||
"type": "integer"
|
||||
},
|
||||
"response_headers": {
|
||||
"description": "Response headers to send with responses",
|
||||
"type": "object",
|
||||
"additionalProperties": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"instance.Instance": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
@@ -1494,6 +1763,13 @@ const docTemplate = `{
|
||||
"description": "Auto restart",
|
||||
"type": "boolean"
|
||||
},
|
||||
"command_override": {
|
||||
"type": "string"
|
||||
},
|
||||
"docker_enabled": {
|
||||
"description": "Execution context overrides",
|
||||
"type": "boolean"
|
||||
},
|
||||
"environment": {
|
||||
"description": "Environment variables",
|
||||
"type": "object",
|
||||
|
||||
@@ -789,34 +789,36 @@
|
||||
<p><img alt="Create Instance Screenshot" src="../images/create_instance.png" /> </p>
|
||||
<ol>
|
||||
<li>Click the <strong>"Create Instance"</strong> button on the dashboard </li>
|
||||
<li><em>Optional</em>: Click <strong>"Import"</strong> in the dialog header to load a previously exported configuration </li>
|
||||
<li>Enter a unique <strong>Name</strong> for your instance (only required field) </li>
|
||||
<li><strong>Select Target Node</strong>: Choose which node to deploy the instance to from the dropdown </li>
|
||||
<li><strong>Choose Backend Type</strong>: <ul>
|
||||
<li><strong>llama.cpp</strong>: For GGUF models using llama-server </li>
|
||||
<li><strong>MLX</strong>: For MLX-optimized models (macOS only) </li>
|
||||
<li><em>Optional</em>: Click <strong>"Import"</strong> to load a previously exported configuration </li>
|
||||
</ol>
|
||||
<p><strong>Instance Settings:</strong> </p>
|
||||
<ol>
|
||||
<li>Enter a unique <strong>Instance Name</strong> (required) </li>
|
||||
<li><strong>Select Node</strong>: Choose which node to deploy the instance to </li>
|
||||
<li>Configure <strong>Auto Restart</strong> settings: <ul>
|
||||
<li>Enable automatic restart on failure </li>
|
||||
<li>Set max restarts and delay between attempts </li>
|
||||
</ul>
|
||||
</li>
|
||||
<li>Configure basic instance options: <ul>
|
||||
<li><strong>Idle Timeout</strong>: Minutes before stopping idle instance </li>
|
||||
<li><strong>On Demand Start</strong>: Start instance only when needed </li>
|
||||
</ul>
|
||||
</li>
|
||||
</ol>
|
||||
<p><strong>Backend Configuration:</strong> </p>
|
||||
<ol>
|
||||
<li><strong>Select Backend Type</strong>: <ul>
|
||||
<li><strong>Llama Server</strong>: For GGUF models using llama-server </li>
|
||||
<li><strong>MLX LM</strong>: For MLX-optimized models (macOS only) </li>
|
||||
<li><strong>vLLM</strong>: For distributed serving and high-throughput inference </li>
|
||||
</ul>
|
||||
</li>
|
||||
<li>Configure model source: <ul>
|
||||
<li><strong>For llama.cpp</strong>: GGUF model path or HuggingFace repo </li>
|
||||
<li><strong>For MLX</strong>: MLX model path or identifier (e.g., <code>mlx-community/Mistral-7B-Instruct-v0.3-4bit</code>) </li>
|
||||
<li><strong>For vLLM</strong>: HuggingFace model identifier (e.g., <code>microsoft/DialoGPT-medium</code>) </li>
|
||||
</ul>
|
||||
</li>
|
||||
<li>Configure optional instance management settings: <ul>
|
||||
<li><strong>Auto Restart</strong>: Automatically restart instance on failure </li>
|
||||
<li><strong>Max Restarts</strong>: Maximum number of restart attempts </li>
|
||||
<li><strong>Restart Delay</strong>: Delay in seconds between restart attempts </li>
|
||||
<li><strong>On Demand Start</strong>: Start instance when receiving a request to the OpenAI compatible endpoint </li>
|
||||
<li><strong>Idle Timeout</strong>: Minutes before stopping idle instance (set to 0 to disable) </li>
|
||||
<li><strong>Environment Variables</strong>: Set custom environment variables for the instance process </li>
|
||||
</ul>
|
||||
</li>
|
||||
<li>Configure backend-specific options: <ul>
|
||||
<li><strong>llama.cpp</strong>: Threads, context size, GPU layers, port, etc. </li>
|
||||
<li><strong>MLX</strong>: Temperature, top-p, adapter path, Python environment, etc. </li>
|
||||
<li><strong>vLLM</strong>: Tensor parallel size, GPU memory utilization, quantization, etc. </li>
|
||||
<li><em>Optional</em>: Click <strong>"Parse Command"</strong> to import settings from an existing backend command </li>
|
||||
<li>Configure <strong>Execution Context</strong>: <ul>
|
||||
<li><strong>Enable Docker</strong>: Run backend in Docker container </li>
|
||||
<li><strong>Command Override</strong>: Custom path to backend executable </li>
|
||||
<li><strong>Environment Variables</strong>: Custom environment variables </li>
|
||||
</ul>
|
||||
</li>
|
||||
</ol>
|
||||
@@ -825,6 +827,14 @@
|
||||
<p>Llamactl automatically assigns ports from the configured port range (default: 8000-9000) and generates API keys if authentication is enabled. You typically don't need to manually specify these values. </p>
|
||||
</div>
|
||||
<ol>
|
||||
<li>Configure <strong>Basic Backend Options</strong> (varies by backend): <ul>
|
||||
<li><strong>llama.cpp</strong>: Model path, threads, context size, GPU layers, etc. </li>
|
||||
<li><strong>MLX</strong>: Model identifier, temperature, max tokens, etc. </li>
|
||||
<li><strong>vLLM</strong>: Model identifier, tensor parallel size, GPU memory utilization, etc. </li>
|
||||
</ul>
|
||||
</li>
|
||||
<li><em>Optional</em>: Expand <strong>Advanced Backend Options</strong> for additional settings </li>
|
||||
<li><em>Optional</em>: Add <strong>Extra Args</strong> as key-value pairs for custom command-line arguments </li>
|
||||
<li>Click <strong>"Create"</strong> to save the instance </li>
|
||||
</ol>
|
||||
<p><strong>Via API</strong> </p>
|
||||
@@ -838,88 +848,47 @@
|
||||
<a id="__codelineno-0-8" name="__codelineno-0-8" href="#__codelineno-0-8"></a><span class="s1"> "model": "/path/to/model.gguf",</span>
|
||||
<a id="__codelineno-0-9" name="__codelineno-0-9" href="#__codelineno-0-9"></a><span class="s1"> "threads": 8,</span>
|
||||
<a id="__codelineno-0-10" name="__codelineno-0-10" href="#__codelineno-0-10"></a><span class="s1"> "ctx_size": 4096,</span>
|
||||
<a id="__codelineno-0-11" name="__codelineno-0-11" href="#__codelineno-0-11"></a><span class="s1"> "gpu_layers": 32</span>
|
||||
<a id="__codelineno-0-12" name="__codelineno-0-12" href="#__codelineno-0-12"></a><span class="s1"> },</span>
|
||||
<a id="__codelineno-0-13" name="__codelineno-0-13" href="#__codelineno-0-13"></a><span class="s1"> "nodes": ["main"]</span>
|
||||
<a id="__codelineno-0-14" name="__codelineno-0-14" href="#__codelineno-0-14"></a><span class="s1"> }'</span>
|
||||
<a id="__codelineno-0-15" name="__codelineno-0-15" href="#__codelineno-0-15"></a>
|
||||
<a id="__codelineno-0-16" name="__codelineno-0-16" href="#__codelineno-0-16"></a><span class="c1"># Create MLX instance (macOS only)</span>
|
||||
<a id="__codelineno-0-17" name="__codelineno-0-17" href="#__codelineno-0-17"></a>curl<span class="w"> </span>-X<span class="w"> </span>POST<span class="w"> </span>http://localhost:8080/api/v1/instances/my-mlx-instance<span class="w"> </span><span class="se">\</span>
|
||||
<a id="__codelineno-0-18" name="__codelineno-0-18" href="#__codelineno-0-18"></a><span class="w"> </span>-H<span class="w"> </span><span class="s2">"Content-Type: application/json"</span><span class="w"> </span><span class="se">\</span>
|
||||
<a id="__codelineno-0-19" name="__codelineno-0-19" href="#__codelineno-0-19"></a><span class="w"> </span>-H<span class="w"> </span><span class="s2">"Authorization: Bearer <token>"</span><span class="w"> </span><span class="se">\</span>
|
||||
<a id="__codelineno-0-20" name="__codelineno-0-20" href="#__codelineno-0-20"></a><span class="w"> </span>-d<span class="w"> </span><span class="s1">'{</span>
|
||||
<a id="__codelineno-0-21" name="__codelineno-0-21" href="#__codelineno-0-21"></a><span class="s1"> "backend_type": "mlx_lm",</span>
|
||||
<a id="__codelineno-0-22" name="__codelineno-0-22" href="#__codelineno-0-22"></a><span class="s1"> "backend_options": {</span>
|
||||
<a id="__codelineno-0-23" name="__codelineno-0-23" href="#__codelineno-0-23"></a><span class="s1"> "model": "mlx-community/Mistral-7B-Instruct-v0.3-4bit",</span>
|
||||
<a id="__codelineno-0-24" name="__codelineno-0-24" href="#__codelineno-0-24"></a><span class="s1"> "temp": 0.7,</span>
|
||||
<a id="__codelineno-0-25" name="__codelineno-0-25" href="#__codelineno-0-25"></a><span class="s1"> "top_p": 0.9,</span>
|
||||
<a id="__codelineno-0-26" name="__codelineno-0-26" href="#__codelineno-0-26"></a><span class="s1"> "max_tokens": 2048</span>
|
||||
<a id="__codelineno-0-27" name="__codelineno-0-27" href="#__codelineno-0-27"></a><span class="s1"> },</span>
|
||||
<a id="__codelineno-0-28" name="__codelineno-0-28" href="#__codelineno-0-28"></a><span class="s1"> "auto_restart": true,</span>
|
||||
<a id="__codelineno-0-29" name="__codelineno-0-29" href="#__codelineno-0-29"></a><span class="s1"> "max_restarts": 3,</span>
|
||||
<a id="__codelineno-0-30" name="__codelineno-0-30" href="#__codelineno-0-30"></a><span class="s1"> "nodes": ["main"]</span>
|
||||
<a id="__codelineno-0-31" name="__codelineno-0-31" href="#__codelineno-0-31"></a><span class="s1"> }'</span>
|
||||
<a id="__codelineno-0-32" name="__codelineno-0-32" href="#__codelineno-0-32"></a>
|
||||
<a id="__codelineno-0-33" name="__codelineno-0-33" href="#__codelineno-0-33"></a><span class="c1"># Create vLLM instance</span>
|
||||
<a id="__codelineno-0-34" name="__codelineno-0-34" href="#__codelineno-0-34"></a>curl<span class="w"> </span>-X<span class="w"> </span>POST<span class="w"> </span>http://localhost:8080/api/v1/instances/my-vllm-instance<span class="w"> </span><span class="se">\</span>
|
||||
<a id="__codelineno-0-35" name="__codelineno-0-35" href="#__codelineno-0-35"></a><span class="w"> </span>-H<span class="w"> </span><span class="s2">"Content-Type: application/json"</span><span class="w"> </span><span class="se">\</span>
|
||||
<a id="__codelineno-0-36" name="__codelineno-0-36" href="#__codelineno-0-36"></a><span class="w"> </span>-H<span class="w"> </span><span class="s2">"Authorization: Bearer <token>"</span><span class="w"> </span><span class="se">\</span>
|
||||
<a id="__codelineno-0-37" name="__codelineno-0-37" href="#__codelineno-0-37"></a><span class="w"> </span>-d<span class="w"> </span><span class="s1">'{</span>
|
||||
<a id="__codelineno-0-38" name="__codelineno-0-38" href="#__codelineno-0-38"></a><span class="s1"> "backend_type": "vllm",</span>
|
||||
<a id="__codelineno-0-39" name="__codelineno-0-39" href="#__codelineno-0-39"></a><span class="s1"> "backend_options": {</span>
|
||||
<a id="__codelineno-0-40" name="__codelineno-0-40" href="#__codelineno-0-40"></a><span class="s1"> "model": "microsoft/DialoGPT-medium",</span>
|
||||
<a id="__codelineno-0-41" name="__codelineno-0-41" href="#__codelineno-0-41"></a><span class="s1"> "tensor_parallel_size": 2,</span>
|
||||
<a id="__codelineno-0-42" name="__codelineno-0-42" href="#__codelineno-0-42"></a><span class="s1"> "gpu_memory_utilization": 0.9</span>
|
||||
<a id="__codelineno-0-43" name="__codelineno-0-43" href="#__codelineno-0-43"></a><span class="s1"> },</span>
|
||||
<a id="__codelineno-0-44" name="__codelineno-0-44" href="#__codelineno-0-44"></a><span class="s1"> "auto_restart": true,</span>
|
||||
<a id="__codelineno-0-45" name="__codelineno-0-45" href="#__codelineno-0-45"></a><span class="s1"> "on_demand_start": true,</span>
|
||||
<a id="__codelineno-0-46" name="__codelineno-0-46" href="#__codelineno-0-46"></a><span class="s1"> "environment": {</span>
|
||||
<a id="__codelineno-0-47" name="__codelineno-0-47" href="#__codelineno-0-47"></a><span class="s1"> "CUDA_VISIBLE_DEVICES": "0,1",</span>
|
||||
<a id="__codelineno-0-48" name="__codelineno-0-48" href="#__codelineno-0-48"></a><span class="s1"> "NCCL_DEBUG": "INFO",</span>
|
||||
<a id="__codelineno-0-49" name="__codelineno-0-49" href="#__codelineno-0-49"></a><span class="s1"> "PYTHONPATH": "/custom/path"</span>
|
||||
<a id="__codelineno-0-50" name="__codelineno-0-50" href="#__codelineno-0-50"></a><span class="s1"> },</span>
|
||||
<a id="__codelineno-0-51" name="__codelineno-0-51" href="#__codelineno-0-51"></a><span class="s1"> "nodes": ["main"]</span>
|
||||
<a id="__codelineno-0-52" name="__codelineno-0-52" href="#__codelineno-0-52"></a><span class="s1"> }'</span>
|
||||
<a id="__codelineno-0-53" name="__codelineno-0-53" href="#__codelineno-0-53"></a>
|
||||
<a id="__codelineno-0-54" name="__codelineno-0-54" href="#__codelineno-0-54"></a><span class="c1"># Create llama.cpp instance with HuggingFace model</span>
|
||||
<a id="__codelineno-0-55" name="__codelineno-0-55" href="#__codelineno-0-55"></a>curl<span class="w"> </span>-X<span class="w"> </span>POST<span class="w"> </span>http://localhost:8080/api/v1/instances/gemma-3-27b<span class="w"> </span><span class="se">\</span>
|
||||
<a id="__codelineno-0-56" name="__codelineno-0-56" href="#__codelineno-0-56"></a><span class="w"> </span>-H<span class="w"> </span><span class="s2">"Content-Type: application/json"</span><span class="w"> </span><span class="se">\</span>
|
||||
<a id="__codelineno-0-57" name="__codelineno-0-57" href="#__codelineno-0-57"></a><span class="w"> </span>-H<span class="w"> </span><span class="s2">"Authorization: Bearer <token>"</span><span class="w"> </span><span class="se">\</span>
|
||||
<a id="__codelineno-0-58" name="__codelineno-0-58" href="#__codelineno-0-58"></a><span class="w"> </span>-d<span class="w"> </span><span class="s1">'{</span>
|
||||
<a id="__codelineno-0-59" name="__codelineno-0-59" href="#__codelineno-0-59"></a><span class="s1"> "backend_type": "llama_cpp",</span>
|
||||
<a id="__codelineno-0-60" name="__codelineno-0-60" href="#__codelineno-0-60"></a><span class="s1"> "backend_options": {</span>
|
||||
<a id="__codelineno-0-61" name="__codelineno-0-61" href="#__codelineno-0-61"></a><span class="s1"> "hf_repo": "unsloth/gemma-3-27b-it-GGUF",</span>
|
||||
<a id="__codelineno-0-62" name="__codelineno-0-62" href="#__codelineno-0-62"></a><span class="s1"> "hf_file": "gemma-3-27b-it-GGUF.gguf",</span>
|
||||
<a id="__codelineno-0-63" name="__codelineno-0-63" href="#__codelineno-0-63"></a><span class="s1"> "gpu_layers": 32</span>
|
||||
<a id="__codelineno-0-64" name="__codelineno-0-64" href="#__codelineno-0-64"></a><span class="s1"> },</span>
|
||||
<a id="__codelineno-0-65" name="__codelineno-0-65" href="#__codelineno-0-65"></a><span class="s1"> "nodes": ["main"]</span>
|
||||
<a id="__codelineno-0-66" name="__codelineno-0-66" href="#__codelineno-0-66"></a><span class="s1"> }'</span>
|
||||
<a id="__codelineno-0-67" name="__codelineno-0-67" href="#__codelineno-0-67"></a>
|
||||
<a id="__codelineno-0-68" name="__codelineno-0-68" href="#__codelineno-0-68"></a><span class="c1"># Create instance on specific remote node</span>
|
||||
<a id="__codelineno-0-69" name="__codelineno-0-69" href="#__codelineno-0-69"></a>curl<span class="w"> </span>-X<span class="w"> </span>POST<span class="w"> </span>http://localhost:8080/api/v1/instances/remote-llama<span class="w"> </span><span class="se">\</span>
|
||||
<a id="__codelineno-0-70" name="__codelineno-0-70" href="#__codelineno-0-70"></a><span class="w"> </span>-H<span class="w"> </span><span class="s2">"Content-Type: application/json"</span><span class="w"> </span><span class="se">\</span>
|
||||
<a id="__codelineno-0-71" name="__codelineno-0-71" href="#__codelineno-0-71"></a><span class="w"> </span>-H<span class="w"> </span><span class="s2">"Authorization: Bearer <token>"</span><span class="w"> </span><span class="se">\</span>
|
||||
<a id="__codelineno-0-72" name="__codelineno-0-72" href="#__codelineno-0-72"></a><span class="w"> </span>-d<span class="w"> </span><span class="s1">'{</span>
|
||||
<a id="__codelineno-0-73" name="__codelineno-0-73" href="#__codelineno-0-73"></a><span class="s1"> "backend_type": "llama_cpp",</span>
|
||||
<a id="__codelineno-0-74" name="__codelineno-0-74" href="#__codelineno-0-74"></a><span class="s1"> "backend_options": {</span>
|
||||
<a id="__codelineno-0-75" name="__codelineno-0-75" href="#__codelineno-0-75"></a><span class="s1"> "model": "/models/llama-7b.gguf",</span>
|
||||
<a id="__codelineno-0-76" name="__codelineno-0-76" href="#__codelineno-0-76"></a><span class="s1"> "gpu_layers": 32</span>
|
||||
<a id="__codelineno-0-77" name="__codelineno-0-77" href="#__codelineno-0-77"></a><span class="s1"> },</span>
|
||||
<a id="__codelineno-0-78" name="__codelineno-0-78" href="#__codelineno-0-78"></a><span class="s1"> "nodes": ["worker1"]</span>
|
||||
<a id="__codelineno-0-79" name="__codelineno-0-79" href="#__codelineno-0-79"></a><span class="s1"> }'</span>
|
||||
<a id="__codelineno-0-80" name="__codelineno-0-80" href="#__codelineno-0-80"></a>
|
||||
<a id="__codelineno-0-81" name="__codelineno-0-81" href="#__codelineno-0-81"></a><span class="c1"># Create instance on multiple nodes for high availability</span>
|
||||
<a id="__codelineno-0-82" name="__codelineno-0-82" href="#__codelineno-0-82"></a>curl<span class="w"> </span>-X<span class="w"> </span>POST<span class="w"> </span>http://localhost:8080/api/v1/instances/multi-node-llama<span class="w"> </span><span class="se">\</span>
|
||||
<a id="__codelineno-0-83" name="__codelineno-0-83" href="#__codelineno-0-83"></a><span class="w"> </span>-H<span class="w"> </span><span class="s2">"Content-Type: application/json"</span><span class="w"> </span><span class="se">\</span>
|
||||
<a id="__codelineno-0-84" name="__codelineno-0-84" href="#__codelineno-0-84"></a><span class="w"> </span>-H<span class="w"> </span><span class="s2">"Authorization: Bearer <token>"</span><span class="w"> </span><span class="se">\</span>
|
||||
<a id="__codelineno-0-85" name="__codelineno-0-85" href="#__codelineno-0-85"></a><span class="w"> </span>-d<span class="w"> </span><span class="s1">'{</span>
|
||||
<a id="__codelineno-0-86" name="__codelineno-0-86" href="#__codelineno-0-86"></a><span class="s1"> "backend_type": "llama_cpp",</span>
|
||||
<a id="__codelineno-0-87" name="__codelineno-0-87" href="#__codelineno-0-87"></a><span class="s1"> "backend_options": {</span>
|
||||
<a id="__codelineno-0-88" name="__codelineno-0-88" href="#__codelineno-0-88"></a><span class="s1"> "model": "/models/llama-7b.gguf",</span>
|
||||
<a id="__codelineno-0-89" name="__codelineno-0-89" href="#__codelineno-0-89"></a><span class="s1"> "gpu_layers": 32</span>
|
||||
<a id="__codelineno-0-90" name="__codelineno-0-90" href="#__codelineno-0-90"></a><span class="s1"> },</span>
|
||||
<a id="__codelineno-0-91" name="__codelineno-0-91" href="#__codelineno-0-91"></a><span class="s1"> "nodes": ["worker1", "worker2", "worker3"]</span>
|
||||
<a id="__codelineno-0-92" name="__codelineno-0-92" href="#__codelineno-0-92"></a><span class="s1"> }'</span>
|
||||
<a id="__codelineno-0-11" name="__codelineno-0-11" href="#__codelineno-0-11"></a><span class="s1"> "gpu_layers": 32,</span>
|
||||
<a id="__codelineno-0-12" name="__codelineno-0-12" href="#__codelineno-0-12"></a><span class="s1"> "flash_attn": "on"</span>
|
||||
<a id="__codelineno-0-13" name="__codelineno-0-13" href="#__codelineno-0-13"></a><span class="s1"> },</span>
|
||||
<a id="__codelineno-0-14" name="__codelineno-0-14" href="#__codelineno-0-14"></a><span class="s1"> "auto_restart": true,</span>
|
||||
<a id="__codelineno-0-15" name="__codelineno-0-15" href="#__codelineno-0-15"></a><span class="s1"> "max_restarts": 3,</span>
|
||||
<a id="__codelineno-0-16" name="__codelineno-0-16" href="#__codelineno-0-16"></a><span class="s1"> "docker_enabled": false,</span>
|
||||
<a id="__codelineno-0-17" name="__codelineno-0-17" href="#__codelineno-0-17"></a><span class="s1"> "command_override": "/opt/llama-server-dev",</span>
|
||||
<a id="__codelineno-0-18" name="__codelineno-0-18" href="#__codelineno-0-18"></a><span class="s1"> "nodes": ["main"]</span>
|
||||
<a id="__codelineno-0-19" name="__codelineno-0-19" href="#__codelineno-0-19"></a><span class="s1"> }'</span>
|
||||
<a id="__codelineno-0-20" name="__codelineno-0-20" href="#__codelineno-0-20"></a>
|
||||
<a id="__codelineno-0-21" name="__codelineno-0-21" href="#__codelineno-0-21"></a><span class="c1"># Create vLLM instance with environment variables</span>
|
||||
<a id="__codelineno-0-22" name="__codelineno-0-22" href="#__codelineno-0-22"></a>curl<span class="w"> </span>-X<span class="w"> </span>POST<span class="w"> </span>http://localhost:8080/api/v1/instances/my-vllm-instance<span class="w"> </span><span class="se">\</span>
|
||||
<a id="__codelineno-0-23" name="__codelineno-0-23" href="#__codelineno-0-23"></a><span class="w"> </span>-H<span class="w"> </span><span class="s2">"Content-Type: application/json"</span><span class="w"> </span><span class="se">\</span>
|
||||
<a id="__codelineno-0-24" name="__codelineno-0-24" href="#__codelineno-0-24"></a><span class="w"> </span>-H<span class="w"> </span><span class="s2">"Authorization: Bearer <token>"</span><span class="w"> </span><span class="se">\</span>
|
||||
<a id="__codelineno-0-25" name="__codelineno-0-25" href="#__codelineno-0-25"></a><span class="w"> </span>-d<span class="w"> </span><span class="s1">'{</span>
|
||||
<a id="__codelineno-0-26" name="__codelineno-0-26" href="#__codelineno-0-26"></a><span class="s1"> "backend_type": "vllm",</span>
|
||||
<a id="__codelineno-0-27" name="__codelineno-0-27" href="#__codelineno-0-27"></a><span class="s1"> "backend_options": {</span>
|
||||
<a id="__codelineno-0-28" name="__codelineno-0-28" href="#__codelineno-0-28"></a><span class="s1"> "model": "microsoft/DialoGPT-medium",</span>
|
||||
<a id="__codelineno-0-29" name="__codelineno-0-29" href="#__codelineno-0-29"></a><span class="s1"> "tensor_parallel_size": 2,</span>
|
||||
<a id="__codelineno-0-30" name="__codelineno-0-30" href="#__codelineno-0-30"></a><span class="s1"> "gpu_memory_utilization": 0.9</span>
|
||||
<a id="__codelineno-0-31" name="__codelineno-0-31" href="#__codelineno-0-31"></a><span class="s1"> },</span>
|
||||
<a id="__codelineno-0-32" name="__codelineno-0-32" href="#__codelineno-0-32"></a><span class="s1"> "on_demand_start": true,</span>
|
||||
<a id="__codelineno-0-33" name="__codelineno-0-33" href="#__codelineno-0-33"></a><span class="s1"> "environment": {</span>
|
||||
<a id="__codelineno-0-34" name="__codelineno-0-34" href="#__codelineno-0-34"></a><span class="s1"> "CUDA_VISIBLE_DEVICES": "0,1"</span>
|
||||
<a id="__codelineno-0-35" name="__codelineno-0-35" href="#__codelineno-0-35"></a><span class="s1"> },</span>
|
||||
<a id="__codelineno-0-36" name="__codelineno-0-36" href="#__codelineno-0-36"></a><span class="s1"> "nodes": ["worker1", "worker2"]</span>
|
||||
<a id="__codelineno-0-37" name="__codelineno-0-37" href="#__codelineno-0-37"></a><span class="s1"> }'</span>
|
||||
<a id="__codelineno-0-38" name="__codelineno-0-38" href="#__codelineno-0-38"></a>
|
||||
<a id="__codelineno-0-39" name="__codelineno-0-39" href="#__codelineno-0-39"></a><span class="c1"># Create MLX instance (macOS only)</span>
|
||||
<a id="__codelineno-0-40" name="__codelineno-0-40" href="#__codelineno-0-40"></a>curl<span class="w"> </span>-X<span class="w"> </span>POST<span class="w"> </span>http://localhost:8080/api/v1/instances/my-mlx-instance<span class="w"> </span><span class="se">\</span>
|
||||
<a id="__codelineno-0-41" name="__codelineno-0-41" href="#__codelineno-0-41"></a><span class="w"> </span>-H<span class="w"> </span><span class="s2">"Content-Type: application/json"</span><span class="w"> </span><span class="se">\</span>
|
||||
<a id="__codelineno-0-42" name="__codelineno-0-42" href="#__codelineno-0-42"></a><span class="w"> </span>-H<span class="w"> </span><span class="s2">"Authorization: Bearer <token>"</span><span class="w"> </span><span class="se">\</span>
|
||||
<a id="__codelineno-0-43" name="__codelineno-0-43" href="#__codelineno-0-43"></a><span class="w"> </span>-d<span class="w"> </span><span class="s1">'{</span>
|
||||
<a id="__codelineno-0-44" name="__codelineno-0-44" href="#__codelineno-0-44"></a><span class="s1"> "backend_type": "mlx_lm",</span>
|
||||
<a id="__codelineno-0-45" name="__codelineno-0-45" href="#__codelineno-0-45"></a><span class="s1"> "backend_options": {</span>
|
||||
<a id="__codelineno-0-46" name="__codelineno-0-46" href="#__codelineno-0-46"></a><span class="s1"> "model": "mlx-community/Mistral-7B-Instruct-v0.3-4bit",</span>
|
||||
<a id="__codelineno-0-47" name="__codelineno-0-47" href="#__codelineno-0-47"></a><span class="s1"> "temp": 0.7,</span>
|
||||
<a id="__codelineno-0-48" name="__codelineno-0-48" href="#__codelineno-0-48"></a><span class="s1"> "max_tokens": 2048</span>
|
||||
<a id="__codelineno-0-49" name="__codelineno-0-49" href="#__codelineno-0-49"></a><span class="s1"> },</span>
|
||||
<a id="__codelineno-0-50" name="__codelineno-0-50" href="#__codelineno-0-50"></a><span class="s1"> "nodes": ["main"]</span>
|
||||
<a id="__codelineno-0-51" name="__codelineno-0-51" href="#__codelineno-0-51"></a><span class="s1"> }'</span>
|
||||
</code></pre></div>
|
||||
<h2 id="start-instance">Start Instance<a class="headerlink" href="#start-instance" title="Permanent link">¶</a></h2>
|
||||
<p><strong>Via Web UI</strong><br />
|
||||
@@ -1026,7 +995,7 @@ Check instance status in real-time: </p>
|
||||
<span class="md-icon" title="Last update">
|
||||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1zM12.5 7v5.2l4 2.4-1 1L11 13V7zM11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2z"/></svg>
|
||||
</span>
|
||||
<span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-date" title="October 27, 2025 19:44:28 UTC">October 27, 2025</span>
|
||||
<span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-date" title="November 14, 2025 23:18:55 UTC">November 14, 2025</span>
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
File diff suppressed because one or more lines are too long
@@ -2,30 +2,30 @@
|
||||
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
||||
<url>
|
||||
<loc>https://llamactl.org/dev/</loc>
|
||||
<lastmod>2025-11-13</lastmod>
|
||||
<lastmod>2025-11-15</lastmod>
|
||||
</url>
|
||||
<url>
|
||||
<loc>https://llamactl.org/dev/api-reference/</loc>
|
||||
<lastmod>2025-11-13</lastmod>
|
||||
<lastmod>2025-11-15</lastmod>
|
||||
</url>
|
||||
<url>
|
||||
<loc>https://llamactl.org/dev/configuration/</loc>
|
||||
<lastmod>2025-11-13</lastmod>
|
||||
<lastmod>2025-11-15</lastmod>
|
||||
</url>
|
||||
<url>
|
||||
<loc>https://llamactl.org/dev/installation/</loc>
|
||||
<lastmod>2025-11-13</lastmod>
|
||||
<lastmod>2025-11-15</lastmod>
|
||||
</url>
|
||||
<url>
|
||||
<loc>https://llamactl.org/dev/managing-instances/</loc>
|
||||
<lastmod>2025-11-13</lastmod>
|
||||
<lastmod>2025-11-15</lastmod>
|
||||
</url>
|
||||
<url>
|
||||
<loc>https://llamactl.org/dev/quick-start/</loc>
|
||||
<lastmod>2025-11-13</lastmod>
|
||||
<lastmod>2025-11-15</lastmod>
|
||||
</url>
|
||||
<url>
|
||||
<loc>https://llamactl.org/dev/troubleshooting/</loc>
|
||||
<lastmod>2025-11-13</lastmod>
|
||||
<lastmod>2025-11-15</lastmod>
|
||||
</url>
|
||||
</urlset>
|
||||
Binary file not shown.
276
dev/swagger.json
276
dev/swagger.json
@@ -249,6 +249,34 @@
|
||||
}
|
||||
}
|
||||
},
|
||||
"/api/v1/config": {
|
||||
"get": {
|
||||
"security": [
|
||||
{
|
||||
"ApiKeyAuth": []
|
||||
}
|
||||
],
|
||||
"description": "Returns the current server configuration (sanitized)",
|
||||
"tags": [
|
||||
"System"
|
||||
],
|
||||
"summary": "Get server configuration",
|
||||
"responses": {
|
||||
"200": {
|
||||
"description": "Sanitized configuration",
|
||||
"schema": {
|
||||
"$ref": "#/definitions/config.AppConfig"
|
||||
}
|
||||
},
|
||||
"500": {
|
||||
"description": "Internal Server Error",
|
||||
"schema": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"/api/v1/instances": {
|
||||
"get": {
|
||||
"security": [
|
||||
@@ -1468,6 +1496,247 @@
|
||||
}
|
||||
},
|
||||
"definitions": {
|
||||
"config.AppConfig": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"auth": {
|
||||
"$ref": "#/definitions/config.AuthConfig"
|
||||
},
|
||||
"backends": {
|
||||
"$ref": "#/definitions/config.BackendConfig"
|
||||
},
|
||||
"build_time": {
|
||||
"type": "string"
|
||||
},
|
||||
"commit_hash": {
|
||||
"type": "string"
|
||||
},
|
||||
"instances": {
|
||||
"$ref": "#/definitions/config.InstancesConfig"
|
||||
},
|
||||
"local_node": {
|
||||
"type": "string"
|
||||
},
|
||||
"nodes": {
|
||||
"type": "object",
|
||||
"additionalProperties": {
|
||||
"$ref": "#/definitions/config.NodeConfig"
|
||||
}
|
||||
},
|
||||
"server": {
|
||||
"$ref": "#/definitions/config.ServerConfig"
|
||||
},
|
||||
"version": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
},
|
||||
"config.AuthConfig": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"inference_keys": {
|
||||
"description": "List of keys for OpenAI compatible inference endpoints",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"management_keys": {
|
||||
"description": "List of keys for management endpoints",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"require_inference_auth": {
|
||||
"description": "Require authentication for OpenAI compatible inference endpoints",
|
||||
"type": "boolean"
|
||||
},
|
||||
"require_management_auth": {
|
||||
"description": "Require authentication for management endpoints",
|
||||
"type": "boolean"
|
||||
}
|
||||
}
|
||||
},
|
||||
"config.BackendConfig": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"llama-cpp": {
|
||||
"$ref": "#/definitions/config.BackendSettings"
|
||||
},
|
||||
"mlx": {
|
||||
"$ref": "#/definitions/config.BackendSettings"
|
||||
},
|
||||
"vllm": {
|
||||
"$ref": "#/definitions/config.BackendSettings"
|
||||
}
|
||||
}
|
||||
},
|
||||
"config.BackendSettings": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"args": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"command": {
|
||||
"type": "string"
|
||||
},
|
||||
"docker": {
|
||||
"$ref": "#/definitions/config.DockerSettings"
|
||||
},
|
||||
"environment": {
|
||||
"type": "object",
|
||||
"additionalProperties": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"response_headers": {
|
||||
"type": "object",
|
||||
"additionalProperties": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"config.DockerSettings": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"args": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"enabled": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"environment": {
|
||||
"type": "object",
|
||||
"additionalProperties": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"image": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
},
|
||||
"config.InstancesConfig": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"auto_create_dirs": {
|
||||
"description": "Automatically create the data directory if it doesn't exist",
|
||||
"type": "boolean"
|
||||
},
|
||||
"configs_dir": {
|
||||
"description": "Instance config directory override",
|
||||
"type": "string"
|
||||
},
|
||||
"data_dir": {
|
||||
"description": "Directory where all llamactl data will be stored (instances.json, logs, etc.)",
|
||||
"type": "string"
|
||||
},
|
||||
"default_auto_restart": {
|
||||
"description": "Default auto-restart setting for new instances",
|
||||
"type": "boolean"
|
||||
},
|
||||
"default_max_restarts": {
|
||||
"description": "Default max restarts for new instances",
|
||||
"type": "integer"
|
||||
},
|
||||
"default_on_demand_start": {
|
||||
"description": "Default on-demand start setting for new instances",
|
||||
"type": "boolean"
|
||||
},
|
||||
"default_restart_delay": {
|
||||
"description": "Default restart delay for new instances (in seconds)",
|
||||
"type": "integer"
|
||||
},
|
||||
"enable_lru_eviction": {
|
||||
"description": "Enable LRU eviction for instance logs",
|
||||
"type": "boolean"
|
||||
},
|
||||
"logs_dir": {
|
||||
"description": "Logs directory override",
|
||||
"type": "string"
|
||||
},
|
||||
"max_instances": {
|
||||
"description": "Maximum number of instances that can be created",
|
||||
"type": "integer"
|
||||
},
|
||||
"max_running_instances": {
|
||||
"description": "Maximum number of instances that can be running at the same time",
|
||||
"type": "integer"
|
||||
},
|
||||
"on_demand_start_timeout": {
|
||||
"description": "How long to wait for an instance to start on demand (in seconds)",
|
||||
"type": "integer"
|
||||
},
|
||||
"port_range": {
|
||||
"description": "Port range for instances (e.g., 8000,9000)",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "integer"
|
||||
}
|
||||
},
|
||||
"timeout_check_interval": {
|
||||
"description": "Interval for checking instance timeouts (in minutes)",
|
||||
"type": "integer"
|
||||
}
|
||||
}
|
||||
},
|
||||
"config.NodeConfig": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"address": {
|
||||
"type": "string"
|
||||
},
|
||||
"api_key": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
},
|
||||
"config.ServerConfig": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"allowed_headers": {
|
||||
"description": "Allowed headers for CORS (e.g., \"Accept\", \"Authorization\", \"Content-Type\", \"X-CSRF-Token\")",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"allowed_origins": {
|
||||
"description": "Allowed origins for CORS (e.g., \"http://localhost:3000\")",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"enable_swagger": {
|
||||
"description": "Enable Swagger UI for API documentation",
|
||||
"type": "boolean"
|
||||
},
|
||||
"host": {
|
||||
"description": "Server host to bind to",
|
||||
"type": "string"
|
||||
},
|
||||
"port": {
|
||||
"description": "Server port to bind to",
|
||||
"type": "integer"
|
||||
},
|
||||
"response_headers": {
|
||||
"description": "Response headers to send with responses",
|
||||
"type": "object",
|
||||
"additionalProperties": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"instance.Instance": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
@@ -1487,6 +1756,13 @@
|
||||
"description": "Auto restart",
|
||||
"type": "boolean"
|
||||
},
|
||||
"command_override": {
|
||||
"type": "string"
|
||||
},
|
||||
"docker_enabled": {
|
||||
"description": "Execution context overrides",
|
||||
"type": "boolean"
|
||||
},
|
||||
"environment": {
|
||||
"description": "Environment variables",
|
||||
"type": "object",
|
||||
|
||||
190
dev/swagger.yaml
190
dev/swagger.yaml
@@ -1,5 +1,173 @@
|
||||
basePath: /api/v1
|
||||
definitions:
|
||||
config.AppConfig:
|
||||
properties:
|
||||
auth:
|
||||
$ref: '#/definitions/config.AuthConfig'
|
||||
backends:
|
||||
$ref: '#/definitions/config.BackendConfig'
|
||||
build_time:
|
||||
type: string
|
||||
commit_hash:
|
||||
type: string
|
||||
instances:
|
||||
$ref: '#/definitions/config.InstancesConfig'
|
||||
local_node:
|
||||
type: string
|
||||
nodes:
|
||||
additionalProperties:
|
||||
$ref: '#/definitions/config.NodeConfig'
|
||||
type: object
|
||||
server:
|
||||
$ref: '#/definitions/config.ServerConfig'
|
||||
version:
|
||||
type: string
|
||||
type: object
|
||||
config.AuthConfig:
|
||||
properties:
|
||||
inference_keys:
|
||||
description: List of keys for OpenAI compatible inference endpoints
|
||||
items:
|
||||
type: string
|
||||
type: array
|
||||
management_keys:
|
||||
description: List of keys for management endpoints
|
||||
items:
|
||||
type: string
|
||||
type: array
|
||||
require_inference_auth:
|
||||
description: Require authentication for OpenAI compatible inference endpoints
|
||||
type: boolean
|
||||
require_management_auth:
|
||||
description: Require authentication for management endpoints
|
||||
type: boolean
|
||||
type: object
|
||||
config.BackendConfig:
|
||||
properties:
|
||||
llama-cpp:
|
||||
$ref: '#/definitions/config.BackendSettings'
|
||||
mlx:
|
||||
$ref: '#/definitions/config.BackendSettings'
|
||||
vllm:
|
||||
$ref: '#/definitions/config.BackendSettings'
|
||||
type: object
|
||||
config.BackendSettings:
|
||||
properties:
|
||||
args:
|
||||
items:
|
||||
type: string
|
||||
type: array
|
||||
command:
|
||||
type: string
|
||||
docker:
|
||||
$ref: '#/definitions/config.DockerSettings'
|
||||
environment:
|
||||
additionalProperties:
|
||||
type: string
|
||||
type: object
|
||||
response_headers:
|
||||
additionalProperties:
|
||||
type: string
|
||||
type: object
|
||||
type: object
|
||||
config.DockerSettings:
|
||||
properties:
|
||||
args:
|
||||
items:
|
||||
type: string
|
||||
type: array
|
||||
enabled:
|
||||
type: boolean
|
||||
environment:
|
||||
additionalProperties:
|
||||
type: string
|
||||
type: object
|
||||
image:
|
||||
type: string
|
||||
type: object
|
||||
config.InstancesConfig:
|
||||
properties:
|
||||
auto_create_dirs:
|
||||
description: Automatically create the data directory if it doesn't exist
|
||||
type: boolean
|
||||
configs_dir:
|
||||
description: Instance config directory override
|
||||
type: string
|
||||
data_dir:
|
||||
description: Directory where all llamactl data will be stored (instances.json,
|
||||
logs, etc.)
|
||||
type: string
|
||||
default_auto_restart:
|
||||
description: Default auto-restart setting for new instances
|
||||
type: boolean
|
||||
default_max_restarts:
|
||||
description: Default max restarts for new instances
|
||||
type: integer
|
||||
default_on_demand_start:
|
||||
description: Default on-demand start setting for new instances
|
||||
type: boolean
|
||||
default_restart_delay:
|
||||
description: Default restart delay for new instances (in seconds)
|
||||
type: integer
|
||||
enable_lru_eviction:
|
||||
description: Enable LRU eviction for instance logs
|
||||
type: boolean
|
||||
logs_dir:
|
||||
description: Logs directory override
|
||||
type: string
|
||||
max_instances:
|
||||
description: Maximum number of instances that can be created
|
||||
type: integer
|
||||
max_running_instances:
|
||||
description: Maximum number of instances that can be running at the same time
|
||||
type: integer
|
||||
on_demand_start_timeout:
|
||||
description: How long to wait for an instance to start on demand (in seconds)
|
||||
type: integer
|
||||
port_range:
|
||||
description: Port range for instances (e.g., 8000,9000)
|
||||
items:
|
||||
type: integer
|
||||
type: array
|
||||
timeout_check_interval:
|
||||
description: Interval for checking instance timeouts (in minutes)
|
||||
type: integer
|
||||
type: object
|
||||
config.NodeConfig:
|
||||
properties:
|
||||
address:
|
||||
type: string
|
||||
api_key:
|
||||
type: string
|
||||
type: object
|
||||
config.ServerConfig:
|
||||
properties:
|
||||
allowed_headers:
|
||||
description: Allowed headers for CORS (e.g., "Accept", "Authorization", "Content-Type",
|
||||
"X-CSRF-Token")
|
||||
items:
|
||||
type: string
|
||||
type: array
|
||||
allowed_origins:
|
||||
description: Allowed origins for CORS (e.g., "http://localhost:3000")
|
||||
items:
|
||||
type: string
|
||||
type: array
|
||||
enable_swagger:
|
||||
description: Enable Swagger UI for API documentation
|
||||
type: boolean
|
||||
host:
|
||||
description: Server host to bind to
|
||||
type: string
|
||||
port:
|
||||
description: Server port to bind to
|
||||
type: integer
|
||||
response_headers:
|
||||
additionalProperties:
|
||||
type: string
|
||||
description: Response headers to send with responses
|
||||
type: object
|
||||
type: object
|
||||
instance.Instance:
|
||||
properties:
|
||||
created:
|
||||
@@ -13,6 +181,11 @@ definitions:
|
||||
auto_restart:
|
||||
description: Auto restart
|
||||
type: boolean
|
||||
command_override:
|
||||
type: string
|
||||
docker_enabled:
|
||||
description: Execution context overrides
|
||||
type: boolean
|
||||
environment:
|
||||
additionalProperties:
|
||||
type: string
|
||||
@@ -216,6 +389,23 @@ paths:
|
||||
summary: Parse vllm serve command
|
||||
tags:
|
||||
- Backends
|
||||
/api/v1/config:
|
||||
get:
|
||||
description: Returns the current server configuration (sanitized)
|
||||
responses:
|
||||
"200":
|
||||
description: Sanitized configuration
|
||||
schema:
|
||||
$ref: '#/definitions/config.AppConfig'
|
||||
"500":
|
||||
description: Internal Server Error
|
||||
schema:
|
||||
type: string
|
||||
security:
|
||||
- ApiKeyAuth: []
|
||||
summary: Get server configuration
|
||||
tags:
|
||||
- System
|
||||
/api/v1/instances:
|
||||
get:
|
||||
description: Returns a list of all instances managed by the server
|
||||
|
||||
Reference in New Issue
Block a user