mirror of
https://github.com/lordmathis/llamactl.git
synced 2025-11-06 00:54:23 +00:00
Merge pull request #70 from lordmathis/refactor/manager
refactor: Split instance manager into single focus structs
This commit is contained in:
@@ -58,7 +58,7 @@ func main() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Initialize the instance manager
|
// Initialize the instance manager
|
||||||
instanceManager := manager.NewInstanceManager(cfg.Backends, cfg.Instances, cfg.Nodes, cfg.LocalNode)
|
instanceManager := manager.New(cfg.Backends, cfg.Instances, cfg.Nodes, cfg.LocalNode)
|
||||||
|
|
||||||
// Create a new handler with the instance manager
|
// Create a new handler with the instance manager
|
||||||
handler := server.NewHandler(instanceManager, cfg)
|
handler := server.NewHandler(instanceManager, cfg)
|
||||||
|
|||||||
152
pkg/manager/lifecycle.go
Normal file
152
pkg/manager/lifecycle.go
Normal file
@@ -0,0 +1,152 @@
|
|||||||
|
package manager
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"llamactl/pkg/instance"
|
||||||
|
"log"
|
||||||
|
"sync"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
// lifecycleManager handles background timeout checking and LRU eviction.
|
||||||
|
// It properly coordinates shutdown to prevent races with the timeout checker.
|
||||||
|
type lifecycleManager struct {
|
||||||
|
registry *instanceRegistry
|
||||||
|
manager InstanceManager // For calling Stop/Evict operations
|
||||||
|
|
||||||
|
ticker *time.Ticker
|
||||||
|
checkInterval time.Duration
|
||||||
|
enableLRU bool
|
||||||
|
|
||||||
|
shutdownChan chan struct{}
|
||||||
|
shutdownDone chan struct{}
|
||||||
|
shutdownOnce sync.Once
|
||||||
|
}
|
||||||
|
|
||||||
|
// newLifecycleManager creates a new lifecycle manager.
|
||||||
|
func newLifecycleManager(
|
||||||
|
registry *instanceRegistry,
|
||||||
|
manager InstanceManager,
|
||||||
|
checkInterval time.Duration,
|
||||||
|
enableLRU bool,
|
||||||
|
) *lifecycleManager {
|
||||||
|
if checkInterval <= 0 {
|
||||||
|
checkInterval = 5 * time.Minute // Default to 5 minutes
|
||||||
|
}
|
||||||
|
|
||||||
|
return &lifecycleManager{
|
||||||
|
registry: registry,
|
||||||
|
manager: manager,
|
||||||
|
ticker: time.NewTicker(checkInterval),
|
||||||
|
checkInterval: checkInterval,
|
||||||
|
enableLRU: enableLRU,
|
||||||
|
shutdownChan: make(chan struct{}),
|
||||||
|
shutdownDone: make(chan struct{}),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Start begins the timeout checking loop in a goroutine.
|
||||||
|
func (l *lifecycleManager) start() {
|
||||||
|
go l.timeoutCheckLoop()
|
||||||
|
}
|
||||||
|
|
||||||
|
// Stop gracefully stops the lifecycle manager.
|
||||||
|
// This ensures the timeout checker completes before instance cleanup begins.
|
||||||
|
func (l *lifecycleManager) stop() {
|
||||||
|
l.shutdownOnce.Do(func() {
|
||||||
|
close(l.shutdownChan)
|
||||||
|
<-l.shutdownDone // Wait for checker to finish (prevents shutdown race)
|
||||||
|
l.ticker.Stop()
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// timeoutCheckLoop is the main loop that periodically checks for timeouts.
|
||||||
|
func (l *lifecycleManager) timeoutCheckLoop() {
|
||||||
|
defer close(l.shutdownDone) // Signal completion
|
||||||
|
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-l.ticker.C:
|
||||||
|
l.checkTimeouts()
|
||||||
|
case <-l.shutdownChan:
|
||||||
|
return // Exit goroutine on shutdown
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// checkTimeouts checks all instances for timeout and stops those that have timed out.
|
||||||
|
func (l *lifecycleManager) checkTimeouts() {
|
||||||
|
// Get all instances from registry
|
||||||
|
instances := l.registry.list()
|
||||||
|
|
||||||
|
var timeoutInstances []string
|
||||||
|
|
||||||
|
// Identify instances that should timeout
|
||||||
|
for _, inst := range instances {
|
||||||
|
// Skip remote instances - they are managed by their respective nodes
|
||||||
|
if inst.IsRemote() {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
// Only check running instances
|
||||||
|
if !l.registry.isRunning(inst.Name) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
if inst.ShouldTimeout() {
|
||||||
|
timeoutInstances = append(timeoutInstances, inst.Name)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Stop the timed-out instances
|
||||||
|
for _, name := range timeoutInstances {
|
||||||
|
log.Printf("Instance %s has timed out, stopping it", name)
|
||||||
|
if _, err := l.manager.StopInstance(name); err != nil {
|
||||||
|
log.Printf("Error stopping instance %s: %v", name, err)
|
||||||
|
} else {
|
||||||
|
log.Printf("Instance %s stopped successfully", name)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// EvictLRU finds and stops the least recently used running instance.
|
||||||
|
// This is called when max running instances limit is reached.
|
||||||
|
func (l *lifecycleManager) evictLRU() error {
|
||||||
|
if !l.enableLRU {
|
||||||
|
return fmt.Errorf("LRU eviction is not enabled")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get all running instances
|
||||||
|
runningInstances := l.registry.listRunning()
|
||||||
|
|
||||||
|
var lruInstance *instance.Instance
|
||||||
|
|
||||||
|
for _, inst := range runningInstances {
|
||||||
|
// Skip remote instances - they are managed by their respective nodes
|
||||||
|
if inst.IsRemote() {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
// Skip instances without idle timeout
|
||||||
|
if inst.GetOptions() != nil && inst.GetOptions().IdleTimeout != nil && *inst.GetOptions().IdleTimeout <= 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
if lruInstance == nil {
|
||||||
|
lruInstance = inst
|
||||||
|
}
|
||||||
|
|
||||||
|
if inst.LastRequestTime() < lruInstance.LastRequestTime() {
|
||||||
|
lruInstance = inst
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if lruInstance == nil {
|
||||||
|
return fmt.Errorf("failed to find lru instance")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Evict the LRU instance
|
||||||
|
log.Printf("Evicting LRU instance %s", lruInstance.Name)
|
||||||
|
_, err := l.manager.StopInstance(lruInstance.Name)
|
||||||
|
return err
|
||||||
|
}
|
||||||
220
pkg/manager/lifecycle_test.go
Normal file
220
pkg/manager/lifecycle_test.go
Normal file
@@ -0,0 +1,220 @@
|
|||||||
|
package manager_test
|
||||||
|
|
||||||
|
import (
|
||||||
|
"llamactl/pkg/backends"
|
||||||
|
"llamactl/pkg/instance"
|
||||||
|
"llamactl/pkg/manager"
|
||||||
|
"sync"
|
||||||
|
"testing"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestInstanceTimeoutLogic(t *testing.T) {
|
||||||
|
testManager := createTestManager()
|
||||||
|
defer testManager.Shutdown()
|
||||||
|
|
||||||
|
idleTimeout := 1 // 1 minute
|
||||||
|
inst := createInstanceWithTimeout(t, testManager, "timeout-test", "/path/to/model.gguf", &idleTimeout)
|
||||||
|
|
||||||
|
// Test timeout logic with mock time provider
|
||||||
|
mockTime := NewMockTimeProvider(time.Now())
|
||||||
|
inst.SetTimeProvider(mockTime)
|
||||||
|
|
||||||
|
// Set instance to running state so timeout logic can work
|
||||||
|
inst.SetStatus(instance.Running)
|
||||||
|
defer inst.SetStatus(instance.Stopped)
|
||||||
|
|
||||||
|
// Update last request time
|
||||||
|
inst.UpdateLastRequestTime()
|
||||||
|
|
||||||
|
// Initially should not timeout (just updated)
|
||||||
|
if inst.ShouldTimeout() {
|
||||||
|
t.Error("Instance should not timeout immediately after request")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Advance time to trigger timeout
|
||||||
|
mockTime.SetTime(time.Now().Add(2 * time.Minute))
|
||||||
|
|
||||||
|
// Now it should timeout
|
||||||
|
if !inst.ShouldTimeout() {
|
||||||
|
t.Error("Instance should timeout after idle period")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestInstanceWithoutTimeoutNeverExpires(t *testing.T) {
|
||||||
|
testManager := createTestManager()
|
||||||
|
defer testManager.Shutdown()
|
||||||
|
|
||||||
|
noTimeoutInst := createInstanceWithTimeout(t, testManager, "no-timeout-test", "/path/to/model.gguf", nil)
|
||||||
|
|
||||||
|
mockTime := NewMockTimeProvider(time.Now())
|
||||||
|
noTimeoutInst.SetTimeProvider(mockTime)
|
||||||
|
noTimeoutInst.SetStatus(instance.Running)
|
||||||
|
defer noTimeoutInst.SetStatus(instance.Stopped)
|
||||||
|
|
||||||
|
noTimeoutInst.UpdateLastRequestTime()
|
||||||
|
|
||||||
|
// Advance time significantly
|
||||||
|
mockTime.SetTime(mockTime.Now().Add(24 * time.Hour))
|
||||||
|
|
||||||
|
// Even with time advanced, should not timeout
|
||||||
|
if noTimeoutInst.ShouldTimeout() {
|
||||||
|
t.Error("Instance without timeout configuration should never timeout")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestEvictLRUInstance_Success(t *testing.T) {
|
||||||
|
manager := createTestManager()
|
||||||
|
defer manager.Shutdown()
|
||||||
|
|
||||||
|
// Create 3 instances with idle timeout enabled (value doesn't matter for LRU logic)
|
||||||
|
validTimeout := 1
|
||||||
|
inst1 := createInstanceWithTimeout(t, manager, "instance-1", "/path/to/model1.gguf", &validTimeout)
|
||||||
|
inst2 := createInstanceWithTimeout(t, manager, "instance-2", "/path/to/model2.gguf", &validTimeout)
|
||||||
|
inst3 := createInstanceWithTimeout(t, manager, "instance-3", "/path/to/model3.gguf", &validTimeout)
|
||||||
|
|
||||||
|
// Set up mock time and set instances to running
|
||||||
|
mockTime := NewMockTimeProvider(time.Now())
|
||||||
|
inst1.SetTimeProvider(mockTime)
|
||||||
|
inst2.SetTimeProvider(mockTime)
|
||||||
|
inst3.SetTimeProvider(mockTime)
|
||||||
|
|
||||||
|
inst1.SetStatus(instance.Running)
|
||||||
|
inst2.SetStatus(instance.Running)
|
||||||
|
inst3.SetStatus(instance.Running)
|
||||||
|
defer func() {
|
||||||
|
// Clean up - ensure all instances are stopped
|
||||||
|
for _, inst := range []*instance.Instance{inst1, inst2, inst3} {
|
||||||
|
if inst.IsRunning() {
|
||||||
|
inst.SetStatus(instance.Stopped)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
|
// Set different last request times (oldest to newest)
|
||||||
|
// inst1: oldest (will be evicted)
|
||||||
|
inst1.UpdateLastRequestTime()
|
||||||
|
|
||||||
|
mockTime.SetTime(mockTime.Now().Add(1 * time.Minute))
|
||||||
|
inst2.UpdateLastRequestTime()
|
||||||
|
|
||||||
|
mockTime.SetTime(mockTime.Now().Add(1 * time.Minute))
|
||||||
|
inst3.UpdateLastRequestTime()
|
||||||
|
|
||||||
|
// Evict LRU instance (should be inst1)
|
||||||
|
if err := manager.EvictLRUInstance(); err != nil {
|
||||||
|
t.Fatalf("EvictLRUInstance failed: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Verify inst1 is stopped
|
||||||
|
if inst1.IsRunning() {
|
||||||
|
t.Error("Expected instance-1 to be stopped after eviction")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Verify inst2 and inst3 are still running
|
||||||
|
if !inst2.IsRunning() {
|
||||||
|
t.Error("Expected instance-2 to still be running")
|
||||||
|
}
|
||||||
|
if !inst3.IsRunning() {
|
||||||
|
t.Error("Expected instance-3 to still be running")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestEvictLRUInstance_NoRunningInstances(t *testing.T) {
|
||||||
|
manager := createTestManager()
|
||||||
|
defer manager.Shutdown()
|
||||||
|
|
||||||
|
err := manager.EvictLRUInstance()
|
||||||
|
if err == nil {
|
||||||
|
t.Error("Expected error when no running instances exist")
|
||||||
|
}
|
||||||
|
if err.Error() != "failed to find lru instance" {
|
||||||
|
t.Errorf("Expected 'failed to find lru instance' error, got: %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestEvictLRUInstance_OnlyEvictsTimeoutEnabledInstances(t *testing.T) {
|
||||||
|
manager := createTestManager()
|
||||||
|
defer manager.Shutdown()
|
||||||
|
|
||||||
|
// Create mix of instances: some with timeout enabled, some disabled
|
||||||
|
// Only timeout-enabled instances should be eligible for eviction
|
||||||
|
validTimeout := 1
|
||||||
|
zeroTimeout := 0
|
||||||
|
instWithTimeout := createInstanceWithTimeout(t, manager, "with-timeout", "/path/to/model-with-timeout.gguf", &validTimeout)
|
||||||
|
instNoTimeout1 := createInstanceWithTimeout(t, manager, "no-timeout-1", "/path/to/model-no-timeout1.gguf", &zeroTimeout)
|
||||||
|
instNoTimeout2 := createInstanceWithTimeout(t, manager, "no-timeout-2", "/path/to/model-no-timeout2.gguf", nil)
|
||||||
|
|
||||||
|
// Set all instances to running
|
||||||
|
instances := []*instance.Instance{instWithTimeout, instNoTimeout1, instNoTimeout2}
|
||||||
|
for _, inst := range instances {
|
||||||
|
inst.SetStatus(instance.Running)
|
||||||
|
inst.UpdateLastRequestTime()
|
||||||
|
}
|
||||||
|
defer func() {
|
||||||
|
// Reset instances to stopped to avoid shutdown panics
|
||||||
|
for _, inst := range instances {
|
||||||
|
if inst.IsRunning() {
|
||||||
|
inst.SetStatus(instance.Stopped)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
|
// Evict LRU instance - should only consider the one with timeout
|
||||||
|
err := manager.EvictLRUInstance()
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("EvictLRUInstance failed: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Verify only the instance with timeout was evicted
|
||||||
|
if instWithTimeout.IsRunning() {
|
||||||
|
t.Error("Expected with-timeout instance to be stopped after eviction")
|
||||||
|
}
|
||||||
|
if !instNoTimeout1.IsRunning() {
|
||||||
|
t.Error("Expected no-timeout-1 instance to still be running")
|
||||||
|
}
|
||||||
|
if !instNoTimeout2.IsRunning() {
|
||||||
|
t.Error("Expected no-timeout-2 instance to still be running")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Helper function to create instances with different timeout configurations
|
||||||
|
func createInstanceWithTimeout(t *testing.T, manager manager.InstanceManager, name, model string, timeout *int) *instance.Instance {
|
||||||
|
t.Helper()
|
||||||
|
options := &instance.Options{
|
||||||
|
IdleTimeout: timeout,
|
||||||
|
BackendOptions: backends.Options{
|
||||||
|
BackendType: backends.BackendTypeLlamaCpp,
|
||||||
|
LlamaServerOptions: &backends.LlamaServerOptions{
|
||||||
|
Model: model,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
inst, err := manager.CreateInstance(name, options)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("CreateInstance failed: %v", err)
|
||||||
|
}
|
||||||
|
return inst
|
||||||
|
}
|
||||||
|
|
||||||
|
// Helper for timeout tests
|
||||||
|
type MockTimeProvider struct {
|
||||||
|
currentTime time.Time
|
||||||
|
mu sync.RWMutex
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewMockTimeProvider(t time.Time) *MockTimeProvider {
|
||||||
|
return &MockTimeProvider{currentTime: t}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *MockTimeProvider) Now() time.Time {
|
||||||
|
m.mu.RLock()
|
||||||
|
defer m.mu.RUnlock()
|
||||||
|
return m.currentTime
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *MockTimeProvider) SetTime(t time.Time) {
|
||||||
|
m.mu.Lock()
|
||||||
|
defer m.mu.Unlock()
|
||||||
|
m.currentTime = t
|
||||||
|
}
|
||||||
@@ -1,15 +1,11 @@
|
|||||||
package manager
|
package manager
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"encoding/json"
|
"context"
|
||||||
"fmt"
|
"fmt"
|
||||||
"llamactl/pkg/config"
|
"llamactl/pkg/config"
|
||||||
"llamactl/pkg/instance"
|
"llamactl/pkg/instance"
|
||||||
"log"
|
"log"
|
||||||
"net/http"
|
|
||||||
"os"
|
|
||||||
"path/filepath"
|
|
||||||
"strings"
|
|
||||||
"sync"
|
"sync"
|
||||||
"time"
|
"time"
|
||||||
)
|
)
|
||||||
@@ -30,256 +26,146 @@ type InstanceManager interface {
|
|||||||
Shutdown()
|
Shutdown()
|
||||||
}
|
}
|
||||||
|
|
||||||
type RemoteManager interface {
|
|
||||||
ListRemoteInstances(node *config.NodeConfig) ([]*instance.Instance, error)
|
|
||||||
CreateRemoteInstance(node *config.NodeConfig, name string, options *instance.Options) (*instance.Instance, error)
|
|
||||||
GetRemoteInstance(node *config.NodeConfig, name string) (*instance.Instance, error)
|
|
||||||
UpdateRemoteInstance(node *config.NodeConfig, name string, options *instance.Options) (*instance.Instance, error)
|
|
||||||
DeleteRemoteInstance(node *config.NodeConfig, name string) error
|
|
||||||
StartRemoteInstance(node *config.NodeConfig, name string) (*instance.Instance, error)
|
|
||||||
StopRemoteInstance(node *config.NodeConfig, name string) (*instance.Instance, error)
|
|
||||||
RestartRemoteInstance(node *config.NodeConfig, name string) (*instance.Instance, error)
|
|
||||||
GetRemoteInstanceLogs(node *config.NodeConfig, name string, numLines int) (string, error)
|
|
||||||
}
|
|
||||||
|
|
||||||
type instanceManager struct {
|
type instanceManager struct {
|
||||||
mu sync.RWMutex
|
// Components (each with own synchronization)
|
||||||
instances map[string]*instance.Instance
|
registry *instanceRegistry
|
||||||
runningInstances map[string]struct{}
|
ports *portAllocator
|
||||||
ports map[int]bool
|
persistence *instancePersister
|
||||||
|
remote *remoteManager
|
||||||
|
lifecycle *lifecycleManager
|
||||||
|
|
||||||
|
// Configuration
|
||||||
instancesConfig config.InstancesConfig
|
instancesConfig config.InstancesConfig
|
||||||
backendsConfig config.BackendConfig
|
backendsConfig config.BackendConfig
|
||||||
localNodeName string // Name of the local node
|
localNodeName string // Name of the local node
|
||||||
|
|
||||||
// Timeout checker
|
// Synchronization
|
||||||
timeoutChecker *time.Ticker
|
instanceLocks sync.Map // map[string]*sync.Mutex - per-instance locks for concurrent operations
|
||||||
shutdownChan chan struct{}
|
shutdownOnce sync.Once
|
||||||
shutdownDone chan struct{}
|
|
||||||
isShutdown bool
|
|
||||||
|
|
||||||
// Remote instance management
|
|
||||||
httpClient *http.Client
|
|
||||||
instanceNodeMap map[string]*config.NodeConfig // Maps instance name to its node config
|
|
||||||
nodeConfigMap map[string]*config.NodeConfig // Maps node name to node config for quick lookup
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// NewInstanceManager creates a new instance of InstanceManager.
|
// New creates a new instance of InstanceManager.
|
||||||
func NewInstanceManager(backendsConfig config.BackendConfig, instancesConfig config.InstancesConfig, nodesConfig map[string]config.NodeConfig, localNodeName string) InstanceManager {
|
func New(backendsConfig config.BackendConfig, instancesConfig config.InstancesConfig, nodesConfig map[string]config.NodeConfig, localNodeName string) InstanceManager {
|
||||||
if instancesConfig.TimeoutCheckInterval <= 0 {
|
if instancesConfig.TimeoutCheckInterval <= 0 {
|
||||||
instancesConfig.TimeoutCheckInterval = 5 // Default to 5 minutes if not set
|
instancesConfig.TimeoutCheckInterval = 5 // Default to 5 minutes if not set
|
||||||
}
|
}
|
||||||
|
|
||||||
// Build node config map for quick lookup
|
// Initialize components
|
||||||
nodeConfigMap := make(map[string]*config.NodeConfig)
|
registry := newInstanceRegistry()
|
||||||
for name := range nodesConfig {
|
|
||||||
nodeCopy := nodesConfig[name]
|
// Initialize port allocator
|
||||||
nodeConfigMap[name] = &nodeCopy
|
portRange := instancesConfig.PortRange
|
||||||
|
ports, err := newPortAllocator(portRange[0], portRange[1])
|
||||||
|
if err != nil {
|
||||||
|
log.Fatalf("Failed to create port allocator: %v", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Initialize persistence
|
||||||
|
persistence, err := newInstancePersister(instancesConfig.InstancesDir)
|
||||||
|
if err != nil {
|
||||||
|
log.Fatalf("Failed to create instance persister: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Initialize remote manager
|
||||||
|
remote := newRemoteManager(nodesConfig, 30*time.Second)
|
||||||
|
|
||||||
|
// Create manager instance
|
||||||
im := &instanceManager{
|
im := &instanceManager{
|
||||||
instances: make(map[string]*instance.Instance),
|
registry: registry,
|
||||||
runningInstances: make(map[string]struct{}),
|
ports: ports,
|
||||||
ports: make(map[int]bool),
|
persistence: persistence,
|
||||||
|
remote: remote,
|
||||||
instancesConfig: instancesConfig,
|
instancesConfig: instancesConfig,
|
||||||
backendsConfig: backendsConfig,
|
backendsConfig: backendsConfig,
|
||||||
localNodeName: localNodeName,
|
localNodeName: localNodeName,
|
||||||
|
|
||||||
timeoutChecker: time.NewTicker(time.Duration(instancesConfig.TimeoutCheckInterval) * time.Minute),
|
|
||||||
shutdownChan: make(chan struct{}),
|
|
||||||
shutdownDone: make(chan struct{}),
|
|
||||||
|
|
||||||
httpClient: &http.Client{
|
|
||||||
Timeout: 30 * time.Second,
|
|
||||||
},
|
|
||||||
|
|
||||||
instanceNodeMap: make(map[string]*config.NodeConfig),
|
|
||||||
nodeConfigMap: nodeConfigMap,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Initialize lifecycle manager (needs reference to manager for Stop/Evict operations)
|
||||||
|
checkInterval := time.Duration(instancesConfig.TimeoutCheckInterval) * time.Minute
|
||||||
|
im.lifecycle = newLifecycleManager(registry, im, checkInterval, true)
|
||||||
|
|
||||||
// Load existing instances from disk
|
// Load existing instances from disk
|
||||||
if err := im.loadInstances(); err != nil {
|
if err := im.loadInstances(); err != nil {
|
||||||
log.Printf("Error loading instances: %v", err)
|
log.Printf("Error loading instances: %v", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Start the timeout checker goroutine after initialization is complete
|
// Start the lifecycle manager
|
||||||
go func() {
|
im.lifecycle.start()
|
||||||
defer close(im.shutdownDone)
|
|
||||||
|
|
||||||
for {
|
|
||||||
select {
|
|
||||||
case <-im.timeoutChecker.C:
|
|
||||||
im.checkAllTimeouts()
|
|
||||||
case <-im.shutdownChan:
|
|
||||||
return // Exit goroutine on shutdown
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}()
|
|
||||||
|
|
||||||
return im
|
return im
|
||||||
}
|
}
|
||||||
|
|
||||||
func (im *instanceManager) getNextAvailablePort() (int, error) {
|
// persistInstance saves an instance using the persistence component
|
||||||
portRange := im.instancesConfig.PortRange
|
func (im *instanceManager) persistInstance(inst *instance.Instance) error {
|
||||||
|
return im.persistence.save(inst)
|
||||||
for port := portRange[0]; port <= portRange[1]; port++ {
|
|
||||||
if !im.ports[port] {
|
|
||||||
im.ports[port] = true
|
|
||||||
return port, nil
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return 0, fmt.Errorf("no available ports in the specified range")
|
|
||||||
}
|
|
||||||
|
|
||||||
// persistInstance saves an instance to its JSON file
|
|
||||||
func (im *instanceManager) persistInstance(instance *instance.Instance) error {
|
|
||||||
if im.instancesConfig.InstancesDir == "" {
|
|
||||||
return nil // Persistence disabled
|
|
||||||
}
|
|
||||||
|
|
||||||
instancePath := filepath.Join(im.instancesConfig.InstancesDir, instance.Name+".json")
|
|
||||||
tempPath := instancePath + ".tmp"
|
|
||||||
|
|
||||||
// Serialize instance to JSON
|
|
||||||
jsonData, err := json.MarshalIndent(instance, "", " ")
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("failed to marshal instance %s: %w", instance.Name, err)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Write to temporary file first
|
|
||||||
if err := os.WriteFile(tempPath, jsonData, 0644); err != nil {
|
|
||||||
return fmt.Errorf("failed to write temp file for instance %s: %w", instance.Name, err)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Atomic rename
|
|
||||||
if err := os.Rename(tempPath, instancePath); err != nil {
|
|
||||||
os.Remove(tempPath) // Clean up temp file
|
|
||||||
return fmt.Errorf("failed to rename temp file for instance %s: %w", instance.Name, err)
|
|
||||||
}
|
|
||||||
|
|
||||||
return nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (im *instanceManager) Shutdown() {
|
func (im *instanceManager) Shutdown() {
|
||||||
im.mu.Lock()
|
im.shutdownOnce.Do(func() {
|
||||||
|
// 1. Stop lifecycle manager (stops timeout checker)
|
||||||
|
im.lifecycle.stop()
|
||||||
|
|
||||||
// Check if already shutdown
|
// 2. Get running instances (no lock needed - registry handles it)
|
||||||
if im.isShutdown {
|
running := im.registry.listRunning()
|
||||||
im.mu.Unlock()
|
|
||||||
return
|
|
||||||
}
|
|
||||||
im.isShutdown = true
|
|
||||||
|
|
||||||
// Signal the timeout checker to stop
|
// 3. Stop local instances concurrently
|
||||||
close(im.shutdownChan)
|
|
||||||
|
|
||||||
// Create a list of running instances to stop
|
|
||||||
var runningInstances []*instance.Instance
|
|
||||||
var runningNames []string
|
|
||||||
for name, inst := range im.instances {
|
|
||||||
if inst.IsRunning() {
|
|
||||||
runningInstances = append(runningInstances, inst)
|
|
||||||
runningNames = append(runningNames, name)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Release lock before stopping instances to avoid deadlock
|
|
||||||
im.mu.Unlock()
|
|
||||||
|
|
||||||
// Wait for the timeout checker goroutine to actually stop
|
|
||||||
<-im.shutdownDone
|
|
||||||
|
|
||||||
// Now stop the ticker
|
|
||||||
if im.timeoutChecker != nil {
|
|
||||||
im.timeoutChecker.Stop()
|
|
||||||
}
|
|
||||||
|
|
||||||
// Stop instances without holding the manager lock
|
|
||||||
var wg sync.WaitGroup
|
var wg sync.WaitGroup
|
||||||
wg.Add(len(runningInstances))
|
for _, inst := range running {
|
||||||
|
if inst.IsRemote() {
|
||||||
for i, inst := range runningInstances {
|
continue // Skip remote instances
|
||||||
go func(name string, inst *instance.Instance) {
|
}
|
||||||
|
wg.Add(1)
|
||||||
|
go func(inst *instance.Instance) {
|
||||||
defer wg.Done()
|
defer wg.Done()
|
||||||
fmt.Printf("Stopping instance %s...\n", name)
|
fmt.Printf("Stopping instance %s...\n", inst.Name)
|
||||||
// Attempt to stop the instance gracefully
|
|
||||||
if err := inst.Stop(); err != nil {
|
if err := inst.Stop(); err != nil {
|
||||||
fmt.Printf("Error stopping instance %s: %v\n", name, err)
|
fmt.Printf("Error stopping instance %s: %v\n", inst.Name, err)
|
||||||
}
|
}
|
||||||
}(runningNames[i], inst)
|
}(inst)
|
||||||
}
|
}
|
||||||
|
|
||||||
wg.Wait()
|
wg.Wait()
|
||||||
fmt.Println("All instances stopped.")
|
fmt.Println("All instances stopped.")
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
// loadInstances restores all instances from disk
|
// loadInstances restores all instances from disk using the persistence component
|
||||||
func (im *instanceManager) loadInstances() error {
|
func (im *instanceManager) loadInstances() error {
|
||||||
if im.instancesConfig.InstancesDir == "" {
|
// Load all instances from persistence
|
||||||
return nil // Persistence disabled
|
instances, err := im.persistence.loadAll()
|
||||||
}
|
|
||||||
|
|
||||||
// Check if instances directory exists
|
|
||||||
if _, err := os.Stat(im.instancesConfig.InstancesDir); os.IsNotExist(err) {
|
|
||||||
return nil // No instances directory, start fresh
|
|
||||||
}
|
|
||||||
|
|
||||||
// Read all JSON files from instances directory
|
|
||||||
files, err := os.ReadDir(im.instancesConfig.InstancesDir)
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("failed to read instances directory: %w", err)
|
return fmt.Errorf("failed to load instances: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
loadedCount := 0
|
if len(instances) == 0 {
|
||||||
for _, file := range files {
|
return nil
|
||||||
if file.IsDir() || !strings.HasSuffix(file.Name(), ".json") {
|
}
|
||||||
|
|
||||||
|
// Process each loaded instance
|
||||||
|
for _, persistedInst := range instances {
|
||||||
|
if err := im.loadInstance(persistedInst); err != nil {
|
||||||
|
log.Printf("Failed to load instance %s: %v", persistedInst.Name, err)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
instanceName := strings.TrimSuffix(file.Name(), ".json")
|
|
||||||
instancePath := filepath.Join(im.instancesConfig.InstancesDir, file.Name())
|
|
||||||
|
|
||||||
if err := im.loadInstance(instanceName, instancePath); err != nil {
|
|
||||||
log.Printf("Failed to load instance %s: %v", instanceName, err)
|
|
||||||
continue
|
|
||||||
}
|
}
|
||||||
|
|
||||||
loadedCount++
|
log.Printf("Loaded %d instances from persistence", len(instances))
|
||||||
}
|
|
||||||
|
|
||||||
if loadedCount > 0 {
|
|
||||||
log.Printf("Loaded %d instances from persistence", loadedCount)
|
|
||||||
// Auto-start instances that have auto-restart enabled
|
// Auto-start instances that have auto-restart enabled
|
||||||
go im.autoStartInstances()
|
go im.autoStartInstances()
|
||||||
}
|
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// loadInstance loads a single instance from its JSON file
|
// loadInstance loads a single persisted instance and adds it to the registry
|
||||||
func (im *instanceManager) loadInstance(name, path string) error {
|
func (im *instanceManager) loadInstance(persistedInst *instance.Instance) error {
|
||||||
data, err := os.ReadFile(path)
|
name := persistedInst.Name
|
||||||
if err != nil {
|
options := persistedInst.GetOptions()
|
||||||
return fmt.Errorf("failed to read instance file: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
var persistedInstance instance.Instance
|
|
||||||
if err := json.Unmarshal(data, &persistedInstance); err != nil {
|
|
||||||
return fmt.Errorf("failed to unmarshal instance: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Validate the instance name matches the filename
|
|
||||||
if persistedInstance.Name != name {
|
|
||||||
return fmt.Errorf("instance name mismatch: file=%s, instance.Name=%s", name, persistedInstance.Name)
|
|
||||||
}
|
|
||||||
|
|
||||||
options := persistedInstance.GetOptions()
|
|
||||||
|
|
||||||
// Check if this is a remote instance (local node not in the Nodes set)
|
// Check if this is a remote instance (local node not in the Nodes set)
|
||||||
var isRemote bool
|
var isRemote bool
|
||||||
var nodeName string
|
var nodeName string
|
||||||
if options != nil {
|
if options != nil {
|
||||||
if _, isLocal := options.Nodes[im.localNodeName]; !isLocal {
|
if _, isLocal := options.Nodes[im.localNodeName]; !isLocal && len(options.Nodes) > 0 {
|
||||||
// Get the first node from the set
|
// Get the first node from the set
|
||||||
for node := range options.Nodes {
|
for node := range options.Nodes {
|
||||||
nodeName = node
|
nodeName = node
|
||||||
@@ -293,7 +179,7 @@ func (im *instanceManager) loadInstance(name, path string) error {
|
|||||||
if !isRemote {
|
if !isRemote {
|
||||||
// Only set status callback for local instances
|
// Only set status callback for local instances
|
||||||
statusCallback = func(oldStatus, newStatus instance.Status) {
|
statusCallback = func(oldStatus, newStatus instance.Status) {
|
||||||
im.onStatusChange(persistedInstance.Name, oldStatus, newStatus)
|
im.onStatusChange(name, oldStatus, newStatus)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -301,38 +187,42 @@ func (im *instanceManager) loadInstance(name, path string) error {
|
|||||||
inst := instance.New(name, &im.backendsConfig, &im.instancesConfig, options, im.localNodeName, statusCallback)
|
inst := instance.New(name, &im.backendsConfig, &im.instancesConfig, options, im.localNodeName, statusCallback)
|
||||||
|
|
||||||
// Restore persisted fields that NewInstance doesn't set
|
// Restore persisted fields that NewInstance doesn't set
|
||||||
inst.Created = persistedInstance.Created
|
inst.Created = persistedInst.Created
|
||||||
inst.SetStatus(persistedInstance.GetStatus())
|
inst.SetStatus(persistedInst.GetStatus())
|
||||||
|
|
||||||
// Handle remote instance mapping
|
// Handle remote instance mapping
|
||||||
if isRemote {
|
if isRemote {
|
||||||
nodeConfig, exists := im.nodeConfigMap[nodeName]
|
// Map instance to node in remote manager
|
||||||
if !exists {
|
if err := im.remote.setInstanceNode(name, nodeName); err != nil {
|
||||||
return fmt.Errorf("node %s not found for remote instance %s", nodeName, name)
|
return fmt.Errorf("failed to set instance node: %w", err)
|
||||||
}
|
}
|
||||||
im.instanceNodeMap[name] = nodeConfig
|
|
||||||
} else {
|
} else {
|
||||||
// Check for port conflicts only for local instances
|
// Allocate port for local instances
|
||||||
if inst.GetPort() > 0 {
|
if inst.GetPort() > 0 {
|
||||||
port := inst.GetPort()
|
port := inst.GetPort()
|
||||||
if im.ports[port] {
|
if err := im.ports.allocateSpecific(port, name); err != nil {
|
||||||
return fmt.Errorf("port conflict: instance %s wants port %d which is already in use", name, port)
|
return fmt.Errorf("port conflict: instance %s wants port %d which is already in use: %w", name, port, err)
|
||||||
}
|
}
|
||||||
im.ports[port] = true
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
im.instances[name] = inst
|
// Add instance to registry
|
||||||
|
if err := im.registry.add(inst); err != nil {
|
||||||
|
return fmt.Errorf("failed to add instance to registry: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// autoStartInstances starts instances that were running when persisted and have auto-restart enabled
|
// autoStartInstances starts instances that were running when persisted and have auto-restart enabled
|
||||||
// For instances with auto-restart disabled, it sets their status to Stopped
|
// For instances with auto-restart disabled, it sets their status to Stopped
|
||||||
func (im *instanceManager) autoStartInstances() {
|
func (im *instanceManager) autoStartInstances() {
|
||||||
im.mu.RLock()
|
instances := im.registry.list()
|
||||||
|
|
||||||
var instancesToStart []*instance.Instance
|
var instancesToStart []*instance.Instance
|
||||||
var instancesToStop []*instance.Instance
|
var instancesToStop []*instance.Instance
|
||||||
for _, inst := range im.instances {
|
|
||||||
|
for _, inst := range instances {
|
||||||
if inst.IsRunning() && // Was running when persisted
|
if inst.IsRunning() && // Was running when persisted
|
||||||
inst.GetOptions() != nil &&
|
inst.GetOptions() != nil &&
|
||||||
inst.GetOptions().AutoRestart != nil {
|
inst.GetOptions().AutoRestart != nil {
|
||||||
@@ -344,12 +234,12 @@ func (im *instanceManager) autoStartInstances() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
im.mu.RUnlock()
|
|
||||||
|
|
||||||
// Stop instances that have auto-restart disabled
|
// Stop instances that have auto-restart disabled
|
||||||
for _, inst := range instancesToStop {
|
for _, inst := range instancesToStop {
|
||||||
log.Printf("Instance %s was running but auto-restart is disabled, setting status to stopped", inst.Name)
|
log.Printf("Instance %s was running but auto-restart is disabled, setting status to stopped", inst.Name)
|
||||||
inst.SetStatus(instance.Stopped)
|
inst.SetStatus(instance.Stopped)
|
||||||
|
im.registry.markStopped(inst.Name)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Start instances that have auto-restart enabled
|
// Start instances that have auto-restart enabled
|
||||||
@@ -357,11 +247,13 @@ func (im *instanceManager) autoStartInstances() {
|
|||||||
log.Printf("Auto-starting instance %s", inst.Name)
|
log.Printf("Auto-starting instance %s", inst.Name)
|
||||||
// Reset running state before starting (since Start() expects stopped instance)
|
// Reset running state before starting (since Start() expects stopped instance)
|
||||||
inst.SetStatus(instance.Stopped)
|
inst.SetStatus(instance.Stopped)
|
||||||
|
im.registry.markStopped(inst.Name)
|
||||||
|
|
||||||
// Check if this is a remote instance
|
// Check if this is a remote instance
|
||||||
if node := im.getNodeForInstance(inst); node != nil {
|
if node, exists := im.remote.getNodeForInstance(inst.Name); exists && node != nil {
|
||||||
// Remote instance - use StartRemoteInstance
|
// Remote instance - use remote manager with context
|
||||||
if _, err := im.StartRemoteInstance(node, inst.Name); err != nil {
|
ctx := context.Background()
|
||||||
|
if _, err := im.remote.startInstance(ctx, node, inst.Name); err != nil {
|
||||||
log.Printf("Failed to auto-start remote instance %s: %v", inst.Name, err)
|
log.Printf("Failed to auto-start remote instance %s: %v", inst.Name, err)
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
@@ -374,13 +266,10 @@ func (im *instanceManager) autoStartInstances() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (im *instanceManager) onStatusChange(name string, oldStatus, newStatus instance.Status) {
|
func (im *instanceManager) onStatusChange(name string, oldStatus, newStatus instance.Status) {
|
||||||
im.mu.Lock()
|
|
||||||
defer im.mu.Unlock()
|
|
||||||
|
|
||||||
if newStatus == instance.Running {
|
if newStatus == instance.Running {
|
||||||
im.runningInstances[name] = struct{}{}
|
im.registry.markRunning(name)
|
||||||
} else {
|
} else {
|
||||||
delete(im.runningInstances, name)
|
im.registry.markStopped(name)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -391,10 +280,27 @@ func (im *instanceManager) getNodeForInstance(inst *instance.Instance) *config.N
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check if we have a cached mapping
|
// Check if we have a node mapping in remote manager
|
||||||
if nodeConfig, exists := im.instanceNodeMap[inst.Name]; exists {
|
if nodeConfig, exists := im.remote.getNodeForInstance(inst.Name); exists {
|
||||||
return nodeConfig
|
return nodeConfig
|
||||||
}
|
}
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// lockInstance returns the lock for a specific instance, creating one if needed.
|
||||||
|
// This allows concurrent operations on different instances while preventing
|
||||||
|
// concurrent operations on the same instance.
|
||||||
|
func (im *instanceManager) lockInstance(name string) *sync.Mutex {
|
||||||
|
lock, _ := im.instanceLocks.LoadOrStore(name, &sync.Mutex{})
|
||||||
|
return lock.(*sync.Mutex)
|
||||||
|
}
|
||||||
|
|
||||||
|
// unlockAndCleanup unlocks the instance lock and removes it from the map.
|
||||||
|
// This should only be called when deleting an instance to prevent memory leaks.
|
||||||
|
func (im *instanceManager) unlockAndCleanup(name string) {
|
||||||
|
if lock, ok := im.instanceLocks.Load(name); ok {
|
||||||
|
lock.(*sync.Mutex).Unlock()
|
||||||
|
im.instanceLocks.Delete(name)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -8,67 +8,17 @@ import (
|
|||||||
"llamactl/pkg/manager"
|
"llamactl/pkg/manager"
|
||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"strings"
|
|
||||||
"sync"
|
"sync"
|
||||||
"testing"
|
"testing"
|
||||||
)
|
)
|
||||||
|
|
||||||
func TestNewInstanceManager(t *testing.T) {
|
func TestManager_PersistsAndLoadsInstances(t *testing.T) {
|
||||||
backendConfig := config.BackendConfig{
|
|
||||||
LlamaCpp: config.BackendSettings{
|
|
||||||
Command: "llama-server",
|
|
||||||
},
|
|
||||||
MLX: config.BackendSettings{
|
|
||||||
Command: "mlx_lm.server",
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
cfg := config.InstancesConfig{
|
|
||||||
PortRange: [2]int{8000, 9000},
|
|
||||||
LogsDir: "/tmp/test",
|
|
||||||
MaxInstances: 5,
|
|
||||||
DefaultAutoRestart: true,
|
|
||||||
DefaultMaxRestarts: 3,
|
|
||||||
DefaultRestartDelay: 5,
|
|
||||||
TimeoutCheckInterval: 5,
|
|
||||||
}
|
|
||||||
|
|
||||||
mgr := manager.NewInstanceManager(backendConfig, cfg, map[string]config.NodeConfig{}, "main")
|
|
||||||
if mgr == nil {
|
|
||||||
t.Fatal("NewInstanceManager returned nil")
|
|
||||||
}
|
|
||||||
|
|
||||||
// Test initial state
|
|
||||||
instances, err := mgr.ListInstances()
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("ListInstances failed: %v", err)
|
|
||||||
}
|
|
||||||
if len(instances) != 0 {
|
|
||||||
t.Errorf("Expected empty instance list, got %d instances", len(instances))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestPersistence(t *testing.T) {
|
|
||||||
tempDir := t.TempDir()
|
tempDir := t.TempDir()
|
||||||
|
cfg := createPersistenceConfig(tempDir)
|
||||||
|
backendConfig := createBackendConfig()
|
||||||
|
|
||||||
backendConfig := config.BackendConfig{
|
// Create instance and check file was created
|
||||||
LlamaCpp: config.BackendSettings{
|
manager1 := manager.New(backendConfig, cfg, map[string]config.NodeConfig{}, "main")
|
||||||
Command: "llama-server",
|
|
||||||
},
|
|
||||||
MLX: config.BackendSettings{
|
|
||||||
Command: "mlx_lm.server",
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
cfg := config.InstancesConfig{
|
|
||||||
PortRange: [2]int{8000, 9000},
|
|
||||||
InstancesDir: tempDir,
|
|
||||||
MaxInstances: 10,
|
|
||||||
TimeoutCheckInterval: 5,
|
|
||||||
}
|
|
||||||
|
|
||||||
// Test instance persistence on creation
|
|
||||||
manager1 := manager.NewInstanceManager(backendConfig, cfg, map[string]config.NodeConfig{}, "main")
|
|
||||||
options := &instance.Options{
|
options := &instance.Options{
|
||||||
BackendOptions: backends.Options{
|
BackendOptions: backends.Options{
|
||||||
BackendType: backends.BackendTypeLlamaCpp,
|
BackendType: backends.BackendTypeLlamaCpp,
|
||||||
@@ -84,14 +34,13 @@ func TestPersistence(t *testing.T) {
|
|||||||
t.Fatalf("CreateInstance failed: %v", err)
|
t.Fatalf("CreateInstance failed: %v", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check that JSON file was created
|
|
||||||
expectedPath := filepath.Join(tempDir, "test-instance.json")
|
expectedPath := filepath.Join(tempDir, "test-instance.json")
|
||||||
if _, err := os.Stat(expectedPath); os.IsNotExist(err) {
|
if _, err := os.Stat(expectedPath); os.IsNotExist(err) {
|
||||||
t.Errorf("Expected persistence file %s to exist", expectedPath)
|
t.Errorf("Expected persistence file %s to exist", expectedPath)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Test loading instances from disk
|
// Load instances from disk
|
||||||
manager2 := manager.NewInstanceManager(backendConfig, cfg, map[string]config.NodeConfig{}, "main")
|
manager2 := manager.New(backendConfig, cfg, map[string]config.NodeConfig{}, "main")
|
||||||
instances, err := manager2.ListInstances()
|
instances, err := manager2.ListInstances()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatalf("ListInstances failed: %v", err)
|
t.Fatalf("ListInstances failed: %v", err)
|
||||||
@@ -102,15 +51,32 @@ func TestPersistence(t *testing.T) {
|
|||||||
if instances[0].Name != "test-instance" {
|
if instances[0].Name != "test-instance" {
|
||||||
t.Errorf("Expected loaded instance name 'test-instance', got %q", instances[0].Name)
|
t.Errorf("Expected loaded instance name 'test-instance', got %q", instances[0].Name)
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Test port map populated from loaded instances (port conflict should be detected)
|
func TestDeleteInstance_RemovesPersistenceFile(t *testing.T) {
|
||||||
_, err = manager2.CreateInstance("new-instance", options) // Same port
|
tempDir := t.TempDir()
|
||||||
if err == nil || !strings.Contains(err.Error(), "port") {
|
cfg := createPersistenceConfig(tempDir)
|
||||||
t.Errorf("Expected port conflict error, got: %v", err)
|
backendConfig := createBackendConfig()
|
||||||
|
|
||||||
|
mgr := manager.New(backendConfig, cfg, map[string]config.NodeConfig{}, "main")
|
||||||
|
options := &instance.Options{
|
||||||
|
BackendOptions: backends.Options{
|
||||||
|
BackendType: backends.BackendTypeLlamaCpp,
|
||||||
|
LlamaServerOptions: &backends.LlamaServerOptions{
|
||||||
|
Model: "/path/to/model.gguf",
|
||||||
|
Port: 8080,
|
||||||
|
},
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
// Test file deletion on instance deletion
|
_, err := mgr.CreateInstance("test-instance", options)
|
||||||
err = manager2.DeleteInstance("test-instance")
|
if err != nil {
|
||||||
|
t.Fatalf("CreateInstance failed: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
expectedPath := filepath.Join(tempDir, "test-instance.json")
|
||||||
|
|
||||||
|
err = mgr.DeleteInstance("test-instance")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatalf("DeleteInstance failed: %v", err)
|
t.Fatalf("DeleteInstance failed: %v", err)
|
||||||
}
|
}
|
||||||
@@ -168,114 +134,40 @@ func TestConcurrentAccess(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestShutdown(t *testing.T) {
|
// Helper functions for test configuration
|
||||||
mgr := createTestManager()
|
func createBackendConfig() config.BackendConfig {
|
||||||
|
// Use 'sleep' as a test command instead of 'llama-server'
|
||||||
// Create test instance
|
// This allows tests to run in CI environments without requiring actual LLM binaries
|
||||||
options := &instance.Options{
|
// The sleep command will be invoked with model paths and other args, which it ignores
|
||||||
BackendOptions: backends.Options{
|
return config.BackendConfig{
|
||||||
BackendType: backends.BackendTypeLlamaCpp,
|
|
||||||
LlamaServerOptions: &backends.LlamaServerOptions{
|
|
||||||
Model: "/path/to/model.gguf",
|
|
||||||
},
|
|
||||||
},
|
|
||||||
}
|
|
||||||
_, err := mgr.CreateInstance("test-instance", options)
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("CreateInstance failed: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Shutdown should not panic
|
|
||||||
mgr.Shutdown()
|
|
||||||
|
|
||||||
// Multiple shutdowns should not panic
|
|
||||||
mgr.Shutdown()
|
|
||||||
}
|
|
||||||
|
|
||||||
// Helper function to create a test manager with standard config
|
|
||||||
func createTestManager() manager.InstanceManager {
|
|
||||||
backendConfig := config.BackendConfig{
|
|
||||||
LlamaCpp: config.BackendSettings{
|
LlamaCpp: config.BackendSettings{
|
||||||
Command: "llama-server",
|
Command: "sleep",
|
||||||
},
|
},
|
||||||
MLX: config.BackendSettings{
|
MLX: config.BackendSettings{
|
||||||
Command: "mlx_lm.server",
|
Command: "sleep",
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func createPersistenceConfig(dir string) config.InstancesConfig {
|
||||||
|
return config.InstancesConfig{
|
||||||
|
PortRange: [2]int{8000, 9000},
|
||||||
|
InstancesDir: dir,
|
||||||
|
MaxInstances: 10,
|
||||||
|
TimeoutCheckInterval: 5,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func createTestManager() manager.InstanceManager {
|
||||||
cfg := config.InstancesConfig{
|
cfg := config.InstancesConfig{
|
||||||
PortRange: [2]int{8000, 9000},
|
PortRange: [2]int{8000, 9000},
|
||||||
LogsDir: "/tmp/test",
|
LogsDir: "/tmp/test",
|
||||||
MaxInstances: 10,
|
MaxInstances: 10,
|
||||||
|
MaxRunningInstances: 10,
|
||||||
DefaultAutoRestart: true,
|
DefaultAutoRestart: true,
|
||||||
DefaultMaxRestarts: 3,
|
DefaultMaxRestarts: 3,
|
||||||
DefaultRestartDelay: 5,
|
DefaultRestartDelay: 5,
|
||||||
TimeoutCheckInterval: 5,
|
TimeoutCheckInterval: 5,
|
||||||
}
|
}
|
||||||
return manager.NewInstanceManager(backendConfig, cfg, map[string]config.NodeConfig{}, "main")
|
return manager.New(createBackendConfig(), cfg, map[string]config.NodeConfig{}, "main")
|
||||||
}
|
|
||||||
|
|
||||||
func TestAutoRestartDisabledInstanceStatus(t *testing.T) {
|
|
||||||
tempDir := t.TempDir()
|
|
||||||
|
|
||||||
backendConfig := config.BackendConfig{
|
|
||||||
LlamaCpp: config.BackendSettings{
|
|
||||||
Command: "llama-server",
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
cfg := config.InstancesConfig{
|
|
||||||
PortRange: [2]int{8000, 9000},
|
|
||||||
InstancesDir: tempDir,
|
|
||||||
MaxInstances: 10,
|
|
||||||
TimeoutCheckInterval: 5,
|
|
||||||
}
|
|
||||||
|
|
||||||
// Create first manager and instance with auto-restart disabled
|
|
||||||
manager1 := manager.NewInstanceManager(backendConfig, cfg, map[string]config.NodeConfig{}, "main")
|
|
||||||
|
|
||||||
autoRestart := false
|
|
||||||
options := &instance.Options{
|
|
||||||
AutoRestart: &autoRestart,
|
|
||||||
BackendOptions: backends.Options{
|
|
||||||
BackendType: backends.BackendTypeLlamaCpp,
|
|
||||||
LlamaServerOptions: &backends.LlamaServerOptions{
|
|
||||||
Model: "/path/to/model.gguf",
|
|
||||||
Port: 8080,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
inst, err := manager1.CreateInstance("test-instance", options)
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("CreateInstance failed: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Simulate instance being in running state when persisted
|
|
||||||
// (this would happen if the instance was running when llamactl was stopped)
|
|
||||||
inst.SetStatus(instance.Running)
|
|
||||||
|
|
||||||
// Shutdown first manager
|
|
||||||
manager1.Shutdown()
|
|
||||||
|
|
||||||
// Create second manager (simulating restart of llamactl)
|
|
||||||
manager2 := manager.NewInstanceManager(backendConfig, cfg, map[string]config.NodeConfig{}, "main")
|
|
||||||
|
|
||||||
// Get the loaded instance
|
|
||||||
loadedInst, err := manager2.GetInstance("test-instance")
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("GetInstance failed: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
// The instance should be marked as Stopped, not Running
|
|
||||||
// because auto-restart is disabled
|
|
||||||
if loadedInst.IsRunning() {
|
|
||||||
t.Errorf("Expected instance with auto-restart disabled to be stopped after manager restart, but it was running")
|
|
||||||
}
|
|
||||||
|
|
||||||
if loadedInst.GetStatus() != instance.Stopped {
|
|
||||||
t.Errorf("Expected instance status to be Stopped, got %v", loadedInst.GetStatus())
|
|
||||||
}
|
|
||||||
|
|
||||||
manager2.Shutdown()
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,44 +1,27 @@
|
|||||||
package manager
|
package manager
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"context"
|
||||||
"fmt"
|
"fmt"
|
||||||
"llamactl/pkg/instance"
|
"llamactl/pkg/instance"
|
||||||
"llamactl/pkg/validation"
|
"log"
|
||||||
"os"
|
|
||||||
"path/filepath"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
type MaxRunningInstancesError error
|
type MaxRunningInstancesError error
|
||||||
|
|
||||||
// updateLocalInstanceFromRemote updates the local stub instance with data from the remote instance
|
// updateLocalInstanceFromRemote updates the local stub instance with data from the remote instance
|
||||||
// while preserving the Nodes field to maintain remote instance tracking
|
|
||||||
func (im *instanceManager) updateLocalInstanceFromRemote(localInst *instance.Instance, remoteInst *instance.Instance) {
|
func (im *instanceManager) updateLocalInstanceFromRemote(localInst *instance.Instance, remoteInst *instance.Instance) {
|
||||||
if localInst == nil || remoteInst == nil {
|
if localInst == nil || remoteInst == nil {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
// Get the remote instance options
|
|
||||||
remoteOptions := remoteInst.GetOptions()
|
remoteOptions := remoteInst.GetOptions()
|
||||||
if remoteOptions == nil {
|
if remoteOptions == nil {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
// Preserve the Nodes field from the local instance
|
|
||||||
localOptions := localInst.GetOptions()
|
|
||||||
var preservedNodes map[string]struct{}
|
|
||||||
if localOptions != nil && len(localOptions.Nodes) > 0 {
|
|
||||||
preservedNodes = make(map[string]struct{}, len(localOptions.Nodes))
|
|
||||||
for node := range localOptions.Nodes {
|
|
||||||
preservedNodes[node] = struct{}{}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Create a copy of remote options and restore the Nodes field
|
|
||||||
updatedOptions := *remoteOptions
|
|
||||||
updatedOptions.Nodes = preservedNodes
|
|
||||||
|
|
||||||
// Update the local instance with all remote data
|
// Update the local instance with all remote data
|
||||||
localInst.SetOptions(&updatedOptions)
|
localInst.SetOptions(remoteOptions)
|
||||||
localInst.SetStatus(remoteInst.GetStatus())
|
localInst.SetStatus(remoteInst.GetStatus())
|
||||||
localInst.Created = remoteInst.Created
|
localInst.Created = remoteInst.Created
|
||||||
}
|
}
|
||||||
@@ -46,17 +29,13 @@ func (im *instanceManager) updateLocalInstanceFromRemote(localInst *instance.Ins
|
|||||||
// ListInstances returns a list of all instances managed by the instance manager.
|
// ListInstances returns a list of all instances managed by the instance manager.
|
||||||
// For remote instances, this fetches the live state from remote nodes and updates local stubs.
|
// For remote instances, this fetches the live state from remote nodes and updates local stubs.
|
||||||
func (im *instanceManager) ListInstances() ([]*instance.Instance, error) {
|
func (im *instanceManager) ListInstances() ([]*instance.Instance, error) {
|
||||||
im.mu.RLock()
|
instances := im.registry.list()
|
||||||
localInstances := make([]*instance.Instance, 0, len(im.instances))
|
|
||||||
for _, inst := range im.instances {
|
|
||||||
localInstances = append(localInstances, inst)
|
|
||||||
}
|
|
||||||
im.mu.RUnlock()
|
|
||||||
|
|
||||||
// Update remote instances with live state
|
// Update remote instances with live state
|
||||||
for _, inst := range localInstances {
|
ctx := context.Background()
|
||||||
|
for _, inst := range instances {
|
||||||
if node := im.getNodeForInstance(inst); node != nil {
|
if node := im.getNodeForInstance(inst); node != nil {
|
||||||
remoteInst, err := im.GetRemoteInstance(node, inst.Name)
|
remoteInst, err := im.remote.getInstance(ctx, node, inst.Name)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
// Log error but continue with stale data
|
// Log error but continue with stale data
|
||||||
// Don't fail the entire list operation due to one remote failure
|
// Don't fail the entire list operation due to one remote failure
|
||||||
@@ -64,13 +43,11 @@ func (im *instanceManager) ListInstances() ([]*instance.Instance, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Update the local stub with all remote data (preserving Nodes)
|
// Update the local stub with all remote data (preserving Nodes)
|
||||||
im.mu.Lock()
|
|
||||||
im.updateLocalInstanceFromRemote(inst, remoteInst)
|
im.updateLocalInstanceFromRemote(inst, remoteInst)
|
||||||
im.mu.Unlock()
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return localInstances, nil
|
return instances, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// CreateInstance creates a new instance with the given options and returns it.
|
// CreateInstance creates a new instance with the given options and returns it.
|
||||||
@@ -80,21 +57,13 @@ func (im *instanceManager) CreateInstance(name string, options *instance.Options
|
|||||||
return nil, fmt.Errorf("instance options cannot be nil")
|
return nil, fmt.Errorf("instance options cannot be nil")
|
||||||
}
|
}
|
||||||
|
|
||||||
name, err := validation.ValidateInstanceName(name)
|
err := options.BackendOptions.ValidateInstanceOptions()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
err = options.BackendOptions.ValidateInstanceOptions()
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
im.mu.Lock()
|
|
||||||
defer im.mu.Unlock()
|
|
||||||
|
|
||||||
// Check if instance with this name already exists (must be globally unique)
|
// Check if instance with this name already exists (must be globally unique)
|
||||||
if im.instances[name] != nil {
|
if _, exists := im.registry.get(name); exists {
|
||||||
return nil, fmt.Errorf("instance with name %s already exists", name)
|
return nil, fmt.Errorf("instance with name %s already exists", name)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -107,14 +76,18 @@ func (im *instanceManager) CreateInstance(name string, options *instance.Options
|
|||||||
break
|
break
|
||||||
}
|
}
|
||||||
|
|
||||||
// Validate that the node exists
|
// Create the remote instance on the remote node
|
||||||
nodeConfig, exists := im.nodeConfigMap[nodeName]
|
ctx := context.Background()
|
||||||
|
nodeConfig, exists := im.remote.getNodeForInstance(nodeName)
|
||||||
if !exists {
|
if !exists {
|
||||||
|
// Try to set the node if it doesn't exist yet
|
||||||
|
if err := im.remote.setInstanceNode(name, nodeName); err != nil {
|
||||||
return nil, fmt.Errorf("node %s not found", nodeName)
|
return nil, fmt.Errorf("node %s not found", nodeName)
|
||||||
}
|
}
|
||||||
|
nodeConfig, _ = im.remote.getNodeForInstance(name)
|
||||||
|
}
|
||||||
|
|
||||||
// Create the remote instance on the remote node
|
remoteInst, err := im.remote.createInstance(ctx, nodeConfig, name, options)
|
||||||
remoteInst, err := im.CreateRemoteInstance(nodeConfig, name, options)
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
@@ -126,12 +99,20 @@ func (im *instanceManager) CreateInstance(name string, options *instance.Options
|
|||||||
// Update the local stub with all remote data (preserving Nodes)
|
// Update the local stub with all remote data (preserving Nodes)
|
||||||
im.updateLocalInstanceFromRemote(inst, remoteInst)
|
im.updateLocalInstanceFromRemote(inst, remoteInst)
|
||||||
|
|
||||||
// Add to local tracking maps (but don't count towards limits)
|
// Map instance to node
|
||||||
im.instances[name] = inst
|
if err := im.remote.setInstanceNode(name, nodeName); err != nil {
|
||||||
im.instanceNodeMap[name] = nodeConfig
|
return nil, fmt.Errorf("failed to map instance to node: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add to registry (doesn't count towards local limits)
|
||||||
|
if err := im.registry.add(inst); err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to add instance to registry: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
// Persist the remote instance locally for tracking across restarts
|
// Persist the remote instance locally for tracking across restarts
|
||||||
if err := im.persistInstance(inst); err != nil {
|
if err := im.persistInstance(inst); err != nil {
|
||||||
|
// Rollback: remove from registry
|
||||||
|
im.registry.remove(name)
|
||||||
return nil, fmt.Errorf("failed to persist remote instance %s: %w", name, err)
|
return nil, fmt.Errorf("failed to persist remote instance %s: %w", name, err)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -140,14 +121,34 @@ func (im *instanceManager) CreateInstance(name string, options *instance.Options
|
|||||||
|
|
||||||
// Local instance creation
|
// Local instance creation
|
||||||
// Check max instances limit for local instances only
|
// Check max instances limit for local instances only
|
||||||
localInstanceCount := len(im.instances) - len(im.instanceNodeMap)
|
totalInstances := im.registry.count()
|
||||||
|
remoteCount := 0
|
||||||
|
for _, inst := range im.registry.list() {
|
||||||
|
if inst.IsRemote() {
|
||||||
|
remoteCount++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
localInstanceCount := totalInstances - remoteCount
|
||||||
if localInstanceCount >= im.instancesConfig.MaxInstances && im.instancesConfig.MaxInstances != -1 {
|
if localInstanceCount >= im.instancesConfig.MaxInstances && im.instancesConfig.MaxInstances != -1 {
|
||||||
return nil, fmt.Errorf("maximum number of instances (%d) reached", im.instancesConfig.MaxInstances)
|
return nil, fmt.Errorf("maximum number of instances (%d) reached", im.instancesConfig.MaxInstances)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Assign and validate port for backend-specific options
|
// Assign and validate port for backend-specific options
|
||||||
if err := im.assignAndValidatePort(options); err != nil {
|
currentPort := im.getPortFromOptions(options)
|
||||||
return nil, err
|
var allocatedPort int
|
||||||
|
if currentPort == 0 {
|
||||||
|
// Allocate a port if not specified
|
||||||
|
allocatedPort, err = im.ports.allocate(name)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to allocate port: %w", err)
|
||||||
|
}
|
||||||
|
im.setPortInOptions(options, allocatedPort)
|
||||||
|
} else {
|
||||||
|
// Use the specified port
|
||||||
|
if err := im.ports.allocateSpecific(currentPort, name); err != nil {
|
||||||
|
return nil, fmt.Errorf("port %d is already in use: %w", currentPort, err)
|
||||||
|
}
|
||||||
|
allocatedPort = currentPort
|
||||||
}
|
}
|
||||||
|
|
||||||
statusCallback := func(oldStatus, newStatus instance.Status) {
|
statusCallback := func(oldStatus, newStatus instance.Status) {
|
||||||
@@ -155,10 +156,17 @@ func (im *instanceManager) CreateInstance(name string, options *instance.Options
|
|||||||
}
|
}
|
||||||
|
|
||||||
inst := instance.New(name, &im.backendsConfig, &im.instancesConfig, options, im.localNodeName, statusCallback)
|
inst := instance.New(name, &im.backendsConfig, &im.instancesConfig, options, im.localNodeName, statusCallback)
|
||||||
im.instances[inst.Name] = inst
|
|
||||||
|
|
||||||
|
// Add to registry
|
||||||
|
if err := im.registry.add(inst); err != nil {
|
||||||
|
// Rollback: release port
|
||||||
|
im.ports.release(allocatedPort)
|
||||||
|
return nil, fmt.Errorf("failed to add instance to registry: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Persist instance (best-effort, don't fail if persistence fails)
|
||||||
if err := im.persistInstance(inst); err != nil {
|
if err := im.persistInstance(inst); err != nil {
|
||||||
return nil, fmt.Errorf("failed to persist instance %s: %w", name, err)
|
log.Printf("Warning: failed to persist instance %s: %v", name, err)
|
||||||
}
|
}
|
||||||
|
|
||||||
return inst, nil
|
return inst, nil
|
||||||
@@ -167,25 +175,21 @@ func (im *instanceManager) CreateInstance(name string, options *instance.Options
|
|||||||
// GetInstance retrieves an instance by its name.
|
// GetInstance retrieves an instance by its name.
|
||||||
// For remote instances, this fetches the live state from the remote node and updates the local stub.
|
// For remote instances, this fetches the live state from the remote node and updates the local stub.
|
||||||
func (im *instanceManager) GetInstance(name string) (*instance.Instance, error) {
|
func (im *instanceManager) GetInstance(name string) (*instance.Instance, error) {
|
||||||
im.mu.RLock()
|
inst, exists := im.registry.get(name)
|
||||||
inst, exists := im.instances[name]
|
|
||||||
im.mu.RUnlock()
|
|
||||||
|
|
||||||
if !exists {
|
if !exists {
|
||||||
return nil, fmt.Errorf("instance with name %s not found", name)
|
return nil, fmt.Errorf("instance with name %s not found", name)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check if instance is remote and fetch live state
|
// Check if instance is remote and fetch live state
|
||||||
if node := im.getNodeForInstance(inst); node != nil {
|
if node := im.getNodeForInstance(inst); node != nil {
|
||||||
remoteInst, err := im.GetRemoteInstance(node, name)
|
ctx := context.Background()
|
||||||
|
remoteInst, err := im.remote.getInstance(ctx, node, name)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
// Update the local stub with all remote data (preserving Nodes)
|
// Update the local stub with all remote data (preserving Nodes)
|
||||||
im.mu.Lock()
|
|
||||||
im.updateLocalInstanceFromRemote(inst, remoteInst)
|
im.updateLocalInstanceFromRemote(inst, remoteInst)
|
||||||
im.mu.Unlock()
|
|
||||||
|
|
||||||
// Return the local stub (preserving Nodes field)
|
// Return the local stub (preserving Nodes field)
|
||||||
return inst, nil
|
return inst, nil
|
||||||
@@ -197,29 +201,23 @@ func (im *instanceManager) GetInstance(name string) (*instance.Instance, error)
|
|||||||
// UpdateInstance updates the options of an existing instance and returns it.
|
// UpdateInstance updates the options of an existing instance and returns it.
|
||||||
// If the instance is running, it will be restarted to apply the new options.
|
// If the instance is running, it will be restarted to apply the new options.
|
||||||
func (im *instanceManager) UpdateInstance(name string, options *instance.Options) (*instance.Instance, error) {
|
func (im *instanceManager) UpdateInstance(name string, options *instance.Options) (*instance.Instance, error) {
|
||||||
im.mu.RLock()
|
inst, exists := im.registry.get(name)
|
||||||
inst, exists := im.instances[name]
|
|
||||||
im.mu.RUnlock()
|
|
||||||
|
|
||||||
if !exists {
|
if !exists {
|
||||||
return nil, fmt.Errorf("instance with name %s not found", name)
|
return nil, fmt.Errorf("instance with name %s not found", name)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check if instance is remote and delegate to remote operation
|
// Check if instance is remote and delegate to remote operation
|
||||||
if node := im.getNodeForInstance(inst); node != nil {
|
if node := im.getNodeForInstance(inst); node != nil {
|
||||||
remoteInst, err := im.UpdateRemoteInstance(node, name, options)
|
ctx := context.Background()
|
||||||
|
remoteInst, err := im.remote.updateInstance(ctx, node, name, options)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
// Update the local stub with all remote data (preserving Nodes)
|
// Update the local stub with all remote data (preserving Nodes)
|
||||||
im.mu.Lock()
|
|
||||||
im.updateLocalInstanceFromRemote(inst, remoteInst)
|
im.updateLocalInstanceFromRemote(inst, remoteInst)
|
||||||
im.mu.Unlock()
|
|
||||||
|
|
||||||
// Persist the updated remote instance locally
|
// Persist the updated remote instance locally
|
||||||
im.mu.Lock()
|
|
||||||
defer im.mu.Unlock()
|
|
||||||
if err := im.persistInstance(inst); err != nil {
|
if err := im.persistInstance(inst); err != nil {
|
||||||
return nil, fmt.Errorf("failed to persist updated remote instance %s: %w", name, err)
|
return nil, fmt.Errorf("failed to persist updated remote instance %s: %w", name, err)
|
||||||
}
|
}
|
||||||
@@ -236,6 +234,43 @@ func (im *instanceManager) UpdateInstance(name string, options *instance.Options
|
|||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Lock this specific instance only
|
||||||
|
lock := im.lockInstance(name)
|
||||||
|
lock.Lock()
|
||||||
|
defer lock.Unlock()
|
||||||
|
|
||||||
|
// Handle port changes
|
||||||
|
oldPort := inst.GetPort()
|
||||||
|
newPort := im.getPortFromOptions(options)
|
||||||
|
var allocatedPort int
|
||||||
|
|
||||||
|
if newPort != oldPort {
|
||||||
|
// Port is changing - need to release old and allocate new
|
||||||
|
if newPort == 0 {
|
||||||
|
// Auto-allocate new port
|
||||||
|
allocatedPort, err = im.ports.allocate(name)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to allocate new port: %w", err)
|
||||||
|
}
|
||||||
|
im.setPortInOptions(options, allocatedPort)
|
||||||
|
} else {
|
||||||
|
// Use specified port
|
||||||
|
if err := im.ports.allocateSpecific(newPort, name); err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to allocate port %d: %w", newPort, err)
|
||||||
|
}
|
||||||
|
allocatedPort = newPort
|
||||||
|
}
|
||||||
|
|
||||||
|
// Release old port
|
||||||
|
if oldPort > 0 {
|
||||||
|
if err := im.ports.release(oldPort); err != nil {
|
||||||
|
// Rollback new port allocation
|
||||||
|
im.ports.release(allocatedPort)
|
||||||
|
return nil, fmt.Errorf("failed to release old port %d: %w", oldPort, err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Check if instance is running before updating options
|
// Check if instance is running before updating options
|
||||||
wasRunning := inst.IsRunning()
|
wasRunning := inst.IsRunning()
|
||||||
|
|
||||||
@@ -256,8 +291,6 @@ func (im *instanceManager) UpdateInstance(name string, options *instance.Options
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
im.mu.Lock()
|
|
||||||
defer im.mu.Unlock()
|
|
||||||
if err := im.persistInstance(inst); err != nil {
|
if err := im.persistInstance(inst); err != nil {
|
||||||
return nil, fmt.Errorf("failed to persist updated instance %s: %w", name, err)
|
return nil, fmt.Errorf("failed to persist updated instance %s: %w", name, err)
|
||||||
}
|
}
|
||||||
@@ -267,60 +300,51 @@ func (im *instanceManager) UpdateInstance(name string, options *instance.Options
|
|||||||
|
|
||||||
// DeleteInstance removes stopped instance by its name.
|
// DeleteInstance removes stopped instance by its name.
|
||||||
func (im *instanceManager) DeleteInstance(name string) error {
|
func (im *instanceManager) DeleteInstance(name string) error {
|
||||||
im.mu.Lock()
|
inst, exists := im.registry.get(name)
|
||||||
inst, exists := im.instances[name]
|
|
||||||
im.mu.Unlock()
|
|
||||||
|
|
||||||
if !exists {
|
if !exists {
|
||||||
return fmt.Errorf("instance with name %s not found", name)
|
return fmt.Errorf("instance with name %s not found", name)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check if instance is remote and delegate to remote operation
|
// Check if instance is remote and delegate to remote operation
|
||||||
if node := im.getNodeForInstance(inst); node != nil {
|
if node := im.getNodeForInstance(inst); node != nil {
|
||||||
err := im.DeleteRemoteInstance(node, name)
|
ctx := context.Background()
|
||||||
|
err := im.remote.deleteInstance(ctx, node, name)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
// Clean up local tracking
|
// Clean up local tracking
|
||||||
im.mu.Lock()
|
im.remote.removeInstance(name)
|
||||||
defer im.mu.Unlock()
|
im.registry.remove(name)
|
||||||
delete(im.instances, name)
|
|
||||||
delete(im.instanceNodeMap, name)
|
|
||||||
|
|
||||||
// Delete the instance's config file if persistence is enabled
|
// Delete the instance's persistence file
|
||||||
// Re-validate instance name for security (defense in depth)
|
if err := im.persistence.delete(name); err != nil {
|
||||||
validatedName, err := validation.ValidateInstanceName(name)
|
return fmt.Errorf("failed to delete config file for remote instance %s: %w", name, err)
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("invalid instance name for file deletion: %w", err)
|
|
||||||
}
|
|
||||||
instancePath := filepath.Join(im.instancesConfig.InstancesDir, validatedName+".json")
|
|
||||||
if err := os.Remove(instancePath); err != nil && !os.IsNotExist(err) {
|
|
||||||
return fmt.Errorf("failed to delete config file for remote instance %s: %w", validatedName, err)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Lock this specific instance and clean up the lock on completion
|
||||||
|
lock := im.lockInstance(name)
|
||||||
|
lock.Lock()
|
||||||
|
defer im.unlockAndCleanup(name)
|
||||||
|
|
||||||
if inst.IsRunning() {
|
if inst.IsRunning() {
|
||||||
return fmt.Errorf("instance with name %s is still running, stop it before deleting", name)
|
return fmt.Errorf("instance with name %s is still running, stop it before deleting", name)
|
||||||
}
|
}
|
||||||
|
|
||||||
im.mu.Lock()
|
// Release port (use ReleaseByInstance for proper cleanup)
|
||||||
defer im.mu.Unlock()
|
im.ports.releaseByInstance(name)
|
||||||
|
|
||||||
delete(im.ports, inst.GetPort())
|
// Remove from registry
|
||||||
delete(im.instances, name)
|
if err := im.registry.remove(name); err != nil {
|
||||||
|
return fmt.Errorf("failed to remove instance from registry: %w", err)
|
||||||
// Delete the instance's config file if persistence is enabled
|
|
||||||
// Re-validate instance name for security (defense in depth)
|
|
||||||
validatedName, err := validation.ValidateInstanceName(inst.Name)
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("invalid instance name for file deletion: %w", err)
|
|
||||||
}
|
}
|
||||||
instancePath := filepath.Join(im.instancesConfig.InstancesDir, validatedName+".json")
|
|
||||||
if err := os.Remove(instancePath); err != nil && !os.IsNotExist(err) {
|
// Delete persistence file
|
||||||
return fmt.Errorf("failed to delete config file for instance %s: %w", validatedName, err)
|
if err := im.persistence.delete(name); err != nil {
|
||||||
|
return fmt.Errorf("failed to delete config file for instance %s: %w", name, err)
|
||||||
}
|
}
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
@@ -329,45 +353,37 @@ func (im *instanceManager) DeleteInstance(name string) error {
|
|||||||
// StartInstance starts a stopped instance and returns it.
|
// StartInstance starts a stopped instance and returns it.
|
||||||
// If the instance is already running, it returns an error.
|
// If the instance is already running, it returns an error.
|
||||||
func (im *instanceManager) StartInstance(name string) (*instance.Instance, error) {
|
func (im *instanceManager) StartInstance(name string) (*instance.Instance, error) {
|
||||||
im.mu.RLock()
|
inst, exists := im.registry.get(name)
|
||||||
inst, exists := im.instances[name]
|
|
||||||
im.mu.RUnlock()
|
|
||||||
|
|
||||||
if !exists {
|
if !exists {
|
||||||
return nil, fmt.Errorf("instance with name %s not found", name)
|
return nil, fmt.Errorf("instance with name %s not found", name)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check if instance is remote and delegate to remote operation
|
// Check if instance is remote and delegate to remote operation
|
||||||
if node := im.getNodeForInstance(inst); node != nil {
|
if node := im.getNodeForInstance(inst); node != nil {
|
||||||
remoteInst, err := im.StartRemoteInstance(node, name)
|
ctx := context.Background()
|
||||||
|
remoteInst, err := im.remote.startInstance(ctx, node, name)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
// Update the local stub with all remote data (preserving Nodes)
|
// Update the local stub with all remote data (preserving Nodes)
|
||||||
im.mu.Lock()
|
|
||||||
im.updateLocalInstanceFromRemote(inst, remoteInst)
|
im.updateLocalInstanceFromRemote(inst, remoteInst)
|
||||||
im.mu.Unlock()
|
|
||||||
|
|
||||||
return inst, nil
|
return inst, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Lock this specific instance only
|
||||||
|
lock := im.lockInstance(name)
|
||||||
|
lock.Lock()
|
||||||
|
defer lock.Unlock()
|
||||||
|
|
||||||
|
// Idempotent: if already running, just return success
|
||||||
if inst.IsRunning() {
|
if inst.IsRunning() {
|
||||||
return inst, fmt.Errorf("instance with name %s is already running", name)
|
return inst, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check max running instances limit for local instances only
|
// Check max running instances limit for local instances only
|
||||||
im.mu.RLock()
|
if im.IsMaxRunningInstancesReached() {
|
||||||
localRunningCount := 0
|
|
||||||
for instName := range im.runningInstances {
|
|
||||||
if _, isRemote := im.instanceNodeMap[instName]; !isRemote {
|
|
||||||
localRunningCount++
|
|
||||||
}
|
|
||||||
}
|
|
||||||
maxRunningExceeded := localRunningCount >= im.instancesConfig.MaxRunningInstances && im.instancesConfig.MaxRunningInstances != -1
|
|
||||||
im.mu.RUnlock()
|
|
||||||
|
|
||||||
if maxRunningExceeded {
|
|
||||||
return nil, MaxRunningInstancesError(fmt.Errorf("maximum number of running instances (%d) reached", im.instancesConfig.MaxRunningInstances))
|
return nil, MaxRunningInstancesError(fmt.Errorf("maximum number of running instances (%d) reached", im.instancesConfig.MaxRunningInstances))
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -375,65 +391,68 @@ func (im *instanceManager) StartInstance(name string) (*instance.Instance, error
|
|||||||
return nil, fmt.Errorf("failed to start instance %s: %w", name, err)
|
return nil, fmt.Errorf("failed to start instance %s: %w", name, err)
|
||||||
}
|
}
|
||||||
|
|
||||||
im.mu.Lock()
|
// Persist instance (best-effort, don't fail if persistence fails)
|
||||||
defer im.mu.Unlock()
|
if err := im.persistInstance(inst); err != nil {
|
||||||
err := im.persistInstance(inst)
|
log.Printf("Warning: failed to persist instance %s: %v", name, err)
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("failed to persist instance %s: %w", name, err)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return inst, nil
|
return inst, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (im *instanceManager) IsMaxRunningInstancesReached() bool {
|
func (im *instanceManager) IsMaxRunningInstancesReached() bool {
|
||||||
im.mu.RLock()
|
if im.instancesConfig.MaxRunningInstances == -1 {
|
||||||
defer im.mu.RUnlock()
|
return false
|
||||||
|
|
||||||
if im.instancesConfig.MaxRunningInstances != -1 && len(im.runningInstances) >= im.instancesConfig.MaxRunningInstances {
|
|
||||||
return true
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return false
|
// Count only local running instances (each node has its own limits)
|
||||||
|
localRunningCount := 0
|
||||||
|
for _, inst := range im.registry.listRunning() {
|
||||||
|
if !inst.IsRemote() {
|
||||||
|
localRunningCount++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return localRunningCount >= im.instancesConfig.MaxRunningInstances
|
||||||
}
|
}
|
||||||
|
|
||||||
// StopInstance stops a running instance and returns it.
|
// StopInstance stops a running instance and returns it.
|
||||||
func (im *instanceManager) StopInstance(name string) (*instance.Instance, error) {
|
func (im *instanceManager) StopInstance(name string) (*instance.Instance, error) {
|
||||||
im.mu.RLock()
|
inst, exists := im.registry.get(name)
|
||||||
inst, exists := im.instances[name]
|
|
||||||
im.mu.RUnlock()
|
|
||||||
|
|
||||||
if !exists {
|
if !exists {
|
||||||
return nil, fmt.Errorf("instance with name %s not found", name)
|
return nil, fmt.Errorf("instance with name %s not found", name)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check if instance is remote and delegate to remote operation
|
// Check if instance is remote and delegate to remote operation
|
||||||
if node := im.getNodeForInstance(inst); node != nil {
|
if node := im.getNodeForInstance(inst); node != nil {
|
||||||
remoteInst, err := im.StopRemoteInstance(node, name)
|
ctx := context.Background()
|
||||||
|
remoteInst, err := im.remote.stopInstance(ctx, node, name)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
// Update the local stub with all remote data (preserving Nodes)
|
// Update the local stub with all remote data (preserving Nodes)
|
||||||
im.mu.Lock()
|
|
||||||
im.updateLocalInstanceFromRemote(inst, remoteInst)
|
im.updateLocalInstanceFromRemote(inst, remoteInst)
|
||||||
im.mu.Unlock()
|
|
||||||
|
|
||||||
return inst, nil
|
return inst, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Lock this specific instance only
|
||||||
|
lock := im.lockInstance(name)
|
||||||
|
lock.Lock()
|
||||||
|
defer lock.Unlock()
|
||||||
|
|
||||||
|
// Idempotent: if already stopped, just return success
|
||||||
if !inst.IsRunning() {
|
if !inst.IsRunning() {
|
||||||
return inst, fmt.Errorf("instance with name %s is already stopped", name)
|
return inst, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
if err := inst.Stop(); err != nil {
|
if err := inst.Stop(); err != nil {
|
||||||
return nil, fmt.Errorf("failed to stop instance %s: %w", name, err)
|
return nil, fmt.Errorf("failed to stop instance %s: %w", name, err)
|
||||||
}
|
}
|
||||||
|
|
||||||
im.mu.Lock()
|
// Persist instance (best-effort, don't fail if persistence fails)
|
||||||
defer im.mu.Unlock()
|
if err := im.persistInstance(inst); err != nil {
|
||||||
err := im.persistInstance(inst)
|
log.Printf("Warning: failed to persist instance %s: %v", name, err)
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("failed to persist instance %s: %w", name, err)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return inst, nil
|
return inst, nil
|
||||||
@@ -441,49 +460,61 @@ func (im *instanceManager) StopInstance(name string) (*instance.Instance, error)
|
|||||||
|
|
||||||
// RestartInstance stops and then starts an instance, returning the updated instance.
|
// RestartInstance stops and then starts an instance, returning the updated instance.
|
||||||
func (im *instanceManager) RestartInstance(name string) (*instance.Instance, error) {
|
func (im *instanceManager) RestartInstance(name string) (*instance.Instance, error) {
|
||||||
im.mu.RLock()
|
inst, exists := im.registry.get(name)
|
||||||
inst, exists := im.instances[name]
|
|
||||||
im.mu.RUnlock()
|
|
||||||
|
|
||||||
if !exists {
|
if !exists {
|
||||||
return nil, fmt.Errorf("instance with name %s not found", name)
|
return nil, fmt.Errorf("instance with name %s not found", name)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check if instance is remote and delegate to remote operation
|
// Check if instance is remote and delegate to remote operation
|
||||||
if node := im.getNodeForInstance(inst); node != nil {
|
if node := im.getNodeForInstance(inst); node != nil {
|
||||||
remoteInst, err := im.RestartRemoteInstance(node, name)
|
ctx := context.Background()
|
||||||
|
remoteInst, err := im.remote.restartInstance(ctx, node, name)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
// Update the local stub with all remote data (preserving Nodes)
|
// Update the local stub with all remote data (preserving Nodes)
|
||||||
im.mu.Lock()
|
|
||||||
im.updateLocalInstanceFromRemote(inst, remoteInst)
|
im.updateLocalInstanceFromRemote(inst, remoteInst)
|
||||||
im.mu.Unlock()
|
|
||||||
|
|
||||||
return inst, nil
|
return inst, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
inst, err := im.StopInstance(name)
|
// Lock this specific instance for the entire restart operation to ensure atomicity
|
||||||
if err != nil {
|
lock := im.lockInstance(name)
|
||||||
return nil, err
|
lock.Lock()
|
||||||
|
defer lock.Unlock()
|
||||||
|
|
||||||
|
// Stop the instance
|
||||||
|
if inst.IsRunning() {
|
||||||
|
if err := inst.Stop(); err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to stop instance %s: %w", name, err)
|
||||||
}
|
}
|
||||||
return im.StartInstance(inst.Name)
|
}
|
||||||
|
|
||||||
|
// Start the instance
|
||||||
|
if err := inst.Start(); err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to start instance %s: %w", name, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Persist the restarted instance
|
||||||
|
if err := im.persistInstance(inst); err != nil {
|
||||||
|
log.Printf("Warning: failed to persist instance %s: %v", name, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
return inst, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// GetInstanceLogs retrieves the logs for a specific instance by its name.
|
// GetInstanceLogs retrieves the logs for a specific instance by its name.
|
||||||
func (im *instanceManager) GetInstanceLogs(name string, numLines int) (string, error) {
|
func (im *instanceManager) GetInstanceLogs(name string, numLines int) (string, error) {
|
||||||
im.mu.RLock()
|
inst, exists := im.registry.get(name)
|
||||||
inst, exists := im.instances[name]
|
|
||||||
im.mu.RUnlock()
|
|
||||||
|
|
||||||
if !exists {
|
if !exists {
|
||||||
return "", fmt.Errorf("instance with name %s not found", name)
|
return "", fmt.Errorf("instance with name %s not found", name)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check if instance is remote and delegate to remote operation
|
// Check if instance is remote and delegate to remote operation
|
||||||
if node := im.getNodeForInstance(inst); node != nil {
|
if node := im.getNodeForInstance(inst); node != nil {
|
||||||
return im.GetRemoteInstanceLogs(node, name, numLines)
|
ctx := context.Background()
|
||||||
|
return im.remote.getInstanceLogs(ctx, node, name, numLines)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Get logs from the local instance
|
// Get logs from the local instance
|
||||||
@@ -500,27 +531,7 @@ func (im *instanceManager) setPortInOptions(options *instance.Options, port int)
|
|||||||
options.BackendOptions.SetPort(port)
|
options.BackendOptions.SetPort(port)
|
||||||
}
|
}
|
||||||
|
|
||||||
// assignAndValidatePort assigns a port if not specified and validates it's not in use
|
// EvictLRUInstance finds and stops the least recently used running instance.
|
||||||
func (im *instanceManager) assignAndValidatePort(options *instance.Options) error {
|
func (im *instanceManager) EvictLRUInstance() error {
|
||||||
currentPort := im.getPortFromOptions(options)
|
return im.lifecycle.evictLRU()
|
||||||
|
|
||||||
if currentPort == 0 {
|
|
||||||
// Assign a port if not specified
|
|
||||||
port, err := im.getNextAvailablePort()
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("failed to get next available port: %w", err)
|
|
||||||
}
|
|
||||||
im.setPortInOptions(options, port)
|
|
||||||
// Mark the port as used
|
|
||||||
im.ports[port] = true
|
|
||||||
} else {
|
|
||||||
// Validate the specified port
|
|
||||||
if _, exists := im.ports[currentPort]; exists {
|
|
||||||
return fmt.Errorf("port %d is already in use", currentPort)
|
|
||||||
}
|
|
||||||
// Mark the port as used
|
|
||||||
im.ports[currentPort] = true
|
|
||||||
}
|
|
||||||
|
|
||||||
return nil
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -9,37 +9,7 @@ import (
|
|||||||
"testing"
|
"testing"
|
||||||
)
|
)
|
||||||
|
|
||||||
func TestCreateInstance_Success(t *testing.T) {
|
func TestCreateInstance_FailsWithDuplicateName(t *testing.T) {
|
||||||
manager := createTestManager()
|
|
||||||
|
|
||||||
options := &instance.Options{
|
|
||||||
BackendOptions: backends.Options{
|
|
||||||
BackendType: backends.BackendTypeLlamaCpp,
|
|
||||||
LlamaServerOptions: &backends.LlamaServerOptions{
|
|
||||||
Model: "/path/to/model.gguf",
|
|
||||||
Port: 8080,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
inst, err := manager.CreateInstance("test-instance", options)
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("CreateInstance failed: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
if inst.Name != "test-instance" {
|
|
||||||
t.Errorf("Expected instance name 'test-instance', got %q", inst.Name)
|
|
||||||
}
|
|
||||||
if inst.GetStatus() != instance.Stopped {
|
|
||||||
t.Error("New instance should not be running")
|
|
||||||
}
|
|
||||||
if inst.GetPort() != 8080 {
|
|
||||||
t.Errorf("Expected port 8080, got %d", inst.GetPort())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestCreateInstance_ValidationAndLimits(t *testing.T) {
|
|
||||||
// Test duplicate names
|
|
||||||
mngr := createTestManager()
|
mngr := createTestManager()
|
||||||
options := &instance.Options{
|
options := &instance.Options{
|
||||||
BackendOptions: backends.Options{
|
BackendOptions: backends.Options{
|
||||||
@@ -63,24 +33,31 @@ func TestCreateInstance_ValidationAndLimits(t *testing.T) {
|
|||||||
if !strings.Contains(err.Error(), "already exists") {
|
if !strings.Contains(err.Error(), "already exists") {
|
||||||
t.Errorf("Expected duplicate name error, got: %v", err)
|
t.Errorf("Expected duplicate name error, got: %v", err)
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Test max instances limit
|
func TestCreateInstance_FailsWhenMaxInstancesReached(t *testing.T) {
|
||||||
backendConfig := config.BackendConfig{
|
backendConfig := config.BackendConfig{
|
||||||
LlamaCpp: config.BackendSettings{
|
LlamaCpp: config.BackendSettings{
|
||||||
Command: "llama-server",
|
Command: "llama-server",
|
||||||
},
|
},
|
||||||
MLX: config.BackendSettings{
|
|
||||||
Command: "mlx_lm.server",
|
|
||||||
},
|
|
||||||
}
|
}
|
||||||
cfg := config.InstancesConfig{
|
cfg := config.InstancesConfig{
|
||||||
PortRange: [2]int{8000, 9000},
|
PortRange: [2]int{8000, 9000},
|
||||||
MaxInstances: 1, // Very low limit for testing
|
MaxInstances: 1, // Very low limit for testing
|
||||||
TimeoutCheckInterval: 5,
|
TimeoutCheckInterval: 5,
|
||||||
}
|
}
|
||||||
limitedManager := manager.NewInstanceManager(backendConfig, cfg, map[string]config.NodeConfig{}, "main")
|
limitedManager := manager.New(backendConfig, cfg, map[string]config.NodeConfig{}, "main")
|
||||||
|
|
||||||
_, err = limitedManager.CreateInstance("instance1", options)
|
options := &instance.Options{
|
||||||
|
BackendOptions: backends.Options{
|
||||||
|
BackendType: backends.BackendTypeLlamaCpp,
|
||||||
|
LlamaServerOptions: &backends.LlamaServerOptions{
|
||||||
|
Model: "/path/to/model.gguf",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
_, err := limitedManager.CreateInstance("instance1", options)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatalf("CreateInstance 1 failed: %v", err)
|
t.Fatalf("CreateInstance 1 failed: %v", err)
|
||||||
}
|
}
|
||||||
@@ -95,36 +72,31 @@ func TestCreateInstance_ValidationAndLimits(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestPortManagement(t *testing.T) {
|
func TestCreateInstance_FailsWithPortConflict(t *testing.T) {
|
||||||
manager := createTestManager()
|
manager := createTestManager()
|
||||||
|
|
||||||
// Test auto port assignment
|
|
||||||
options1 := &instance.Options{
|
options1 := &instance.Options{
|
||||||
BackendOptions: backends.Options{
|
BackendOptions: backends.Options{
|
||||||
BackendType: backends.BackendTypeLlamaCpp,
|
BackendType: backends.BackendTypeLlamaCpp,
|
||||||
LlamaServerOptions: &backends.LlamaServerOptions{
|
LlamaServerOptions: &backends.LlamaServerOptions{
|
||||||
Model: "/path/to/model.gguf",
|
Model: "/path/to/model.gguf",
|
||||||
|
Port: 8080,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
inst1, err := manager.CreateInstance("instance1", options1)
|
_, err := manager.CreateInstance("instance1", options1)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatalf("CreateInstance failed: %v", err)
|
t.Fatalf("CreateInstance failed: %v", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
port1 := inst1.GetPort()
|
// Try to create instance with same port
|
||||||
if port1 < 8000 || port1 > 9000 {
|
|
||||||
t.Errorf("Expected port in range 8000-9000, got %d", port1)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Test port conflict detection
|
|
||||||
options2 := &instance.Options{
|
options2 := &instance.Options{
|
||||||
BackendOptions: backends.Options{
|
BackendOptions: backends.Options{
|
||||||
BackendType: backends.BackendTypeLlamaCpp,
|
BackendType: backends.BackendTypeLlamaCpp,
|
||||||
LlamaServerOptions: &backends.LlamaServerOptions{
|
LlamaServerOptions: &backends.LlamaServerOptions{
|
||||||
Model: "/path/to/model2.gguf",
|
Model: "/path/to/model2.gguf",
|
||||||
Port: port1, // Same port - should conflict
|
Port: 8080, // Same port - should conflict
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
@@ -136,37 +108,9 @@ func TestPortManagement(t *testing.T) {
|
|||||||
if !strings.Contains(err.Error(), "port") && !strings.Contains(err.Error(), "in use") {
|
if !strings.Contains(err.Error(), "port") && !strings.Contains(err.Error(), "in use") {
|
||||||
t.Errorf("Expected port conflict error, got: %v", err)
|
t.Errorf("Expected port conflict error, got: %v", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Test port release on deletion
|
|
||||||
specificPort := 8080
|
|
||||||
options3 := &instance.Options{
|
|
||||||
BackendOptions: backends.Options{
|
|
||||||
BackendType: backends.BackendTypeLlamaCpp,
|
|
||||||
LlamaServerOptions: &backends.LlamaServerOptions{
|
|
||||||
Model: "/path/to/model.gguf",
|
|
||||||
Port: specificPort,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
_, err = manager.CreateInstance("port-test", options3)
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("CreateInstance failed: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
err = manager.DeleteInstance("port-test")
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("DeleteInstance failed: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Should be able to create new instance with same port
|
|
||||||
_, err = manager.CreateInstance("new-port-test", options3)
|
|
||||||
if err != nil {
|
|
||||||
t.Errorf("Expected to reuse port after deletion, got error: %v", err)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestInstanceOperations(t *testing.T) {
|
func TestInstanceOperations_FailWithNonExistentInstance(t *testing.T) {
|
||||||
manager := createTestManager()
|
manager := createTestManager()
|
||||||
|
|
||||||
options := &instance.Options{
|
options := &instance.Options{
|
||||||
@@ -178,62 +122,7 @@ func TestInstanceOperations(t *testing.T) {
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
// Create instance
|
_, err := manager.GetInstance("nonexistent")
|
||||||
created, err := manager.CreateInstance("test-instance", options)
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("CreateInstance failed: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Get instance
|
|
||||||
retrieved, err := manager.GetInstance("test-instance")
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("GetInstance failed: %v", err)
|
|
||||||
}
|
|
||||||
if retrieved.Name != created.Name {
|
|
||||||
t.Errorf("Expected name %q, got %q", created.Name, retrieved.Name)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Update instance
|
|
||||||
newOptions := &instance.Options{
|
|
||||||
BackendOptions: backends.Options{
|
|
||||||
BackendType: backends.BackendTypeLlamaCpp,
|
|
||||||
LlamaServerOptions: &backends.LlamaServerOptions{
|
|
||||||
Model: "/path/to/new-model.gguf",
|
|
||||||
Port: 8081,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
updated, err := manager.UpdateInstance("test-instance", newOptions)
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("UpdateInstance failed: %v", err)
|
|
||||||
}
|
|
||||||
if updated.GetOptions().BackendOptions.LlamaServerOptions.Model != "/path/to/new-model.gguf" {
|
|
||||||
t.Errorf("Expected model '/path/to/new-model.gguf', got %q", updated.GetOptions().BackendOptions.LlamaServerOptions.Model)
|
|
||||||
}
|
|
||||||
|
|
||||||
// List instances
|
|
||||||
instances, err := manager.ListInstances()
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("ListInstances failed: %v", err)
|
|
||||||
}
|
|
||||||
if len(instances) != 1 {
|
|
||||||
t.Errorf("Expected 1 instance, got %d", len(instances))
|
|
||||||
}
|
|
||||||
|
|
||||||
// Delete instance
|
|
||||||
err = manager.DeleteInstance("test-instance")
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("DeleteInstance failed: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
_, err = manager.GetInstance("test-instance")
|
|
||||||
if err == nil {
|
|
||||||
t.Error("Instance should not exist after deletion")
|
|
||||||
}
|
|
||||||
|
|
||||||
// Test operations on non-existent instances
|
|
||||||
_, err = manager.GetInstance("nonexistent")
|
|
||||||
if err == nil || !strings.Contains(err.Error(), "not found") {
|
if err == nil || !strings.Contains(err.Error(), "not found") {
|
||||||
t.Errorf("Expected 'not found' error, got: %v", err)
|
t.Errorf("Expected 'not found' error, got: %v", err)
|
||||||
}
|
}
|
||||||
@@ -248,3 +137,143 @@ func TestInstanceOperations(t *testing.T) {
|
|||||||
t.Errorf("Expected 'not found' error, got: %v", err)
|
t.Errorf("Expected 'not found' error, got: %v", err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestDeleteInstance_RunningInstanceFails(t *testing.T) {
|
||||||
|
mgr := createTestManager()
|
||||||
|
defer mgr.Shutdown()
|
||||||
|
|
||||||
|
options := &instance.Options{
|
||||||
|
BackendOptions: backends.Options{
|
||||||
|
BackendType: backends.BackendTypeLlamaCpp,
|
||||||
|
LlamaServerOptions: &backends.LlamaServerOptions{
|
||||||
|
Model: "/path/to/model.gguf",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
_, err := mgr.CreateInstance("test-instance", options)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("CreateInstance failed: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
_, err = mgr.StartInstance("test-instance")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("StartInstance failed: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Should fail to delete running instance
|
||||||
|
err = mgr.DeleteInstance("test-instance")
|
||||||
|
if err == nil {
|
||||||
|
t.Error("Expected error when deleting running instance")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestUpdateInstance(t *testing.T) {
|
||||||
|
mgr := createTestManager()
|
||||||
|
defer mgr.Shutdown()
|
||||||
|
|
||||||
|
options := &instance.Options{
|
||||||
|
BackendOptions: backends.Options{
|
||||||
|
BackendType: backends.BackendTypeLlamaCpp,
|
||||||
|
LlamaServerOptions: &backends.LlamaServerOptions{
|
||||||
|
Model: "/path/to/model.gguf",
|
||||||
|
Port: 8080,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
_, err := mgr.CreateInstance("test-instance", options)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("CreateInstance failed: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
_, err = mgr.StartInstance("test-instance")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("StartInstance failed: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update running instance with new model
|
||||||
|
newOptions := &instance.Options{
|
||||||
|
BackendOptions: backends.Options{
|
||||||
|
BackendType: backends.BackendTypeLlamaCpp,
|
||||||
|
LlamaServerOptions: &backends.LlamaServerOptions{
|
||||||
|
Model: "/path/to/new-model.gguf",
|
||||||
|
Port: 8080,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
updated, err := mgr.UpdateInstance("test-instance", newOptions)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("UpdateInstance failed: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Should still be running after update
|
||||||
|
if !updated.IsRunning() {
|
||||||
|
t.Error("Instance should be running after update")
|
||||||
|
}
|
||||||
|
|
||||||
|
if updated.GetOptions().BackendOptions.LlamaServerOptions.Model != "/path/to/new-model.gguf" {
|
||||||
|
t.Errorf("Expected model to be updated")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestUpdateInstance_ReleasesOldPort(t *testing.T) {
|
||||||
|
mgr := createTestManager()
|
||||||
|
defer mgr.Shutdown()
|
||||||
|
|
||||||
|
options := &instance.Options{
|
||||||
|
BackendOptions: backends.Options{
|
||||||
|
BackendType: backends.BackendTypeLlamaCpp,
|
||||||
|
LlamaServerOptions: &backends.LlamaServerOptions{
|
||||||
|
Model: "/path/to/model.gguf",
|
||||||
|
Port: 8080,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
inst, err := mgr.CreateInstance("test-instance", options)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("CreateInstance failed: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if inst.GetPort() != 8080 {
|
||||||
|
t.Errorf("Expected port 8080, got %d", inst.GetPort())
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update with new port
|
||||||
|
newOptions := &instance.Options{
|
||||||
|
BackendOptions: backends.Options{
|
||||||
|
BackendType: backends.BackendTypeLlamaCpp,
|
||||||
|
LlamaServerOptions: &backends.LlamaServerOptions{
|
||||||
|
Model: "/path/to/model.gguf",
|
||||||
|
Port: 8081,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
updated, err := mgr.UpdateInstance("test-instance", newOptions)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("UpdateInstance failed: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if updated.GetPort() != 8081 {
|
||||||
|
t.Errorf("Expected port 8081, got %d", updated.GetPort())
|
||||||
|
}
|
||||||
|
|
||||||
|
// Old port should be released - try to create new instance with old port
|
||||||
|
options2 := &instance.Options{
|
||||||
|
BackendOptions: backends.Options{
|
||||||
|
BackendType: backends.BackendTypeLlamaCpp,
|
||||||
|
LlamaServerOptions: &backends.LlamaServerOptions{
|
||||||
|
Model: "/path/to/model2.gguf",
|
||||||
|
Port: 8080,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
_, err = mgr.CreateInstance("test-instance-2", options2)
|
||||||
|
if err != nil {
|
||||||
|
t.Errorf("Should be able to use old port 8080: %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
223
pkg/manager/persistence.go
Normal file
223
pkg/manager/persistence.go
Normal file
@@ -0,0 +1,223 @@
|
|||||||
|
package manager
|
||||||
|
|
||||||
|
import (
|
||||||
|
"encoding/json"
|
||||||
|
"fmt"
|
||||||
|
"llamactl/pkg/instance"
|
||||||
|
"log"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"strings"
|
||||||
|
"sync"
|
||||||
|
)
|
||||||
|
|
||||||
|
// instancePersister provides atomic file-based persistence with durability guarantees.
|
||||||
|
type instancePersister struct {
|
||||||
|
mu sync.Mutex
|
||||||
|
instancesDir string
|
||||||
|
enabled bool
|
||||||
|
}
|
||||||
|
|
||||||
|
// newInstancePersister creates a new instance persister.
|
||||||
|
// If instancesDir is empty, persistence is disabled.
|
||||||
|
func newInstancePersister(instancesDir string) (*instancePersister, error) {
|
||||||
|
if instancesDir == "" {
|
||||||
|
return &instancePersister{
|
||||||
|
enabled: false,
|
||||||
|
}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Ensure the instances directory exists
|
||||||
|
if err := os.MkdirAll(instancesDir, 0755); err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to create instances directory: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
return &instancePersister{
|
||||||
|
instancesDir: instancesDir,
|
||||||
|
enabled: true,
|
||||||
|
}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Save persists an instance to disk with atomic write
|
||||||
|
func (p *instancePersister) save(inst *instance.Instance) error {
|
||||||
|
if !p.enabled {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
if inst == nil {
|
||||||
|
return fmt.Errorf("cannot save nil instance")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Validate instance name to prevent path traversal
|
||||||
|
validatedName, err := p.validateInstanceName(inst.Name)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
p.mu.Lock()
|
||||||
|
defer p.mu.Unlock()
|
||||||
|
|
||||||
|
instancePath := filepath.Join(p.instancesDir, validatedName+".json")
|
||||||
|
tempPath := instancePath + ".tmp"
|
||||||
|
|
||||||
|
// Serialize instance to JSON
|
||||||
|
jsonData, err := json.MarshalIndent(inst, "", " ")
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("failed to marshal instance %s: %w", inst.Name, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create temporary file
|
||||||
|
tempFile, err := os.OpenFile(tempPath, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0644)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("failed to create temp file for instance %s: %w", inst.Name, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Write data to temporary file
|
||||||
|
if _, err := tempFile.Write(jsonData); err != nil {
|
||||||
|
tempFile.Close()
|
||||||
|
os.Remove(tempPath)
|
||||||
|
return fmt.Errorf("failed to write temp file for instance %s: %w", inst.Name, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Sync to disk before rename to ensure durability
|
||||||
|
if err := tempFile.Sync(); err != nil {
|
||||||
|
tempFile.Close()
|
||||||
|
os.Remove(tempPath)
|
||||||
|
return fmt.Errorf("failed to sync temp file for instance %s: %w", inst.Name, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Close the file
|
||||||
|
if err := tempFile.Close(); err != nil {
|
||||||
|
os.Remove(tempPath)
|
||||||
|
return fmt.Errorf("failed to close temp file for instance %s: %w", inst.Name, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Atomic rename (this is atomic on POSIX systems)
|
||||||
|
if err := os.Rename(tempPath, instancePath); err != nil {
|
||||||
|
os.Remove(tempPath)
|
||||||
|
return fmt.Errorf("failed to rename temp file for instance %s: %w", inst.Name, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Delete removes an instance's persistence file from disk.
|
||||||
|
func (p *instancePersister) delete(name string) error {
|
||||||
|
if !p.enabled {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
validatedName, err := p.validateInstanceName(name)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
p.mu.Lock()
|
||||||
|
defer p.mu.Unlock()
|
||||||
|
|
||||||
|
instancePath := filepath.Join(p.instancesDir, validatedName+".json")
|
||||||
|
|
||||||
|
if err := os.Remove(instancePath); err != nil {
|
||||||
|
if os.IsNotExist(err) {
|
||||||
|
// Not an error if file doesn't exist
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return fmt.Errorf("failed to delete instance file for %s: %w", name, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// LoadAll loads all persisted instances from disk.
|
||||||
|
// Returns a slice of instances and any errors encountered during loading.
|
||||||
|
func (p *instancePersister) loadAll() ([]*instance.Instance, error) {
|
||||||
|
if !p.enabled {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
p.mu.Lock()
|
||||||
|
defer p.mu.Unlock()
|
||||||
|
|
||||||
|
// Check if instances directory exists
|
||||||
|
if _, err := os.Stat(p.instancesDir); os.IsNotExist(err) {
|
||||||
|
return nil, nil // No instances directory, return empty list
|
||||||
|
}
|
||||||
|
|
||||||
|
// Read all JSON files from instances directory
|
||||||
|
files, err := os.ReadDir(p.instancesDir)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to read instances directory: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
instances := make([]*instance.Instance, 0)
|
||||||
|
var loadErrors []string
|
||||||
|
|
||||||
|
for _, file := range files {
|
||||||
|
if file.IsDir() || !strings.HasSuffix(file.Name(), ".json") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
instanceName := strings.TrimSuffix(file.Name(), ".json")
|
||||||
|
instancePath := filepath.Join(p.instancesDir, file.Name())
|
||||||
|
|
||||||
|
inst, err := p.loadInstanceFile(instanceName, instancePath)
|
||||||
|
if err != nil {
|
||||||
|
log.Printf("Failed to load instance %s: %v", instanceName, err)
|
||||||
|
loadErrors = append(loadErrors, fmt.Sprintf("%s: %v", instanceName, err))
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
instances = append(instances, inst)
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(loadErrors) > 0 {
|
||||||
|
log.Printf("Loaded %d instances with %d errors", len(instances), len(loadErrors))
|
||||||
|
} else if len(instances) > 0 {
|
||||||
|
log.Printf("Loaded %d instances from persistence", len(instances))
|
||||||
|
}
|
||||||
|
|
||||||
|
return instances, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// loadInstanceFile is an internal helper that loads a single instance file.
|
||||||
|
// Note: This assumes the mutex is already held by the caller.
|
||||||
|
func (p *instancePersister) loadInstanceFile(name, path string) (*instance.Instance, error) {
|
||||||
|
data, err := os.ReadFile(path)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to read instance file: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
var inst instance.Instance
|
||||||
|
if err := json.Unmarshal(data, &inst); err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to unmarshal instance: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Validate the instance name matches the filename
|
||||||
|
if inst.Name != name {
|
||||||
|
return nil, fmt.Errorf("instance name mismatch: file=%s, instance.Name=%s", name, inst.Name)
|
||||||
|
}
|
||||||
|
|
||||||
|
return &inst, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// validateInstanceName ensures the instance name is safe for filesystem operations.
|
||||||
|
// Returns the validated name if valid, or an error if invalid.
|
||||||
|
func (p *instancePersister) validateInstanceName(name string) (string, error) {
|
||||||
|
if name == "" {
|
||||||
|
return "", fmt.Errorf("instance name cannot be empty")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check for path separators and parent directory references
|
||||||
|
// This prevents path traversal attacks
|
||||||
|
if strings.Contains(name, "/") || strings.Contains(name, "\\") || strings.Contains(name, "..") {
|
||||||
|
return "", fmt.Errorf("invalid instance name: %s (cannot contain path separators or '..')", name)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Additional check: ensure the name doesn't start with a dot (hidden files)
|
||||||
|
// or contain any other suspicious characters
|
||||||
|
if strings.HasPrefix(name, ".") {
|
||||||
|
return "", fmt.Errorf("invalid instance name: %s (cannot start with '.')", name)
|
||||||
|
}
|
||||||
|
|
||||||
|
return name, nil
|
||||||
|
}
|
||||||
184
pkg/manager/ports.go
Normal file
184
pkg/manager/ports.go
Normal file
@@ -0,0 +1,184 @@
|
|||||||
|
package manager
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"math/bits"
|
||||||
|
"sync"
|
||||||
|
)
|
||||||
|
|
||||||
|
// portAllocator provides efficient port allocation using a bitmap for O(1) operations.
|
||||||
|
// The bitmap approach prevents unbounded memory growth and simplifies port management.
|
||||||
|
type portAllocator struct {
|
||||||
|
mu sync.Mutex
|
||||||
|
|
||||||
|
// Bitmap for O(1) allocation/release
|
||||||
|
// Each bit represents a port (1 = allocated, 0 = free)
|
||||||
|
bitmap []uint64 // Each uint64 covers 64 ports
|
||||||
|
|
||||||
|
// Map port to instance name for cleanup operations
|
||||||
|
allocated map[int]string
|
||||||
|
|
||||||
|
minPort int
|
||||||
|
maxPort int
|
||||||
|
rangeSize int
|
||||||
|
}
|
||||||
|
|
||||||
|
// newPortAllocator creates a new port allocator for the given port range.
|
||||||
|
// Returns an error if the port range is invalid.
|
||||||
|
func newPortAllocator(minPort, maxPort int) (*portAllocator, error) {
|
||||||
|
if minPort <= 0 || maxPort <= 0 {
|
||||||
|
return nil, fmt.Errorf("invalid port range: min=%d, max=%d (must be > 0)", minPort, maxPort)
|
||||||
|
}
|
||||||
|
if minPort > maxPort {
|
||||||
|
return nil, fmt.Errorf("invalid port range: min=%d > max=%d", minPort, maxPort)
|
||||||
|
}
|
||||||
|
|
||||||
|
rangeSize := maxPort - minPort + 1
|
||||||
|
bitmapSize := (rangeSize + 63) / 64 // Round up to nearest uint64
|
||||||
|
|
||||||
|
return &portAllocator{
|
||||||
|
bitmap: make([]uint64, bitmapSize),
|
||||||
|
allocated: make(map[int]string),
|
||||||
|
minPort: minPort,
|
||||||
|
maxPort: maxPort,
|
||||||
|
rangeSize: rangeSize,
|
||||||
|
}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// allocate finds and allocates the first available port for the given instance.
|
||||||
|
// Returns the allocated port or an error if no ports are available.
|
||||||
|
func (p *portAllocator) allocate(instanceName string) (int, error) {
|
||||||
|
if instanceName == "" {
|
||||||
|
return 0, fmt.Errorf("instance name cannot be empty")
|
||||||
|
}
|
||||||
|
|
||||||
|
p.mu.Lock()
|
||||||
|
defer p.mu.Unlock()
|
||||||
|
|
||||||
|
port, err := p.findFirstFreeBit()
|
||||||
|
if err != nil {
|
||||||
|
return 0, err
|
||||||
|
}
|
||||||
|
|
||||||
|
p.setBit(port)
|
||||||
|
p.allocated[port] = instanceName
|
||||||
|
|
||||||
|
return port, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// allocateSpecific allocates a specific port for the given instance.
|
||||||
|
// Returns an error if the port is already allocated or out of range.
|
||||||
|
func (p *portAllocator) allocateSpecific(port int, instanceName string) error {
|
||||||
|
if instanceName == "" {
|
||||||
|
return fmt.Errorf("instance name cannot be empty")
|
||||||
|
}
|
||||||
|
if port < p.minPort || port > p.maxPort {
|
||||||
|
return fmt.Errorf("port %d is out of range [%d-%d]", port, p.minPort, p.maxPort)
|
||||||
|
}
|
||||||
|
|
||||||
|
p.mu.Lock()
|
||||||
|
defer p.mu.Unlock()
|
||||||
|
|
||||||
|
if p.isBitSet(port) {
|
||||||
|
return fmt.Errorf("port %d is already allocated", port)
|
||||||
|
}
|
||||||
|
|
||||||
|
p.setBit(port)
|
||||||
|
p.allocated[port] = instanceName
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// release releases a specific port, making it available for reuse.
|
||||||
|
// Returns an error if the port is not allocated.
|
||||||
|
func (p *portAllocator) release(port int) error {
|
||||||
|
if port < p.minPort || port > p.maxPort {
|
||||||
|
return fmt.Errorf("port %d is out of range [%d-%d]", port, p.minPort, p.maxPort)
|
||||||
|
}
|
||||||
|
|
||||||
|
p.mu.Lock()
|
||||||
|
defer p.mu.Unlock()
|
||||||
|
|
||||||
|
if !p.isBitSet(port) {
|
||||||
|
return fmt.Errorf("port %d is not allocated", port)
|
||||||
|
}
|
||||||
|
|
||||||
|
p.clearBit(port)
|
||||||
|
delete(p.allocated, port)
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// releaseByInstance releases all ports allocated to the given instance.
|
||||||
|
// This is useful for cleanup when deleting or updating an instance.
|
||||||
|
// Returns the number of ports released.
|
||||||
|
func (p *portAllocator) releaseByInstance(instanceName string) int {
|
||||||
|
if instanceName == "" {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
p.mu.Lock()
|
||||||
|
defer p.mu.Unlock()
|
||||||
|
|
||||||
|
portsToRelease := make([]int, 0)
|
||||||
|
for port, name := range p.allocated {
|
||||||
|
if name == instanceName {
|
||||||
|
portsToRelease = append(portsToRelease, port)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, port := range portsToRelease {
|
||||||
|
p.clearBit(port)
|
||||||
|
delete(p.allocated, port)
|
||||||
|
}
|
||||||
|
|
||||||
|
return len(portsToRelease)
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- Internal bitmap operations ---
|
||||||
|
|
||||||
|
// portToBitPos converts a port number to bitmap array index and bit position.
|
||||||
|
func (p *portAllocator) portToBitPos(port int) (index int, bit uint) {
|
||||||
|
offset := port - p.minPort
|
||||||
|
index = offset / 64
|
||||||
|
bit = uint(offset % 64)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// setBit marks a port as allocated in the bitmap.
|
||||||
|
func (p *portAllocator) setBit(port int) {
|
||||||
|
index, bit := p.portToBitPos(port)
|
||||||
|
p.bitmap[index] |= (1 << bit)
|
||||||
|
}
|
||||||
|
|
||||||
|
// clearBit marks a port as free in the bitmap.
|
||||||
|
func (p *portAllocator) clearBit(port int) {
|
||||||
|
index, bit := p.portToBitPos(port)
|
||||||
|
p.bitmap[index] &^= (1 << bit)
|
||||||
|
}
|
||||||
|
|
||||||
|
// isBitSet checks if a port is allocated in the bitmap.
|
||||||
|
func (p *portAllocator) isBitSet(port int) bool {
|
||||||
|
index, bit := p.portToBitPos(port)
|
||||||
|
return (p.bitmap[index] & (1 << bit)) != 0
|
||||||
|
}
|
||||||
|
|
||||||
|
// findFirstFreeBit scans the bitmap to find the first unallocated port.
|
||||||
|
// Returns the port number or an error if no ports are available.
|
||||||
|
func (p *portAllocator) findFirstFreeBit() (int, error) {
|
||||||
|
for i, word := range p.bitmap {
|
||||||
|
if word != ^uint64(0) { // Not all bits are set (some ports are free)
|
||||||
|
// Find the first 0 bit in this word
|
||||||
|
// XOR with all 1s to flip bits, then find first 1 (which was 0)
|
||||||
|
bit := bits.TrailingZeros64(^word)
|
||||||
|
port := p.minPort + (i * 64) + bit
|
||||||
|
|
||||||
|
// Ensure we don't go beyond maxPort due to bitmap rounding
|
||||||
|
if port <= p.maxPort {
|
||||||
|
return port, nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0, fmt.Errorf("no available ports in range [%d-%d]", p.minPort, p.maxPort)
|
||||||
|
}
|
||||||
121
pkg/manager/registry.go
Normal file
121
pkg/manager/registry.go
Normal file
@@ -0,0 +1,121 @@
|
|||||||
|
package manager
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"llamactl/pkg/instance"
|
||||||
|
"sync"
|
||||||
|
)
|
||||||
|
|
||||||
|
// instanceRegistry provides thread-safe storage and lookup of instances
|
||||||
|
// with running state tracking using lock-free sync.Map for status checks.
|
||||||
|
type instanceRegistry struct {
|
||||||
|
mu sync.RWMutex
|
||||||
|
instances map[string]*instance.Instance
|
||||||
|
running sync.Map // map[string]struct{} - lock-free for status checks
|
||||||
|
}
|
||||||
|
|
||||||
|
// newInstanceRegistry creates a new instance registry.
|
||||||
|
func newInstanceRegistry() *instanceRegistry {
|
||||||
|
return &instanceRegistry{
|
||||||
|
instances: make(map[string]*instance.Instance),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get retrieves an instance by name.
|
||||||
|
// Returns the instance and true if found, nil and false otherwise.
|
||||||
|
func (r *instanceRegistry) get(name string) (*instance.Instance, bool) {
|
||||||
|
r.mu.RLock()
|
||||||
|
defer r.mu.RUnlock()
|
||||||
|
|
||||||
|
inst, exists := r.instances[name]
|
||||||
|
return inst, exists
|
||||||
|
}
|
||||||
|
|
||||||
|
// List returns a snapshot copy of all instances to prevent external mutation.
|
||||||
|
func (r *instanceRegistry) list() []*instance.Instance {
|
||||||
|
r.mu.RLock()
|
||||||
|
defer r.mu.RUnlock()
|
||||||
|
|
||||||
|
result := make([]*instance.Instance, 0, len(r.instances))
|
||||||
|
for _, inst := range r.instances {
|
||||||
|
result = append(result, inst)
|
||||||
|
}
|
||||||
|
return result
|
||||||
|
}
|
||||||
|
|
||||||
|
// ListRunning returns a snapshot of all currently running instances.
|
||||||
|
func (r *instanceRegistry) listRunning() []*instance.Instance {
|
||||||
|
r.mu.RLock()
|
||||||
|
defer r.mu.RUnlock()
|
||||||
|
|
||||||
|
result := make([]*instance.Instance, 0)
|
||||||
|
for name, inst := range r.instances {
|
||||||
|
if _, isRunning := r.running.Load(name); isRunning {
|
||||||
|
result = append(result, inst)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add adds a new instance to the registry.
|
||||||
|
// Returns an error if an instance with the same name already exists.
|
||||||
|
func (r *instanceRegistry) add(inst *instance.Instance) error {
|
||||||
|
if inst == nil {
|
||||||
|
return fmt.Errorf("cannot add nil instance")
|
||||||
|
}
|
||||||
|
|
||||||
|
r.mu.Lock()
|
||||||
|
defer r.mu.Unlock()
|
||||||
|
|
||||||
|
if _, exists := r.instances[inst.Name]; exists {
|
||||||
|
return fmt.Errorf("instance %s already exists", inst.Name)
|
||||||
|
}
|
||||||
|
|
||||||
|
r.instances[inst.Name] = inst
|
||||||
|
|
||||||
|
// Initialize running state if the instance is running
|
||||||
|
if inst.IsRunning() {
|
||||||
|
r.running.Store(inst.Name, struct{}{})
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Remove removes an instance from the registry.
|
||||||
|
// Returns an error if the instance doesn't exist.
|
||||||
|
func (r *instanceRegistry) remove(name string) error {
|
||||||
|
r.mu.Lock()
|
||||||
|
defer r.mu.Unlock()
|
||||||
|
|
||||||
|
if _, exists := r.instances[name]; !exists {
|
||||||
|
return fmt.Errorf("instance %s not found", name)
|
||||||
|
}
|
||||||
|
|
||||||
|
delete(r.instances, name)
|
||||||
|
r.running.Delete(name)
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// MarkRunning marks an instance as running using lock-free sync.Map.
|
||||||
|
func (r *instanceRegistry) markRunning(name string) {
|
||||||
|
r.running.Store(name, struct{}{})
|
||||||
|
}
|
||||||
|
|
||||||
|
// MarkStopped marks an instance as stopped using lock-free sync.Map.
|
||||||
|
func (r *instanceRegistry) markStopped(name string) {
|
||||||
|
r.running.Delete(name)
|
||||||
|
}
|
||||||
|
|
||||||
|
// IsRunning checks if an instance is running using lock-free sync.Map.
|
||||||
|
func (r *instanceRegistry) isRunning(name string) bool {
|
||||||
|
_, isRunning := r.running.Load(name)
|
||||||
|
return isRunning
|
||||||
|
}
|
||||||
|
|
||||||
|
// Count returns the total number of instances in the registry.
|
||||||
|
func (r *instanceRegistry) count() int {
|
||||||
|
r.mu.RLock()
|
||||||
|
defer r.mu.RUnlock()
|
||||||
|
return len(r.instances)
|
||||||
|
}
|
||||||
293
pkg/manager/remote.go
Normal file
293
pkg/manager/remote.go
Normal file
@@ -0,0 +1,293 @@
|
|||||||
|
package manager
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"context"
|
||||||
|
"encoding/json"
|
||||||
|
"fmt"
|
||||||
|
"io"
|
||||||
|
"llamactl/pkg/config"
|
||||||
|
"llamactl/pkg/instance"
|
||||||
|
"net/http"
|
||||||
|
"net/url"
|
||||||
|
"sync"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
const apiBasePath = "/api/v1/instances/"
|
||||||
|
|
||||||
|
// remoteManager handles HTTP operations for remote instances.
|
||||||
|
type remoteManager struct {
|
||||||
|
mu sync.RWMutex
|
||||||
|
client *http.Client
|
||||||
|
nodeMap map[string]*config.NodeConfig // node name -> node config
|
||||||
|
instanceToNode map[string]*config.NodeConfig // instance name -> node config
|
||||||
|
}
|
||||||
|
|
||||||
|
// newRemoteManager creates a new remote manager.
|
||||||
|
func newRemoteManager(nodes map[string]config.NodeConfig, timeout time.Duration) *remoteManager {
|
||||||
|
if timeout <= 0 {
|
||||||
|
timeout = 30 * time.Second
|
||||||
|
}
|
||||||
|
|
||||||
|
// Build node config map
|
||||||
|
nodeMap := make(map[string]*config.NodeConfig)
|
||||||
|
for name := range nodes {
|
||||||
|
nodeCopy := nodes[name]
|
||||||
|
nodeMap[name] = &nodeCopy
|
||||||
|
}
|
||||||
|
|
||||||
|
return &remoteManager{
|
||||||
|
client: &http.Client{
|
||||||
|
Timeout: timeout,
|
||||||
|
},
|
||||||
|
nodeMap: nodeMap,
|
||||||
|
instanceToNode: make(map[string]*config.NodeConfig),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// GetNodeForInstance returns the node configuration for a given instance.
|
||||||
|
// Returns nil if the instance is not mapped to any node.
|
||||||
|
func (rm *remoteManager) getNodeForInstance(instanceName string) (*config.NodeConfig, bool) {
|
||||||
|
rm.mu.RLock()
|
||||||
|
defer rm.mu.RUnlock()
|
||||||
|
|
||||||
|
node, exists := rm.instanceToNode[instanceName]
|
||||||
|
return node, exists
|
||||||
|
}
|
||||||
|
|
||||||
|
// SetInstanceNode maps an instance to a specific node.
|
||||||
|
// Returns an error if the node doesn't exist.
|
||||||
|
func (rm *remoteManager) setInstanceNode(instanceName, nodeName string) error {
|
||||||
|
rm.mu.Lock()
|
||||||
|
defer rm.mu.Unlock()
|
||||||
|
|
||||||
|
node, exists := rm.nodeMap[nodeName]
|
||||||
|
if !exists {
|
||||||
|
return fmt.Errorf("node %s not found", nodeName)
|
||||||
|
}
|
||||||
|
|
||||||
|
rm.instanceToNode[instanceName] = node
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// RemoveInstance removes the instance-to-node mapping.
|
||||||
|
func (rm *remoteManager) removeInstance(instanceName string) {
|
||||||
|
rm.mu.Lock()
|
||||||
|
defer rm.mu.Unlock()
|
||||||
|
|
||||||
|
delete(rm.instanceToNode, instanceName)
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- HTTP request helpers ---
|
||||||
|
|
||||||
|
// makeRemoteRequest creates and executes an HTTP request to a remote node with context support.
|
||||||
|
func (rm *remoteManager) makeRemoteRequest(ctx context.Context, nodeConfig *config.NodeConfig, method, path string, body any) (*http.Response, error) {
|
||||||
|
var reqBody io.Reader
|
||||||
|
if body != nil {
|
||||||
|
jsonData, err := json.Marshal(body)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to marshal request body: %w", err)
|
||||||
|
}
|
||||||
|
reqBody = bytes.NewBuffer(jsonData)
|
||||||
|
}
|
||||||
|
|
||||||
|
url := fmt.Sprintf("%s%s", nodeConfig.Address, path)
|
||||||
|
req, err := http.NewRequestWithContext(ctx, method, url, reqBody)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to create request: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if body != nil {
|
||||||
|
req.Header.Set("Content-Type", "application/json")
|
||||||
|
}
|
||||||
|
|
||||||
|
if nodeConfig.APIKey != "" {
|
||||||
|
req.Header.Set("Authorization", fmt.Sprintf("Bearer %s", nodeConfig.APIKey))
|
||||||
|
}
|
||||||
|
|
||||||
|
resp, err := rm.client.Do(req)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to execute request: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
return resp, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// parseRemoteResponse parses an HTTP response and unmarshals the result.
|
||||||
|
func parseRemoteResponse(resp *http.Response, result any) error {
|
||||||
|
defer resp.Body.Close()
|
||||||
|
|
||||||
|
body, err := io.ReadAll(resp.Body)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("failed to read response body: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
|
||||||
|
return fmt.Errorf("API request failed with status %d: %s", resp.StatusCode, string(body))
|
||||||
|
}
|
||||||
|
|
||||||
|
if result != nil {
|
||||||
|
if err := json.Unmarshal(body, result); err != nil {
|
||||||
|
return fmt.Errorf("failed to unmarshal response: %w", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- Remote CRUD operations ---
|
||||||
|
|
||||||
|
// createInstance creates a new instance on a remote node.
|
||||||
|
func (rm *remoteManager) createInstance(ctx context.Context, node *config.NodeConfig, name string, opts *instance.Options) (*instance.Instance, error) {
|
||||||
|
escapedName := url.PathEscape(name)
|
||||||
|
|
||||||
|
path := fmt.Sprintf("%s%s/", apiBasePath, escapedName)
|
||||||
|
|
||||||
|
resp, err := rm.makeRemoteRequest(ctx, node, "POST", path, opts)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
var inst instance.Instance
|
||||||
|
if err := parseRemoteResponse(resp, &inst); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
return &inst, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// getInstance retrieves an instance by name from a remote node.
|
||||||
|
func (rm *remoteManager) getInstance(ctx context.Context, node *config.NodeConfig, name string) (*instance.Instance, error) {
|
||||||
|
|
||||||
|
escapedName := url.PathEscape(name)
|
||||||
|
|
||||||
|
path := fmt.Sprintf("%s%s/", apiBasePath, escapedName)
|
||||||
|
resp, err := rm.makeRemoteRequest(ctx, node, "GET", path, nil)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
var inst instance.Instance
|
||||||
|
if err := parseRemoteResponse(resp, &inst); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
return &inst, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// updateInstance updates an existing instance on a remote node.
|
||||||
|
func (rm *remoteManager) updateInstance(ctx context.Context, node *config.NodeConfig, name string, opts *instance.Options) (*instance.Instance, error) {
|
||||||
|
|
||||||
|
escapedName := url.PathEscape(name)
|
||||||
|
|
||||||
|
path := fmt.Sprintf("%s%s/", apiBasePath, escapedName)
|
||||||
|
|
||||||
|
resp, err := rm.makeRemoteRequest(ctx, node, "PUT", path, opts)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
var inst instance.Instance
|
||||||
|
if err := parseRemoteResponse(resp, &inst); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
return &inst, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// deleteInstance deletes an instance from a remote node.
|
||||||
|
func (rm *remoteManager) deleteInstance(ctx context.Context, node *config.NodeConfig, name string) error {
|
||||||
|
|
||||||
|
escapedName := url.PathEscape(name)
|
||||||
|
|
||||||
|
path := fmt.Sprintf("%s%s/", apiBasePath, escapedName)
|
||||||
|
resp, err := rm.makeRemoteRequest(ctx, node, "DELETE", path, nil)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
return parseRemoteResponse(resp, nil)
|
||||||
|
}
|
||||||
|
|
||||||
|
// startInstance starts an instance on a remote node.
|
||||||
|
func (rm *remoteManager) startInstance(ctx context.Context, node *config.NodeConfig, name string) (*instance.Instance, error) {
|
||||||
|
|
||||||
|
escapedName := url.PathEscape(name)
|
||||||
|
|
||||||
|
path := fmt.Sprintf("%s%s/start", apiBasePath, escapedName)
|
||||||
|
resp, err := rm.makeRemoteRequest(ctx, node, "POST", path, nil)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
var inst instance.Instance
|
||||||
|
if err := parseRemoteResponse(resp, &inst); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
return &inst, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// stopInstance stops an instance on a remote node.
|
||||||
|
func (rm *remoteManager) stopInstance(ctx context.Context, node *config.NodeConfig, name string) (*instance.Instance, error) {
|
||||||
|
|
||||||
|
escapedName := url.PathEscape(name)
|
||||||
|
|
||||||
|
path := fmt.Sprintf("%s%s/stop", apiBasePath, escapedName)
|
||||||
|
resp, err := rm.makeRemoteRequest(ctx, node, "POST", path, nil)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
var inst instance.Instance
|
||||||
|
if err := parseRemoteResponse(resp, &inst); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
return &inst, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// restartInstance restarts an instance on a remote node.
|
||||||
|
func (rm *remoteManager) restartInstance(ctx context.Context, node *config.NodeConfig, name string) (*instance.Instance, error) {
|
||||||
|
escapedName := url.PathEscape(name)
|
||||||
|
|
||||||
|
path := fmt.Sprintf("%s%s/restart", apiBasePath, escapedName)
|
||||||
|
resp, err := rm.makeRemoteRequest(ctx, node, "POST", path, nil)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
var inst instance.Instance
|
||||||
|
if err := parseRemoteResponse(resp, &inst); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
return &inst, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// getInstanceLogs retrieves logs for an instance from a remote node.
|
||||||
|
func (rm *remoteManager) getInstanceLogs(ctx context.Context, node *config.NodeConfig, name string, numLines int) (string, error) {
|
||||||
|
|
||||||
|
escapedName := url.PathEscape(name)
|
||||||
|
|
||||||
|
path := fmt.Sprintf("%s%s/logs?lines=%d", apiBasePath, escapedName, numLines)
|
||||||
|
resp, err := rm.makeRemoteRequest(ctx, node, "GET", path, nil)
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
|
||||||
|
defer resp.Body.Close()
|
||||||
|
|
||||||
|
body, err := io.ReadAll(resp.Body)
|
||||||
|
if err != nil {
|
||||||
|
return "", fmt.Errorf("failed to read response body: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
|
||||||
|
return "", fmt.Errorf("API request failed with status %d: %s", resp.StatusCode, string(body))
|
||||||
|
}
|
||||||
|
|
||||||
|
// Logs endpoint returns plain text (Content-Type: text/plain)
|
||||||
|
return string(body), nil
|
||||||
|
}
|
||||||
@@ -1,222 +0,0 @@
|
|||||||
package manager
|
|
||||||
|
|
||||||
import (
|
|
||||||
"bytes"
|
|
||||||
"encoding/json"
|
|
||||||
"fmt"
|
|
||||||
"io"
|
|
||||||
"llamactl/pkg/config"
|
|
||||||
"llamactl/pkg/instance"
|
|
||||||
"net/http"
|
|
||||||
)
|
|
||||||
|
|
||||||
// makeRemoteRequest is a helper function to make HTTP requests to a remote node
|
|
||||||
func (im *instanceManager) makeRemoteRequest(nodeConfig *config.NodeConfig, method, path string, body any) (*http.Response, error) {
|
|
||||||
var reqBody io.Reader
|
|
||||||
if body != nil {
|
|
||||||
jsonData, err := json.Marshal(body)
|
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("failed to marshal request body: %w", err)
|
|
||||||
}
|
|
||||||
reqBody = bytes.NewBuffer(jsonData)
|
|
||||||
}
|
|
||||||
|
|
||||||
url := fmt.Sprintf("%s%s", nodeConfig.Address, path)
|
|
||||||
req, err := http.NewRequest(method, url, reqBody)
|
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("failed to create request: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
if body != nil {
|
|
||||||
req.Header.Set("Content-Type", "application/json")
|
|
||||||
}
|
|
||||||
|
|
||||||
if nodeConfig.APIKey != "" {
|
|
||||||
req.Header.Set("Authorization", fmt.Sprintf("Bearer %s", nodeConfig.APIKey))
|
|
||||||
}
|
|
||||||
|
|
||||||
resp, err := im.httpClient.Do(req)
|
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("failed to execute request: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
return resp, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// parseRemoteResponse is a helper function to parse API responses
|
|
||||||
func parseRemoteResponse(resp *http.Response, result any) error {
|
|
||||||
defer resp.Body.Close()
|
|
||||||
|
|
||||||
body, err := io.ReadAll(resp.Body)
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("failed to read response body: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
|
|
||||||
return fmt.Errorf("API request failed with status %d: %s", resp.StatusCode, string(body))
|
|
||||||
}
|
|
||||||
|
|
||||||
if result != nil {
|
|
||||||
if err := json.Unmarshal(body, result); err != nil {
|
|
||||||
return fmt.Errorf("failed to unmarshal response: %w", err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// ListRemoteInstances lists all instances on the remote node
|
|
||||||
func (im *instanceManager) ListRemoteInstances(nodeConfig *config.NodeConfig) ([]*instance.Instance, error) {
|
|
||||||
resp, err := im.makeRemoteRequest(nodeConfig, "GET", "/api/v1/instances/", nil)
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
var instances []*instance.Instance
|
|
||||||
if err := parseRemoteResponse(resp, &instances); err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
return instances, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// CreateRemoteInstance creates a new instance on the remote node
|
|
||||||
func (im *instanceManager) CreateRemoteInstance(nodeConfig *config.NodeConfig, name string, options *instance.Options) (*instance.Instance, error) {
|
|
||||||
path := fmt.Sprintf("/api/v1/instances/%s/", name)
|
|
||||||
|
|
||||||
resp, err := im.makeRemoteRequest(nodeConfig, "POST", path, options)
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
var inst instance.Instance
|
|
||||||
if err := parseRemoteResponse(resp, &inst); err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
return &inst, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// GetRemoteInstance retrieves an instance by name from the remote node
|
|
||||||
func (im *instanceManager) GetRemoteInstance(nodeConfig *config.NodeConfig, name string) (*instance.Instance, error) {
|
|
||||||
path := fmt.Sprintf("/api/v1/instances/%s/", name)
|
|
||||||
resp, err := im.makeRemoteRequest(nodeConfig, "GET", path, nil)
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
var inst instance.Instance
|
|
||||||
if err := parseRemoteResponse(resp, &inst); err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
return &inst, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// UpdateRemoteInstance updates an existing instance on the remote node
|
|
||||||
func (im *instanceManager) UpdateRemoteInstance(nodeConfig *config.NodeConfig, name string, options *instance.Options) (*instance.Instance, error) {
|
|
||||||
path := fmt.Sprintf("/api/v1/instances/%s/", name)
|
|
||||||
|
|
||||||
resp, err := im.makeRemoteRequest(nodeConfig, "PUT", path, options)
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
var inst instance.Instance
|
|
||||||
if err := parseRemoteResponse(resp, &inst); err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
return &inst, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// DeleteRemoteInstance deletes an instance from the remote node
|
|
||||||
func (im *instanceManager) DeleteRemoteInstance(nodeConfig *config.NodeConfig, name string) error {
|
|
||||||
path := fmt.Sprintf("/api/v1/instances/%s/", name)
|
|
||||||
resp, err := im.makeRemoteRequest(nodeConfig, "DELETE", path, nil)
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
return parseRemoteResponse(resp, nil)
|
|
||||||
}
|
|
||||||
|
|
||||||
// StartRemoteInstance starts an instance on the remote node
|
|
||||||
func (im *instanceManager) StartRemoteInstance(nodeConfig *config.NodeConfig, name string) (*instance.Instance, error) {
|
|
||||||
path := fmt.Sprintf("/api/v1/instances/%s/start", name)
|
|
||||||
resp, err := im.makeRemoteRequest(nodeConfig, "POST", path, nil)
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
var inst instance.Instance
|
|
||||||
if err := parseRemoteResponse(resp, &inst); err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
return &inst, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// StopRemoteInstance stops an instance on the remote node
|
|
||||||
func (im *instanceManager) StopRemoteInstance(nodeConfig *config.NodeConfig, name string) (*instance.Instance, error) {
|
|
||||||
path := fmt.Sprintf("/api/v1/instances/%s/stop", name)
|
|
||||||
resp, err := im.makeRemoteRequest(nodeConfig, "POST", path, nil)
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
var inst instance.Instance
|
|
||||||
if err := parseRemoteResponse(resp, &inst); err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
return &inst, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// RestartRemoteInstance restarts an instance on the remote node
|
|
||||||
func (im *instanceManager) RestartRemoteInstance(nodeConfig *config.NodeConfig, name string) (*instance.Instance, error) {
|
|
||||||
path := fmt.Sprintf("/api/v1/instances/%s/restart", name)
|
|
||||||
resp, err := im.makeRemoteRequest(nodeConfig, "POST", path, nil)
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
var inst instance.Instance
|
|
||||||
if err := parseRemoteResponse(resp, &inst); err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
return &inst, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// GetRemoteInstanceLogs retrieves logs for an instance from the remote node
|
|
||||||
func (im *instanceManager) GetRemoteInstanceLogs(nodeConfig *config.NodeConfig, name string, numLines int) (string, error) {
|
|
||||||
path := fmt.Sprintf("/api/v1/instances/%s/logs?lines=%d", name, numLines)
|
|
||||||
resp, err := im.makeRemoteRequest(nodeConfig, "GET", path, nil)
|
|
||||||
if err != nil {
|
|
||||||
return "", err
|
|
||||||
}
|
|
||||||
|
|
||||||
defer resp.Body.Close()
|
|
||||||
|
|
||||||
body, err := io.ReadAll(resp.Body)
|
|
||||||
if err != nil {
|
|
||||||
return "", fmt.Errorf("failed to read response body: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
|
|
||||||
return "", fmt.Errorf("API request failed with status %d: %s", resp.StatusCode, string(body))
|
|
||||||
}
|
|
||||||
|
|
||||||
// Logs endpoint might return plain text or JSON
|
|
||||||
// Try to parse as JSON first (in case it's wrapped in a response object)
|
|
||||||
var logResponse struct {
|
|
||||||
Logs string `json:"logs"`
|
|
||||||
}
|
|
||||||
if err := json.Unmarshal(body, &logResponse); err == nil && logResponse.Logs != "" {
|
|
||||||
return logResponse.Logs, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// Otherwise, return as plain text
|
|
||||||
return string(body), nil
|
|
||||||
}
|
|
||||||
@@ -1,74 +0,0 @@
|
|||||||
package manager
|
|
||||||
|
|
||||||
import (
|
|
||||||
"fmt"
|
|
||||||
"llamactl/pkg/instance"
|
|
||||||
"log"
|
|
||||||
)
|
|
||||||
|
|
||||||
func (im *instanceManager) checkAllTimeouts() {
|
|
||||||
im.mu.RLock()
|
|
||||||
var timeoutInstances []string
|
|
||||||
|
|
||||||
// Identify instances that should timeout
|
|
||||||
for _, inst := range im.instances {
|
|
||||||
// Skip remote instances - they are managed by their respective nodes
|
|
||||||
if inst.IsRemote() {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
if inst.ShouldTimeout() {
|
|
||||||
timeoutInstances = append(timeoutInstances, inst.Name)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
im.mu.RUnlock() // Release read lock before calling StopInstance
|
|
||||||
|
|
||||||
// Stop the timed-out instances
|
|
||||||
for _, name := range timeoutInstances {
|
|
||||||
log.Printf("Instance %s has timed out, stopping it", name)
|
|
||||||
if _, err := im.StopInstance(name); err != nil {
|
|
||||||
log.Printf("Error stopping instance %s: %v", name, err)
|
|
||||||
} else {
|
|
||||||
log.Printf("Instance %s stopped successfully", name)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// EvictLRUInstance finds and stops the least recently used running instance.
|
|
||||||
func (im *instanceManager) EvictLRUInstance() error {
|
|
||||||
im.mu.RLock()
|
|
||||||
var lruInstance *instance.Instance
|
|
||||||
|
|
||||||
for name := range im.runningInstances {
|
|
||||||
inst := im.instances[name]
|
|
||||||
if inst == nil {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
// Skip remote instances - they are managed by their respective nodes
|
|
||||||
if inst.IsRemote() {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
if inst.GetOptions() != nil && inst.GetOptions().IdleTimeout != nil && *inst.GetOptions().IdleTimeout <= 0 {
|
|
||||||
continue // Skip instances without idle timeout
|
|
||||||
}
|
|
||||||
|
|
||||||
if lruInstance == nil {
|
|
||||||
lruInstance = inst
|
|
||||||
}
|
|
||||||
|
|
||||||
if inst.LastRequestTime() < lruInstance.LastRequestTime() {
|
|
||||||
lruInstance = inst
|
|
||||||
}
|
|
||||||
}
|
|
||||||
im.mu.RUnlock()
|
|
||||||
|
|
||||||
if lruInstance == nil {
|
|
||||||
return fmt.Errorf("failed to find lru instance")
|
|
||||||
}
|
|
||||||
|
|
||||||
// Evict Instance
|
|
||||||
_, err := im.StopInstance(lruInstance.Name)
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
@@ -1,343 +0,0 @@
|
|||||||
package manager_test
|
|
||||||
|
|
||||||
import (
|
|
||||||
"llamactl/pkg/backends"
|
|
||||||
"llamactl/pkg/config"
|
|
||||||
"llamactl/pkg/instance"
|
|
||||||
"llamactl/pkg/manager"
|
|
||||||
"sync"
|
|
||||||
"testing"
|
|
||||||
"time"
|
|
||||||
)
|
|
||||||
|
|
||||||
func TestTimeoutFunctionality(t *testing.T) {
|
|
||||||
// Test timeout checker initialization
|
|
||||||
backendConfig := config.BackendConfig{
|
|
||||||
LlamaCpp: config.BackendSettings{Command: "llama-server"},
|
|
||||||
MLX: config.BackendSettings{Command: "mlx_lm.server"},
|
|
||||||
}
|
|
||||||
cfg := config.InstancesConfig{
|
|
||||||
PortRange: [2]int{8000, 9000},
|
|
||||||
TimeoutCheckInterval: 10,
|
|
||||||
MaxInstances: 5,
|
|
||||||
}
|
|
||||||
|
|
||||||
manager := manager.NewInstanceManager(backendConfig, cfg, map[string]config.NodeConfig{}, "main")
|
|
||||||
if manager == nil {
|
|
||||||
t.Fatal("Manager should be initialized with timeout checker")
|
|
||||||
}
|
|
||||||
manager.Shutdown() // Clean up
|
|
||||||
|
|
||||||
// Test timeout configuration and logic without starting the actual process
|
|
||||||
testManager := createTestManager()
|
|
||||||
defer testManager.Shutdown()
|
|
||||||
|
|
||||||
idleTimeout := 1 // 1 minute
|
|
||||||
options := &instance.Options{
|
|
||||||
IdleTimeout: &idleTimeout,
|
|
||||||
BackendOptions: backends.Options{
|
|
||||||
BackendType: backends.BackendTypeLlamaCpp,
|
|
||||||
LlamaServerOptions: &backends.LlamaServerOptions{
|
|
||||||
Model: "/path/to/model.gguf",
|
|
||||||
},
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
inst, err := testManager.CreateInstance("timeout-test", options)
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("CreateInstance failed: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Test timeout configuration is properly set
|
|
||||||
if inst.GetOptions().IdleTimeout == nil {
|
|
||||||
t.Fatal("Instance should have idle timeout configured")
|
|
||||||
}
|
|
||||||
if *inst.GetOptions().IdleTimeout != 1 {
|
|
||||||
t.Errorf("Expected idle timeout 1 minute, got %d", *inst.GetOptions().IdleTimeout)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Test timeout logic without actually starting the process
|
|
||||||
// Create a mock time provider to simulate timeout
|
|
||||||
mockTime := NewMockTimeProvider(time.Now())
|
|
||||||
inst.SetTimeProvider(mockTime)
|
|
||||||
|
|
||||||
// Set instance to running state so timeout logic can work
|
|
||||||
inst.SetStatus(instance.Running)
|
|
||||||
|
|
||||||
// Simulate instance being "running" for timeout check (without actual process)
|
|
||||||
// We'll test the ShouldTimeout logic directly
|
|
||||||
inst.UpdateLastRequestTime()
|
|
||||||
|
|
||||||
// Initially should not timeout (just updated)
|
|
||||||
if inst.ShouldTimeout() {
|
|
||||||
t.Error("Instance should not timeout immediately after request")
|
|
||||||
}
|
|
||||||
|
|
||||||
// Advance time to trigger timeout
|
|
||||||
mockTime.SetTime(time.Now().Add(2 * time.Minute))
|
|
||||||
|
|
||||||
// Now it should timeout
|
|
||||||
if !inst.ShouldTimeout() {
|
|
||||||
t.Error("Instance should timeout after idle period")
|
|
||||||
}
|
|
||||||
|
|
||||||
// Reset running state to avoid shutdown issues
|
|
||||||
inst.SetStatus(instance.Stopped)
|
|
||||||
|
|
||||||
// Test that instance without timeout doesn't timeout
|
|
||||||
noTimeoutOptions := &instance.Options{
|
|
||||||
BackendOptions: backends.Options{
|
|
||||||
BackendType: backends.BackendTypeLlamaCpp,
|
|
||||||
LlamaServerOptions: &backends.LlamaServerOptions{
|
|
||||||
Model: "/path/to/model.gguf",
|
|
||||||
},
|
|
||||||
},
|
|
||||||
// No IdleTimeout set
|
|
||||||
}
|
|
||||||
|
|
||||||
noTimeoutInst, err := testManager.CreateInstance("no-timeout-test", noTimeoutOptions)
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("CreateInstance failed: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
noTimeoutInst.SetTimeProvider(mockTime)
|
|
||||||
noTimeoutInst.SetStatus(instance.Running) // Set to running for timeout check
|
|
||||||
noTimeoutInst.UpdateLastRequestTime()
|
|
||||||
|
|
||||||
// Even with time advanced, should not timeout
|
|
||||||
if noTimeoutInst.ShouldTimeout() {
|
|
||||||
t.Error("Instance without timeout configuration should never timeout")
|
|
||||||
}
|
|
||||||
|
|
||||||
// Reset running state to avoid shutdown issues
|
|
||||||
noTimeoutInst.SetStatus(instance.Stopped)
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestEvictLRUInstance_Success(t *testing.T) {
|
|
||||||
manager := createTestManager()
|
|
||||||
// Don't defer manager.Shutdown() - we'll handle cleanup manually
|
|
||||||
|
|
||||||
// Create 3 instances with idle timeout enabled (value doesn't matter for LRU logic)
|
|
||||||
options1 := &instance.Options{
|
|
||||||
IdleTimeout: func() *int { timeout := 1; return &timeout }(), // Any value > 0
|
|
||||||
BackendOptions: backends.Options{
|
|
||||||
BackendType: backends.BackendTypeLlamaCpp,
|
|
||||||
LlamaServerOptions: &backends.LlamaServerOptions{
|
|
||||||
Model: "/path/to/model1.gguf",
|
|
||||||
},
|
|
||||||
},
|
|
||||||
}
|
|
||||||
options2 := &instance.Options{
|
|
||||||
IdleTimeout: func() *int { timeout := 1; return &timeout }(), // Any value > 0
|
|
||||||
BackendOptions: backends.Options{
|
|
||||||
BackendType: backends.BackendTypeLlamaCpp,
|
|
||||||
LlamaServerOptions: &backends.LlamaServerOptions{
|
|
||||||
Model: "/path/to/model2.gguf",
|
|
||||||
},
|
|
||||||
},
|
|
||||||
}
|
|
||||||
options3 := &instance.Options{
|
|
||||||
IdleTimeout: func() *int { timeout := 1; return &timeout }(), // Any value > 0
|
|
||||||
BackendOptions: backends.Options{
|
|
||||||
BackendType: backends.BackendTypeLlamaCpp,
|
|
||||||
LlamaServerOptions: &backends.LlamaServerOptions{
|
|
||||||
Model: "/path/to/model3.gguf",
|
|
||||||
},
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
inst1, err := manager.CreateInstance("instance-1", options1)
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("CreateInstance failed: %v", err)
|
|
||||||
}
|
|
||||||
inst2, err := manager.CreateInstance("instance-2", options2)
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("CreateInstance failed: %v", err)
|
|
||||||
}
|
|
||||||
inst3, err := manager.CreateInstance("instance-3", options3)
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("CreateInstance failed: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Set up mock time and set instances to running
|
|
||||||
mockTime := NewMockTimeProvider(time.Now())
|
|
||||||
inst1.SetTimeProvider(mockTime)
|
|
||||||
inst2.SetTimeProvider(mockTime)
|
|
||||||
inst3.SetTimeProvider(mockTime)
|
|
||||||
|
|
||||||
inst1.SetStatus(instance.Running)
|
|
||||||
inst2.SetStatus(instance.Running)
|
|
||||||
inst3.SetStatus(instance.Running)
|
|
||||||
|
|
||||||
// Set different last request times (oldest to newest)
|
|
||||||
// inst1: oldest (will be evicted)
|
|
||||||
inst1.UpdateLastRequestTime()
|
|
||||||
|
|
||||||
mockTime.SetTime(mockTime.Now().Add(1 * time.Minute))
|
|
||||||
inst2.UpdateLastRequestTime()
|
|
||||||
|
|
||||||
mockTime.SetTime(mockTime.Now().Add(1 * time.Minute))
|
|
||||||
inst3.UpdateLastRequestTime()
|
|
||||||
|
|
||||||
// Evict LRU instance (should be inst1)
|
|
||||||
err = manager.EvictLRUInstance()
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("EvictLRUInstance failed: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Verify inst1 is stopped
|
|
||||||
if inst1.IsRunning() {
|
|
||||||
t.Error("Expected instance-1 to be stopped after eviction")
|
|
||||||
}
|
|
||||||
|
|
||||||
// Verify inst2 and inst3 are still running
|
|
||||||
if !inst2.IsRunning() {
|
|
||||||
t.Error("Expected instance-2 to still be running")
|
|
||||||
}
|
|
||||||
if !inst3.IsRunning() {
|
|
||||||
t.Error("Expected instance-3 to still be running")
|
|
||||||
}
|
|
||||||
|
|
||||||
// Clean up manually - set all to stopped and then shutdown
|
|
||||||
inst2.SetStatus(instance.Stopped)
|
|
||||||
inst3.SetStatus(instance.Stopped)
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestEvictLRUInstance_NoEligibleInstances(t *testing.T) {
|
|
||||||
// Helper function to create instances with different timeout configurations
|
|
||||||
createInstanceWithTimeout := func(manager manager.InstanceManager, name, model string, timeout *int) *instance.Instance {
|
|
||||||
options := &instance.Options{
|
|
||||||
IdleTimeout: timeout,
|
|
||||||
BackendOptions: backends.Options{
|
|
||||||
BackendType: backends.BackendTypeLlamaCpp,
|
|
||||||
LlamaServerOptions: &backends.LlamaServerOptions{
|
|
||||||
Model: model,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
}
|
|
||||||
inst, err := manager.CreateInstance(name, options)
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("CreateInstance failed: %v", err)
|
|
||||||
}
|
|
||||||
return inst
|
|
||||||
}
|
|
||||||
|
|
||||||
t.Run("no running instances", func(t *testing.T) {
|
|
||||||
manager := createTestManager()
|
|
||||||
defer manager.Shutdown()
|
|
||||||
|
|
||||||
err := manager.EvictLRUInstance()
|
|
||||||
if err == nil {
|
|
||||||
t.Error("Expected error when no running instances exist")
|
|
||||||
}
|
|
||||||
if err.Error() != "failed to find lru instance" {
|
|
||||||
t.Errorf("Expected 'failed to find lru instance' error, got: %v", err)
|
|
||||||
}
|
|
||||||
})
|
|
||||||
|
|
||||||
t.Run("only instances without timeout", func(t *testing.T) {
|
|
||||||
manager := createTestManager()
|
|
||||||
defer manager.Shutdown()
|
|
||||||
|
|
||||||
// Create instances with various non-eligible timeout configurations
|
|
||||||
zeroTimeout := 0
|
|
||||||
negativeTimeout := -1
|
|
||||||
inst1 := createInstanceWithTimeout(manager, "no-timeout-1", "/path/to/model1.gguf", &zeroTimeout)
|
|
||||||
inst2 := createInstanceWithTimeout(manager, "no-timeout-2", "/path/to/model2.gguf", &negativeTimeout)
|
|
||||||
inst3 := createInstanceWithTimeout(manager, "no-timeout-3", "/path/to/model3.gguf", nil)
|
|
||||||
|
|
||||||
// Set instances to running
|
|
||||||
instances := []*instance.Instance{inst1, inst2, inst3}
|
|
||||||
for _, inst := range instances {
|
|
||||||
inst.SetStatus(instance.Running)
|
|
||||||
}
|
|
||||||
defer func() {
|
|
||||||
// Reset instances to stopped to avoid shutdown panics
|
|
||||||
for _, inst := range instances {
|
|
||||||
inst.SetStatus(instance.Stopped)
|
|
||||||
}
|
|
||||||
}()
|
|
||||||
|
|
||||||
// Try to evict - should fail because no eligible instances
|
|
||||||
err := manager.EvictLRUInstance()
|
|
||||||
if err == nil {
|
|
||||||
t.Error("Expected error when no eligible instances exist")
|
|
||||||
}
|
|
||||||
if err.Error() != "failed to find lru instance" {
|
|
||||||
t.Errorf("Expected 'failed to find lru instance' error, got: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Verify all instances are still running
|
|
||||||
for i, inst := range instances {
|
|
||||||
if !inst.IsRunning() {
|
|
||||||
t.Errorf("Expected instance %d to still be running", i+1)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
})
|
|
||||||
|
|
||||||
t.Run("mixed instances - evicts only eligible ones", func(t *testing.T) {
|
|
||||||
manager := createTestManager()
|
|
||||||
defer manager.Shutdown()
|
|
||||||
|
|
||||||
// Create mix of instances: some with timeout enabled, some disabled
|
|
||||||
validTimeout := 1
|
|
||||||
zeroTimeout := 0
|
|
||||||
instWithTimeout := createInstanceWithTimeout(manager, "with-timeout", "/path/to/model-with-timeout.gguf", &validTimeout)
|
|
||||||
instNoTimeout1 := createInstanceWithTimeout(manager, "no-timeout-1", "/path/to/model-no-timeout1.gguf", &zeroTimeout)
|
|
||||||
instNoTimeout2 := createInstanceWithTimeout(manager, "no-timeout-2", "/path/to/model-no-timeout2.gguf", nil)
|
|
||||||
|
|
||||||
// Set all instances to running
|
|
||||||
instances := []*instance.Instance{instWithTimeout, instNoTimeout1, instNoTimeout2}
|
|
||||||
for _, inst := range instances {
|
|
||||||
inst.SetStatus(instance.Running)
|
|
||||||
inst.UpdateLastRequestTime()
|
|
||||||
}
|
|
||||||
defer func() {
|
|
||||||
// Reset instances to stopped to avoid shutdown panics
|
|
||||||
for _, inst := range instances {
|
|
||||||
if inst.IsRunning() {
|
|
||||||
inst.SetStatus(instance.Stopped)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}()
|
|
||||||
|
|
||||||
// Evict LRU instance - should only consider the one with timeout
|
|
||||||
err := manager.EvictLRUInstance()
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("EvictLRUInstance failed: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Verify only the instance with timeout was evicted
|
|
||||||
if instWithTimeout.IsRunning() {
|
|
||||||
t.Error("Expected with-timeout instance to be stopped after eviction")
|
|
||||||
}
|
|
||||||
if !instNoTimeout1.IsRunning() {
|
|
||||||
t.Error("Expected no-timeout-1 instance to still be running")
|
|
||||||
}
|
|
||||||
if !instNoTimeout2.IsRunning() {
|
|
||||||
t.Error("Expected no-timeout-2 instance to still be running")
|
|
||||||
}
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
// Helper for timeout tests
|
|
||||||
type MockTimeProvider struct {
|
|
||||||
currentTime time.Time
|
|
||||||
mu sync.RWMutex
|
|
||||||
}
|
|
||||||
|
|
||||||
func NewMockTimeProvider(t time.Time) *MockTimeProvider {
|
|
||||||
return &MockTimeProvider{currentTime: t}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (m *MockTimeProvider) Now() time.Time {
|
|
||||||
m.mu.RLock()
|
|
||||||
defer m.mu.RUnlock()
|
|
||||||
return m.currentTime
|
|
||||||
}
|
|
||||||
|
|
||||||
func (m *MockTimeProvider) SetTime(t time.Time) {
|
|
||||||
m.mu.Lock()
|
|
||||||
defer m.mu.Unlock()
|
|
||||||
m.currentTime = t
|
|
||||||
}
|
|
||||||
@@ -5,6 +5,7 @@ import (
|
|||||||
"fmt"
|
"fmt"
|
||||||
"llamactl/pkg/backends"
|
"llamactl/pkg/backends"
|
||||||
"llamactl/pkg/instance"
|
"llamactl/pkg/instance"
|
||||||
|
"llamactl/pkg/validation"
|
||||||
"net/http"
|
"net/http"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"strings"
|
"strings"
|
||||||
@@ -22,13 +23,16 @@ func (h *Handler) LlamaCppProxy(onDemandStart bool) http.HandlerFunc {
|
|||||||
|
|
||||||
// Get the instance name from the URL parameter
|
// Get the instance name from the URL parameter
|
||||||
name := chi.URLParam(r, "name")
|
name := chi.URLParam(r, "name")
|
||||||
if name == "" {
|
|
||||||
http.Error(w, "Instance name cannot be empty", http.StatusBadRequest)
|
// Validate instance name at the entry point
|
||||||
|
validatedName, err := validation.ValidateInstanceName(name)
|
||||||
|
if err != nil {
|
||||||
|
http.Error(w, "Invalid instance name: "+err.Error(), http.StatusBadRequest)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
// Route to the appropriate inst based on instance name
|
// Route to the appropriate inst based on instance name
|
||||||
inst, err := h.InstanceManager.GetInstance(name)
|
inst, err := h.InstanceManager.GetInstance(validatedName)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
http.Error(w, "Invalid instance: "+err.Error(), http.StatusBadRequest)
|
http.Error(w, "Invalid instance: "+err.Error(), http.StatusBadRequest)
|
||||||
return
|
return
|
||||||
@@ -66,7 +70,7 @@ func (h *Handler) LlamaCppProxy(onDemandStart bool) http.HandlerFunc {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// If on-demand start is enabled, start the instance
|
// If on-demand start is enabled, start the instance
|
||||||
if _, err := h.InstanceManager.StartInstance(name); err != nil {
|
if _, err := h.InstanceManager.StartInstance(validatedName); err != nil {
|
||||||
http.Error(w, "Failed to start instance: "+err.Error(), http.StatusInternalServerError)
|
http.Error(w, "Failed to start instance: "+err.Error(), http.StatusInternalServerError)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
@@ -85,7 +89,7 @@ func (h *Handler) LlamaCppProxy(onDemandStart bool) http.HandlerFunc {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Strip the "/llama-cpp/<name>" prefix from the request URL
|
// Strip the "/llama-cpp/<name>" prefix from the request URL
|
||||||
prefix := fmt.Sprintf("/llama-cpp/%s", name)
|
prefix := fmt.Sprintf("/llama-cpp/%s", validatedName)
|
||||||
r.URL.Path = strings.TrimPrefix(r.URL.Path, prefix)
|
r.URL.Path = strings.TrimPrefix(r.URL.Path, prefix)
|
||||||
|
|
||||||
// Update the last request time for the instance
|
// Update the last request time for the instance
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ import (
|
|||||||
"fmt"
|
"fmt"
|
||||||
"llamactl/pkg/instance"
|
"llamactl/pkg/instance"
|
||||||
"llamactl/pkg/manager"
|
"llamactl/pkg/manager"
|
||||||
|
"llamactl/pkg/validation"
|
||||||
"net/http"
|
"net/http"
|
||||||
"net/http/httputil"
|
"net/http/httputil"
|
||||||
"net/url"
|
"net/url"
|
||||||
@@ -55,8 +56,10 @@ func (h *Handler) ListInstances() http.HandlerFunc {
|
|||||||
func (h *Handler) CreateInstance() http.HandlerFunc {
|
func (h *Handler) CreateInstance() http.HandlerFunc {
|
||||||
return func(w http.ResponseWriter, r *http.Request) {
|
return func(w http.ResponseWriter, r *http.Request) {
|
||||||
name := chi.URLParam(r, "name")
|
name := chi.URLParam(r, "name")
|
||||||
if name == "" {
|
|
||||||
http.Error(w, "Instance name cannot be empty", http.StatusBadRequest)
|
validatedName, err := validation.ValidateInstanceName(name)
|
||||||
|
if err != nil {
|
||||||
|
http.Error(w, "Invalid instance name: "+err.Error(), http.StatusBadRequest)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -66,7 +69,7 @@ func (h *Handler) CreateInstance() http.HandlerFunc {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
inst, err := h.InstanceManager.CreateInstance(name, &options)
|
inst, err := h.InstanceManager.CreateInstance(validatedName, &options)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
http.Error(w, "Failed to create instance: "+err.Error(), http.StatusInternalServerError)
|
http.Error(w, "Failed to create instance: "+err.Error(), http.StatusInternalServerError)
|
||||||
return
|
return
|
||||||
@@ -95,12 +98,14 @@ func (h *Handler) CreateInstance() http.HandlerFunc {
|
|||||||
func (h *Handler) GetInstance() http.HandlerFunc {
|
func (h *Handler) GetInstance() http.HandlerFunc {
|
||||||
return func(w http.ResponseWriter, r *http.Request) {
|
return func(w http.ResponseWriter, r *http.Request) {
|
||||||
name := chi.URLParam(r, "name")
|
name := chi.URLParam(r, "name")
|
||||||
if name == "" {
|
|
||||||
http.Error(w, "Instance name cannot be empty", http.StatusBadRequest)
|
validatedName, err := validation.ValidateInstanceName(name)
|
||||||
|
if err != nil {
|
||||||
|
http.Error(w, "Invalid instance name: "+err.Error(), http.StatusBadRequest)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
inst, err := h.InstanceManager.GetInstance(name)
|
inst, err := h.InstanceManager.GetInstance(validatedName)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
http.Error(w, "Invalid instance: "+err.Error(), http.StatusBadRequest)
|
http.Error(w, "Invalid instance: "+err.Error(), http.StatusBadRequest)
|
||||||
return
|
return
|
||||||
@@ -130,8 +135,10 @@ func (h *Handler) GetInstance() http.HandlerFunc {
|
|||||||
func (h *Handler) UpdateInstance() http.HandlerFunc {
|
func (h *Handler) UpdateInstance() http.HandlerFunc {
|
||||||
return func(w http.ResponseWriter, r *http.Request) {
|
return func(w http.ResponseWriter, r *http.Request) {
|
||||||
name := chi.URLParam(r, "name")
|
name := chi.URLParam(r, "name")
|
||||||
if name == "" {
|
|
||||||
http.Error(w, "Instance name cannot be empty", http.StatusBadRequest)
|
validatedName, err := validation.ValidateInstanceName(name)
|
||||||
|
if err != nil {
|
||||||
|
http.Error(w, "Invalid instance name: "+err.Error(), http.StatusBadRequest)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -141,7 +148,7 @@ func (h *Handler) UpdateInstance() http.HandlerFunc {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
inst, err := h.InstanceManager.UpdateInstance(name, &options)
|
inst, err := h.InstanceManager.UpdateInstance(validatedName, &options)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
http.Error(w, "Failed to update instance: "+err.Error(), http.StatusInternalServerError)
|
http.Error(w, "Failed to update instance: "+err.Error(), http.StatusInternalServerError)
|
||||||
return
|
return
|
||||||
@@ -169,12 +176,14 @@ func (h *Handler) UpdateInstance() http.HandlerFunc {
|
|||||||
func (h *Handler) StartInstance() http.HandlerFunc {
|
func (h *Handler) StartInstance() http.HandlerFunc {
|
||||||
return func(w http.ResponseWriter, r *http.Request) {
|
return func(w http.ResponseWriter, r *http.Request) {
|
||||||
name := chi.URLParam(r, "name")
|
name := chi.URLParam(r, "name")
|
||||||
if name == "" {
|
|
||||||
http.Error(w, "Instance name cannot be empty", http.StatusBadRequest)
|
validatedName, err := validation.ValidateInstanceName(name)
|
||||||
|
if err != nil {
|
||||||
|
http.Error(w, "Invalid instance name: "+err.Error(), http.StatusBadRequest)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
inst, err := h.InstanceManager.StartInstance(name)
|
inst, err := h.InstanceManager.StartInstance(validatedName)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
// Check if error is due to maximum running instances limit
|
// Check if error is due to maximum running instances limit
|
||||||
if _, ok := err.(manager.MaxRunningInstancesError); ok {
|
if _, ok := err.(manager.MaxRunningInstancesError); ok {
|
||||||
@@ -208,12 +217,14 @@ func (h *Handler) StartInstance() http.HandlerFunc {
|
|||||||
func (h *Handler) StopInstance() http.HandlerFunc {
|
func (h *Handler) StopInstance() http.HandlerFunc {
|
||||||
return func(w http.ResponseWriter, r *http.Request) {
|
return func(w http.ResponseWriter, r *http.Request) {
|
||||||
name := chi.URLParam(r, "name")
|
name := chi.URLParam(r, "name")
|
||||||
if name == "" {
|
|
||||||
http.Error(w, "Instance name cannot be empty", http.StatusBadRequest)
|
validatedName, err := validation.ValidateInstanceName(name)
|
||||||
|
if err != nil {
|
||||||
|
http.Error(w, "Invalid instance name: "+err.Error(), http.StatusBadRequest)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
inst, err := h.InstanceManager.StopInstance(name)
|
inst, err := h.InstanceManager.StopInstance(validatedName)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
http.Error(w, "Failed to stop instance: "+err.Error(), http.StatusInternalServerError)
|
http.Error(w, "Failed to stop instance: "+err.Error(), http.StatusInternalServerError)
|
||||||
return
|
return
|
||||||
@@ -241,12 +252,14 @@ func (h *Handler) StopInstance() http.HandlerFunc {
|
|||||||
func (h *Handler) RestartInstance() http.HandlerFunc {
|
func (h *Handler) RestartInstance() http.HandlerFunc {
|
||||||
return func(w http.ResponseWriter, r *http.Request) {
|
return func(w http.ResponseWriter, r *http.Request) {
|
||||||
name := chi.URLParam(r, "name")
|
name := chi.URLParam(r, "name")
|
||||||
if name == "" {
|
|
||||||
http.Error(w, "Instance name cannot be empty", http.StatusBadRequest)
|
validatedName, err := validation.ValidateInstanceName(name)
|
||||||
|
if err != nil {
|
||||||
|
http.Error(w, "Invalid instance name: "+err.Error(), http.StatusBadRequest)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
inst, err := h.InstanceManager.RestartInstance(name)
|
inst, err := h.InstanceManager.RestartInstance(validatedName)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
http.Error(w, "Failed to restart instance: "+err.Error(), http.StatusInternalServerError)
|
http.Error(w, "Failed to restart instance: "+err.Error(), http.StatusInternalServerError)
|
||||||
return
|
return
|
||||||
@@ -273,12 +286,14 @@ func (h *Handler) RestartInstance() http.HandlerFunc {
|
|||||||
func (h *Handler) DeleteInstance() http.HandlerFunc {
|
func (h *Handler) DeleteInstance() http.HandlerFunc {
|
||||||
return func(w http.ResponseWriter, r *http.Request) {
|
return func(w http.ResponseWriter, r *http.Request) {
|
||||||
name := chi.URLParam(r, "name")
|
name := chi.URLParam(r, "name")
|
||||||
if name == "" {
|
|
||||||
http.Error(w, "Instance name cannot be empty", http.StatusBadRequest)
|
validatedName, err := validation.ValidateInstanceName(name)
|
||||||
|
if err != nil {
|
||||||
|
http.Error(w, "Invalid instance name: "+err.Error(), http.StatusBadRequest)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
if err := h.InstanceManager.DeleteInstance(name); err != nil {
|
if err := h.InstanceManager.DeleteInstance(validatedName); err != nil {
|
||||||
http.Error(w, "Failed to delete instance: "+err.Error(), http.StatusInternalServerError)
|
http.Error(w, "Failed to delete instance: "+err.Error(), http.StatusInternalServerError)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
@@ -302,8 +317,10 @@ func (h *Handler) DeleteInstance() http.HandlerFunc {
|
|||||||
func (h *Handler) GetInstanceLogs() http.HandlerFunc {
|
func (h *Handler) GetInstanceLogs() http.HandlerFunc {
|
||||||
return func(w http.ResponseWriter, r *http.Request) {
|
return func(w http.ResponseWriter, r *http.Request) {
|
||||||
name := chi.URLParam(r, "name")
|
name := chi.URLParam(r, "name")
|
||||||
if name == "" {
|
|
||||||
http.Error(w, "Instance name cannot be empty", http.StatusBadRequest)
|
validatedName, err := validation.ValidateInstanceName(name)
|
||||||
|
if err != nil {
|
||||||
|
http.Error(w, "Invalid instance name: "+err.Error(), http.StatusBadRequest)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -319,7 +336,7 @@ func (h *Handler) GetInstanceLogs() http.HandlerFunc {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Use the instance manager which handles both local and remote instances
|
// Use the instance manager which handles both local and remote instances
|
||||||
logs, err := h.InstanceManager.GetInstanceLogs(name, numLines)
|
logs, err := h.InstanceManager.GetInstanceLogs(validatedName, numLines)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
http.Error(w, "Failed to get logs: "+err.Error(), http.StatusInternalServerError)
|
http.Error(w, "Failed to get logs: "+err.Error(), http.StatusInternalServerError)
|
||||||
return
|
return
|
||||||
@@ -345,12 +362,14 @@ func (h *Handler) GetInstanceLogs() http.HandlerFunc {
|
|||||||
func (h *Handler) ProxyToInstance() http.HandlerFunc {
|
func (h *Handler) ProxyToInstance() http.HandlerFunc {
|
||||||
return func(w http.ResponseWriter, r *http.Request) {
|
return func(w http.ResponseWriter, r *http.Request) {
|
||||||
name := chi.URLParam(r, "name")
|
name := chi.URLParam(r, "name")
|
||||||
if name == "" {
|
|
||||||
http.Error(w, "Instance name cannot be empty", http.StatusBadRequest)
|
validatedName, err := validation.ValidateInstanceName(name)
|
||||||
|
if err != nil {
|
||||||
|
http.Error(w, "Invalid instance name: "+err.Error(), http.StatusBadRequest)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
inst, err := h.InstanceManager.GetInstance(name)
|
inst, err := h.InstanceManager.GetInstance(validatedName)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
http.Error(w, "Failed to get instance: "+err.Error(), http.StatusInternalServerError)
|
http.Error(w, "Failed to get instance: "+err.Error(), http.StatusInternalServerError)
|
||||||
return
|
return
|
||||||
@@ -358,7 +377,7 @@ func (h *Handler) ProxyToInstance() http.HandlerFunc {
|
|||||||
|
|
||||||
// Check if this is a remote instance
|
// Check if this is a remote instance
|
||||||
if inst.IsRemote() {
|
if inst.IsRemote() {
|
||||||
h.RemoteInstanceProxy(w, r, name, inst)
|
h.RemoteInstanceProxy(w, r, validatedName, inst)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -375,7 +394,7 @@ func (h *Handler) ProxyToInstance() http.HandlerFunc {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Strip the "/api/v1/instances/<name>/proxy" prefix from the request URL
|
// Strip the "/api/v1/instances/<name>/proxy" prefix from the request URL
|
||||||
prefix := fmt.Sprintf("/api/v1/instances/%s/proxy", name)
|
prefix := fmt.Sprintf("/api/v1/instances/%s/proxy", validatedName)
|
||||||
r.URL.Path = strings.TrimPrefix(r.URL.Path, prefix)
|
r.URL.Path = strings.TrimPrefix(r.URL.Path, prefix)
|
||||||
|
|
||||||
// Update the last request time for the instance
|
// Update the last request time for the instance
|
||||||
|
|||||||
@@ -6,6 +6,7 @@ import (
|
|||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
"llamactl/pkg/instance"
|
"llamactl/pkg/instance"
|
||||||
|
"llamactl/pkg/validation"
|
||||||
"net/http"
|
"net/http"
|
||||||
"net/http/httputil"
|
"net/http/httputil"
|
||||||
"net/url"
|
"net/url"
|
||||||
@@ -85,8 +86,15 @@ func (h *Handler) OpenAIProxy() http.HandlerFunc {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Validate instance name at the entry point
|
||||||
|
validatedName, err := validation.ValidateInstanceName(modelName)
|
||||||
|
if err != nil {
|
||||||
|
http.Error(w, "Invalid instance name: "+err.Error(), http.StatusBadRequest)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
// Route to the appropriate inst based on instance name
|
// Route to the appropriate inst based on instance name
|
||||||
inst, err := h.InstanceManager.GetInstance(modelName)
|
inst, err := h.InstanceManager.GetInstance(validatedName)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
http.Error(w, "Invalid instance: "+err.Error(), http.StatusBadRequest)
|
http.Error(w, "Invalid instance: "+err.Error(), http.StatusBadRequest)
|
||||||
return
|
return
|
||||||
@@ -96,7 +104,7 @@ func (h *Handler) OpenAIProxy() http.HandlerFunc {
|
|||||||
if inst.IsRemote() {
|
if inst.IsRemote() {
|
||||||
// Restore the body for the remote proxy
|
// Restore the body for the remote proxy
|
||||||
r.Body = io.NopCloser(bytes.NewReader(bodyBytes))
|
r.Body = io.NopCloser(bytes.NewReader(bodyBytes))
|
||||||
h.RemoteOpenAIProxy(w, r, modelName, inst)
|
h.RemoteOpenAIProxy(w, r, validatedName, inst)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -122,7 +130,7 @@ func (h *Handler) OpenAIProxy() http.HandlerFunc {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// If on-demand start is enabled, start the instance
|
// If on-demand start is enabled, start the instance
|
||||||
if _, err := h.InstanceManager.StartInstance(modelName); err != nil {
|
if _, err := h.InstanceManager.StartInstance(validatedName); err != nil {
|
||||||
http.Error(w, "Failed to start instance: "+err.Error(), http.StatusInternalServerError)
|
http.Error(w, "Failed to start instance: "+err.Error(), http.StatusInternalServerError)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -116,7 +116,7 @@ export const nodesApi = {
|
|||||||
list: () => apiCall<NodesMap>("/nodes"),
|
list: () => apiCall<NodesMap>("/nodes"),
|
||||||
|
|
||||||
// GET /nodes/{name}
|
// GET /nodes/{name}
|
||||||
get: (name: string) => apiCall<NodeResponse>(`/nodes/${name}`),
|
get: (name: string) => apiCall<NodeResponse>(`/nodes/${encodeURIComponent(name)}`),
|
||||||
};
|
};
|
||||||
|
|
||||||
// Instance API functions
|
// Instance API functions
|
||||||
@@ -125,52 +125,52 @@ export const instancesApi = {
|
|||||||
list: () => apiCall<Instance[]>("/instances"),
|
list: () => apiCall<Instance[]>("/instances"),
|
||||||
|
|
||||||
// GET /instances/{name}
|
// GET /instances/{name}
|
||||||
get: (name: string) => apiCall<Instance>(`/instances/${name}`),
|
get: (name: string) => apiCall<Instance>(`/instances/${encodeURIComponent(name)}`),
|
||||||
|
|
||||||
// POST /instances/{name}
|
// POST /instances/{name}
|
||||||
create: (name: string, options: CreateInstanceOptions) =>
|
create: (name: string, options: CreateInstanceOptions) =>
|
||||||
apiCall<Instance>(`/instances/${name}`, {
|
apiCall<Instance>(`/instances/${encodeURIComponent(name)}`, {
|
||||||
method: "POST",
|
method: "POST",
|
||||||
body: JSON.stringify(options),
|
body: JSON.stringify(options),
|
||||||
}),
|
}),
|
||||||
|
|
||||||
// PUT /instances/{name}
|
// PUT /instances/{name}
|
||||||
update: (name: string, options: CreateInstanceOptions) =>
|
update: (name: string, options: CreateInstanceOptions) =>
|
||||||
apiCall<Instance>(`/instances/${name}`, {
|
apiCall<Instance>(`/instances/${encodeURIComponent(name)}`, {
|
||||||
method: "PUT",
|
method: "PUT",
|
||||||
body: JSON.stringify(options),
|
body: JSON.stringify(options),
|
||||||
}),
|
}),
|
||||||
|
|
||||||
// DELETE /instances/{name}
|
// DELETE /instances/{name}
|
||||||
delete: (name: string) =>
|
delete: (name: string) =>
|
||||||
apiCall<void>(`/instances/${name}`, {
|
apiCall<void>(`/instances/${encodeURIComponent(name)}`, {
|
||||||
method: "DELETE",
|
method: "DELETE",
|
||||||
}),
|
}),
|
||||||
|
|
||||||
// POST /instances/{name}/start
|
// POST /instances/{name}/start
|
||||||
start: (name: string) =>
|
start: (name: string) =>
|
||||||
apiCall<Instance>(`/instances/${name}/start`, {
|
apiCall<Instance>(`/instances/${encodeURIComponent(name)}/start`, {
|
||||||
method: "POST",
|
method: "POST",
|
||||||
}),
|
}),
|
||||||
|
|
||||||
// POST /instances/{name}/stop
|
// POST /instances/{name}/stop
|
||||||
stop: (name: string) =>
|
stop: (name: string) =>
|
||||||
apiCall<Instance>(`/instances/${name}/stop`, {
|
apiCall<Instance>(`/instances/${encodeURIComponent(name)}/stop`, {
|
||||||
method: "POST",
|
method: "POST",
|
||||||
}),
|
}),
|
||||||
|
|
||||||
// POST /instances/{name}/restart
|
// POST /instances/{name}/restart
|
||||||
restart: (name: string) =>
|
restart: (name: string) =>
|
||||||
apiCall<Instance>(`/instances/${name}/restart`, {
|
apiCall<Instance>(`/instances/${encodeURIComponent(name)}/restart`, {
|
||||||
method: "POST",
|
method: "POST",
|
||||||
}),
|
}),
|
||||||
|
|
||||||
// GET /instances/{name}/logs
|
// GET /instances/{name}/logs
|
||||||
getLogs: (name: string, lines?: number) => {
|
getLogs: (name: string, lines?: number) => {
|
||||||
const params = lines ? `?lines=${lines}` : "";
|
const params = lines ? `?lines=${lines}` : "";
|
||||||
return apiCall<string>(`/instances/${name}/logs${params}`, {}, "text");
|
return apiCall<string>(`/instances/${encodeURIComponent(name)}/logs${params}`, {}, "text");
|
||||||
},
|
},
|
||||||
|
|
||||||
// GET /instances/{name}/proxy/health
|
// GET /instances/{name}/proxy/health
|
||||||
getHealth: (name: string) => apiCall<Record<string, unknown>>(`/instances/${name}/proxy/health`),
|
getHealth: (name: string) => apiCall<Record<string, unknown>>(`/instances/${encodeURIComponent(name)}/proxy/health`),
|
||||||
};
|
};
|
||||||
|
|||||||
Reference in New Issue
Block a user