From ffb4b49c9479ad87ba73c5e88796f385929fcbd5 Mon Sep 17 00:00:00 2001 From: LordMathis Date: Mon, 20 Oct 2025 21:55:50 +0200 Subject: [PATCH 01/22] Split manager into multiple structs --- cmd/server/main.go | 2 +- pkg/manager/lifecycle.go | 152 ++++++++++++++ pkg/manager/manager.go | 341 +++++++++++-------------------- pkg/manager/manager_test.go | 12 +- pkg/manager/operations.go | 362 +++++++++++++++++---------------- pkg/manager/operations_test.go | 2 +- pkg/manager/persistence.go | 241 ++++++++++++++++++++++ pkg/manager/ports.go | 184 +++++++++++++++++ pkg/manager/registry.go | 131 ++++++++++++ pkg/manager/remote.go | 283 ++++++++++++++++++++++++++ pkg/manager/remote_ops.go | 222 -------------------- pkg/manager/timeout.go | 74 ------- pkg/manager/timeout_test.go | 2 +- 13 files changed, 1307 insertions(+), 701 deletions(-) create mode 100644 pkg/manager/lifecycle.go create mode 100644 pkg/manager/persistence.go create mode 100644 pkg/manager/ports.go create mode 100644 pkg/manager/registry.go create mode 100644 pkg/manager/remote.go delete mode 100644 pkg/manager/remote_ops.go delete mode 100644 pkg/manager/timeout.go diff --git a/cmd/server/main.go b/cmd/server/main.go index 754b1d7..843a594 100644 --- a/cmd/server/main.go +++ b/cmd/server/main.go @@ -58,7 +58,7 @@ func main() { } // Initialize the instance manager - instanceManager := manager.NewInstanceManager(cfg.Backends, cfg.Instances, cfg.Nodes, cfg.LocalNode) + instanceManager := manager.New(cfg.Backends, cfg.Instances, cfg.Nodes, cfg.LocalNode) // Create a new handler with the instance manager handler := server.NewHandler(instanceManager, cfg) diff --git a/pkg/manager/lifecycle.go b/pkg/manager/lifecycle.go new file mode 100644 index 0000000..4045022 --- /dev/null +++ b/pkg/manager/lifecycle.go @@ -0,0 +1,152 @@ +package manager + +import ( + "fmt" + "llamactl/pkg/instance" + "log" + "sync" + "time" +) + +// lifecycleManager handles background timeout checking and LRU eviction. +// It properly coordinates shutdown to prevent races with the timeout checker. +type lifecycleManager struct { + registry *instanceRegistry + manager InstanceManager // For calling Stop/Evict operations + + ticker *time.Ticker + checkInterval time.Duration + enableLRU bool + + shutdownChan chan struct{} + shutdownDone chan struct{} + shutdownOnce sync.Once +} + +// NewLifecycleManager creates a new lifecycle manager. +func NewLifecycleManager( + registry *instanceRegistry, + manager InstanceManager, + checkInterval time.Duration, + enableLRU bool, +) *lifecycleManager { + if checkInterval <= 0 { + checkInterval = 5 * time.Minute // Default to 5 minutes + } + + return &lifecycleManager{ + registry: registry, + manager: manager, + ticker: time.NewTicker(checkInterval), + checkInterval: checkInterval, + enableLRU: enableLRU, + shutdownChan: make(chan struct{}), + shutdownDone: make(chan struct{}), + } +} + +// Start begins the timeout checking loop in a goroutine. +func (l *lifecycleManager) Start() { + go l.timeoutCheckLoop() +} + +// Stop gracefully stops the lifecycle manager. +// This ensures the timeout checker completes before instance cleanup begins. +func (l *lifecycleManager) Stop() { + l.shutdownOnce.Do(func() { + close(l.shutdownChan) + <-l.shutdownDone // Wait for checker to finish (prevents shutdown race) + l.ticker.Stop() + }) +} + +// timeoutCheckLoop is the main loop that periodically checks for timeouts. +func (l *lifecycleManager) timeoutCheckLoop() { + defer close(l.shutdownDone) // Signal completion + + for { + select { + case <-l.ticker.C: + l.checkTimeouts() + case <-l.shutdownChan: + return // Exit goroutine on shutdown + } + } +} + +// checkTimeouts checks all instances for timeout and stops those that have timed out. +func (l *lifecycleManager) checkTimeouts() { + // Get all instances from registry + instances := l.registry.List() + + var timeoutInstances []string + + // Identify instances that should timeout + for _, inst := range instances { + // Skip remote instances - they are managed by their respective nodes + if inst.IsRemote() { + continue + } + + // Only check running instances + if !l.registry.IsRunning(inst.Name) { + continue + } + + if inst.ShouldTimeout() { + timeoutInstances = append(timeoutInstances, inst.Name) + } + } + + // Stop the timed-out instances + for _, name := range timeoutInstances { + log.Printf("Instance %s has timed out, stopping it", name) + if _, err := l.manager.StopInstance(name); err != nil { + log.Printf("Error stopping instance %s: %v", name, err) + } else { + log.Printf("Instance %s stopped successfully", name) + } + } +} + +// EvictLRU finds and stops the least recently used running instance. +// This is called when max running instances limit is reached. +func (l *lifecycleManager) EvictLRU() error { + if !l.enableLRU { + return fmt.Errorf("LRU eviction is not enabled") + } + + // Get all running instances + runningInstances := l.registry.ListRunning() + + var lruInstance *instance.Instance + + for _, inst := range runningInstances { + // Skip remote instances - they are managed by their respective nodes + if inst.IsRemote() { + continue + } + + // Skip instances without idle timeout + if inst.GetOptions() != nil && inst.GetOptions().IdleTimeout != nil && *inst.GetOptions().IdleTimeout <= 0 { + continue + } + + if lruInstance == nil { + lruInstance = inst + } + + if inst.LastRequestTime() < lruInstance.LastRequestTime() { + lruInstance = inst + } + } + + if lruInstance == nil { + return fmt.Errorf("failed to find lru instance") + } + + // Evict the LRU instance + log.Printf("Evicting LRU instance %s", lruInstance.Name) + _, err := l.manager.StopInstance(lruInstance.Name) + return err +} diff --git a/pkg/manager/manager.go b/pkg/manager/manager.go index 894b8df..5863f46 100644 --- a/pkg/manager/manager.go +++ b/pkg/manager/manager.go @@ -1,15 +1,11 @@ package manager import ( - "encoding/json" + "context" "fmt" "llamactl/pkg/config" "llamactl/pkg/instance" "log" - "net/http" - "os" - "path/filepath" - "strings" "sync" "time" ) @@ -43,243 +39,145 @@ type RemoteManager interface { } type instanceManager struct { - mu sync.RWMutex - instances map[string]*instance.Instance - runningInstances map[string]struct{} - ports map[int]bool - instancesConfig config.InstancesConfig - backendsConfig config.BackendConfig - localNodeName string // Name of the local node + // Components (each with own synchronization) + registry *instanceRegistry + ports *portAllocator + persistence *instancePersister + remote *remoteManager + lifecycle *lifecycleManager - // Timeout checker - timeoutChecker *time.Ticker - shutdownChan chan struct{} - shutdownDone chan struct{} - isShutdown bool + // Configuration + instancesConfig config.InstancesConfig + backendsConfig config.BackendConfig + localNodeName string // Name of the local node - // Remote instance management - httpClient *http.Client - instanceNodeMap map[string]*config.NodeConfig // Maps instance name to its node config - nodeConfigMap map[string]*config.NodeConfig // Maps node name to node config for quick lookup + // Synchronization + operationMu sync.Mutex // Protects start/stop/update/delete/restart operations + shutdownOnce sync.Once } -// NewInstanceManager creates a new instance of InstanceManager. -func NewInstanceManager(backendsConfig config.BackendConfig, instancesConfig config.InstancesConfig, nodesConfig map[string]config.NodeConfig, localNodeName string) InstanceManager { +// New creates a new instance of InstanceManager. +func New(backendsConfig config.BackendConfig, instancesConfig config.InstancesConfig, nodesConfig map[string]config.NodeConfig, localNodeName string) InstanceManager { if instancesConfig.TimeoutCheckInterval <= 0 { instancesConfig.TimeoutCheckInterval = 5 // Default to 5 minutes if not set } - // Build node config map for quick lookup - nodeConfigMap := make(map[string]*config.NodeConfig) - for name := range nodesConfig { - nodeCopy := nodesConfig[name] - nodeConfigMap[name] = &nodeCopy + // Initialize components + registry := NewInstanceRegistry() + + // Initialize port allocator + portRange := instancesConfig.PortRange + ports, err := NewPortAllocator(portRange[0], portRange[1]) + if err != nil { + log.Fatalf("Failed to create port allocator: %v", err) } + // Initialize persistence + persistence, err := NewInstancePersister(instancesConfig.InstancesDir) + if err != nil { + log.Fatalf("Failed to create instance persister: %v", err) + } + + // Initialize remote manager + remote := NewRemoteManager(nodesConfig, 30*time.Second) + + // Create manager instance im := &instanceManager{ - instances: make(map[string]*instance.Instance), - runningInstances: make(map[string]struct{}), - ports: make(map[int]bool), - instancesConfig: instancesConfig, - backendsConfig: backendsConfig, - localNodeName: localNodeName, - - timeoutChecker: time.NewTicker(time.Duration(instancesConfig.TimeoutCheckInterval) * time.Minute), - shutdownChan: make(chan struct{}), - shutdownDone: make(chan struct{}), - - httpClient: &http.Client{ - Timeout: 30 * time.Second, - }, - - instanceNodeMap: make(map[string]*config.NodeConfig), - nodeConfigMap: nodeConfigMap, + registry: registry, + ports: ports, + persistence: persistence, + remote: remote, + instancesConfig: instancesConfig, + backendsConfig: backendsConfig, + localNodeName: localNodeName, } + // Initialize lifecycle manager (needs reference to manager for Stop/Evict operations) + checkInterval := time.Duration(instancesConfig.TimeoutCheckInterval) * time.Minute + im.lifecycle = NewLifecycleManager(registry, im, checkInterval, true) + // Load existing instances from disk if err := im.loadInstances(); err != nil { log.Printf("Error loading instances: %v", err) } - // Start the timeout checker goroutine after initialization is complete - go func() { - defer close(im.shutdownDone) - - for { - select { - case <-im.timeoutChecker.C: - im.checkAllTimeouts() - case <-im.shutdownChan: - return // Exit goroutine on shutdown - } - } - }() + // Start the lifecycle manager + im.lifecycle.Start() return im } -func (im *instanceManager) getNextAvailablePort() (int, error) { - portRange := im.instancesConfig.PortRange - - for port := portRange[0]; port <= portRange[1]; port++ { - if !im.ports[port] { - im.ports[port] = true - return port, nil - } - } - - return 0, fmt.Errorf("no available ports in the specified range") -} - -// persistInstance saves an instance to its JSON file -func (im *instanceManager) persistInstance(instance *instance.Instance) error { - if im.instancesConfig.InstancesDir == "" { - return nil // Persistence disabled - } - - instancePath := filepath.Join(im.instancesConfig.InstancesDir, instance.Name+".json") - tempPath := instancePath + ".tmp" - - // Serialize instance to JSON - jsonData, err := json.MarshalIndent(instance, "", " ") - if err != nil { - return fmt.Errorf("failed to marshal instance %s: %w", instance.Name, err) - } - - // Write to temporary file first - if err := os.WriteFile(tempPath, jsonData, 0644); err != nil { - return fmt.Errorf("failed to write temp file for instance %s: %w", instance.Name, err) - } - - // Atomic rename - if err := os.Rename(tempPath, instancePath); err != nil { - os.Remove(tempPath) // Clean up temp file - return fmt.Errorf("failed to rename temp file for instance %s: %w", instance.Name, err) - } - - return nil +// persistInstance saves an instance using the persistence component +func (im *instanceManager) persistInstance(inst *instance.Instance) error { + return im.persistence.Save(inst) } func (im *instanceManager) Shutdown() { - im.mu.Lock() + im.shutdownOnce.Do(func() { + // 1. Stop lifecycle manager (stops timeout checker) + im.lifecycle.Stop() - // Check if already shutdown - if im.isShutdown { - im.mu.Unlock() - return - } - im.isShutdown = true + // 2. Get running instances (no lock needed - registry handles it) + running := im.registry.ListRunning() - // Signal the timeout checker to stop - close(im.shutdownChan) - - // Create a list of running instances to stop - var runningInstances []*instance.Instance - var runningNames []string - for name, inst := range im.instances { - if inst.IsRunning() { - runningInstances = append(runningInstances, inst) - runningNames = append(runningNames, name) - } - } - - // Release lock before stopping instances to avoid deadlock - im.mu.Unlock() - - // Wait for the timeout checker goroutine to actually stop - <-im.shutdownDone - - // Now stop the ticker - if im.timeoutChecker != nil { - im.timeoutChecker.Stop() - } - - // Stop instances without holding the manager lock - var wg sync.WaitGroup - wg.Add(len(runningInstances)) - - for i, inst := range runningInstances { - go func(name string, inst *instance.Instance) { - defer wg.Done() - fmt.Printf("Stopping instance %s...\n", name) - // Attempt to stop the instance gracefully - if err := inst.Stop(); err != nil { - fmt.Printf("Error stopping instance %s: %v\n", name, err) + // 3. Stop local instances concurrently + var wg sync.WaitGroup + for _, inst := range running { + if inst.IsRemote() { + continue // Skip remote instances } - }(runningNames[i], inst) - } - - wg.Wait() - fmt.Println("All instances stopped.") + wg.Add(1) + go func(inst *instance.Instance) { + defer wg.Done() + fmt.Printf("Stopping instance %s...\n", inst.Name) + if err := inst.Stop(); err != nil { + fmt.Printf("Error stopping instance %s: %v\n", inst.Name, err) + } + }(inst) + } + wg.Wait() + fmt.Println("All instances stopped.") + }) } -// loadInstances restores all instances from disk +// loadInstances restores all instances from disk using the persistence component func (im *instanceManager) loadInstances() error { - if im.instancesConfig.InstancesDir == "" { - return nil // Persistence disabled - } - - // Check if instances directory exists - if _, err := os.Stat(im.instancesConfig.InstancesDir); os.IsNotExist(err) { - return nil // No instances directory, start fresh - } - - // Read all JSON files from instances directory - files, err := os.ReadDir(im.instancesConfig.InstancesDir) + // Load all instances from persistence + instances, err := im.persistence.LoadAll() if err != nil { - return fmt.Errorf("failed to read instances directory: %w", err) + return fmt.Errorf("failed to load instances: %w", err) } - loadedCount := 0 - for _, file := range files { - if file.IsDir() || !strings.HasSuffix(file.Name(), ".json") { + if len(instances) == 0 { + return nil + } + + // Process each loaded instance + for _, persistedInst := range instances { + if err := im.loadInstance(persistedInst); err != nil { + log.Printf("Failed to load instance %s: %v", persistedInst.Name, err) continue } - - instanceName := strings.TrimSuffix(file.Name(), ".json") - instancePath := filepath.Join(im.instancesConfig.InstancesDir, file.Name()) - - if err := im.loadInstance(instanceName, instancePath); err != nil { - log.Printf("Failed to load instance %s: %v", instanceName, err) - continue - } - - loadedCount++ } - if loadedCount > 0 { - log.Printf("Loaded %d instances from persistence", loadedCount) - // Auto-start instances that have auto-restart enabled - go im.autoStartInstances() - } + log.Printf("Loaded %d instances from persistence", len(instances)) + + // Auto-start instances that have auto-restart enabled + go im.autoStartInstances() return nil } -// loadInstance loads a single instance from its JSON file -func (im *instanceManager) loadInstance(name, path string) error { - data, err := os.ReadFile(path) - if err != nil { - return fmt.Errorf("failed to read instance file: %w", err) - } - - var persistedInstance instance.Instance - if err := json.Unmarshal(data, &persistedInstance); err != nil { - return fmt.Errorf("failed to unmarshal instance: %w", err) - } - - // Validate the instance name matches the filename - if persistedInstance.Name != name { - return fmt.Errorf("instance name mismatch: file=%s, instance.Name=%s", name, persistedInstance.Name) - } - - options := persistedInstance.GetOptions() +// loadInstance loads a single persisted instance and adds it to the registry +func (im *instanceManager) loadInstance(persistedInst *instance.Instance) error { + name := persistedInst.Name + options := persistedInst.GetOptions() // Check if this is a remote instance (local node not in the Nodes set) var isRemote bool var nodeName string if options != nil { - if _, isLocal := options.Nodes[im.localNodeName]; !isLocal { + if _, isLocal := options.Nodes[im.localNodeName]; !isLocal && len(options.Nodes) > 0 { // Get the first node from the set for node := range options.Nodes { nodeName = node @@ -293,7 +191,7 @@ func (im *instanceManager) loadInstance(name, path string) error { if !isRemote { // Only set status callback for local instances statusCallback = func(oldStatus, newStatus instance.Status) { - im.onStatusChange(persistedInstance.Name, oldStatus, newStatus) + im.onStatusChange(name, oldStatus, newStatus) } } @@ -301,38 +199,42 @@ func (im *instanceManager) loadInstance(name, path string) error { inst := instance.New(name, &im.backendsConfig, &im.instancesConfig, options, im.localNodeName, statusCallback) // Restore persisted fields that NewInstance doesn't set - inst.Created = persistedInstance.Created - inst.SetStatus(persistedInstance.GetStatus()) + inst.Created = persistedInst.Created + inst.SetStatus(persistedInst.GetStatus()) // Handle remote instance mapping if isRemote { - nodeConfig, exists := im.nodeConfigMap[nodeName] - if !exists { - return fmt.Errorf("node %s not found for remote instance %s", nodeName, name) + // Map instance to node in remote manager + if err := im.remote.SetInstanceNode(name, nodeName); err != nil { + return fmt.Errorf("failed to set instance node: %w", err) } - im.instanceNodeMap[name] = nodeConfig } else { - // Check for port conflicts only for local instances + // Allocate port for local instances if inst.GetPort() > 0 { port := inst.GetPort() - if im.ports[port] { - return fmt.Errorf("port conflict: instance %s wants port %d which is already in use", name, port) + if err := im.ports.AllocateSpecific(port, name); err != nil { + return fmt.Errorf("port conflict: instance %s wants port %d which is already in use: %w", name, port, err) } - im.ports[port] = true } } - im.instances[name] = inst + // Add instance to registry + if err := im.registry.Add(inst); err != nil { + return fmt.Errorf("failed to add instance to registry: %w", err) + } + return nil } // autoStartInstances starts instances that were running when persisted and have auto-restart enabled // For instances with auto-restart disabled, it sets their status to Stopped func (im *instanceManager) autoStartInstances() { - im.mu.RLock() + instances := im.registry.List() + var instancesToStart []*instance.Instance var instancesToStop []*instance.Instance - for _, inst := range im.instances { + + for _, inst := range instances { if inst.IsRunning() && // Was running when persisted inst.GetOptions() != nil && inst.GetOptions().AutoRestart != nil { @@ -344,12 +246,12 @@ func (im *instanceManager) autoStartInstances() { } } } - im.mu.RUnlock() // Stop instances that have auto-restart disabled for _, inst := range instancesToStop { log.Printf("Instance %s was running but auto-restart is disabled, setting status to stopped", inst.Name) inst.SetStatus(instance.Stopped) + im.registry.MarkStopped(inst.Name) } // Start instances that have auto-restart enabled @@ -357,11 +259,13 @@ func (im *instanceManager) autoStartInstances() { log.Printf("Auto-starting instance %s", inst.Name) // Reset running state before starting (since Start() expects stopped instance) inst.SetStatus(instance.Stopped) + im.registry.MarkStopped(inst.Name) // Check if this is a remote instance - if node := im.getNodeForInstance(inst); node != nil { - // Remote instance - use StartRemoteInstance - if _, err := im.StartRemoteInstance(node, inst.Name); err != nil { + if node, exists := im.remote.GetNodeForInstance(inst.Name); exists && node != nil { + // Remote instance - use remote manager with context + ctx := context.Background() + if _, err := im.remote.StartInstance(ctx, node, inst.Name); err != nil { log.Printf("Failed to auto-start remote instance %s: %v", inst.Name, err) } } else { @@ -374,13 +278,10 @@ func (im *instanceManager) autoStartInstances() { } func (im *instanceManager) onStatusChange(name string, oldStatus, newStatus instance.Status) { - im.mu.Lock() - defer im.mu.Unlock() - if newStatus == instance.Running { - im.runningInstances[name] = struct{}{} + im.registry.MarkRunning(name) } else { - delete(im.runningInstances, name) + im.registry.MarkStopped(name) } } @@ -391,8 +292,8 @@ func (im *instanceManager) getNodeForInstance(inst *instance.Instance) *config.N return nil } - // Check if we have a cached mapping - if nodeConfig, exists := im.instanceNodeMap[inst.Name]; exists { + // Check if we have a node mapping in remote manager + if nodeConfig, exists := im.remote.GetNodeForInstance(inst.Name); exists { return nodeConfig } diff --git a/pkg/manager/manager_test.go b/pkg/manager/manager_test.go index 8c1be1c..997e0f6 100644 --- a/pkg/manager/manager_test.go +++ b/pkg/manager/manager_test.go @@ -33,7 +33,7 @@ func TestNewInstanceManager(t *testing.T) { TimeoutCheckInterval: 5, } - mgr := manager.NewInstanceManager(backendConfig, cfg, map[string]config.NodeConfig{}, "main") + mgr := manager.New(backendConfig, cfg, map[string]config.NodeConfig{}, "main") if mgr == nil { t.Fatal("NewInstanceManager returned nil") } @@ -68,7 +68,7 @@ func TestPersistence(t *testing.T) { } // Test instance persistence on creation - manager1 := manager.NewInstanceManager(backendConfig, cfg, map[string]config.NodeConfig{}, "main") + manager1 := manager.New(backendConfig, cfg, map[string]config.NodeConfig{}, "main") options := &instance.Options{ BackendOptions: backends.Options{ BackendType: backends.BackendTypeLlamaCpp, @@ -91,7 +91,7 @@ func TestPersistence(t *testing.T) { } // Test loading instances from disk - manager2 := manager.NewInstanceManager(backendConfig, cfg, map[string]config.NodeConfig{}, "main") + manager2 := manager.New(backendConfig, cfg, map[string]config.NodeConfig{}, "main") instances, err := manager2.ListInstances() if err != nil { t.Fatalf("ListInstances failed: %v", err) @@ -212,7 +212,7 @@ func createTestManager() manager.InstanceManager { DefaultRestartDelay: 5, TimeoutCheckInterval: 5, } - return manager.NewInstanceManager(backendConfig, cfg, map[string]config.NodeConfig{}, "main") + return manager.New(backendConfig, cfg, map[string]config.NodeConfig{}, "main") } func TestAutoRestartDisabledInstanceStatus(t *testing.T) { @@ -232,7 +232,7 @@ func TestAutoRestartDisabledInstanceStatus(t *testing.T) { } // Create first manager and instance with auto-restart disabled - manager1 := manager.NewInstanceManager(backendConfig, cfg, map[string]config.NodeConfig{}, "main") + manager1 := manager.New(backendConfig, cfg, map[string]config.NodeConfig{}, "main") autoRestart := false options := &instance.Options{ @@ -259,7 +259,7 @@ func TestAutoRestartDisabledInstanceStatus(t *testing.T) { manager1.Shutdown() // Create second manager (simulating restart of llamactl) - manager2 := manager.NewInstanceManager(backendConfig, cfg, map[string]config.NodeConfig{}, "main") + manager2 := manager.New(backendConfig, cfg, map[string]config.NodeConfig{}, "main") // Get the loaded instance loadedInst, err := manager2.GetInstance("test-instance") diff --git a/pkg/manager/operations.go b/pkg/manager/operations.go index fd150e8..51099ad 100644 --- a/pkg/manager/operations.go +++ b/pkg/manager/operations.go @@ -1,44 +1,28 @@ package manager import ( + "context" "fmt" "llamactl/pkg/instance" "llamactl/pkg/validation" - "os" - "path/filepath" + "log" ) type MaxRunningInstancesError error // updateLocalInstanceFromRemote updates the local stub instance with data from the remote instance -// while preserving the Nodes field to maintain remote instance tracking func (im *instanceManager) updateLocalInstanceFromRemote(localInst *instance.Instance, remoteInst *instance.Instance) { if localInst == nil || remoteInst == nil { return } - // Get the remote instance options remoteOptions := remoteInst.GetOptions() if remoteOptions == nil { return } - // Preserve the Nodes field from the local instance - localOptions := localInst.GetOptions() - var preservedNodes map[string]struct{} - if localOptions != nil && len(localOptions.Nodes) > 0 { - preservedNodes = make(map[string]struct{}, len(localOptions.Nodes)) - for node := range localOptions.Nodes { - preservedNodes[node] = struct{}{} - } - } - - // Create a copy of remote options and restore the Nodes field - updatedOptions := *remoteOptions - updatedOptions.Nodes = preservedNodes - // Update the local instance with all remote data - localInst.SetOptions(&updatedOptions) + localInst.SetOptions(remoteOptions) localInst.SetStatus(remoteInst.GetStatus()) localInst.Created = remoteInst.Created } @@ -46,17 +30,13 @@ func (im *instanceManager) updateLocalInstanceFromRemote(localInst *instance.Ins // ListInstances returns a list of all instances managed by the instance manager. // For remote instances, this fetches the live state from remote nodes and updates local stubs. func (im *instanceManager) ListInstances() ([]*instance.Instance, error) { - im.mu.RLock() - localInstances := make([]*instance.Instance, 0, len(im.instances)) - for _, inst := range im.instances { - localInstances = append(localInstances, inst) - } - im.mu.RUnlock() + instances := im.registry.List() // Update remote instances with live state - for _, inst := range localInstances { + ctx := context.Background() + for _, inst := range instances { if node := im.getNodeForInstance(inst); node != nil { - remoteInst, err := im.GetRemoteInstance(node, inst.Name) + remoteInst, err := im.remote.GetInstance(ctx, node, inst.Name) if err != nil { // Log error but continue with stale data // Don't fail the entire list operation due to one remote failure @@ -64,13 +44,11 @@ func (im *instanceManager) ListInstances() ([]*instance.Instance, error) { } // Update the local stub with all remote data (preserving Nodes) - im.mu.Lock() im.updateLocalInstanceFromRemote(inst, remoteInst) - im.mu.Unlock() } } - return localInstances, nil + return instances, nil } // CreateInstance creates a new instance with the given options and returns it. @@ -90,11 +68,8 @@ func (im *instanceManager) CreateInstance(name string, options *instance.Options return nil, err } - im.mu.Lock() - defer im.mu.Unlock() - // Check if instance with this name already exists (must be globally unique) - if im.instances[name] != nil { + if _, exists := im.registry.Get(name); exists { return nil, fmt.Errorf("instance with name %s already exists", name) } @@ -107,14 +82,18 @@ func (im *instanceManager) CreateInstance(name string, options *instance.Options break } - // Validate that the node exists - nodeConfig, exists := im.nodeConfigMap[nodeName] + // Create the remote instance on the remote node + ctx := context.Background() + nodeConfig, exists := im.remote.GetNodeForInstance(nodeName) if !exists { - return nil, fmt.Errorf("node %s not found", nodeName) + // Try to set the node if it doesn't exist yet + if err := im.remote.SetInstanceNode(name, nodeName); err != nil { + return nil, fmt.Errorf("node %s not found", nodeName) + } + nodeConfig, _ = im.remote.GetNodeForInstance(name) } - // Create the remote instance on the remote node - remoteInst, err := im.CreateRemoteInstance(nodeConfig, name, options) + remoteInst, err := im.remote.CreateInstance(ctx, nodeConfig, name, options) if err != nil { return nil, err } @@ -126,12 +105,20 @@ func (im *instanceManager) CreateInstance(name string, options *instance.Options // Update the local stub with all remote data (preserving Nodes) im.updateLocalInstanceFromRemote(inst, remoteInst) - // Add to local tracking maps (but don't count towards limits) - im.instances[name] = inst - im.instanceNodeMap[name] = nodeConfig + // Map instance to node + if err := im.remote.SetInstanceNode(name, nodeName); err != nil { + return nil, fmt.Errorf("failed to map instance to node: %w", err) + } + + // Add to registry (doesn't count towards local limits) + if err := im.registry.Add(inst); err != nil { + return nil, fmt.Errorf("failed to add instance to registry: %w", err) + } // Persist the remote instance locally for tracking across restarts if err := im.persistInstance(inst); err != nil { + // Rollback: remove from registry + im.registry.Remove(name) return nil, fmt.Errorf("failed to persist remote instance %s: %w", name, err) } @@ -140,14 +127,34 @@ func (im *instanceManager) CreateInstance(name string, options *instance.Options // Local instance creation // Check max instances limit for local instances only - localInstanceCount := len(im.instances) - len(im.instanceNodeMap) + totalInstances := im.registry.Count() + remoteCount := 0 + for _, inst := range im.registry.List() { + if inst.IsRemote() { + remoteCount++ + } + } + localInstanceCount := totalInstances - remoteCount if localInstanceCount >= im.instancesConfig.MaxInstances && im.instancesConfig.MaxInstances != -1 { return nil, fmt.Errorf("maximum number of instances (%d) reached", im.instancesConfig.MaxInstances) } // Assign and validate port for backend-specific options - if err := im.assignAndValidatePort(options); err != nil { - return nil, err + currentPort := im.getPortFromOptions(options) + var allocatedPort int + if currentPort == 0 { + // Allocate a port if not specified + allocatedPort, err = im.ports.Allocate(name) + if err != nil { + return nil, fmt.Errorf("failed to allocate port: %w", err) + } + im.setPortInOptions(options, allocatedPort) + } else { + // Use the specified port + if err := im.ports.AllocateSpecific(currentPort, name); err != nil { + return nil, fmt.Errorf("port %d is already in use: %w", currentPort, err) + } + allocatedPort = currentPort } statusCallback := func(oldStatus, newStatus instance.Status) { @@ -155,10 +162,17 @@ func (im *instanceManager) CreateInstance(name string, options *instance.Options } inst := instance.New(name, &im.backendsConfig, &im.instancesConfig, options, im.localNodeName, statusCallback) - im.instances[inst.Name] = inst + // Add to registry + if err := im.registry.Add(inst); err != nil { + // Rollback: release port + im.ports.Release(allocatedPort) + return nil, fmt.Errorf("failed to add instance to registry: %w", err) + } + + // Persist instance (best-effort, don't fail if persistence fails) if err := im.persistInstance(inst); err != nil { - return nil, fmt.Errorf("failed to persist instance %s: %w", name, err) + log.Printf("Warning: failed to persist instance %s: %v", name, err) } return inst, nil @@ -167,25 +181,21 @@ func (im *instanceManager) CreateInstance(name string, options *instance.Options // GetInstance retrieves an instance by its name. // For remote instances, this fetches the live state from the remote node and updates the local stub. func (im *instanceManager) GetInstance(name string) (*instance.Instance, error) { - im.mu.RLock() - inst, exists := im.instances[name] - im.mu.RUnlock() - + inst, exists := im.registry.Get(name) if !exists { return nil, fmt.Errorf("instance with name %s not found", name) } // Check if instance is remote and fetch live state if node := im.getNodeForInstance(inst); node != nil { - remoteInst, err := im.GetRemoteInstance(node, name) + ctx := context.Background() + remoteInst, err := im.remote.GetInstance(ctx, node, name) if err != nil { return nil, err } // Update the local stub with all remote data (preserving Nodes) - im.mu.Lock() im.updateLocalInstanceFromRemote(inst, remoteInst) - im.mu.Unlock() // Return the local stub (preserving Nodes field) return inst, nil @@ -197,29 +207,23 @@ func (im *instanceManager) GetInstance(name string) (*instance.Instance, error) // UpdateInstance updates the options of an existing instance and returns it. // If the instance is running, it will be restarted to apply the new options. func (im *instanceManager) UpdateInstance(name string, options *instance.Options) (*instance.Instance, error) { - im.mu.RLock() - inst, exists := im.instances[name] - im.mu.RUnlock() - + inst, exists := im.registry.Get(name) if !exists { return nil, fmt.Errorf("instance with name %s not found", name) } // Check if instance is remote and delegate to remote operation if node := im.getNodeForInstance(inst); node != nil { - remoteInst, err := im.UpdateRemoteInstance(node, name, options) + ctx := context.Background() + remoteInst, err := im.remote.UpdateInstance(ctx, node, name, options) if err != nil { return nil, err } // Update the local stub with all remote data (preserving Nodes) - im.mu.Lock() im.updateLocalInstanceFromRemote(inst, remoteInst) - im.mu.Unlock() // Persist the updated remote instance locally - im.mu.Lock() - defer im.mu.Unlock() if err := im.persistInstance(inst); err != nil { return nil, fmt.Errorf("failed to persist updated remote instance %s: %w", name, err) } @@ -236,6 +240,42 @@ func (im *instanceManager) UpdateInstance(name string, options *instance.Options return nil, err } + // Lock for local instance operations to prevent races + im.operationMu.Lock() + defer im.operationMu.Unlock() + + // Handle port changes + oldPort := inst.GetPort() + newPort := im.getPortFromOptions(options) + var allocatedPort int + + if newPort != oldPort { + // Port is changing - need to release old and allocate new + if newPort == 0 { + // Auto-allocate new port + allocatedPort, err = im.ports.Allocate(name) + if err != nil { + return nil, fmt.Errorf("failed to allocate new port: %w", err) + } + im.setPortInOptions(options, allocatedPort) + } else { + // Use specified port + if err := im.ports.AllocateSpecific(newPort, name); err != nil { + return nil, fmt.Errorf("failed to allocate port %d: %w", newPort, err) + } + allocatedPort = newPort + } + + // Release old port + if oldPort > 0 { + if err := im.ports.Release(oldPort); err != nil { + // Rollback new port allocation + im.ports.Release(allocatedPort) + return nil, fmt.Errorf("failed to release old port %d: %w", oldPort, err) + } + } + } + // Check if instance is running before updating options wasRunning := inst.IsRunning() @@ -256,8 +296,6 @@ func (im *instanceManager) UpdateInstance(name string, options *instance.Options } } - im.mu.Lock() - defer im.mu.Unlock() if err := im.persistInstance(inst); err != nil { return nil, fmt.Errorf("failed to persist updated instance %s: %w", name, err) } @@ -267,60 +305,50 @@ func (im *instanceManager) UpdateInstance(name string, options *instance.Options // DeleteInstance removes stopped instance by its name. func (im *instanceManager) DeleteInstance(name string) error { - im.mu.Lock() - inst, exists := im.instances[name] - im.mu.Unlock() - + inst, exists := im.registry.Get(name) if !exists { return fmt.Errorf("instance with name %s not found", name) } // Check if instance is remote and delegate to remote operation if node := im.getNodeForInstance(inst); node != nil { - err := im.DeleteRemoteInstance(node, name) + ctx := context.Background() + err := im.remote.DeleteInstance(ctx, node, name) if err != nil { return err } // Clean up local tracking - im.mu.Lock() - defer im.mu.Unlock() - delete(im.instances, name) - delete(im.instanceNodeMap, name) + im.remote.RemoveInstance(name) + im.registry.Remove(name) - // Delete the instance's config file if persistence is enabled - // Re-validate instance name for security (defense in depth) - validatedName, err := validation.ValidateInstanceName(name) - if err != nil { - return fmt.Errorf("invalid instance name for file deletion: %w", err) - } - instancePath := filepath.Join(im.instancesConfig.InstancesDir, validatedName+".json") - if err := os.Remove(instancePath); err != nil && !os.IsNotExist(err) { - return fmt.Errorf("failed to delete config file for remote instance %s: %w", validatedName, err) + // Delete the instance's persistence file + if err := im.persistence.Delete(name); err != nil { + return fmt.Errorf("failed to delete config file for remote instance %s: %w", name, err) } return nil } + // Lock for local instance operations to prevent races + im.operationMu.Lock() + defer im.operationMu.Unlock() + if inst.IsRunning() { return fmt.Errorf("instance with name %s is still running, stop it before deleting", name) } - im.mu.Lock() - defer im.mu.Unlock() + // Release port (use ReleaseByInstance for proper cleanup) + im.ports.ReleaseByInstance(name) - delete(im.ports, inst.GetPort()) - delete(im.instances, name) - - // Delete the instance's config file if persistence is enabled - // Re-validate instance name for security (defense in depth) - validatedName, err := validation.ValidateInstanceName(inst.Name) - if err != nil { - return fmt.Errorf("invalid instance name for file deletion: %w", err) + // Remove from registry + if err := im.registry.Remove(name); err != nil { + return fmt.Errorf("failed to remove instance from registry: %w", err) } - instancePath := filepath.Join(im.instancesConfig.InstancesDir, validatedName+".json") - if err := os.Remove(instancePath); err != nil && !os.IsNotExist(err) { - return fmt.Errorf("failed to delete config file for instance %s: %w", validatedName, err) + + // Delete persistence file + if err := im.persistence.Delete(name); err != nil { + return fmt.Errorf("failed to delete config file for instance %s: %w", name, err) } return nil @@ -329,45 +357,35 @@ func (im *instanceManager) DeleteInstance(name string) error { // StartInstance starts a stopped instance and returns it. // If the instance is already running, it returns an error. func (im *instanceManager) StartInstance(name string) (*instance.Instance, error) { - im.mu.RLock() - inst, exists := im.instances[name] - im.mu.RUnlock() - + inst, exists := im.registry.Get(name) if !exists { return nil, fmt.Errorf("instance with name %s not found", name) } // Check if instance is remote and delegate to remote operation if node := im.getNodeForInstance(inst); node != nil { - remoteInst, err := im.StartRemoteInstance(node, name) + ctx := context.Background() + remoteInst, err := im.remote.StartInstance(ctx, node, name) if err != nil { return nil, err } // Update the local stub with all remote data (preserving Nodes) - im.mu.Lock() im.updateLocalInstanceFromRemote(inst, remoteInst) - im.mu.Unlock() return inst, nil } + // Lock for local instance operations to prevent races + im.operationMu.Lock() + defer im.operationMu.Unlock() + if inst.IsRunning() { return inst, fmt.Errorf("instance with name %s is already running", name) } // Check max running instances limit for local instances only - im.mu.RLock() - localRunningCount := 0 - for instName := range im.runningInstances { - if _, isRemote := im.instanceNodeMap[instName]; !isRemote { - localRunningCount++ - } - } - maxRunningExceeded := localRunningCount >= im.instancesConfig.MaxRunningInstances && im.instancesConfig.MaxRunningInstances != -1 - im.mu.RUnlock() - - if maxRunningExceeded { + if im.IsMaxRunningInstancesReached() { return nil, MaxRunningInstancesError(fmt.Errorf("maximum number of running instances (%d) reached", im.instancesConfig.MaxRunningInstances)) } @@ -375,52 +393,55 @@ func (im *instanceManager) StartInstance(name string) (*instance.Instance, error return nil, fmt.Errorf("failed to start instance %s: %w", name, err) } - im.mu.Lock() - defer im.mu.Unlock() - err := im.persistInstance(inst) - if err != nil { - return nil, fmt.Errorf("failed to persist instance %s: %w", name, err) + // Persist instance (best-effort, don't fail if persistence fails) + if err := im.persistInstance(inst); err != nil { + log.Printf("Warning: failed to persist instance %s: %v", name, err) } return inst, nil } func (im *instanceManager) IsMaxRunningInstancesReached() bool { - im.mu.RLock() - defer im.mu.RUnlock() - - if im.instancesConfig.MaxRunningInstances != -1 && len(im.runningInstances) >= im.instancesConfig.MaxRunningInstances { - return true + if im.instancesConfig.MaxRunningInstances == -1 { + return false } - return false + // Count only local running instances (each node has its own limits) + localRunningCount := 0 + for _, inst := range im.registry.ListRunning() { + if !inst.IsRemote() { + localRunningCount++ + } + } + + return localRunningCount >= im.instancesConfig.MaxRunningInstances } // StopInstance stops a running instance and returns it. func (im *instanceManager) StopInstance(name string) (*instance.Instance, error) { - im.mu.RLock() - inst, exists := im.instances[name] - im.mu.RUnlock() - + inst, exists := im.registry.Get(name) if !exists { return nil, fmt.Errorf("instance with name %s not found", name) } // Check if instance is remote and delegate to remote operation if node := im.getNodeForInstance(inst); node != nil { - remoteInst, err := im.StopRemoteInstance(node, name) + ctx := context.Background() + remoteInst, err := im.remote.StopInstance(ctx, node, name) if err != nil { return nil, err } // Update the local stub with all remote data (preserving Nodes) - im.mu.Lock() im.updateLocalInstanceFromRemote(inst, remoteInst) - im.mu.Unlock() return inst, nil } + // Lock for local instance operations to prevent races + im.operationMu.Lock() + defer im.operationMu.Unlock() + if !inst.IsRunning() { return inst, fmt.Errorf("instance with name %s is already stopped", name) } @@ -429,11 +450,9 @@ func (im *instanceManager) StopInstance(name string) (*instance.Instance, error) return nil, fmt.Errorf("failed to stop instance %s: %w", name, err) } - im.mu.Lock() - defer im.mu.Unlock() - err := im.persistInstance(inst) - if err != nil { - return nil, fmt.Errorf("failed to persist instance %s: %w", name, err) + // Persist instance (best-effort, don't fail if persistence fails) + if err := im.persistInstance(inst); err != nil { + log.Printf("Warning: failed to persist instance %s: %v", name, err) } return inst, nil @@ -441,49 +460,60 @@ func (im *instanceManager) StopInstance(name string) (*instance.Instance, error) // RestartInstance stops and then starts an instance, returning the updated instance. func (im *instanceManager) RestartInstance(name string) (*instance.Instance, error) { - im.mu.RLock() - inst, exists := im.instances[name] - im.mu.RUnlock() - + inst, exists := im.registry.Get(name) if !exists { return nil, fmt.Errorf("instance with name %s not found", name) } // Check if instance is remote and delegate to remote operation if node := im.getNodeForInstance(inst); node != nil { - remoteInst, err := im.RestartRemoteInstance(node, name) + ctx := context.Background() + remoteInst, err := im.remote.RestartInstance(ctx, node, name) if err != nil { return nil, err } // Update the local stub with all remote data (preserving Nodes) - im.mu.Lock() im.updateLocalInstanceFromRemote(inst, remoteInst) - im.mu.Unlock() return inst, nil } - inst, err := im.StopInstance(name) - if err != nil { - return nil, err + // Lock for the entire restart operation to ensure atomicity + im.operationMu.Lock() + defer im.operationMu.Unlock() + + // Stop the instance + if inst.IsRunning() { + if err := inst.Stop(); err != nil { + return nil, fmt.Errorf("failed to stop instance %s: %w", name, err) + } } - return im.StartInstance(inst.Name) + + // Start the instance + if err := inst.Start(); err != nil { + return nil, fmt.Errorf("failed to start instance %s: %w", name, err) + } + + // Persist the restarted instance + if err := im.persistInstance(inst); err != nil { + log.Printf("Warning: failed to persist instance %s: %v", name, err) + } + + return inst, nil } // GetInstanceLogs retrieves the logs for a specific instance by its name. func (im *instanceManager) GetInstanceLogs(name string, numLines int) (string, error) { - im.mu.RLock() - inst, exists := im.instances[name] - im.mu.RUnlock() - + inst, exists := im.registry.Get(name) if !exists { return "", fmt.Errorf("instance with name %s not found", name) } // Check if instance is remote and delegate to remote operation if node := im.getNodeForInstance(inst); node != nil { - return im.GetRemoteInstanceLogs(node, name, numLines) + ctx := context.Background() + return im.remote.GetInstanceLogs(ctx, node, name, numLines) } // Get logs from the local instance @@ -500,27 +530,7 @@ func (im *instanceManager) setPortInOptions(options *instance.Options, port int) options.BackendOptions.SetPort(port) } -// assignAndValidatePort assigns a port if not specified and validates it's not in use -func (im *instanceManager) assignAndValidatePort(options *instance.Options) error { - currentPort := im.getPortFromOptions(options) - - if currentPort == 0 { - // Assign a port if not specified - port, err := im.getNextAvailablePort() - if err != nil { - return fmt.Errorf("failed to get next available port: %w", err) - } - im.setPortInOptions(options, port) - // Mark the port as used - im.ports[port] = true - } else { - // Validate the specified port - if _, exists := im.ports[currentPort]; exists { - return fmt.Errorf("port %d is already in use", currentPort) - } - // Mark the port as used - im.ports[currentPort] = true - } - - return nil +// EvictLRUInstance finds and stops the least recently used running instance. +func (im *instanceManager) EvictLRUInstance() error { + return im.lifecycle.EvictLRU() } diff --git a/pkg/manager/operations_test.go b/pkg/manager/operations_test.go index 3a0651d..7523c5b 100644 --- a/pkg/manager/operations_test.go +++ b/pkg/manager/operations_test.go @@ -78,7 +78,7 @@ func TestCreateInstance_ValidationAndLimits(t *testing.T) { MaxInstances: 1, // Very low limit for testing TimeoutCheckInterval: 5, } - limitedManager := manager.NewInstanceManager(backendConfig, cfg, map[string]config.NodeConfig{}, "main") + limitedManager := manager.New(backendConfig, cfg, map[string]config.NodeConfig{}, "main") _, err = limitedManager.CreateInstance("instance1", options) if err != nil { diff --git a/pkg/manager/persistence.go b/pkg/manager/persistence.go new file mode 100644 index 0000000..a2853cb --- /dev/null +++ b/pkg/manager/persistence.go @@ -0,0 +1,241 @@ +package manager + +import ( + "encoding/json" + "fmt" + "llamactl/pkg/instance" + "log" + "os" + "path/filepath" + "strings" + "sync" +) + +// instancePersister provides atomic file-based persistence with durability guarantees. +type instancePersister struct { + mu sync.Mutex + instancesDir string + enabled bool +} + +// NewInstancePersister creates a new instance persister. +// If instancesDir is empty, persistence is disabled. +func NewInstancePersister(instancesDir string) (*instancePersister, error) { + if instancesDir == "" { + return &instancePersister{ + enabled: false, + }, nil + } + + // Ensure the instances directory exists + if err := os.MkdirAll(instancesDir, 0755); err != nil { + return nil, fmt.Errorf("failed to create instances directory: %w", err) + } + + return &instancePersister{ + instancesDir: instancesDir, + enabled: true, + }, nil +} + +// Save persists an instance to disk with atomic write +func (p *instancePersister) Save(inst *instance.Instance) error { + if !p.enabled { + return nil + } + + if inst == nil { + return fmt.Errorf("cannot save nil instance") + } + + // Validate instance name to prevent path traversal + if err := p.validateInstanceName(inst.Name); err != nil { + return err + } + + p.mu.Lock() + defer p.mu.Unlock() + + instancePath := filepath.Join(p.instancesDir, inst.Name+".json") + tempPath := instancePath + ".tmp" + + // Serialize instance to JSON + jsonData, err := json.MarshalIndent(inst, "", " ") + if err != nil { + return fmt.Errorf("failed to marshal instance %s: %w", inst.Name, err) + } + + // Create temporary file + tempFile, err := os.OpenFile(tempPath, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0644) + if err != nil { + return fmt.Errorf("failed to create temp file for instance %s: %w", inst.Name, err) + } + + // Write data to temporary file + if _, err := tempFile.Write(jsonData); err != nil { + tempFile.Close() + os.Remove(tempPath) + return fmt.Errorf("failed to write temp file for instance %s: %w", inst.Name, err) + } + + // Sync to disk before rename to ensure durability + if err := tempFile.Sync(); err != nil { + tempFile.Close() + os.Remove(tempPath) + return fmt.Errorf("failed to sync temp file for instance %s: %w", inst.Name, err) + } + + // Close the file + if err := tempFile.Close(); err != nil { + os.Remove(tempPath) + return fmt.Errorf("failed to close temp file for instance %s: %w", inst.Name, err) + } + + // Atomic rename (this is atomic on POSIX systems) + if err := os.Rename(tempPath, instancePath); err != nil { + os.Remove(tempPath) + return fmt.Errorf("failed to rename temp file for instance %s: %w", inst.Name, err) + } + + return nil +} + +// Load loads a single instance from disk by name. +func (p *instancePersister) Load(name string) (*instance.Instance, error) { + if !p.enabled { + return nil, fmt.Errorf("persistence is disabled") + } + + if err := p.validateInstanceName(name); err != nil { + return nil, err + } + + p.mu.Lock() + defer p.mu.Unlock() + + instancePath := filepath.Join(p.instancesDir, name+".json") + + inst, err := p.loadInstanceFile(name, instancePath) + if err != nil { + if os.IsNotExist(err) { + return nil, fmt.Errorf("instance %s not found", name) + } + return nil, err + } + + return inst, nil +} + +// Delete removes an instance's persistence file from disk. +func (p *instancePersister) Delete(name string) error { + if !p.enabled { + return nil + } + + if err := p.validateInstanceName(name); err != nil { + return err + } + + p.mu.Lock() + defer p.mu.Unlock() + + instancePath := filepath.Join(p.instancesDir, name+".json") + + if err := os.Remove(instancePath); err != nil { + if os.IsNotExist(err) { + // Not an error if file doesn't exist + return nil + } + return fmt.Errorf("failed to delete instance file for %s: %w", name, err) + } + + return nil +} + +// LoadAll loads all persisted instances from disk. +// Returns a slice of instances and any errors encountered during loading. +func (p *instancePersister) LoadAll() ([]*instance.Instance, error) { + if !p.enabled { + return nil, nil + } + + p.mu.Lock() + defer p.mu.Unlock() + + // Check if instances directory exists + if _, err := os.Stat(p.instancesDir); os.IsNotExist(err) { + return nil, nil // No instances directory, return empty list + } + + // Read all JSON files from instances directory + files, err := os.ReadDir(p.instancesDir) + if err != nil { + return nil, fmt.Errorf("failed to read instances directory: %w", err) + } + + instances := make([]*instance.Instance, 0) + var loadErrors []string + + for _, file := range files { + if file.IsDir() || !strings.HasSuffix(file.Name(), ".json") { + continue + } + + instanceName := strings.TrimSuffix(file.Name(), ".json") + instancePath := filepath.Join(p.instancesDir, file.Name()) + + inst, err := p.loadInstanceFile(instanceName, instancePath) + if err != nil { + log.Printf("Failed to load instance %s: %v", instanceName, err) + loadErrors = append(loadErrors, fmt.Sprintf("%s: %v", instanceName, err)) + continue + } + + instances = append(instances, inst) + } + + if len(loadErrors) > 0 { + log.Printf("Loaded %d instances with %d errors", len(instances), len(loadErrors)) + } else if len(instances) > 0 { + log.Printf("Loaded %d instances from persistence", len(instances)) + } + + return instances, nil +} + +// loadInstanceFile is an internal helper that loads a single instance file. +// Note: This assumes the mutex is already held by the caller. +func (p *instancePersister) loadInstanceFile(name, path string) (*instance.Instance, error) { + data, err := os.ReadFile(path) + if err != nil { + return nil, fmt.Errorf("failed to read instance file: %w", err) + } + + var inst instance.Instance + if err := json.Unmarshal(data, &inst); err != nil { + return nil, fmt.Errorf("failed to unmarshal instance: %w", err) + } + + // Validate the instance name matches the filename + if inst.Name != name { + return nil, fmt.Errorf("instance name mismatch: file=%s, instance.Name=%s", name, inst.Name) + } + + return &inst, nil +} + +// validateInstanceName ensures the instance name is safe for filesystem operations. +func (p *instancePersister) validateInstanceName(name string) error { + if name == "" { + return fmt.Errorf("instance name cannot be empty") + } + + cleaned := filepath.Clean(name) + + // After cleaning, name should not contain any path separators + if cleaned != name || strings.Contains(cleaned, string(filepath.Separator)) { + return fmt.Errorf("invalid instance name: %s", name) + } + + return nil +} diff --git a/pkg/manager/ports.go b/pkg/manager/ports.go new file mode 100644 index 0000000..a77ff0e --- /dev/null +++ b/pkg/manager/ports.go @@ -0,0 +1,184 @@ +package manager + +import ( + "fmt" + "math/bits" + "sync" +) + +// portAllocator provides efficient port allocation using a bitmap for O(1) operations. +// The bitmap approach prevents unbounded memory growth and simplifies port management. +type portAllocator struct { + mu sync.Mutex + + // Bitmap for O(1) allocation/release + // Each bit represents a port (1 = allocated, 0 = free) + bitmap []uint64 // Each uint64 covers 64 ports + + // Map port to instance name for cleanup operations + allocated map[int]string + + minPort int + maxPort int + rangeSize int +} + +// NewPortAllocator creates a new port allocator for the given port range. +// Returns an error if the port range is invalid. +func NewPortAllocator(minPort, maxPort int) (*portAllocator, error) { + if minPort <= 0 || maxPort <= 0 { + return nil, fmt.Errorf("invalid port range: min=%d, max=%d (must be > 0)", minPort, maxPort) + } + if minPort > maxPort { + return nil, fmt.Errorf("invalid port range: min=%d > max=%d", minPort, maxPort) + } + + rangeSize := maxPort - minPort + 1 + bitmapSize := (rangeSize + 63) / 64 // Round up to nearest uint64 + + return &portAllocator{ + bitmap: make([]uint64, bitmapSize), + allocated: make(map[int]string), + minPort: minPort, + maxPort: maxPort, + rangeSize: rangeSize, + }, nil +} + +// Allocate finds and allocates the first available port for the given instance. +// Returns the allocated port or an error if no ports are available. +func (p *portAllocator) Allocate(instanceName string) (int, error) { + if instanceName == "" { + return 0, fmt.Errorf("instance name cannot be empty") + } + + p.mu.Lock() + defer p.mu.Unlock() + + port, err := p.findFirstFreeBit() + if err != nil { + return 0, err + } + + p.setBit(port) + p.allocated[port] = instanceName + + return port, nil +} + +// AllocateSpecific allocates a specific port for the given instance. +// Returns an error if the port is already allocated or out of range. +func (p *portAllocator) AllocateSpecific(port int, instanceName string) error { + if instanceName == "" { + return fmt.Errorf("instance name cannot be empty") + } + if port < p.minPort || port > p.maxPort { + return fmt.Errorf("port %d is out of range [%d-%d]", port, p.minPort, p.maxPort) + } + + p.mu.Lock() + defer p.mu.Unlock() + + if p.isBitSet(port) { + return fmt.Errorf("port %d is already allocated", port) + } + + p.setBit(port) + p.allocated[port] = instanceName + + return nil +} + +// Release releases a specific port, making it available for reuse. +// Returns an error if the port is not allocated. +func (p *portAllocator) Release(port int) error { + if port < p.minPort || port > p.maxPort { + return fmt.Errorf("port %d is out of range [%d-%d]", port, p.minPort, p.maxPort) + } + + p.mu.Lock() + defer p.mu.Unlock() + + if !p.isBitSet(port) { + return fmt.Errorf("port %d is not allocated", port) + } + + p.clearBit(port) + delete(p.allocated, port) + + return nil +} + +// ReleaseByInstance releases all ports allocated to the given instance. +// This is useful for cleanup when deleting or updating an instance. +// Returns the number of ports released. +func (p *portAllocator) ReleaseByInstance(instanceName string) int { + if instanceName == "" { + return 0 + } + + p.mu.Lock() + defer p.mu.Unlock() + + portsToRelease := make([]int, 0) + for port, name := range p.allocated { + if name == instanceName { + portsToRelease = append(portsToRelease, port) + } + } + + for _, port := range portsToRelease { + p.clearBit(port) + delete(p.allocated, port) + } + + return len(portsToRelease) +} + +// --- Internal bitmap operations --- + +// portToBitPos converts a port number to bitmap array index and bit position. +func (p *portAllocator) portToBitPos(port int) (index int, bit uint) { + offset := port - p.minPort + index = offset / 64 + bit = uint(offset % 64) + return +} + +// setBit marks a port as allocated in the bitmap. +func (p *portAllocator) setBit(port int) { + index, bit := p.portToBitPos(port) + p.bitmap[index] |= (1 << bit) +} + +// clearBit marks a port as free in the bitmap. +func (p *portAllocator) clearBit(port int) { + index, bit := p.portToBitPos(port) + p.bitmap[index] &^= (1 << bit) +} + +// isBitSet checks if a port is allocated in the bitmap. +func (p *portAllocator) isBitSet(port int) bool { + index, bit := p.portToBitPos(port) + return (p.bitmap[index] & (1 << bit)) != 0 +} + +// findFirstFreeBit scans the bitmap to find the first unallocated port. +// Returns the port number or an error if no ports are available. +func (p *portAllocator) findFirstFreeBit() (int, error) { + for i, word := range p.bitmap { + if word != ^uint64(0) { // Not all bits are set (some ports are free) + // Find the first 0 bit in this word + // XOR with all 1s to flip bits, then find first 1 (which was 0) + bit := bits.TrailingZeros64(^word) + port := p.minPort + (i * 64) + bit + + // Ensure we don't go beyond maxPort due to bitmap rounding + if port <= p.maxPort { + return port, nil + } + } + } + + return 0, fmt.Errorf("no available ports in range [%d-%d]", p.minPort, p.maxPort) +} diff --git a/pkg/manager/registry.go b/pkg/manager/registry.go new file mode 100644 index 0000000..41d801b --- /dev/null +++ b/pkg/manager/registry.go @@ -0,0 +1,131 @@ +package manager + +import ( + "fmt" + "llamactl/pkg/instance" + "sync" +) + +// instanceRegistry provides thread-safe storage and lookup of instances +// with running state tracking using lock-free sync.Map for status checks. +type instanceRegistry struct { + mu sync.RWMutex + instances map[string]*instance.Instance + running sync.Map // map[string]struct{} - lock-free for status checks +} + +// NewInstanceRegistry creates a new instance registry. +func NewInstanceRegistry() *instanceRegistry { + return &instanceRegistry{ + instances: make(map[string]*instance.Instance), + } +} + +// Get retrieves an instance by name. +// Returns the instance and true if found, nil and false otherwise. +func (r *instanceRegistry) Get(name string) (*instance.Instance, bool) { + r.mu.RLock() + defer r.mu.RUnlock() + + inst, exists := r.instances[name] + return inst, exists +} + +// List returns a snapshot copy of all instances to prevent external mutation. +func (r *instanceRegistry) List() []*instance.Instance { + r.mu.RLock() + defer r.mu.RUnlock() + + result := make([]*instance.Instance, 0, len(r.instances)) + for _, inst := range r.instances { + result = append(result, inst) + } + return result +} + +// ListRunning returns a snapshot of all currently running instances. +func (r *instanceRegistry) ListRunning() []*instance.Instance { + r.mu.RLock() + defer r.mu.RUnlock() + + result := make([]*instance.Instance, 0) + for name, inst := range r.instances { + if _, isRunning := r.running.Load(name); isRunning { + result = append(result, inst) + } + } + return result +} + +// Add adds a new instance to the registry. +// Returns an error if an instance with the same name already exists. +func (r *instanceRegistry) Add(inst *instance.Instance) error { + if inst == nil { + return fmt.Errorf("cannot add nil instance") + } + + r.mu.Lock() + defer r.mu.Unlock() + + if _, exists := r.instances[inst.Name]; exists { + return fmt.Errorf("instance %s already exists", inst.Name) + } + + r.instances[inst.Name] = inst + + // Initialize running state if the instance is running + if inst.IsRunning() { + r.running.Store(inst.Name, struct{}{}) + } + + return nil +} + +// Remove removes an instance from the registry. +// Returns an error if the instance doesn't exist. +func (r *instanceRegistry) Remove(name string) error { + r.mu.Lock() + defer r.mu.Unlock() + + if _, exists := r.instances[name]; !exists { + return fmt.Errorf("instance %s not found", name) + } + + delete(r.instances, name) + r.running.Delete(name) + + return nil +} + +// MarkRunning marks an instance as running using lock-free sync.Map. +func (r *instanceRegistry) MarkRunning(name string) { + r.running.Store(name, struct{}{}) +} + +// MarkStopped marks an instance as stopped using lock-free sync.Map. +func (r *instanceRegistry) MarkStopped(name string) { + r.running.Delete(name) +} + +// IsRunning checks if an instance is running using lock-free sync.Map. +func (r *instanceRegistry) IsRunning(name string) bool { + _, isRunning := r.running.Load(name) + return isRunning +} + +// Count returns the total number of instances in the registry. +func (r *instanceRegistry) Count() int { + r.mu.RLock() + defer r.mu.RUnlock() + return len(r.instances) +} + +// CountRunning returns the number of currently running instances. +func (r *instanceRegistry) CountRunning() int { + count := 0 + r.running.Range(func(key, value any) bool { + count++ + return true + }) + return count +} diff --git a/pkg/manager/remote.go b/pkg/manager/remote.go new file mode 100644 index 0000000..9cb147c --- /dev/null +++ b/pkg/manager/remote.go @@ -0,0 +1,283 @@ +package manager + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "io" + "llamactl/pkg/config" + "llamactl/pkg/instance" + "net/http" + "sync" + "time" +) + +// remoteManager handles HTTP operations for remote instances. +type remoteManager struct { + mu sync.RWMutex + client *http.Client + nodeMap map[string]*config.NodeConfig // node name -> node config + instanceToNode map[string]*config.NodeConfig // instance name -> node config +} + +// NewRemoteManager creates a new remote manager. +func NewRemoteManager(nodes map[string]config.NodeConfig, timeout time.Duration) *remoteManager { + if timeout <= 0 { + timeout = 30 * time.Second + } + + // Build node config map + nodeMap := make(map[string]*config.NodeConfig) + for name := range nodes { + nodeCopy := nodes[name] + nodeMap[name] = &nodeCopy + } + + return &remoteManager{ + client: &http.Client{ + Timeout: timeout, + }, + nodeMap: nodeMap, + instanceToNode: make(map[string]*config.NodeConfig), + } +} + +// GetNodeForInstance returns the node configuration for a given instance. +// Returns nil if the instance is not mapped to any node. +func (rm *remoteManager) GetNodeForInstance(instanceName string) (*config.NodeConfig, bool) { + rm.mu.RLock() + defer rm.mu.RUnlock() + + node, exists := rm.instanceToNode[instanceName] + return node, exists +} + +// SetInstanceNode maps an instance to a specific node. +// Returns an error if the node doesn't exist. +func (rm *remoteManager) SetInstanceNode(instanceName, nodeName string) error { + rm.mu.Lock() + defer rm.mu.Unlock() + + node, exists := rm.nodeMap[nodeName] + if !exists { + return fmt.Errorf("node %s not found", nodeName) + } + + rm.instanceToNode[instanceName] = node + return nil +} + +// RemoveInstance removes the instance-to-node mapping. +func (rm *remoteManager) RemoveInstance(instanceName string) { + rm.mu.Lock() + defer rm.mu.Unlock() + + delete(rm.instanceToNode, instanceName) +} + +// --- HTTP request helpers --- + +// makeRemoteRequest creates and executes an HTTP request to a remote node with context support. +func (rm *remoteManager) makeRemoteRequest(ctx context.Context, nodeConfig *config.NodeConfig, method, path string, body any) (*http.Response, error) { + var reqBody io.Reader + if body != nil { + jsonData, err := json.Marshal(body) + if err != nil { + return nil, fmt.Errorf("failed to marshal request body: %w", err) + } + reqBody = bytes.NewBuffer(jsonData) + } + + url := fmt.Sprintf("%s%s", nodeConfig.Address, path) + req, err := http.NewRequestWithContext(ctx, method, url, reqBody) + if err != nil { + return nil, fmt.Errorf("failed to create request: %w", err) + } + + if body != nil { + req.Header.Set("Content-Type", "application/json") + } + + if nodeConfig.APIKey != "" { + req.Header.Set("Authorization", fmt.Sprintf("Bearer %s", nodeConfig.APIKey)) + } + + resp, err := rm.client.Do(req) + if err != nil { + return nil, fmt.Errorf("failed to execute request: %w", err) + } + + return resp, nil +} + +// parseRemoteResponse parses an HTTP response and unmarshals the result. +func parseRemoteResponse(resp *http.Response, result any) error { + defer resp.Body.Close() + + body, err := io.ReadAll(resp.Body) + if err != nil { + return fmt.Errorf("failed to read response body: %w", err) + } + + if resp.StatusCode < 200 || resp.StatusCode >= 300 { + return fmt.Errorf("API request failed with status %d: %s", resp.StatusCode, string(body)) + } + + if result != nil { + if err := json.Unmarshal(body, result); err != nil { + return fmt.Errorf("failed to unmarshal response: %w", err) + } + } + + return nil +} + +// --- Remote CRUD operations --- + +// ListInstances lists all instances on a remote node. +func (rm *remoteManager) ListInstances(ctx context.Context, node *config.NodeConfig) ([]*instance.Instance, error) { + resp, err := rm.makeRemoteRequest(ctx, node, "GET", "/api/v1/instances/", nil) + if err != nil { + return nil, err + } + + var instances []*instance.Instance + if err := parseRemoteResponse(resp, &instances); err != nil { + return nil, err + } + + return instances, nil +} + +// CreateInstance creates a new instance on a remote node. +func (rm *remoteManager) CreateInstance(ctx context.Context, node *config.NodeConfig, name string, opts *instance.Options) (*instance.Instance, error) { + path := fmt.Sprintf("/api/v1/instances/%s/", name) + + resp, err := rm.makeRemoteRequest(ctx, node, "POST", path, opts) + if err != nil { + return nil, err + } + + var inst instance.Instance + if err := parseRemoteResponse(resp, &inst); err != nil { + return nil, err + } + + return &inst, nil +} + +// GetInstance retrieves an instance by name from a remote node. +func (rm *remoteManager) GetInstance(ctx context.Context, node *config.NodeConfig, name string) (*instance.Instance, error) { + path := fmt.Sprintf("/api/v1/instances/%s/", name) + resp, err := rm.makeRemoteRequest(ctx, node, "GET", path, nil) + if err != nil { + return nil, err + } + + var inst instance.Instance + if err := parseRemoteResponse(resp, &inst); err != nil { + return nil, err + } + + return &inst, nil +} + +// UpdateInstance updates an existing instance on a remote node. +func (rm *remoteManager) UpdateInstance(ctx context.Context, node *config.NodeConfig, name string, opts *instance.Options) (*instance.Instance, error) { + path := fmt.Sprintf("/api/v1/instances/%s/", name) + + resp, err := rm.makeRemoteRequest(ctx, node, "PUT", path, opts) + if err != nil { + return nil, err + } + + var inst instance.Instance + if err := parseRemoteResponse(resp, &inst); err != nil { + return nil, err + } + + return &inst, nil +} + +// DeleteInstance deletes an instance from a remote node. +func (rm *remoteManager) DeleteInstance(ctx context.Context, node *config.NodeConfig, name string) error { + path := fmt.Sprintf("/api/v1/instances/%s/", name) + resp, err := rm.makeRemoteRequest(ctx, node, "DELETE", path, nil) + if err != nil { + return err + } + + return parseRemoteResponse(resp, nil) +} + +// StartInstance starts an instance on a remote node. +func (rm *remoteManager) StartInstance(ctx context.Context, node *config.NodeConfig, name string) (*instance.Instance, error) { + path := fmt.Sprintf("/api/v1/instances/%s/start", name) + resp, err := rm.makeRemoteRequest(ctx, node, "POST", path, nil) + if err != nil { + return nil, err + } + + var inst instance.Instance + if err := parseRemoteResponse(resp, &inst); err != nil { + return nil, err + } + + return &inst, nil +} + +// StopInstance stops an instance on a remote node. +func (rm *remoteManager) StopInstance(ctx context.Context, node *config.NodeConfig, name string) (*instance.Instance, error) { + path := fmt.Sprintf("/api/v1/instances/%s/stop", name) + resp, err := rm.makeRemoteRequest(ctx, node, "POST", path, nil) + if err != nil { + return nil, err + } + + var inst instance.Instance + if err := parseRemoteResponse(resp, &inst); err != nil { + return nil, err + } + + return &inst, nil +} + +// RestartInstance restarts an instance on a remote node. +func (rm *remoteManager) RestartInstance(ctx context.Context, node *config.NodeConfig, name string) (*instance.Instance, error) { + path := fmt.Sprintf("/api/v1/instances/%s/restart", name) + resp, err := rm.makeRemoteRequest(ctx, node, "POST", path, nil) + if err != nil { + return nil, err + } + + var inst instance.Instance + if err := parseRemoteResponse(resp, &inst); err != nil { + return nil, err + } + + return &inst, nil +} + +// GetInstanceLogs retrieves logs for an instance from a remote node. +func (rm *remoteManager) GetInstanceLogs(ctx context.Context, node *config.NodeConfig, name string, numLines int) (string, error) { + path := fmt.Sprintf("/api/v1/instances/%s/logs?lines=%d", name, numLines) + resp, err := rm.makeRemoteRequest(ctx, node, "GET", path, nil) + if err != nil { + return "", err + } + + defer resp.Body.Close() + + body, err := io.ReadAll(resp.Body) + if err != nil { + return "", fmt.Errorf("failed to read response body: %w", err) + } + + if resp.StatusCode < 200 || resp.StatusCode >= 300 { + return "", fmt.Errorf("API request failed with status %d: %s", resp.StatusCode, string(body)) + } + + // Logs endpoint returns plain text (Content-Type: text/plain) + return string(body), nil +} diff --git a/pkg/manager/remote_ops.go b/pkg/manager/remote_ops.go deleted file mode 100644 index 143c317..0000000 --- a/pkg/manager/remote_ops.go +++ /dev/null @@ -1,222 +0,0 @@ -package manager - -import ( - "bytes" - "encoding/json" - "fmt" - "io" - "llamactl/pkg/config" - "llamactl/pkg/instance" - "net/http" -) - -// makeRemoteRequest is a helper function to make HTTP requests to a remote node -func (im *instanceManager) makeRemoteRequest(nodeConfig *config.NodeConfig, method, path string, body any) (*http.Response, error) { - var reqBody io.Reader - if body != nil { - jsonData, err := json.Marshal(body) - if err != nil { - return nil, fmt.Errorf("failed to marshal request body: %w", err) - } - reqBody = bytes.NewBuffer(jsonData) - } - - url := fmt.Sprintf("%s%s", nodeConfig.Address, path) - req, err := http.NewRequest(method, url, reqBody) - if err != nil { - return nil, fmt.Errorf("failed to create request: %w", err) - } - - if body != nil { - req.Header.Set("Content-Type", "application/json") - } - - if nodeConfig.APIKey != "" { - req.Header.Set("Authorization", fmt.Sprintf("Bearer %s", nodeConfig.APIKey)) - } - - resp, err := im.httpClient.Do(req) - if err != nil { - return nil, fmt.Errorf("failed to execute request: %w", err) - } - - return resp, nil -} - -// parseRemoteResponse is a helper function to parse API responses -func parseRemoteResponse(resp *http.Response, result any) error { - defer resp.Body.Close() - - body, err := io.ReadAll(resp.Body) - if err != nil { - return fmt.Errorf("failed to read response body: %w", err) - } - - if resp.StatusCode < 200 || resp.StatusCode >= 300 { - return fmt.Errorf("API request failed with status %d: %s", resp.StatusCode, string(body)) - } - - if result != nil { - if err := json.Unmarshal(body, result); err != nil { - return fmt.Errorf("failed to unmarshal response: %w", err) - } - } - - return nil -} - -// ListRemoteInstances lists all instances on the remote node -func (im *instanceManager) ListRemoteInstances(nodeConfig *config.NodeConfig) ([]*instance.Instance, error) { - resp, err := im.makeRemoteRequest(nodeConfig, "GET", "/api/v1/instances/", nil) - if err != nil { - return nil, err - } - - var instances []*instance.Instance - if err := parseRemoteResponse(resp, &instances); err != nil { - return nil, err - } - - return instances, nil -} - -// CreateRemoteInstance creates a new instance on the remote node -func (im *instanceManager) CreateRemoteInstance(nodeConfig *config.NodeConfig, name string, options *instance.Options) (*instance.Instance, error) { - path := fmt.Sprintf("/api/v1/instances/%s/", name) - - resp, err := im.makeRemoteRequest(nodeConfig, "POST", path, options) - if err != nil { - return nil, err - } - - var inst instance.Instance - if err := parseRemoteResponse(resp, &inst); err != nil { - return nil, err - } - - return &inst, nil -} - -// GetRemoteInstance retrieves an instance by name from the remote node -func (im *instanceManager) GetRemoteInstance(nodeConfig *config.NodeConfig, name string) (*instance.Instance, error) { - path := fmt.Sprintf("/api/v1/instances/%s/", name) - resp, err := im.makeRemoteRequest(nodeConfig, "GET", path, nil) - if err != nil { - return nil, err - } - - var inst instance.Instance - if err := parseRemoteResponse(resp, &inst); err != nil { - return nil, err - } - - return &inst, nil -} - -// UpdateRemoteInstance updates an existing instance on the remote node -func (im *instanceManager) UpdateRemoteInstance(nodeConfig *config.NodeConfig, name string, options *instance.Options) (*instance.Instance, error) { - path := fmt.Sprintf("/api/v1/instances/%s/", name) - - resp, err := im.makeRemoteRequest(nodeConfig, "PUT", path, options) - if err != nil { - return nil, err - } - - var inst instance.Instance - if err := parseRemoteResponse(resp, &inst); err != nil { - return nil, err - } - - return &inst, nil -} - -// DeleteRemoteInstance deletes an instance from the remote node -func (im *instanceManager) DeleteRemoteInstance(nodeConfig *config.NodeConfig, name string) error { - path := fmt.Sprintf("/api/v1/instances/%s/", name) - resp, err := im.makeRemoteRequest(nodeConfig, "DELETE", path, nil) - if err != nil { - return err - } - - return parseRemoteResponse(resp, nil) -} - -// StartRemoteInstance starts an instance on the remote node -func (im *instanceManager) StartRemoteInstance(nodeConfig *config.NodeConfig, name string) (*instance.Instance, error) { - path := fmt.Sprintf("/api/v1/instances/%s/start", name) - resp, err := im.makeRemoteRequest(nodeConfig, "POST", path, nil) - if err != nil { - return nil, err - } - - var inst instance.Instance - if err := parseRemoteResponse(resp, &inst); err != nil { - return nil, err - } - - return &inst, nil -} - -// StopRemoteInstance stops an instance on the remote node -func (im *instanceManager) StopRemoteInstance(nodeConfig *config.NodeConfig, name string) (*instance.Instance, error) { - path := fmt.Sprintf("/api/v1/instances/%s/stop", name) - resp, err := im.makeRemoteRequest(nodeConfig, "POST", path, nil) - if err != nil { - return nil, err - } - - var inst instance.Instance - if err := parseRemoteResponse(resp, &inst); err != nil { - return nil, err - } - - return &inst, nil -} - -// RestartRemoteInstance restarts an instance on the remote node -func (im *instanceManager) RestartRemoteInstance(nodeConfig *config.NodeConfig, name string) (*instance.Instance, error) { - path := fmt.Sprintf("/api/v1/instances/%s/restart", name) - resp, err := im.makeRemoteRequest(nodeConfig, "POST", path, nil) - if err != nil { - return nil, err - } - - var inst instance.Instance - if err := parseRemoteResponse(resp, &inst); err != nil { - return nil, err - } - - return &inst, nil -} - -// GetRemoteInstanceLogs retrieves logs for an instance from the remote node -func (im *instanceManager) GetRemoteInstanceLogs(nodeConfig *config.NodeConfig, name string, numLines int) (string, error) { - path := fmt.Sprintf("/api/v1/instances/%s/logs?lines=%d", name, numLines) - resp, err := im.makeRemoteRequest(nodeConfig, "GET", path, nil) - if err != nil { - return "", err - } - - defer resp.Body.Close() - - body, err := io.ReadAll(resp.Body) - if err != nil { - return "", fmt.Errorf("failed to read response body: %w", err) - } - - if resp.StatusCode < 200 || resp.StatusCode >= 300 { - return "", fmt.Errorf("API request failed with status %d: %s", resp.StatusCode, string(body)) - } - - // Logs endpoint might return plain text or JSON - // Try to parse as JSON first (in case it's wrapped in a response object) - var logResponse struct { - Logs string `json:"logs"` - } - if err := json.Unmarshal(body, &logResponse); err == nil && logResponse.Logs != "" { - return logResponse.Logs, nil - } - - // Otherwise, return as plain text - return string(body), nil -} diff --git a/pkg/manager/timeout.go b/pkg/manager/timeout.go deleted file mode 100644 index 2e0314a..0000000 --- a/pkg/manager/timeout.go +++ /dev/null @@ -1,74 +0,0 @@ -package manager - -import ( - "fmt" - "llamactl/pkg/instance" - "log" -) - -func (im *instanceManager) checkAllTimeouts() { - im.mu.RLock() - var timeoutInstances []string - - // Identify instances that should timeout - for _, inst := range im.instances { - // Skip remote instances - they are managed by their respective nodes - if inst.IsRemote() { - continue - } - - if inst.ShouldTimeout() { - timeoutInstances = append(timeoutInstances, inst.Name) - } - } - im.mu.RUnlock() // Release read lock before calling StopInstance - - // Stop the timed-out instances - for _, name := range timeoutInstances { - log.Printf("Instance %s has timed out, stopping it", name) - if _, err := im.StopInstance(name); err != nil { - log.Printf("Error stopping instance %s: %v", name, err) - } else { - log.Printf("Instance %s stopped successfully", name) - } - } -} - -// EvictLRUInstance finds and stops the least recently used running instance. -func (im *instanceManager) EvictLRUInstance() error { - im.mu.RLock() - var lruInstance *instance.Instance - - for name := range im.runningInstances { - inst := im.instances[name] - if inst == nil { - continue - } - - // Skip remote instances - they are managed by their respective nodes - if inst.IsRemote() { - continue - } - - if inst.GetOptions() != nil && inst.GetOptions().IdleTimeout != nil && *inst.GetOptions().IdleTimeout <= 0 { - continue // Skip instances without idle timeout - } - - if lruInstance == nil { - lruInstance = inst - } - - if inst.LastRequestTime() < lruInstance.LastRequestTime() { - lruInstance = inst - } - } - im.mu.RUnlock() - - if lruInstance == nil { - return fmt.Errorf("failed to find lru instance") - } - - // Evict Instance - _, err := im.StopInstance(lruInstance.Name) - return err -} diff --git a/pkg/manager/timeout_test.go b/pkg/manager/timeout_test.go index d1c3a47..e05c400 100644 --- a/pkg/manager/timeout_test.go +++ b/pkg/manager/timeout_test.go @@ -22,7 +22,7 @@ func TestTimeoutFunctionality(t *testing.T) { MaxInstances: 5, } - manager := manager.NewInstanceManager(backendConfig, cfg, map[string]config.NodeConfig{}, "main") + manager := manager.New(backendConfig, cfg, map[string]config.NodeConfig{}, "main") if manager == nil { t.Fatal("Manager should be initialized with timeout checker") } From c537bc48b88a60118117ebc4dc34b5256d3de822 Mon Sep 17 00:00:00 2001 From: LordMathis Date: Mon, 20 Oct 2025 22:00:06 +0200 Subject: [PATCH 02/22] Refactor API path handling in remoteManager to use a constant for base path --- pkg/manager/remote.go | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/pkg/manager/remote.go b/pkg/manager/remote.go index 9cb147c..9f9288a 100644 --- a/pkg/manager/remote.go +++ b/pkg/manager/remote.go @@ -13,6 +13,8 @@ import ( "time" ) +const apiBasePath = "/api/v1/instances/" + // remoteManager handles HTTP operations for remote instances. type remoteManager struct { mu sync.RWMutex @@ -137,7 +139,7 @@ func parseRemoteResponse(resp *http.Response, result any) error { // ListInstances lists all instances on a remote node. func (rm *remoteManager) ListInstances(ctx context.Context, node *config.NodeConfig) ([]*instance.Instance, error) { - resp, err := rm.makeRemoteRequest(ctx, node, "GET", "/api/v1/instances/", nil) + resp, err := rm.makeRemoteRequest(ctx, node, "GET", apiBasePath, nil) if err != nil { return nil, err } @@ -152,7 +154,7 @@ func (rm *remoteManager) ListInstances(ctx context.Context, node *config.NodeCon // CreateInstance creates a new instance on a remote node. func (rm *remoteManager) CreateInstance(ctx context.Context, node *config.NodeConfig, name string, opts *instance.Options) (*instance.Instance, error) { - path := fmt.Sprintf("/api/v1/instances/%s/", name) + path := fmt.Sprintf("%s%s/", apiBasePath, name) resp, err := rm.makeRemoteRequest(ctx, node, "POST", path, opts) if err != nil { @@ -169,7 +171,7 @@ func (rm *remoteManager) CreateInstance(ctx context.Context, node *config.NodeCo // GetInstance retrieves an instance by name from a remote node. func (rm *remoteManager) GetInstance(ctx context.Context, node *config.NodeConfig, name string) (*instance.Instance, error) { - path := fmt.Sprintf("/api/v1/instances/%s/", name) + path := fmt.Sprintf("%s%s/", apiBasePath, name) resp, err := rm.makeRemoteRequest(ctx, node, "GET", path, nil) if err != nil { return nil, err @@ -185,7 +187,7 @@ func (rm *remoteManager) GetInstance(ctx context.Context, node *config.NodeConfi // UpdateInstance updates an existing instance on a remote node. func (rm *remoteManager) UpdateInstance(ctx context.Context, node *config.NodeConfig, name string, opts *instance.Options) (*instance.Instance, error) { - path := fmt.Sprintf("/api/v1/instances/%s/", name) + path := fmt.Sprintf("%s%s/", apiBasePath, name) resp, err := rm.makeRemoteRequest(ctx, node, "PUT", path, opts) if err != nil { @@ -202,7 +204,7 @@ func (rm *remoteManager) UpdateInstance(ctx context.Context, node *config.NodeCo // DeleteInstance deletes an instance from a remote node. func (rm *remoteManager) DeleteInstance(ctx context.Context, node *config.NodeConfig, name string) error { - path := fmt.Sprintf("/api/v1/instances/%s/", name) + path := fmt.Sprintf("%s%s/", apiBasePath, name) resp, err := rm.makeRemoteRequest(ctx, node, "DELETE", path, nil) if err != nil { return err @@ -213,7 +215,7 @@ func (rm *remoteManager) DeleteInstance(ctx context.Context, node *config.NodeCo // StartInstance starts an instance on a remote node. func (rm *remoteManager) StartInstance(ctx context.Context, node *config.NodeConfig, name string) (*instance.Instance, error) { - path := fmt.Sprintf("/api/v1/instances/%s/start", name) + path := fmt.Sprintf("%s%s/start", apiBasePath, name) resp, err := rm.makeRemoteRequest(ctx, node, "POST", path, nil) if err != nil { return nil, err @@ -229,7 +231,7 @@ func (rm *remoteManager) StartInstance(ctx context.Context, node *config.NodeCon // StopInstance stops an instance on a remote node. func (rm *remoteManager) StopInstance(ctx context.Context, node *config.NodeConfig, name string) (*instance.Instance, error) { - path := fmt.Sprintf("/api/v1/instances/%s/stop", name) + path := fmt.Sprintf("%s%s/stop", apiBasePath, name) resp, err := rm.makeRemoteRequest(ctx, node, "POST", path, nil) if err != nil { return nil, err @@ -245,7 +247,7 @@ func (rm *remoteManager) StopInstance(ctx context.Context, node *config.NodeConf // RestartInstance restarts an instance on a remote node. func (rm *remoteManager) RestartInstance(ctx context.Context, node *config.NodeConfig, name string) (*instance.Instance, error) { - path := fmt.Sprintf("/api/v1/instances/%s/restart", name) + path := fmt.Sprintf("%s%s/restart", apiBasePath, name) resp, err := rm.makeRemoteRequest(ctx, node, "POST", path, nil) if err != nil { return nil, err @@ -261,7 +263,7 @@ func (rm *remoteManager) RestartInstance(ctx context.Context, node *config.NodeC // GetInstanceLogs retrieves logs for an instance from a remote node. func (rm *remoteManager) GetInstanceLogs(ctx context.Context, node *config.NodeConfig, name string, numLines int) (string, error) { - path := fmt.Sprintf("/api/v1/instances/%s/logs?lines=%d", name, numLines) + path := fmt.Sprintf("%s%s/logs?lines=%d", apiBasePath, name, numLines) resp, err := rm.makeRemoteRequest(ctx, node, "GET", path, nil) if err != nil { return "", err From 1ae28a0b09e65ebafc1733fd5b467cb1f931ec72 Mon Sep 17 00:00:00 2001 From: LordMathis Date: Mon, 20 Oct 2025 22:22:09 +0200 Subject: [PATCH 03/22] Unexport member struct methods --- pkg/manager/lifecycle.go | 12 +++--- pkg/manager/manager.go | 32 +++++++-------- pkg/manager/operations.go | 84 +++++++++++++++++++------------------- pkg/manager/persistence.go | 8 ++-- pkg/manager/ports.go | 16 ++++---- pkg/manager/registry.go | 20 ++++----- pkg/manager/remote.go | 24 +++++------ 7 files changed, 98 insertions(+), 98 deletions(-) diff --git a/pkg/manager/lifecycle.go b/pkg/manager/lifecycle.go index 4045022..73cb073 100644 --- a/pkg/manager/lifecycle.go +++ b/pkg/manager/lifecycle.go @@ -46,13 +46,13 @@ func NewLifecycleManager( } // Start begins the timeout checking loop in a goroutine. -func (l *lifecycleManager) Start() { +func (l *lifecycleManager) start() { go l.timeoutCheckLoop() } // Stop gracefully stops the lifecycle manager. // This ensures the timeout checker completes before instance cleanup begins. -func (l *lifecycleManager) Stop() { +func (l *lifecycleManager) stop() { l.shutdownOnce.Do(func() { close(l.shutdownChan) <-l.shutdownDone // Wait for checker to finish (prevents shutdown race) @@ -77,7 +77,7 @@ func (l *lifecycleManager) timeoutCheckLoop() { // checkTimeouts checks all instances for timeout and stops those that have timed out. func (l *lifecycleManager) checkTimeouts() { // Get all instances from registry - instances := l.registry.List() + instances := l.registry.list() var timeoutInstances []string @@ -89,7 +89,7 @@ func (l *lifecycleManager) checkTimeouts() { } // Only check running instances - if !l.registry.IsRunning(inst.Name) { + if !l.registry.isRunning(inst.Name) { continue } @@ -111,13 +111,13 @@ func (l *lifecycleManager) checkTimeouts() { // EvictLRU finds and stops the least recently used running instance. // This is called when max running instances limit is reached. -func (l *lifecycleManager) EvictLRU() error { +func (l *lifecycleManager) evictLRU() error { if !l.enableLRU { return fmt.Errorf("LRU eviction is not enabled") } // Get all running instances - runningInstances := l.registry.ListRunning() + runningInstances := l.registry.listRunning() var lruInstance *instance.Instance diff --git a/pkg/manager/manager.go b/pkg/manager/manager.go index 5863f46..98aac8f 100644 --- a/pkg/manager/manager.go +++ b/pkg/manager/manager.go @@ -102,23 +102,23 @@ func New(backendsConfig config.BackendConfig, instancesConfig config.InstancesCo } // Start the lifecycle manager - im.lifecycle.Start() + im.lifecycle.start() return im } // persistInstance saves an instance using the persistence component func (im *instanceManager) persistInstance(inst *instance.Instance) error { - return im.persistence.Save(inst) + return im.persistence.save(inst) } func (im *instanceManager) Shutdown() { im.shutdownOnce.Do(func() { // 1. Stop lifecycle manager (stops timeout checker) - im.lifecycle.Stop() + im.lifecycle.stop() // 2. Get running instances (no lock needed - registry handles it) - running := im.registry.ListRunning() + running := im.registry.listRunning() // 3. Stop local instances concurrently var wg sync.WaitGroup @@ -143,7 +143,7 @@ func (im *instanceManager) Shutdown() { // loadInstances restores all instances from disk using the persistence component func (im *instanceManager) loadInstances() error { // Load all instances from persistence - instances, err := im.persistence.LoadAll() + instances, err := im.persistence.loadAll() if err != nil { return fmt.Errorf("failed to load instances: %w", err) } @@ -205,21 +205,21 @@ func (im *instanceManager) loadInstance(persistedInst *instance.Instance) error // Handle remote instance mapping if isRemote { // Map instance to node in remote manager - if err := im.remote.SetInstanceNode(name, nodeName); err != nil { + if err := im.remote.setInstanceNode(name, nodeName); err != nil { return fmt.Errorf("failed to set instance node: %w", err) } } else { // Allocate port for local instances if inst.GetPort() > 0 { port := inst.GetPort() - if err := im.ports.AllocateSpecific(port, name); err != nil { + if err := im.ports.allocateSpecific(port, name); err != nil { return fmt.Errorf("port conflict: instance %s wants port %d which is already in use: %w", name, port, err) } } } // Add instance to registry - if err := im.registry.Add(inst); err != nil { + if err := im.registry.add(inst); err != nil { return fmt.Errorf("failed to add instance to registry: %w", err) } @@ -229,7 +229,7 @@ func (im *instanceManager) loadInstance(persistedInst *instance.Instance) error // autoStartInstances starts instances that were running when persisted and have auto-restart enabled // For instances with auto-restart disabled, it sets their status to Stopped func (im *instanceManager) autoStartInstances() { - instances := im.registry.List() + instances := im.registry.list() var instancesToStart []*instance.Instance var instancesToStop []*instance.Instance @@ -251,7 +251,7 @@ func (im *instanceManager) autoStartInstances() { for _, inst := range instancesToStop { log.Printf("Instance %s was running but auto-restart is disabled, setting status to stopped", inst.Name) inst.SetStatus(instance.Stopped) - im.registry.MarkStopped(inst.Name) + im.registry.markStopped(inst.Name) } // Start instances that have auto-restart enabled @@ -259,13 +259,13 @@ func (im *instanceManager) autoStartInstances() { log.Printf("Auto-starting instance %s", inst.Name) // Reset running state before starting (since Start() expects stopped instance) inst.SetStatus(instance.Stopped) - im.registry.MarkStopped(inst.Name) + im.registry.markStopped(inst.Name) // Check if this is a remote instance - if node, exists := im.remote.GetNodeForInstance(inst.Name); exists && node != nil { + if node, exists := im.remote.getNodeForInstance(inst.Name); exists && node != nil { // Remote instance - use remote manager with context ctx := context.Background() - if _, err := im.remote.StartInstance(ctx, node, inst.Name); err != nil { + if _, err := im.remote.startInstance(ctx, node, inst.Name); err != nil { log.Printf("Failed to auto-start remote instance %s: %v", inst.Name, err) } } else { @@ -279,9 +279,9 @@ func (im *instanceManager) autoStartInstances() { func (im *instanceManager) onStatusChange(name string, oldStatus, newStatus instance.Status) { if newStatus == instance.Running { - im.registry.MarkRunning(name) + im.registry.markRunning(name) } else { - im.registry.MarkStopped(name) + im.registry.markStopped(name) } } @@ -293,7 +293,7 @@ func (im *instanceManager) getNodeForInstance(inst *instance.Instance) *config.N } // Check if we have a node mapping in remote manager - if nodeConfig, exists := im.remote.GetNodeForInstance(inst.Name); exists { + if nodeConfig, exists := im.remote.getNodeForInstance(inst.Name); exists { return nodeConfig } diff --git a/pkg/manager/operations.go b/pkg/manager/operations.go index 51099ad..a88ec77 100644 --- a/pkg/manager/operations.go +++ b/pkg/manager/operations.go @@ -30,13 +30,13 @@ func (im *instanceManager) updateLocalInstanceFromRemote(localInst *instance.Ins // ListInstances returns a list of all instances managed by the instance manager. // For remote instances, this fetches the live state from remote nodes and updates local stubs. func (im *instanceManager) ListInstances() ([]*instance.Instance, error) { - instances := im.registry.List() + instances := im.registry.list() // Update remote instances with live state ctx := context.Background() for _, inst := range instances { if node := im.getNodeForInstance(inst); node != nil { - remoteInst, err := im.remote.GetInstance(ctx, node, inst.Name) + remoteInst, err := im.remote.getInstance(ctx, node, inst.Name) if err != nil { // Log error but continue with stale data // Don't fail the entire list operation due to one remote failure @@ -69,7 +69,7 @@ func (im *instanceManager) CreateInstance(name string, options *instance.Options } // Check if instance with this name already exists (must be globally unique) - if _, exists := im.registry.Get(name); exists { + if _, exists := im.registry.get(name); exists { return nil, fmt.Errorf("instance with name %s already exists", name) } @@ -84,16 +84,16 @@ func (im *instanceManager) CreateInstance(name string, options *instance.Options // Create the remote instance on the remote node ctx := context.Background() - nodeConfig, exists := im.remote.GetNodeForInstance(nodeName) + nodeConfig, exists := im.remote.getNodeForInstance(nodeName) if !exists { // Try to set the node if it doesn't exist yet - if err := im.remote.SetInstanceNode(name, nodeName); err != nil { + if err := im.remote.setInstanceNode(name, nodeName); err != nil { return nil, fmt.Errorf("node %s not found", nodeName) } - nodeConfig, _ = im.remote.GetNodeForInstance(name) + nodeConfig, _ = im.remote.getNodeForInstance(name) } - remoteInst, err := im.remote.CreateInstance(ctx, nodeConfig, name, options) + remoteInst, err := im.remote.createInstance(ctx, nodeConfig, name, options) if err != nil { return nil, err } @@ -106,19 +106,19 @@ func (im *instanceManager) CreateInstance(name string, options *instance.Options im.updateLocalInstanceFromRemote(inst, remoteInst) // Map instance to node - if err := im.remote.SetInstanceNode(name, nodeName); err != nil { + if err := im.remote.setInstanceNode(name, nodeName); err != nil { return nil, fmt.Errorf("failed to map instance to node: %w", err) } // Add to registry (doesn't count towards local limits) - if err := im.registry.Add(inst); err != nil { + if err := im.registry.add(inst); err != nil { return nil, fmt.Errorf("failed to add instance to registry: %w", err) } // Persist the remote instance locally for tracking across restarts if err := im.persistInstance(inst); err != nil { // Rollback: remove from registry - im.registry.Remove(name) + im.registry.remove(name) return nil, fmt.Errorf("failed to persist remote instance %s: %w", name, err) } @@ -127,9 +127,9 @@ func (im *instanceManager) CreateInstance(name string, options *instance.Options // Local instance creation // Check max instances limit for local instances only - totalInstances := im.registry.Count() + totalInstances := im.registry.count() remoteCount := 0 - for _, inst := range im.registry.List() { + for _, inst := range im.registry.list() { if inst.IsRemote() { remoteCount++ } @@ -144,14 +144,14 @@ func (im *instanceManager) CreateInstance(name string, options *instance.Options var allocatedPort int if currentPort == 0 { // Allocate a port if not specified - allocatedPort, err = im.ports.Allocate(name) + allocatedPort, err = im.ports.allocate(name) if err != nil { return nil, fmt.Errorf("failed to allocate port: %w", err) } im.setPortInOptions(options, allocatedPort) } else { // Use the specified port - if err := im.ports.AllocateSpecific(currentPort, name); err != nil { + if err := im.ports.allocateSpecific(currentPort, name); err != nil { return nil, fmt.Errorf("port %d is already in use: %w", currentPort, err) } allocatedPort = currentPort @@ -164,9 +164,9 @@ func (im *instanceManager) CreateInstance(name string, options *instance.Options inst := instance.New(name, &im.backendsConfig, &im.instancesConfig, options, im.localNodeName, statusCallback) // Add to registry - if err := im.registry.Add(inst); err != nil { + if err := im.registry.add(inst); err != nil { // Rollback: release port - im.ports.Release(allocatedPort) + im.ports.release(allocatedPort) return nil, fmt.Errorf("failed to add instance to registry: %w", err) } @@ -181,7 +181,7 @@ func (im *instanceManager) CreateInstance(name string, options *instance.Options // GetInstance retrieves an instance by its name. // For remote instances, this fetches the live state from the remote node and updates the local stub. func (im *instanceManager) GetInstance(name string) (*instance.Instance, error) { - inst, exists := im.registry.Get(name) + inst, exists := im.registry.get(name) if !exists { return nil, fmt.Errorf("instance with name %s not found", name) } @@ -189,7 +189,7 @@ func (im *instanceManager) GetInstance(name string) (*instance.Instance, error) // Check if instance is remote and fetch live state if node := im.getNodeForInstance(inst); node != nil { ctx := context.Background() - remoteInst, err := im.remote.GetInstance(ctx, node, name) + remoteInst, err := im.remote.getInstance(ctx, node, name) if err != nil { return nil, err } @@ -207,7 +207,7 @@ func (im *instanceManager) GetInstance(name string) (*instance.Instance, error) // UpdateInstance updates the options of an existing instance and returns it. // If the instance is running, it will be restarted to apply the new options. func (im *instanceManager) UpdateInstance(name string, options *instance.Options) (*instance.Instance, error) { - inst, exists := im.registry.Get(name) + inst, exists := im.registry.get(name) if !exists { return nil, fmt.Errorf("instance with name %s not found", name) } @@ -215,7 +215,7 @@ func (im *instanceManager) UpdateInstance(name string, options *instance.Options // Check if instance is remote and delegate to remote operation if node := im.getNodeForInstance(inst); node != nil { ctx := context.Background() - remoteInst, err := im.remote.UpdateInstance(ctx, node, name, options) + remoteInst, err := im.remote.updateInstance(ctx, node, name, options) if err != nil { return nil, err } @@ -253,14 +253,14 @@ func (im *instanceManager) UpdateInstance(name string, options *instance.Options // Port is changing - need to release old and allocate new if newPort == 0 { // Auto-allocate new port - allocatedPort, err = im.ports.Allocate(name) + allocatedPort, err = im.ports.allocate(name) if err != nil { return nil, fmt.Errorf("failed to allocate new port: %w", err) } im.setPortInOptions(options, allocatedPort) } else { // Use specified port - if err := im.ports.AllocateSpecific(newPort, name); err != nil { + if err := im.ports.allocateSpecific(newPort, name); err != nil { return nil, fmt.Errorf("failed to allocate port %d: %w", newPort, err) } allocatedPort = newPort @@ -268,9 +268,9 @@ func (im *instanceManager) UpdateInstance(name string, options *instance.Options // Release old port if oldPort > 0 { - if err := im.ports.Release(oldPort); err != nil { + if err := im.ports.release(oldPort); err != nil { // Rollback new port allocation - im.ports.Release(allocatedPort) + im.ports.release(allocatedPort) return nil, fmt.Errorf("failed to release old port %d: %w", oldPort, err) } } @@ -305,7 +305,7 @@ func (im *instanceManager) UpdateInstance(name string, options *instance.Options // DeleteInstance removes stopped instance by its name. func (im *instanceManager) DeleteInstance(name string) error { - inst, exists := im.registry.Get(name) + inst, exists := im.registry.get(name) if !exists { return fmt.Errorf("instance with name %s not found", name) } @@ -313,17 +313,17 @@ func (im *instanceManager) DeleteInstance(name string) error { // Check if instance is remote and delegate to remote operation if node := im.getNodeForInstance(inst); node != nil { ctx := context.Background() - err := im.remote.DeleteInstance(ctx, node, name) + err := im.remote.deleteInstance(ctx, node, name) if err != nil { return err } // Clean up local tracking - im.remote.RemoveInstance(name) - im.registry.Remove(name) + im.remote.removeInstance(name) + im.registry.remove(name) // Delete the instance's persistence file - if err := im.persistence.Delete(name); err != nil { + if err := im.persistence.delete(name); err != nil { return fmt.Errorf("failed to delete config file for remote instance %s: %w", name, err) } @@ -339,15 +339,15 @@ func (im *instanceManager) DeleteInstance(name string) error { } // Release port (use ReleaseByInstance for proper cleanup) - im.ports.ReleaseByInstance(name) + im.ports.releaseByInstance(name) // Remove from registry - if err := im.registry.Remove(name); err != nil { + if err := im.registry.remove(name); err != nil { return fmt.Errorf("failed to remove instance from registry: %w", err) } // Delete persistence file - if err := im.persistence.Delete(name); err != nil { + if err := im.persistence.delete(name); err != nil { return fmt.Errorf("failed to delete config file for instance %s: %w", name, err) } @@ -357,7 +357,7 @@ func (im *instanceManager) DeleteInstance(name string) error { // StartInstance starts a stopped instance and returns it. // If the instance is already running, it returns an error. func (im *instanceManager) StartInstance(name string) (*instance.Instance, error) { - inst, exists := im.registry.Get(name) + inst, exists := im.registry.get(name) if !exists { return nil, fmt.Errorf("instance with name %s not found", name) } @@ -365,7 +365,7 @@ func (im *instanceManager) StartInstance(name string) (*instance.Instance, error // Check if instance is remote and delegate to remote operation if node := im.getNodeForInstance(inst); node != nil { ctx := context.Background() - remoteInst, err := im.remote.StartInstance(ctx, node, name) + remoteInst, err := im.remote.startInstance(ctx, node, name) if err != nil { return nil, err } @@ -408,7 +408,7 @@ func (im *instanceManager) IsMaxRunningInstancesReached() bool { // Count only local running instances (each node has its own limits) localRunningCount := 0 - for _, inst := range im.registry.ListRunning() { + for _, inst := range im.registry.listRunning() { if !inst.IsRemote() { localRunningCount++ } @@ -419,7 +419,7 @@ func (im *instanceManager) IsMaxRunningInstancesReached() bool { // StopInstance stops a running instance and returns it. func (im *instanceManager) StopInstance(name string) (*instance.Instance, error) { - inst, exists := im.registry.Get(name) + inst, exists := im.registry.get(name) if !exists { return nil, fmt.Errorf("instance with name %s not found", name) } @@ -427,7 +427,7 @@ func (im *instanceManager) StopInstance(name string) (*instance.Instance, error) // Check if instance is remote and delegate to remote operation if node := im.getNodeForInstance(inst); node != nil { ctx := context.Background() - remoteInst, err := im.remote.StopInstance(ctx, node, name) + remoteInst, err := im.remote.stopInstance(ctx, node, name) if err != nil { return nil, err } @@ -460,7 +460,7 @@ func (im *instanceManager) StopInstance(name string) (*instance.Instance, error) // RestartInstance stops and then starts an instance, returning the updated instance. func (im *instanceManager) RestartInstance(name string) (*instance.Instance, error) { - inst, exists := im.registry.Get(name) + inst, exists := im.registry.get(name) if !exists { return nil, fmt.Errorf("instance with name %s not found", name) } @@ -468,7 +468,7 @@ func (im *instanceManager) RestartInstance(name string) (*instance.Instance, err // Check if instance is remote and delegate to remote operation if node := im.getNodeForInstance(inst); node != nil { ctx := context.Background() - remoteInst, err := im.remote.RestartInstance(ctx, node, name) + remoteInst, err := im.remote.restartInstance(ctx, node, name) if err != nil { return nil, err } @@ -505,7 +505,7 @@ func (im *instanceManager) RestartInstance(name string) (*instance.Instance, err // GetInstanceLogs retrieves the logs for a specific instance by its name. func (im *instanceManager) GetInstanceLogs(name string, numLines int) (string, error) { - inst, exists := im.registry.Get(name) + inst, exists := im.registry.get(name) if !exists { return "", fmt.Errorf("instance with name %s not found", name) } @@ -513,7 +513,7 @@ func (im *instanceManager) GetInstanceLogs(name string, numLines int) (string, e // Check if instance is remote and delegate to remote operation if node := im.getNodeForInstance(inst); node != nil { ctx := context.Background() - return im.remote.GetInstanceLogs(ctx, node, name, numLines) + return im.remote.getInstanceLogs(ctx, node, name, numLines) } // Get logs from the local instance @@ -532,5 +532,5 @@ func (im *instanceManager) setPortInOptions(options *instance.Options, port int) // EvictLRUInstance finds and stops the least recently used running instance. func (im *instanceManager) EvictLRUInstance() error { - return im.lifecycle.EvictLRU() + return im.lifecycle.evictLRU() } diff --git a/pkg/manager/persistence.go b/pkg/manager/persistence.go index a2853cb..c1c93e5 100644 --- a/pkg/manager/persistence.go +++ b/pkg/manager/persistence.go @@ -39,7 +39,7 @@ func NewInstancePersister(instancesDir string) (*instancePersister, error) { } // Save persists an instance to disk with atomic write -func (p *instancePersister) Save(inst *instance.Instance) error { +func (p *instancePersister) save(inst *instance.Instance) error { if !p.enabled { return nil } @@ -101,7 +101,7 @@ func (p *instancePersister) Save(inst *instance.Instance) error { } // Load loads a single instance from disk by name. -func (p *instancePersister) Load(name string) (*instance.Instance, error) { +func (p *instancePersister) load(name string) (*instance.Instance, error) { if !p.enabled { return nil, fmt.Errorf("persistence is disabled") } @@ -127,7 +127,7 @@ func (p *instancePersister) Load(name string) (*instance.Instance, error) { } // Delete removes an instance's persistence file from disk. -func (p *instancePersister) Delete(name string) error { +func (p *instancePersister) delete(name string) error { if !p.enabled { return nil } @@ -154,7 +154,7 @@ func (p *instancePersister) Delete(name string) error { // LoadAll loads all persisted instances from disk. // Returns a slice of instances and any errors encountered during loading. -func (p *instancePersister) LoadAll() ([]*instance.Instance, error) { +func (p *instancePersister) loadAll() ([]*instance.Instance, error) { if !p.enabled { return nil, nil } diff --git a/pkg/manager/ports.go b/pkg/manager/ports.go index a77ff0e..c5aac3e 100644 --- a/pkg/manager/ports.go +++ b/pkg/manager/ports.go @@ -45,9 +45,9 @@ func NewPortAllocator(minPort, maxPort int) (*portAllocator, error) { }, nil } -// Allocate finds and allocates the first available port for the given instance. +// allocate finds and allocates the first available port for the given instance. // Returns the allocated port or an error if no ports are available. -func (p *portAllocator) Allocate(instanceName string) (int, error) { +func (p *portAllocator) allocate(instanceName string) (int, error) { if instanceName == "" { return 0, fmt.Errorf("instance name cannot be empty") } @@ -66,9 +66,9 @@ func (p *portAllocator) Allocate(instanceName string) (int, error) { return port, nil } -// AllocateSpecific allocates a specific port for the given instance. +// allocateSpecific allocates a specific port for the given instance. // Returns an error if the port is already allocated or out of range. -func (p *portAllocator) AllocateSpecific(port int, instanceName string) error { +func (p *portAllocator) allocateSpecific(port int, instanceName string) error { if instanceName == "" { return fmt.Errorf("instance name cannot be empty") } @@ -89,9 +89,9 @@ func (p *portAllocator) AllocateSpecific(port int, instanceName string) error { return nil } -// Release releases a specific port, making it available for reuse. +// release releases a specific port, making it available for reuse. // Returns an error if the port is not allocated. -func (p *portAllocator) Release(port int) error { +func (p *portAllocator) release(port int) error { if port < p.minPort || port > p.maxPort { return fmt.Errorf("port %d is out of range [%d-%d]", port, p.minPort, p.maxPort) } @@ -109,10 +109,10 @@ func (p *portAllocator) Release(port int) error { return nil } -// ReleaseByInstance releases all ports allocated to the given instance. +// releaseByInstance releases all ports allocated to the given instance. // This is useful for cleanup when deleting or updating an instance. // Returns the number of ports released. -func (p *portAllocator) ReleaseByInstance(instanceName string) int { +func (p *portAllocator) releaseByInstance(instanceName string) int { if instanceName == "" { return 0 } diff --git a/pkg/manager/registry.go b/pkg/manager/registry.go index 41d801b..aa30ada 100644 --- a/pkg/manager/registry.go +++ b/pkg/manager/registry.go @@ -23,7 +23,7 @@ func NewInstanceRegistry() *instanceRegistry { // Get retrieves an instance by name. // Returns the instance and true if found, nil and false otherwise. -func (r *instanceRegistry) Get(name string) (*instance.Instance, bool) { +func (r *instanceRegistry) get(name string) (*instance.Instance, bool) { r.mu.RLock() defer r.mu.RUnlock() @@ -32,7 +32,7 @@ func (r *instanceRegistry) Get(name string) (*instance.Instance, bool) { } // List returns a snapshot copy of all instances to prevent external mutation. -func (r *instanceRegistry) List() []*instance.Instance { +func (r *instanceRegistry) list() []*instance.Instance { r.mu.RLock() defer r.mu.RUnlock() @@ -44,7 +44,7 @@ func (r *instanceRegistry) List() []*instance.Instance { } // ListRunning returns a snapshot of all currently running instances. -func (r *instanceRegistry) ListRunning() []*instance.Instance { +func (r *instanceRegistry) listRunning() []*instance.Instance { r.mu.RLock() defer r.mu.RUnlock() @@ -59,7 +59,7 @@ func (r *instanceRegistry) ListRunning() []*instance.Instance { // Add adds a new instance to the registry. // Returns an error if an instance with the same name already exists. -func (r *instanceRegistry) Add(inst *instance.Instance) error { +func (r *instanceRegistry) add(inst *instance.Instance) error { if inst == nil { return fmt.Errorf("cannot add nil instance") } @@ -83,7 +83,7 @@ func (r *instanceRegistry) Add(inst *instance.Instance) error { // Remove removes an instance from the registry. // Returns an error if the instance doesn't exist. -func (r *instanceRegistry) Remove(name string) error { +func (r *instanceRegistry) remove(name string) error { r.mu.Lock() defer r.mu.Unlock() @@ -98,30 +98,30 @@ func (r *instanceRegistry) Remove(name string) error { } // MarkRunning marks an instance as running using lock-free sync.Map. -func (r *instanceRegistry) MarkRunning(name string) { +func (r *instanceRegistry) markRunning(name string) { r.running.Store(name, struct{}{}) } // MarkStopped marks an instance as stopped using lock-free sync.Map. -func (r *instanceRegistry) MarkStopped(name string) { +func (r *instanceRegistry) markStopped(name string) { r.running.Delete(name) } // IsRunning checks if an instance is running using lock-free sync.Map. -func (r *instanceRegistry) IsRunning(name string) bool { +func (r *instanceRegistry) isRunning(name string) bool { _, isRunning := r.running.Load(name) return isRunning } // Count returns the total number of instances in the registry. -func (r *instanceRegistry) Count() int { +func (r *instanceRegistry) count() int { r.mu.RLock() defer r.mu.RUnlock() return len(r.instances) } // CountRunning returns the number of currently running instances. -func (r *instanceRegistry) CountRunning() int { +func (r *instanceRegistry) countRunning() int { count := 0 r.running.Range(func(key, value any) bool { count++ diff --git a/pkg/manager/remote.go b/pkg/manager/remote.go index 9f9288a..b466a3b 100644 --- a/pkg/manager/remote.go +++ b/pkg/manager/remote.go @@ -47,7 +47,7 @@ func NewRemoteManager(nodes map[string]config.NodeConfig, timeout time.Duration) // GetNodeForInstance returns the node configuration for a given instance. // Returns nil if the instance is not mapped to any node. -func (rm *remoteManager) GetNodeForInstance(instanceName string) (*config.NodeConfig, bool) { +func (rm *remoteManager) getNodeForInstance(instanceName string) (*config.NodeConfig, bool) { rm.mu.RLock() defer rm.mu.RUnlock() @@ -57,7 +57,7 @@ func (rm *remoteManager) GetNodeForInstance(instanceName string) (*config.NodeCo // SetInstanceNode maps an instance to a specific node. // Returns an error if the node doesn't exist. -func (rm *remoteManager) SetInstanceNode(instanceName, nodeName string) error { +func (rm *remoteManager) setInstanceNode(instanceName, nodeName string) error { rm.mu.Lock() defer rm.mu.Unlock() @@ -71,7 +71,7 @@ func (rm *remoteManager) SetInstanceNode(instanceName, nodeName string) error { } // RemoveInstance removes the instance-to-node mapping. -func (rm *remoteManager) RemoveInstance(instanceName string) { +func (rm *remoteManager) removeInstance(instanceName string) { rm.mu.Lock() defer rm.mu.Unlock() @@ -138,7 +138,7 @@ func parseRemoteResponse(resp *http.Response, result any) error { // --- Remote CRUD operations --- // ListInstances lists all instances on a remote node. -func (rm *remoteManager) ListInstances(ctx context.Context, node *config.NodeConfig) ([]*instance.Instance, error) { +func (rm *remoteManager) listInstances(ctx context.Context, node *config.NodeConfig) ([]*instance.Instance, error) { resp, err := rm.makeRemoteRequest(ctx, node, "GET", apiBasePath, nil) if err != nil { return nil, err @@ -153,7 +153,7 @@ func (rm *remoteManager) ListInstances(ctx context.Context, node *config.NodeCon } // CreateInstance creates a new instance on a remote node. -func (rm *remoteManager) CreateInstance(ctx context.Context, node *config.NodeConfig, name string, opts *instance.Options) (*instance.Instance, error) { +func (rm *remoteManager) createInstance(ctx context.Context, node *config.NodeConfig, name string, opts *instance.Options) (*instance.Instance, error) { path := fmt.Sprintf("%s%s/", apiBasePath, name) resp, err := rm.makeRemoteRequest(ctx, node, "POST", path, opts) @@ -170,7 +170,7 @@ func (rm *remoteManager) CreateInstance(ctx context.Context, node *config.NodeCo } // GetInstance retrieves an instance by name from a remote node. -func (rm *remoteManager) GetInstance(ctx context.Context, node *config.NodeConfig, name string) (*instance.Instance, error) { +func (rm *remoteManager) getInstance(ctx context.Context, node *config.NodeConfig, name string) (*instance.Instance, error) { path := fmt.Sprintf("%s%s/", apiBasePath, name) resp, err := rm.makeRemoteRequest(ctx, node, "GET", path, nil) if err != nil { @@ -186,7 +186,7 @@ func (rm *remoteManager) GetInstance(ctx context.Context, node *config.NodeConfi } // UpdateInstance updates an existing instance on a remote node. -func (rm *remoteManager) UpdateInstance(ctx context.Context, node *config.NodeConfig, name string, opts *instance.Options) (*instance.Instance, error) { +func (rm *remoteManager) updateInstance(ctx context.Context, node *config.NodeConfig, name string, opts *instance.Options) (*instance.Instance, error) { path := fmt.Sprintf("%s%s/", apiBasePath, name) resp, err := rm.makeRemoteRequest(ctx, node, "PUT", path, opts) @@ -203,7 +203,7 @@ func (rm *remoteManager) UpdateInstance(ctx context.Context, node *config.NodeCo } // DeleteInstance deletes an instance from a remote node. -func (rm *remoteManager) DeleteInstance(ctx context.Context, node *config.NodeConfig, name string) error { +func (rm *remoteManager) deleteInstance(ctx context.Context, node *config.NodeConfig, name string) error { path := fmt.Sprintf("%s%s/", apiBasePath, name) resp, err := rm.makeRemoteRequest(ctx, node, "DELETE", path, nil) if err != nil { @@ -214,7 +214,7 @@ func (rm *remoteManager) DeleteInstance(ctx context.Context, node *config.NodeCo } // StartInstance starts an instance on a remote node. -func (rm *remoteManager) StartInstance(ctx context.Context, node *config.NodeConfig, name string) (*instance.Instance, error) { +func (rm *remoteManager) startInstance(ctx context.Context, node *config.NodeConfig, name string) (*instance.Instance, error) { path := fmt.Sprintf("%s%s/start", apiBasePath, name) resp, err := rm.makeRemoteRequest(ctx, node, "POST", path, nil) if err != nil { @@ -230,7 +230,7 @@ func (rm *remoteManager) StartInstance(ctx context.Context, node *config.NodeCon } // StopInstance stops an instance on a remote node. -func (rm *remoteManager) StopInstance(ctx context.Context, node *config.NodeConfig, name string) (*instance.Instance, error) { +func (rm *remoteManager) stopInstance(ctx context.Context, node *config.NodeConfig, name string) (*instance.Instance, error) { path := fmt.Sprintf("%s%s/stop", apiBasePath, name) resp, err := rm.makeRemoteRequest(ctx, node, "POST", path, nil) if err != nil { @@ -246,7 +246,7 @@ func (rm *remoteManager) StopInstance(ctx context.Context, node *config.NodeConf } // RestartInstance restarts an instance on a remote node. -func (rm *remoteManager) RestartInstance(ctx context.Context, node *config.NodeConfig, name string) (*instance.Instance, error) { +func (rm *remoteManager) restartInstance(ctx context.Context, node *config.NodeConfig, name string) (*instance.Instance, error) { path := fmt.Sprintf("%s%s/restart", apiBasePath, name) resp, err := rm.makeRemoteRequest(ctx, node, "POST", path, nil) if err != nil { @@ -262,7 +262,7 @@ func (rm *remoteManager) RestartInstance(ctx context.Context, node *config.NodeC } // GetInstanceLogs retrieves logs for an instance from a remote node. -func (rm *remoteManager) GetInstanceLogs(ctx context.Context, node *config.NodeConfig, name string, numLines int) (string, error) { +func (rm *remoteManager) getInstanceLogs(ctx context.Context, node *config.NodeConfig, name string, numLines int) (string, error) { path := fmt.Sprintf("%s%s/logs?lines=%d", apiBasePath, name, numLines) resp, err := rm.makeRemoteRequest(ctx, node, "GET", path, nil) if err != nil { From d923732aba4dddbfa3fb47d552c491fb7e8bba5d Mon Sep 17 00:00:00 2001 From: LordMathis Date: Mon, 20 Oct 2025 22:27:22 +0200 Subject: [PATCH 04/22] Delete unused code --- pkg/manager/manager.go | 12 ------------ pkg/manager/persistence.go | 26 -------------------------- pkg/manager/registry.go | 10 ---------- pkg/manager/remote.go | 15 --------------- 4 files changed, 63 deletions(-) diff --git a/pkg/manager/manager.go b/pkg/manager/manager.go index 98aac8f..171d6ca 100644 --- a/pkg/manager/manager.go +++ b/pkg/manager/manager.go @@ -26,18 +26,6 @@ type InstanceManager interface { Shutdown() } -type RemoteManager interface { - ListRemoteInstances(node *config.NodeConfig) ([]*instance.Instance, error) - CreateRemoteInstance(node *config.NodeConfig, name string, options *instance.Options) (*instance.Instance, error) - GetRemoteInstance(node *config.NodeConfig, name string) (*instance.Instance, error) - UpdateRemoteInstance(node *config.NodeConfig, name string, options *instance.Options) (*instance.Instance, error) - DeleteRemoteInstance(node *config.NodeConfig, name string) error - StartRemoteInstance(node *config.NodeConfig, name string) (*instance.Instance, error) - StopRemoteInstance(node *config.NodeConfig, name string) (*instance.Instance, error) - RestartRemoteInstance(node *config.NodeConfig, name string) (*instance.Instance, error) - GetRemoteInstanceLogs(node *config.NodeConfig, name string, numLines int) (string, error) -} - type instanceManager struct { // Components (each with own synchronization) registry *instanceRegistry diff --git a/pkg/manager/persistence.go b/pkg/manager/persistence.go index c1c93e5..21f4588 100644 --- a/pkg/manager/persistence.go +++ b/pkg/manager/persistence.go @@ -100,32 +100,6 @@ func (p *instancePersister) save(inst *instance.Instance) error { return nil } -// Load loads a single instance from disk by name. -func (p *instancePersister) load(name string) (*instance.Instance, error) { - if !p.enabled { - return nil, fmt.Errorf("persistence is disabled") - } - - if err := p.validateInstanceName(name); err != nil { - return nil, err - } - - p.mu.Lock() - defer p.mu.Unlock() - - instancePath := filepath.Join(p.instancesDir, name+".json") - - inst, err := p.loadInstanceFile(name, instancePath) - if err != nil { - if os.IsNotExist(err) { - return nil, fmt.Errorf("instance %s not found", name) - } - return nil, err - } - - return inst, nil -} - // Delete removes an instance's persistence file from disk. func (p *instancePersister) delete(name string) error { if !p.enabled { diff --git a/pkg/manager/registry.go b/pkg/manager/registry.go index aa30ada..c8758f9 100644 --- a/pkg/manager/registry.go +++ b/pkg/manager/registry.go @@ -119,13 +119,3 @@ func (r *instanceRegistry) count() int { defer r.mu.RUnlock() return len(r.instances) } - -// CountRunning returns the number of currently running instances. -func (r *instanceRegistry) countRunning() int { - count := 0 - r.running.Range(func(key, value any) bool { - count++ - return true - }) - return count -} diff --git a/pkg/manager/remote.go b/pkg/manager/remote.go index b466a3b..50d2463 100644 --- a/pkg/manager/remote.go +++ b/pkg/manager/remote.go @@ -137,21 +137,6 @@ func parseRemoteResponse(resp *http.Response, result any) error { // --- Remote CRUD operations --- -// ListInstances lists all instances on a remote node. -func (rm *remoteManager) listInstances(ctx context.Context, node *config.NodeConfig) ([]*instance.Instance, error) { - resp, err := rm.makeRemoteRequest(ctx, node, "GET", apiBasePath, nil) - if err != nil { - return nil, err - } - - var instances []*instance.Instance - if err := parseRemoteResponse(resp, &instances); err != nil { - return nil, err - } - - return instances, nil -} - // CreateInstance creates a new instance on a remote node. func (rm *remoteManager) createInstance(ctx context.Context, node *config.NodeConfig, name string, opts *instance.Options) (*instance.Instance, error) { path := fmt.Sprintf("%s%s/", apiBasePath, name) From a2d4622486ef9f6834c57774f3c1caaf8b132820 Mon Sep 17 00:00:00 2001 From: LordMathis Date: Mon, 20 Oct 2025 22:59:31 +0200 Subject: [PATCH 05/22] Refactor instance locking mechanism to use per-instance locks for concurrency --- pkg/manager/manager.go | 22 ++++++++++++++++++++-- pkg/manager/operations.go | 35 ++++++++++++++++++++--------------- 2 files changed, 40 insertions(+), 17 deletions(-) diff --git a/pkg/manager/manager.go b/pkg/manager/manager.go index 171d6ca..551b1dd 100644 --- a/pkg/manager/manager.go +++ b/pkg/manager/manager.go @@ -40,8 +40,9 @@ type instanceManager struct { localNodeName string // Name of the local node // Synchronization - operationMu sync.Mutex // Protects start/stop/update/delete/restart operations - shutdownOnce sync.Once + operationMu sync.Mutex // DEPRECATED: Use instanceLocks for per-instance operations + instanceLocks sync.Map // map[string]*sync.Mutex - per-instance locks for concurrent operations + shutdownOnce sync.Once } // New creates a new instance of InstanceManager. @@ -287,3 +288,20 @@ func (im *instanceManager) getNodeForInstance(inst *instance.Instance) *config.N return nil } + +// lockInstance returns the lock for a specific instance, creating one if needed. +// This allows concurrent operations on different instances while preventing +// concurrent operations on the same instance. +func (im *instanceManager) lockInstance(name string) *sync.Mutex { + lock, _ := im.instanceLocks.LoadOrStore(name, &sync.Mutex{}) + return lock.(*sync.Mutex) +} + +// unlockAndCleanup unlocks the instance lock and removes it from the map. +// This should only be called when deleting an instance to prevent memory leaks. +func (im *instanceManager) unlockAndCleanup(name string) { + if lock, ok := im.instanceLocks.Load(name); ok { + lock.(*sync.Mutex).Unlock() + im.instanceLocks.Delete(name) + } +} diff --git a/pkg/manager/operations.go b/pkg/manager/operations.go index a88ec77..4b2e719 100644 --- a/pkg/manager/operations.go +++ b/pkg/manager/operations.go @@ -240,9 +240,10 @@ func (im *instanceManager) UpdateInstance(name string, options *instance.Options return nil, err } - // Lock for local instance operations to prevent races - im.operationMu.Lock() - defer im.operationMu.Unlock() + // Lock this specific instance only + lock := im.lockInstance(name) + lock.Lock() + defer lock.Unlock() // Handle port changes oldPort := inst.GetPort() @@ -330,9 +331,10 @@ func (im *instanceManager) DeleteInstance(name string) error { return nil } - // Lock for local instance operations to prevent races - im.operationMu.Lock() - defer im.operationMu.Unlock() + // Lock this specific instance and clean up the lock on completion + lock := im.lockInstance(name) + lock.Lock() + defer im.unlockAndCleanup(name) if inst.IsRunning() { return fmt.Errorf("instance with name %s is still running, stop it before deleting", name) @@ -376,9 +378,10 @@ func (im *instanceManager) StartInstance(name string) (*instance.Instance, error return inst, nil } - // Lock for local instance operations to prevent races - im.operationMu.Lock() - defer im.operationMu.Unlock() + // Lock this specific instance only + lock := im.lockInstance(name) + lock.Lock() + defer lock.Unlock() if inst.IsRunning() { return inst, fmt.Errorf("instance with name %s is already running", name) @@ -438,9 +441,10 @@ func (im *instanceManager) StopInstance(name string) (*instance.Instance, error) return inst, nil } - // Lock for local instance operations to prevent races - im.operationMu.Lock() - defer im.operationMu.Unlock() + // Lock this specific instance only + lock := im.lockInstance(name) + lock.Lock() + defer lock.Unlock() if !inst.IsRunning() { return inst, fmt.Errorf("instance with name %s is already stopped", name) @@ -479,9 +483,10 @@ func (im *instanceManager) RestartInstance(name string) (*instance.Instance, err return inst, nil } - // Lock for the entire restart operation to ensure atomicity - im.operationMu.Lock() - defer im.operationMu.Unlock() + // Lock this specific instance for the entire restart operation to ensure atomicity + lock := im.lockInstance(name) + lock.Lock() + defer lock.Unlock() // Stop the instance if inst.IsRunning() { From 7c64ab9cc6041a47cc5f91a074b7077c4257bb92 Mon Sep 17 00:00:00 2001 From: LordMathis Date: Tue, 21 Oct 2025 18:49:49 +0200 Subject: [PATCH 06/22] Make StartInstance and StopInstance idempotent --- pkg/manager/operations.go | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pkg/manager/operations.go b/pkg/manager/operations.go index 4b2e719..09fe06b 100644 --- a/pkg/manager/operations.go +++ b/pkg/manager/operations.go @@ -383,8 +383,9 @@ func (im *instanceManager) StartInstance(name string) (*instance.Instance, error lock.Lock() defer lock.Unlock() + // Idempotent: if already running, just return success if inst.IsRunning() { - return inst, fmt.Errorf("instance with name %s is already running", name) + return inst, nil } // Check max running instances limit for local instances only @@ -446,8 +447,9 @@ func (im *instanceManager) StopInstance(name string) (*instance.Instance, error) lock.Lock() defer lock.Unlock() + // Idempotent: if already stopped, just return success if !inst.IsRunning() { - return inst, fmt.Errorf("instance with name %s is already stopped", name) + return inst, nil } if err := inst.Stop(); err != nil { From 4d05fcea46c2294cab328b17d07efe143d013d87 Mon Sep 17 00:00:00 2001 From: LordMathis Date: Tue, 21 Oct 2025 21:39:01 +0200 Subject: [PATCH 07/22] Improve manager tests --- pkg/manager/lifecycle_test.go | 220 +++++++++++++++++++++ pkg/manager/manager_test.go | 111 +++++------ pkg/manager/operations_test.go | 260 ++++++++++++++++--------- pkg/manager/timeout_test.go | 343 --------------------------------- 4 files changed, 440 insertions(+), 494 deletions(-) create mode 100644 pkg/manager/lifecycle_test.go delete mode 100644 pkg/manager/timeout_test.go diff --git a/pkg/manager/lifecycle_test.go b/pkg/manager/lifecycle_test.go new file mode 100644 index 0000000..520f445 --- /dev/null +++ b/pkg/manager/lifecycle_test.go @@ -0,0 +1,220 @@ +package manager_test + +import ( + "llamactl/pkg/backends" + "llamactl/pkg/instance" + "llamactl/pkg/manager" + "sync" + "testing" + "time" +) + +func TestInstanceTimeoutLogic(t *testing.T) { + testManager := createTestManager() + defer testManager.Shutdown() + + idleTimeout := 1 // 1 minute + inst := createInstanceWithTimeout(t, testManager, "timeout-test", "/path/to/model.gguf", &idleTimeout) + + // Test timeout logic with mock time provider + mockTime := NewMockTimeProvider(time.Now()) + inst.SetTimeProvider(mockTime) + + // Set instance to running state so timeout logic can work + inst.SetStatus(instance.Running) + defer inst.SetStatus(instance.Stopped) + + // Update last request time + inst.UpdateLastRequestTime() + + // Initially should not timeout (just updated) + if inst.ShouldTimeout() { + t.Error("Instance should not timeout immediately after request") + } + + // Advance time to trigger timeout + mockTime.SetTime(time.Now().Add(2 * time.Minute)) + + // Now it should timeout + if !inst.ShouldTimeout() { + t.Error("Instance should timeout after idle period") + } +} + +func TestInstanceWithoutTimeoutNeverExpires(t *testing.T) { + testManager := createTestManager() + defer testManager.Shutdown() + + noTimeoutInst := createInstanceWithTimeout(t, testManager, "no-timeout-test", "/path/to/model.gguf", nil) + + mockTime := NewMockTimeProvider(time.Now()) + noTimeoutInst.SetTimeProvider(mockTime) + noTimeoutInst.SetStatus(instance.Running) + defer noTimeoutInst.SetStatus(instance.Stopped) + + noTimeoutInst.UpdateLastRequestTime() + + // Advance time significantly + mockTime.SetTime(mockTime.Now().Add(24 * time.Hour)) + + // Even with time advanced, should not timeout + if noTimeoutInst.ShouldTimeout() { + t.Error("Instance without timeout configuration should never timeout") + } +} + +func TestEvictLRUInstance_Success(t *testing.T) { + manager := createTestManager() + defer manager.Shutdown() + + // Create 3 instances with idle timeout enabled (value doesn't matter for LRU logic) + validTimeout := 1 + inst1 := createInstanceWithTimeout(t, manager, "instance-1", "/path/to/model1.gguf", &validTimeout) + inst2 := createInstanceWithTimeout(t, manager, "instance-2", "/path/to/model2.gguf", &validTimeout) + inst3 := createInstanceWithTimeout(t, manager, "instance-3", "/path/to/model3.gguf", &validTimeout) + + // Set up mock time and set instances to running + mockTime := NewMockTimeProvider(time.Now()) + inst1.SetTimeProvider(mockTime) + inst2.SetTimeProvider(mockTime) + inst3.SetTimeProvider(mockTime) + + inst1.SetStatus(instance.Running) + inst2.SetStatus(instance.Running) + inst3.SetStatus(instance.Running) + defer func() { + // Clean up - ensure all instances are stopped + for _, inst := range []*instance.Instance{inst1, inst2, inst3} { + if inst.IsRunning() { + inst.SetStatus(instance.Stopped) + } + } + }() + + // Set different last request times (oldest to newest) + // inst1: oldest (will be evicted) + inst1.UpdateLastRequestTime() + + mockTime.SetTime(mockTime.Now().Add(1 * time.Minute)) + inst2.UpdateLastRequestTime() + + mockTime.SetTime(mockTime.Now().Add(1 * time.Minute)) + inst3.UpdateLastRequestTime() + + // Evict LRU instance (should be inst1) + if err := manager.EvictLRUInstance(); err != nil { + t.Fatalf("EvictLRUInstance failed: %v", err) + } + + // Verify inst1 is stopped + if inst1.IsRunning() { + t.Error("Expected instance-1 to be stopped after eviction") + } + + // Verify inst2 and inst3 are still running + if !inst2.IsRunning() { + t.Error("Expected instance-2 to still be running") + } + if !inst3.IsRunning() { + t.Error("Expected instance-3 to still be running") + } +} + +func TestEvictLRUInstance_NoRunningInstances(t *testing.T) { + manager := createTestManager() + defer manager.Shutdown() + + err := manager.EvictLRUInstance() + if err == nil { + t.Error("Expected error when no running instances exist") + } + if err.Error() != "failed to find lru instance" { + t.Errorf("Expected 'failed to find lru instance' error, got: %v", err) + } +} + +func TestEvictLRUInstance_OnlyEvictsTimeoutEnabledInstances(t *testing.T) { + manager := createTestManager() + defer manager.Shutdown() + + // Create mix of instances: some with timeout enabled, some disabled + // Only timeout-enabled instances should be eligible for eviction + validTimeout := 1 + zeroTimeout := 0 + instWithTimeout := createInstanceWithTimeout(t, manager, "with-timeout", "/path/to/model-with-timeout.gguf", &validTimeout) + instNoTimeout1 := createInstanceWithTimeout(t, manager, "no-timeout-1", "/path/to/model-no-timeout1.gguf", &zeroTimeout) + instNoTimeout2 := createInstanceWithTimeout(t, manager, "no-timeout-2", "/path/to/model-no-timeout2.gguf", nil) + + // Set all instances to running + instances := []*instance.Instance{instWithTimeout, instNoTimeout1, instNoTimeout2} + for _, inst := range instances { + inst.SetStatus(instance.Running) + inst.UpdateLastRequestTime() + } + defer func() { + // Reset instances to stopped to avoid shutdown panics + for _, inst := range instances { + if inst.IsRunning() { + inst.SetStatus(instance.Stopped) + } + } + }() + + // Evict LRU instance - should only consider the one with timeout + err := manager.EvictLRUInstance() + if err != nil { + t.Fatalf("EvictLRUInstance failed: %v", err) + } + + // Verify only the instance with timeout was evicted + if instWithTimeout.IsRunning() { + t.Error("Expected with-timeout instance to be stopped after eviction") + } + if !instNoTimeout1.IsRunning() { + t.Error("Expected no-timeout-1 instance to still be running") + } + if !instNoTimeout2.IsRunning() { + t.Error("Expected no-timeout-2 instance to still be running") + } +} + +// Helper function to create instances with different timeout configurations +func createInstanceWithTimeout(t *testing.T, manager manager.InstanceManager, name, model string, timeout *int) *instance.Instance { + t.Helper() + options := &instance.Options{ + IdleTimeout: timeout, + BackendOptions: backends.Options{ + BackendType: backends.BackendTypeLlamaCpp, + LlamaServerOptions: &backends.LlamaServerOptions{ + Model: model, + }, + }, + } + inst, err := manager.CreateInstance(name, options) + if err != nil { + t.Fatalf("CreateInstance failed: %v", err) + } + return inst +} + +// Helper for timeout tests +type MockTimeProvider struct { + currentTime time.Time + mu sync.RWMutex +} + +func NewMockTimeProvider(t time.Time) *MockTimeProvider { + return &MockTimeProvider{currentTime: t} +} + +func (m *MockTimeProvider) Now() time.Time { + m.mu.RLock() + defer m.mu.RUnlock() + return m.currentTime +} + +func (m *MockTimeProvider) SetTime(t time.Time) { + m.mu.Lock() + defer m.mu.Unlock() + m.currentTime = t +} diff --git a/pkg/manager/manager_test.go b/pkg/manager/manager_test.go index 997e0f6..b85c691 100644 --- a/pkg/manager/manager_test.go +++ b/pkg/manager/manager_test.go @@ -8,37 +8,16 @@ import ( "llamactl/pkg/manager" "os" "path/filepath" - "strings" "sync" "testing" ) func TestNewInstanceManager(t *testing.T) { - backendConfig := config.BackendConfig{ - LlamaCpp: config.BackendSettings{ - Command: "llama-server", - }, - MLX: config.BackendSettings{ - Command: "mlx_lm.server", - }, - } - - cfg := config.InstancesConfig{ - PortRange: [2]int{8000, 9000}, - LogsDir: "/tmp/test", - MaxInstances: 5, - DefaultAutoRestart: true, - DefaultMaxRestarts: 3, - DefaultRestartDelay: 5, - TimeoutCheckInterval: 5, - } - - mgr := manager.New(backendConfig, cfg, map[string]config.NodeConfig{}, "main") + mgr := createTestManager() if mgr == nil { t.Fatal("NewInstanceManager returned nil") } - // Test initial state instances, err := mgr.ListInstances() if err != nil { t.Fatalf("ListInstances failed: %v", err) @@ -48,26 +27,12 @@ func TestNewInstanceManager(t *testing.T) { } } -func TestPersistence(t *testing.T) { +func TestPersistence_SaveAndLoad(t *testing.T) { tempDir := t.TempDir() + cfg := createPersistenceConfig(tempDir) + backendConfig := createBackendConfig() - backendConfig := config.BackendConfig{ - LlamaCpp: config.BackendSettings{ - Command: "llama-server", - }, - MLX: config.BackendSettings{ - Command: "mlx_lm.server", - }, - } - - cfg := config.InstancesConfig{ - PortRange: [2]int{8000, 9000}, - InstancesDir: tempDir, - MaxInstances: 10, - TimeoutCheckInterval: 5, - } - - // Test instance persistence on creation + // Create instance and check file was created manager1 := manager.New(backendConfig, cfg, map[string]config.NodeConfig{}, "main") options := &instance.Options{ BackendOptions: backends.Options{ @@ -84,13 +49,12 @@ func TestPersistence(t *testing.T) { t.Fatalf("CreateInstance failed: %v", err) } - // Check that JSON file was created expectedPath := filepath.Join(tempDir, "test-instance.json") if _, err := os.Stat(expectedPath); os.IsNotExist(err) { t.Errorf("Expected persistence file %s to exist", expectedPath) } - // Test loading instances from disk + // Load instances from disk manager2 := manager.New(backendConfig, cfg, map[string]config.NodeConfig{}, "main") instances, err := manager2.ListInstances() if err != nil { @@ -102,15 +66,32 @@ func TestPersistence(t *testing.T) { if instances[0].Name != "test-instance" { t.Errorf("Expected loaded instance name 'test-instance', got %q", instances[0].Name) } +} - // Test port map populated from loaded instances (port conflict should be detected) - _, err = manager2.CreateInstance("new-instance", options) // Same port - if err == nil || !strings.Contains(err.Error(), "port") { - t.Errorf("Expected port conflict error, got: %v", err) +func TestPersistence_DeleteRemovesFile(t *testing.T) { + tempDir := t.TempDir() + cfg := createPersistenceConfig(tempDir) + backendConfig := createBackendConfig() + + mgr := manager.New(backendConfig, cfg, map[string]config.NodeConfig{}, "main") + options := &instance.Options{ + BackendOptions: backends.Options{ + BackendType: backends.BackendTypeLlamaCpp, + LlamaServerOptions: &backends.LlamaServerOptions{ + Model: "/path/to/model.gguf", + Port: 8080, + }, + }, } - // Test file deletion on instance deletion - err = manager2.DeleteInstance("test-instance") + _, err := mgr.CreateInstance("test-instance", options) + if err != nil { + t.Fatalf("CreateInstance failed: %v", err) + } + + expectedPath := filepath.Join(tempDir, "test-instance.json") + + err = mgr.DeleteInstance("test-instance") if err != nil { t.Fatalf("DeleteInstance failed: %v", err) } @@ -192,9 +173,9 @@ func TestShutdown(t *testing.T) { mgr.Shutdown() } -// Helper function to create a test manager with standard config -func createTestManager() manager.InstanceManager { - backendConfig := config.BackendConfig{ +// Helper functions for test configuration +func createBackendConfig() config.BackendConfig { + return config.BackendConfig{ LlamaCpp: config.BackendSettings{ Command: "llama-server", }, @@ -202,7 +183,18 @@ func createTestManager() manager.InstanceManager { Command: "mlx_lm.server", }, } +} +func createPersistenceConfig(dir string) config.InstancesConfig { + return config.InstancesConfig{ + PortRange: [2]int{8000, 9000}, + InstancesDir: dir, + MaxInstances: 10, + TimeoutCheckInterval: 5, + } +} + +func createTestManager() manager.InstanceManager { cfg := config.InstancesConfig{ PortRange: [2]int{8000, 9000}, LogsDir: "/tmp/test", @@ -212,24 +204,13 @@ func createTestManager() manager.InstanceManager { DefaultRestartDelay: 5, TimeoutCheckInterval: 5, } - return manager.New(backendConfig, cfg, map[string]config.NodeConfig{}, "main") + return manager.New(createBackendConfig(), cfg, map[string]config.NodeConfig{}, "main") } func TestAutoRestartDisabledInstanceStatus(t *testing.T) { tempDir := t.TempDir() - - backendConfig := config.BackendConfig{ - LlamaCpp: config.BackendSettings{ - Command: "llama-server", - }, - } - - cfg := config.InstancesConfig{ - PortRange: [2]int{8000, 9000}, - InstancesDir: tempDir, - MaxInstances: 10, - TimeoutCheckInterval: 5, - } + cfg := createPersistenceConfig(tempDir) + backendConfig := createBackendConfig() // Create first manager and instance with auto-restart disabled manager1 := manager.New(backendConfig, cfg, map[string]config.NodeConfig{}, "main") diff --git a/pkg/manager/operations_test.go b/pkg/manager/operations_test.go index 7523c5b..d078145 100644 --- a/pkg/manager/operations_test.go +++ b/pkg/manager/operations_test.go @@ -38,8 +38,7 @@ func TestCreateInstance_Success(t *testing.T) { } } -func TestCreateInstance_ValidationAndLimits(t *testing.T) { - // Test duplicate names +func TestCreateInstance_DuplicateName(t *testing.T) { mngr := createTestManager() options := &instance.Options{ BackendOptions: backends.Options{ @@ -63,15 +62,13 @@ func TestCreateInstance_ValidationAndLimits(t *testing.T) { if !strings.Contains(err.Error(), "already exists") { t.Errorf("Expected duplicate name error, got: %v", err) } +} - // Test max instances limit +func TestCreateInstance_MaxInstancesLimit(t *testing.T) { backendConfig := config.BackendConfig{ LlamaCpp: config.BackendSettings{ Command: "llama-server", }, - MLX: config.BackendSettings{ - Command: "mlx_lm.server", - }, } cfg := config.InstancesConfig{ PortRange: [2]int{8000, 9000}, @@ -80,7 +77,16 @@ func TestCreateInstance_ValidationAndLimits(t *testing.T) { } limitedManager := manager.New(backendConfig, cfg, map[string]config.NodeConfig{}, "main") - _, err = limitedManager.CreateInstance("instance1", options) + options := &instance.Options{ + BackendOptions: backends.Options{ + BackendType: backends.BackendTypeLlamaCpp, + LlamaServerOptions: &backends.LlamaServerOptions{ + Model: "/path/to/model.gguf", + }, + }, + } + + _, err := limitedManager.CreateInstance("instance1", options) if err != nil { t.Fatalf("CreateInstance 1 failed: %v", err) } @@ -95,78 +101,98 @@ func TestCreateInstance_ValidationAndLimits(t *testing.T) { } } -func TestPortManagement(t *testing.T) { - manager := createTestManager() - - // Test auto port assignment - options1 := &instance.Options{ - BackendOptions: backends.Options{ - BackendType: backends.BackendTypeLlamaCpp, - LlamaServerOptions: &backends.LlamaServerOptions{ - Model: "/path/to/model.gguf", - }, - }, - } - - inst1, err := manager.CreateInstance("instance1", options1) - if err != nil { - t.Fatalf("CreateInstance failed: %v", err) - } - - port1 := inst1.GetPort() - if port1 < 8000 || port1 > 9000 { - t.Errorf("Expected port in range 8000-9000, got %d", port1) - } - - // Test port conflict detection - options2 := &instance.Options{ - BackendOptions: backends.Options{ - BackendType: backends.BackendTypeLlamaCpp, - LlamaServerOptions: &backends.LlamaServerOptions{ - Model: "/path/to/model2.gguf", - Port: port1, // Same port - should conflict - }, - }, - } - - _, err = manager.CreateInstance("instance2", options2) - if err == nil { - t.Error("Expected error for port conflict") - } - if !strings.Contains(err.Error(), "port") && !strings.Contains(err.Error(), "in use") { - t.Errorf("Expected port conflict error, got: %v", err) - } - - // Test port release on deletion - specificPort := 8080 - options3 := &instance.Options{ - BackendOptions: backends.Options{ - BackendType: backends.BackendTypeLlamaCpp, - LlamaServerOptions: &backends.LlamaServerOptions{ - Model: "/path/to/model.gguf", - Port: specificPort, - }, - }, - } - - _, err = manager.CreateInstance("port-test", options3) - if err != nil { - t.Fatalf("CreateInstance failed: %v", err) - } - - err = manager.DeleteInstance("port-test") - if err != nil { - t.Fatalf("DeleteInstance failed: %v", err) - } - - // Should be able to create new instance with same port - _, err = manager.CreateInstance("new-port-test", options3) - if err != nil { - t.Errorf("Expected to reuse port after deletion, got error: %v", err) - } -} - -func TestInstanceOperations(t *testing.T) { +func TestPort_AutoAssignment(t *testing.T) { + manager := createTestManager() + + options := &instance.Options{ + BackendOptions: backends.Options{ + BackendType: backends.BackendTypeLlamaCpp, + LlamaServerOptions: &backends.LlamaServerOptions{ + Model: "/path/to/model.gguf", + }, + }, + } + + inst, err := manager.CreateInstance("instance1", options) + if err != nil { + t.Fatalf("CreateInstance failed: %v", err) + } + + port := inst.GetPort() + if port < 8000 || port > 9000 { + t.Errorf("Expected port in range 8000-9000, got %d", port) + } +} + +func TestPort_ConflictDetection(t *testing.T) { + manager := createTestManager() + + options1 := &instance.Options{ + BackendOptions: backends.Options{ + BackendType: backends.BackendTypeLlamaCpp, + LlamaServerOptions: &backends.LlamaServerOptions{ + Model: "/path/to/model.gguf", + Port: 8080, + }, + }, + } + + _, err := manager.CreateInstance("instance1", options1) + if err != nil { + t.Fatalf("CreateInstance failed: %v", err) + } + + // Try to create instance with same port + options2 := &instance.Options{ + BackendOptions: backends.Options{ + BackendType: backends.BackendTypeLlamaCpp, + LlamaServerOptions: &backends.LlamaServerOptions{ + Model: "/path/to/model2.gguf", + Port: 8080, // Same port - should conflict + }, + }, + } + + _, err = manager.CreateInstance("instance2", options2) + if err == nil { + t.Error("Expected error for port conflict") + } + if !strings.Contains(err.Error(), "port") && !strings.Contains(err.Error(), "in use") { + t.Errorf("Expected port conflict error, got: %v", err) + } +} + +func TestPort_ReleaseOnDeletion(t *testing.T) { + manager := createTestManager() + + options := &instance.Options{ + BackendOptions: backends.Options{ + BackendType: backends.BackendTypeLlamaCpp, + LlamaServerOptions: &backends.LlamaServerOptions{ + Model: "/path/to/model.gguf", + Port: 8080, + }, + }, + } + + _, err := manager.CreateInstance("port-test", options) + if err != nil { + t.Fatalf("CreateInstance failed: %v", err) + } + + err = manager.DeleteInstance("port-test") + if err != nil { + t.Fatalf("DeleteInstance failed: %v", err) + } + + // Should be able to create new instance with same port + _, err = manager.CreateInstance("new-port-test", options) + if err != nil { + t.Errorf("Expected to reuse port after deletion, got error: %v", err) + } +} + +func TestGetInstance(t *testing.T) { manager := createTestManager() options := &instance.Options{ @@ -178,13 +204,11 @@ func TestInstanceOperations(t *testing.T) { }, } - // Create instance created, err := manager.CreateInstance("test-instance", options) if err != nil { t.Fatalf("CreateInstance failed: %v", err) } - // Get instance retrieved, err := manager.GetInstance("test-instance") if err != nil { t.Fatalf("GetInstance failed: %v", err) @@ -192,8 +216,26 @@ func TestInstanceOperations(t *testing.T) { if retrieved.Name != created.Name { t.Errorf("Expected name %q, got %q", created.Name, retrieved.Name) } +} + +func TestUpdateInstance(t *testing.T) { + manager := createTestManager() + + options := &instance.Options{ + BackendOptions: backends.Options{ + BackendType: backends.BackendTypeLlamaCpp, + LlamaServerOptions: &backends.LlamaServerOptions{ + Model: "/path/to/model.gguf", + Port: 8080, + }, + }, + } + + _, err := manager.CreateInstance("test-instance", options) + if err != nil { + t.Fatalf("CreateInstance failed: %v", err) + } - // Update instance newOptions := &instance.Options{ BackendOptions: backends.Options{ BackendType: backends.BackendTypeLlamaCpp, @@ -211,8 +253,25 @@ func TestInstanceOperations(t *testing.T) { if updated.GetOptions().BackendOptions.LlamaServerOptions.Model != "/path/to/new-model.gguf" { t.Errorf("Expected model '/path/to/new-model.gguf', got %q", updated.GetOptions().BackendOptions.LlamaServerOptions.Model) } +} + +func TestListInstances(t *testing.T) { + manager := createTestManager() + + options := &instance.Options{ + BackendOptions: backends.Options{ + BackendType: backends.BackendTypeLlamaCpp, + LlamaServerOptions: &backends.LlamaServerOptions{ + Model: "/path/to/model.gguf", + }, + }, + } + + _, err := manager.CreateInstance("test-instance", options) + if err != nil { + t.Fatalf("CreateInstance failed: %v", err) + } - // List instances instances, err := manager.ListInstances() if err != nil { t.Fatalf("ListInstances failed: %v", err) @@ -220,8 +279,25 @@ func TestInstanceOperations(t *testing.T) { if len(instances) != 1 { t.Errorf("Expected 1 instance, got %d", len(instances)) } +} + +func TestDeleteInstance(t *testing.T) { + manager := createTestManager() + + options := &instance.Options{ + BackendOptions: backends.Options{ + BackendType: backends.BackendTypeLlamaCpp, + LlamaServerOptions: &backends.LlamaServerOptions{ + Model: "/path/to/model.gguf", + }, + }, + } + + _, err := manager.CreateInstance("test-instance", options) + if err != nil { + t.Fatalf("CreateInstance failed: %v", err) + } - // Delete instance err = manager.DeleteInstance("test-instance") if err != nil { t.Fatalf("DeleteInstance failed: %v", err) @@ -231,9 +307,21 @@ func TestInstanceOperations(t *testing.T) { if err == nil { t.Error("Instance should not exist after deletion") } +} - // Test operations on non-existent instances - _, err = manager.GetInstance("nonexistent") +func TestInstanceOperations_NonExistentInstance(t *testing.T) { + manager := createTestManager() + + options := &instance.Options{ + BackendOptions: backends.Options{ + BackendType: backends.BackendTypeLlamaCpp, + LlamaServerOptions: &backends.LlamaServerOptions{ + Model: "/path/to/model.gguf", + }, + }, + } + + _, err := manager.GetInstance("nonexistent") if err == nil || !strings.Contains(err.Error(), "not found") { t.Errorf("Expected 'not found' error, got: %v", err) } diff --git a/pkg/manager/timeout_test.go b/pkg/manager/timeout_test.go deleted file mode 100644 index e05c400..0000000 --- a/pkg/manager/timeout_test.go +++ /dev/null @@ -1,343 +0,0 @@ -package manager_test - -import ( - "llamactl/pkg/backends" - "llamactl/pkg/config" - "llamactl/pkg/instance" - "llamactl/pkg/manager" - "sync" - "testing" - "time" -) - -func TestTimeoutFunctionality(t *testing.T) { - // Test timeout checker initialization - backendConfig := config.BackendConfig{ - LlamaCpp: config.BackendSettings{Command: "llama-server"}, - MLX: config.BackendSettings{Command: "mlx_lm.server"}, - } - cfg := config.InstancesConfig{ - PortRange: [2]int{8000, 9000}, - TimeoutCheckInterval: 10, - MaxInstances: 5, - } - - manager := manager.New(backendConfig, cfg, map[string]config.NodeConfig{}, "main") - if manager == nil { - t.Fatal("Manager should be initialized with timeout checker") - } - manager.Shutdown() // Clean up - - // Test timeout configuration and logic without starting the actual process - testManager := createTestManager() - defer testManager.Shutdown() - - idleTimeout := 1 // 1 minute - options := &instance.Options{ - IdleTimeout: &idleTimeout, - BackendOptions: backends.Options{ - BackendType: backends.BackendTypeLlamaCpp, - LlamaServerOptions: &backends.LlamaServerOptions{ - Model: "/path/to/model.gguf", - }, - }, - } - - inst, err := testManager.CreateInstance("timeout-test", options) - if err != nil { - t.Fatalf("CreateInstance failed: %v", err) - } - - // Test timeout configuration is properly set - if inst.GetOptions().IdleTimeout == nil { - t.Fatal("Instance should have idle timeout configured") - } - if *inst.GetOptions().IdleTimeout != 1 { - t.Errorf("Expected idle timeout 1 minute, got %d", *inst.GetOptions().IdleTimeout) - } - - // Test timeout logic without actually starting the process - // Create a mock time provider to simulate timeout - mockTime := NewMockTimeProvider(time.Now()) - inst.SetTimeProvider(mockTime) - - // Set instance to running state so timeout logic can work - inst.SetStatus(instance.Running) - - // Simulate instance being "running" for timeout check (without actual process) - // We'll test the ShouldTimeout logic directly - inst.UpdateLastRequestTime() - - // Initially should not timeout (just updated) - if inst.ShouldTimeout() { - t.Error("Instance should not timeout immediately after request") - } - - // Advance time to trigger timeout - mockTime.SetTime(time.Now().Add(2 * time.Minute)) - - // Now it should timeout - if !inst.ShouldTimeout() { - t.Error("Instance should timeout after idle period") - } - - // Reset running state to avoid shutdown issues - inst.SetStatus(instance.Stopped) - - // Test that instance without timeout doesn't timeout - noTimeoutOptions := &instance.Options{ - BackendOptions: backends.Options{ - BackendType: backends.BackendTypeLlamaCpp, - LlamaServerOptions: &backends.LlamaServerOptions{ - Model: "/path/to/model.gguf", - }, - }, - // No IdleTimeout set - } - - noTimeoutInst, err := testManager.CreateInstance("no-timeout-test", noTimeoutOptions) - if err != nil { - t.Fatalf("CreateInstance failed: %v", err) - } - - noTimeoutInst.SetTimeProvider(mockTime) - noTimeoutInst.SetStatus(instance.Running) // Set to running for timeout check - noTimeoutInst.UpdateLastRequestTime() - - // Even with time advanced, should not timeout - if noTimeoutInst.ShouldTimeout() { - t.Error("Instance without timeout configuration should never timeout") - } - - // Reset running state to avoid shutdown issues - noTimeoutInst.SetStatus(instance.Stopped) -} - -func TestEvictLRUInstance_Success(t *testing.T) { - manager := createTestManager() - // Don't defer manager.Shutdown() - we'll handle cleanup manually - - // Create 3 instances with idle timeout enabled (value doesn't matter for LRU logic) - options1 := &instance.Options{ - IdleTimeout: func() *int { timeout := 1; return &timeout }(), // Any value > 0 - BackendOptions: backends.Options{ - BackendType: backends.BackendTypeLlamaCpp, - LlamaServerOptions: &backends.LlamaServerOptions{ - Model: "/path/to/model1.gguf", - }, - }, - } - options2 := &instance.Options{ - IdleTimeout: func() *int { timeout := 1; return &timeout }(), // Any value > 0 - BackendOptions: backends.Options{ - BackendType: backends.BackendTypeLlamaCpp, - LlamaServerOptions: &backends.LlamaServerOptions{ - Model: "/path/to/model2.gguf", - }, - }, - } - options3 := &instance.Options{ - IdleTimeout: func() *int { timeout := 1; return &timeout }(), // Any value > 0 - BackendOptions: backends.Options{ - BackendType: backends.BackendTypeLlamaCpp, - LlamaServerOptions: &backends.LlamaServerOptions{ - Model: "/path/to/model3.gguf", - }, - }, - } - - inst1, err := manager.CreateInstance("instance-1", options1) - if err != nil { - t.Fatalf("CreateInstance failed: %v", err) - } - inst2, err := manager.CreateInstance("instance-2", options2) - if err != nil { - t.Fatalf("CreateInstance failed: %v", err) - } - inst3, err := manager.CreateInstance("instance-3", options3) - if err != nil { - t.Fatalf("CreateInstance failed: %v", err) - } - - // Set up mock time and set instances to running - mockTime := NewMockTimeProvider(time.Now()) - inst1.SetTimeProvider(mockTime) - inst2.SetTimeProvider(mockTime) - inst3.SetTimeProvider(mockTime) - - inst1.SetStatus(instance.Running) - inst2.SetStatus(instance.Running) - inst3.SetStatus(instance.Running) - - // Set different last request times (oldest to newest) - // inst1: oldest (will be evicted) - inst1.UpdateLastRequestTime() - - mockTime.SetTime(mockTime.Now().Add(1 * time.Minute)) - inst2.UpdateLastRequestTime() - - mockTime.SetTime(mockTime.Now().Add(1 * time.Minute)) - inst3.UpdateLastRequestTime() - - // Evict LRU instance (should be inst1) - err = manager.EvictLRUInstance() - if err != nil { - t.Fatalf("EvictLRUInstance failed: %v", err) - } - - // Verify inst1 is stopped - if inst1.IsRunning() { - t.Error("Expected instance-1 to be stopped after eviction") - } - - // Verify inst2 and inst3 are still running - if !inst2.IsRunning() { - t.Error("Expected instance-2 to still be running") - } - if !inst3.IsRunning() { - t.Error("Expected instance-3 to still be running") - } - - // Clean up manually - set all to stopped and then shutdown - inst2.SetStatus(instance.Stopped) - inst3.SetStatus(instance.Stopped) -} - -func TestEvictLRUInstance_NoEligibleInstances(t *testing.T) { - // Helper function to create instances with different timeout configurations - createInstanceWithTimeout := func(manager manager.InstanceManager, name, model string, timeout *int) *instance.Instance { - options := &instance.Options{ - IdleTimeout: timeout, - BackendOptions: backends.Options{ - BackendType: backends.BackendTypeLlamaCpp, - LlamaServerOptions: &backends.LlamaServerOptions{ - Model: model, - }, - }, - } - inst, err := manager.CreateInstance(name, options) - if err != nil { - t.Fatalf("CreateInstance failed: %v", err) - } - return inst - } - - t.Run("no running instances", func(t *testing.T) { - manager := createTestManager() - defer manager.Shutdown() - - err := manager.EvictLRUInstance() - if err == nil { - t.Error("Expected error when no running instances exist") - } - if err.Error() != "failed to find lru instance" { - t.Errorf("Expected 'failed to find lru instance' error, got: %v", err) - } - }) - - t.Run("only instances without timeout", func(t *testing.T) { - manager := createTestManager() - defer manager.Shutdown() - - // Create instances with various non-eligible timeout configurations - zeroTimeout := 0 - negativeTimeout := -1 - inst1 := createInstanceWithTimeout(manager, "no-timeout-1", "/path/to/model1.gguf", &zeroTimeout) - inst2 := createInstanceWithTimeout(manager, "no-timeout-2", "/path/to/model2.gguf", &negativeTimeout) - inst3 := createInstanceWithTimeout(manager, "no-timeout-3", "/path/to/model3.gguf", nil) - - // Set instances to running - instances := []*instance.Instance{inst1, inst2, inst3} - for _, inst := range instances { - inst.SetStatus(instance.Running) - } - defer func() { - // Reset instances to stopped to avoid shutdown panics - for _, inst := range instances { - inst.SetStatus(instance.Stopped) - } - }() - - // Try to evict - should fail because no eligible instances - err := manager.EvictLRUInstance() - if err == nil { - t.Error("Expected error when no eligible instances exist") - } - if err.Error() != "failed to find lru instance" { - t.Errorf("Expected 'failed to find lru instance' error, got: %v", err) - } - - // Verify all instances are still running - for i, inst := range instances { - if !inst.IsRunning() { - t.Errorf("Expected instance %d to still be running", i+1) - } - } - }) - - t.Run("mixed instances - evicts only eligible ones", func(t *testing.T) { - manager := createTestManager() - defer manager.Shutdown() - - // Create mix of instances: some with timeout enabled, some disabled - validTimeout := 1 - zeroTimeout := 0 - instWithTimeout := createInstanceWithTimeout(manager, "with-timeout", "/path/to/model-with-timeout.gguf", &validTimeout) - instNoTimeout1 := createInstanceWithTimeout(manager, "no-timeout-1", "/path/to/model-no-timeout1.gguf", &zeroTimeout) - instNoTimeout2 := createInstanceWithTimeout(manager, "no-timeout-2", "/path/to/model-no-timeout2.gguf", nil) - - // Set all instances to running - instances := []*instance.Instance{instWithTimeout, instNoTimeout1, instNoTimeout2} - for _, inst := range instances { - inst.SetStatus(instance.Running) - inst.UpdateLastRequestTime() - } - defer func() { - // Reset instances to stopped to avoid shutdown panics - for _, inst := range instances { - if inst.IsRunning() { - inst.SetStatus(instance.Stopped) - } - } - }() - - // Evict LRU instance - should only consider the one with timeout - err := manager.EvictLRUInstance() - if err != nil { - t.Fatalf("EvictLRUInstance failed: %v", err) - } - - // Verify only the instance with timeout was evicted - if instWithTimeout.IsRunning() { - t.Error("Expected with-timeout instance to be stopped after eviction") - } - if !instNoTimeout1.IsRunning() { - t.Error("Expected no-timeout-1 instance to still be running") - } - if !instNoTimeout2.IsRunning() { - t.Error("Expected no-timeout-2 instance to still be running") - } - }) -} - -// Helper for timeout tests -type MockTimeProvider struct { - currentTime time.Time - mu sync.RWMutex -} - -func NewMockTimeProvider(t time.Time) *MockTimeProvider { - return &MockTimeProvider{currentTime: t} -} - -func (m *MockTimeProvider) Now() time.Time { - m.mu.RLock() - defer m.mu.RUnlock() - return m.currentTime -} - -func (m *MockTimeProvider) SetTime(t time.Time) { - m.mu.Lock() - defer m.mu.Unlock() - m.currentTime = t -} From 6afe120a0ef24c22446d682daa683706244ac798 Mon Sep 17 00:00:00 2001 From: LordMathis Date: Tue, 21 Oct 2025 22:07:10 +0200 Subject: [PATCH 08/22] Implement more manager tests --- pkg/manager/manager_test.go | 14 +- pkg/manager/operations_test.go | 319 ++++++++++++++++++++++++++++++++- 2 files changed, 325 insertions(+), 8 deletions(-) diff --git a/pkg/manager/manager_test.go b/pkg/manager/manager_test.go index b85c691..ed0ed25 100644 --- a/pkg/manager/manager_test.go +++ b/pkg/manager/manager_test.go @@ -27,7 +27,7 @@ func TestNewInstanceManager(t *testing.T) { } } -func TestPersistence_SaveAndLoad(t *testing.T) { +func TestManager_PersistsAndLoadsInstances(t *testing.T) { tempDir := t.TempDir() cfg := createPersistenceConfig(tempDir) backendConfig := createBackendConfig() @@ -68,7 +68,7 @@ func TestPersistence_SaveAndLoad(t *testing.T) { } } -func TestPersistence_DeleteRemovesFile(t *testing.T) { +func TestDeleteInstance_RemovesPersistenceFile(t *testing.T) { tempDir := t.TempDir() cfg := createPersistenceConfig(tempDir) backendConfig := createBackendConfig() @@ -175,12 +175,15 @@ func TestShutdown(t *testing.T) { // Helper functions for test configuration func createBackendConfig() config.BackendConfig { + // Use 'sleep' as a test command instead of 'llama-server' + // This allows tests to run in CI environments without requiring actual LLM binaries + // The sleep command will be invoked with model paths and other args, which it ignores return config.BackendConfig{ LlamaCpp: config.BackendSettings{ - Command: "llama-server", + Command: "sleep", }, MLX: config.BackendSettings{ - Command: "mlx_lm.server", + Command: "sleep", }, } } @@ -199,6 +202,7 @@ func createTestManager() manager.InstanceManager { PortRange: [2]int{8000, 9000}, LogsDir: "/tmp/test", MaxInstances: 10, + MaxRunningInstances: 10, DefaultAutoRestart: true, DefaultMaxRestarts: 3, DefaultRestartDelay: 5, @@ -207,7 +211,7 @@ func createTestManager() manager.InstanceManager { return manager.New(createBackendConfig(), cfg, map[string]config.NodeConfig{}, "main") } -func TestAutoRestartDisabledInstanceStatus(t *testing.T) { +func TestManager_DoesNotAutoRestartWhenDisabled(t *testing.T) { tempDir := t.TempDir() cfg := createPersistenceConfig(tempDir) backendConfig := createBackendConfig() diff --git a/pkg/manager/operations_test.go b/pkg/manager/operations_test.go index d078145..dc3ae1e 100644 --- a/pkg/manager/operations_test.go +++ b/pkg/manager/operations_test.go @@ -101,7 +101,7 @@ func TestCreateInstance_MaxInstancesLimit(t *testing.T) { } } -func TestPort_AutoAssignment(t *testing.T) { +func TestCreateInstance_AutoAssignsPort(t *testing.T) { manager := createTestManager() options := &instance.Options{ @@ -124,7 +124,7 @@ func TestPort_AutoAssignment(t *testing.T) { } } -func TestPort_ConflictDetection(t *testing.T) { +func TestCreateInstance_PortConflict(t *testing.T) { manager := createTestManager() options1 := &instance.Options{ @@ -162,7 +162,7 @@ func TestPort_ConflictDetection(t *testing.T) { } } -func TestPort_ReleaseOnDeletion(t *testing.T) { +func TestDeleteInstance_ReleasesPort(t *testing.T) { manager := createTestManager() options := &instance.Options{ @@ -336,3 +336,316 @@ func TestInstanceOperations_NonExistentInstance(t *testing.T) { t.Errorf("Expected 'not found' error, got: %v", err) } } + +func TestStartInstance(t *testing.T) { + mgr := createTestManager() + defer mgr.Shutdown() + + options := &instance.Options{ + BackendOptions: backends.Options{ + BackendType: backends.BackendTypeLlamaCpp, + LlamaServerOptions: &backends.LlamaServerOptions{ + Model: "/path/to/model.gguf", + }, + }, + } + + inst, err := mgr.CreateInstance("test-instance", options) + if err != nil { + t.Fatalf("CreateInstance failed: %v", err) + } + + if inst.IsRunning() { + t.Error("New instance should not be running") + } + + // Start the instance + started, err := mgr.StartInstance("test-instance") + if err != nil { + t.Fatalf("StartInstance failed: %v", err) + } + + if !started.IsRunning() { + t.Error("Instance should be running after start") + } +} + +func TestStartInstance_Idempotent(t *testing.T) { + mgr := createTestManager() + defer mgr.Shutdown() + + options := &instance.Options{ + BackendOptions: backends.Options{ + BackendType: backends.BackendTypeLlamaCpp, + LlamaServerOptions: &backends.LlamaServerOptions{ + Model: "/path/to/model.gguf", + }, + }, + } + + inst, err := mgr.CreateInstance("test-instance", options) + if err != nil { + t.Fatalf("CreateInstance failed: %v", err) + } + + // Start the instance + _, err = mgr.StartInstance("test-instance") + if err != nil { + t.Fatalf("First StartInstance failed: %v", err) + } + + // Start again - should be idempotent + started, err := mgr.StartInstance("test-instance") + if err != nil { + t.Fatalf("Second StartInstance failed: %v", err) + } + + if !started.IsRunning() { + t.Error("Instance should still be running") + } + + if inst.GetStatus() != instance.Running { + t.Errorf("Expected Running status, got %v", inst.GetStatus()) + } +} + +func TestStopInstance(t *testing.T) { + mgr := createTestManager() + defer mgr.Shutdown() + + options := &instance.Options{ + BackendOptions: backends.Options{ + BackendType: backends.BackendTypeLlamaCpp, + LlamaServerOptions: &backends.LlamaServerOptions{ + Model: "/path/to/model.gguf", + }, + }, + } + + _, err := mgr.CreateInstance("test-instance", options) + if err != nil { + t.Fatalf("CreateInstance failed: %v", err) + } + + _, err = mgr.StartInstance("test-instance") + if err != nil { + t.Fatalf("StartInstance failed: %v", err) + } + + // Stop the instance + stopped, err := mgr.StopInstance("test-instance") + if err != nil { + t.Fatalf("StopInstance failed: %v", err) + } + + if stopped.IsRunning() { + t.Error("Instance should not be running after stop") + } +} + +func TestStopInstance_Idempotent(t *testing.T) { + mgr := createTestManager() + defer mgr.Shutdown() + + options := &instance.Options{ + BackendOptions: backends.Options{ + BackendType: backends.BackendTypeLlamaCpp, + LlamaServerOptions: &backends.LlamaServerOptions{ + Model: "/path/to/model.gguf", + }, + }, + } + + inst, err := mgr.CreateInstance("test-instance", options) + if err != nil { + t.Fatalf("CreateInstance failed: %v", err) + } + + // Stop when already stopped - should be idempotent + stopped, err := mgr.StopInstance("test-instance") + if err != nil { + t.Fatalf("StopInstance failed: %v", err) + } + + if stopped.IsRunning() { + t.Error("Instance should not be running") + } + + if inst.GetStatus() != instance.Stopped { + t.Errorf("Expected Stopped status, got %v", inst.GetStatus()) + } +} + +func TestRestartInstance(t *testing.T) { + mgr := createTestManager() + defer mgr.Shutdown() + + options := &instance.Options{ + BackendOptions: backends.Options{ + BackendType: backends.BackendTypeLlamaCpp, + LlamaServerOptions: &backends.LlamaServerOptions{ + Model: "/path/to/model.gguf", + }, + }, + } + + _, err := mgr.CreateInstance("test-instance", options) + if err != nil { + t.Fatalf("CreateInstance failed: %v", err) + } + + _, err = mgr.StartInstance("test-instance") + if err != nil { + t.Fatalf("StartInstance failed: %v", err) + } + + // Restart the instance + restarted, err := mgr.RestartInstance("test-instance") + if err != nil { + t.Fatalf("RestartInstance failed: %v", err) + } + + if !restarted.IsRunning() { + t.Error("Instance should be running after restart") + } +} + +func TestDeleteInstance_RunningInstanceFails(t *testing.T) { + mgr := createTestManager() + defer mgr.Shutdown() + + options := &instance.Options{ + BackendOptions: backends.Options{ + BackendType: backends.BackendTypeLlamaCpp, + LlamaServerOptions: &backends.LlamaServerOptions{ + Model: "/path/to/model.gguf", + }, + }, + } + + _, err := mgr.CreateInstance("test-instance", options) + if err != nil { + t.Fatalf("CreateInstance failed: %v", err) + } + + _, err = mgr.StartInstance("test-instance") + if err != nil { + t.Fatalf("StartInstance failed: %v", err) + } + + // Should fail to delete running instance + err = mgr.DeleteInstance("test-instance") + if err == nil { + t.Error("Expected error when deleting running instance") + } +} + +func TestUpdateInstance_OnRunningInstance(t *testing.T) { + mgr := createTestManager() + defer mgr.Shutdown() + + options := &instance.Options{ + BackendOptions: backends.Options{ + BackendType: backends.BackendTypeLlamaCpp, + LlamaServerOptions: &backends.LlamaServerOptions{ + Model: "/path/to/model.gguf", + Port: 8080, + }, + }, + } + + _, err := mgr.CreateInstance("test-instance", options) + if err != nil { + t.Fatalf("CreateInstance failed: %v", err) + } + + _, err = mgr.StartInstance("test-instance") + if err != nil { + t.Fatalf("StartInstance failed: %v", err) + } + + // Update running instance with new model + newOptions := &instance.Options{ + BackendOptions: backends.Options{ + BackendType: backends.BackendTypeLlamaCpp, + LlamaServerOptions: &backends.LlamaServerOptions{ + Model: "/path/to/new-model.gguf", + Port: 8080, + }, + }, + } + + updated, err := mgr.UpdateInstance("test-instance", newOptions) + if err != nil { + t.Fatalf("UpdateInstance failed: %v", err) + } + + // Should still be running after update + if !updated.IsRunning() { + t.Error("Instance should be running after update") + } + + if updated.GetOptions().BackendOptions.LlamaServerOptions.Model != "/path/to/new-model.gguf" { + t.Errorf("Expected model to be updated") + } +} + +func TestUpdateInstance_PortChange(t *testing.T) { + mgr := createTestManager() + defer mgr.Shutdown() + + options := &instance.Options{ + BackendOptions: backends.Options{ + BackendType: backends.BackendTypeLlamaCpp, + LlamaServerOptions: &backends.LlamaServerOptions{ + Model: "/path/to/model.gguf", + Port: 8080, + }, + }, + } + + inst, err := mgr.CreateInstance("test-instance", options) + if err != nil { + t.Fatalf("CreateInstance failed: %v", err) + } + + if inst.GetPort() != 8080 { + t.Errorf("Expected port 8080, got %d", inst.GetPort()) + } + + // Update with new port + newOptions := &instance.Options{ + BackendOptions: backends.Options{ + BackendType: backends.BackendTypeLlamaCpp, + LlamaServerOptions: &backends.LlamaServerOptions{ + Model: "/path/to/model.gguf", + Port: 8081, + }, + }, + } + + updated, err := mgr.UpdateInstance("test-instance", newOptions) + if err != nil { + t.Fatalf("UpdateInstance failed: %v", err) + } + + if updated.GetPort() != 8081 { + t.Errorf("Expected port 8081, got %d", updated.GetPort()) + } + + // Old port should be released - try to create new instance with old port + options2 := &instance.Options{ + BackendOptions: backends.Options{ + BackendType: backends.BackendTypeLlamaCpp, + LlamaServerOptions: &backends.LlamaServerOptions{ + Model: "/path/to/model2.gguf", + Port: 8080, + }, + }, + } + + _, err = mgr.CreateInstance("test-instance-2", options2) + if err != nil { + t.Errorf("Should be able to use old port 8080: %v", err) + } +} From c44712e8130d602f6696336b67a5d69e3b2fcf9f Mon Sep 17 00:00:00 2001 From: LordMathis Date: Tue, 21 Oct 2025 22:15:12 +0200 Subject: [PATCH 09/22] Remove redundant instance manager tests --- pkg/manager/manager_test.go | 15 ------- pkg/manager/operations_test.go | 80 ---------------------------------- 2 files changed, 95 deletions(-) diff --git a/pkg/manager/manager_test.go b/pkg/manager/manager_test.go index ed0ed25..34f63cf 100644 --- a/pkg/manager/manager_test.go +++ b/pkg/manager/manager_test.go @@ -12,21 +12,6 @@ import ( "testing" ) -func TestNewInstanceManager(t *testing.T) { - mgr := createTestManager() - if mgr == nil { - t.Fatal("NewInstanceManager returned nil") - } - - instances, err := mgr.ListInstances() - if err != nil { - t.Fatalf("ListInstances failed: %v", err) - } - if len(instances) != 0 { - t.Errorf("Expected empty instance list, got %d instances", len(instances)) - } -} - func TestManager_PersistsAndLoadsInstances(t *testing.T) { tempDir := t.TempDir() cfg := createPersistenceConfig(tempDir) diff --git a/pkg/manager/operations_test.go b/pkg/manager/operations_test.go index dc3ae1e..976fb39 100644 --- a/pkg/manager/operations_test.go +++ b/pkg/manager/operations_test.go @@ -192,32 +192,6 @@ func TestDeleteInstance_ReleasesPort(t *testing.T) { } } -func TestGetInstance(t *testing.T) { - manager := createTestManager() - - options := &instance.Options{ - BackendOptions: backends.Options{ - BackendType: backends.BackendTypeLlamaCpp, - LlamaServerOptions: &backends.LlamaServerOptions{ - Model: "/path/to/model.gguf", - }, - }, - } - - created, err := manager.CreateInstance("test-instance", options) - if err != nil { - t.Fatalf("CreateInstance failed: %v", err) - } - - retrieved, err := manager.GetInstance("test-instance") - if err != nil { - t.Fatalf("GetInstance failed: %v", err) - } - if retrieved.Name != created.Name { - t.Errorf("Expected name %q, got %q", created.Name, retrieved.Name) - } -} - func TestUpdateInstance(t *testing.T) { manager := createTestManager() @@ -255,60 +229,6 @@ func TestUpdateInstance(t *testing.T) { } } -func TestListInstances(t *testing.T) { - manager := createTestManager() - - options := &instance.Options{ - BackendOptions: backends.Options{ - BackendType: backends.BackendTypeLlamaCpp, - LlamaServerOptions: &backends.LlamaServerOptions{ - Model: "/path/to/model.gguf", - }, - }, - } - - _, err := manager.CreateInstance("test-instance", options) - if err != nil { - t.Fatalf("CreateInstance failed: %v", err) - } - - instances, err := manager.ListInstances() - if err != nil { - t.Fatalf("ListInstances failed: %v", err) - } - if len(instances) != 1 { - t.Errorf("Expected 1 instance, got %d", len(instances)) - } -} - -func TestDeleteInstance(t *testing.T) { - manager := createTestManager() - - options := &instance.Options{ - BackendOptions: backends.Options{ - BackendType: backends.BackendTypeLlamaCpp, - LlamaServerOptions: &backends.LlamaServerOptions{ - Model: "/path/to/model.gguf", - }, - }, - } - - _, err := manager.CreateInstance("test-instance", options) - if err != nil { - t.Fatalf("CreateInstance failed: %v", err) - } - - err = manager.DeleteInstance("test-instance") - if err != nil { - t.Fatalf("DeleteInstance failed: %v", err) - } - - _, err = manager.GetInstance("test-instance") - if err == nil { - t.Error("Instance should not exist after deletion") - } -} - func TestInstanceOperations_NonExistentInstance(t *testing.T) { manager := createTestManager() From 2b51b4a47feb77e5f013ef95903e3be680289e7d Mon Sep 17 00:00:00 2001 From: LordMathis Date: Tue, 21 Oct 2025 22:30:08 +0200 Subject: [PATCH 10/22] Simplify manager tests --- pkg/manager/manager_test.go | 78 --------- pkg/manager/operations_test.go | 304 +-------------------------------- 2 files changed, 6 insertions(+), 376 deletions(-) diff --git a/pkg/manager/manager_test.go b/pkg/manager/manager_test.go index 34f63cf..ed9cdcb 100644 --- a/pkg/manager/manager_test.go +++ b/pkg/manager/manager_test.go @@ -134,30 +134,6 @@ func TestConcurrentAccess(t *testing.T) { } } -func TestShutdown(t *testing.T) { - mgr := createTestManager() - - // Create test instance - options := &instance.Options{ - BackendOptions: backends.Options{ - BackendType: backends.BackendTypeLlamaCpp, - LlamaServerOptions: &backends.LlamaServerOptions{ - Model: "/path/to/model.gguf", - }, - }, - } - _, err := mgr.CreateInstance("test-instance", options) - if err != nil { - t.Fatalf("CreateInstance failed: %v", err) - } - - // Shutdown should not panic - mgr.Shutdown() - - // Multiple shutdowns should not panic - mgr.Shutdown() -} - // Helper functions for test configuration func createBackendConfig() config.BackendConfig { // Use 'sleep' as a test command instead of 'llama-server' @@ -195,57 +171,3 @@ func createTestManager() manager.InstanceManager { } return manager.New(createBackendConfig(), cfg, map[string]config.NodeConfig{}, "main") } - -func TestManager_DoesNotAutoRestartWhenDisabled(t *testing.T) { - tempDir := t.TempDir() - cfg := createPersistenceConfig(tempDir) - backendConfig := createBackendConfig() - - // Create first manager and instance with auto-restart disabled - manager1 := manager.New(backendConfig, cfg, map[string]config.NodeConfig{}, "main") - - autoRestart := false - options := &instance.Options{ - AutoRestart: &autoRestart, - BackendOptions: backends.Options{ - BackendType: backends.BackendTypeLlamaCpp, - LlamaServerOptions: &backends.LlamaServerOptions{ - Model: "/path/to/model.gguf", - Port: 8080, - }, - }, - } - - inst, err := manager1.CreateInstance("test-instance", options) - if err != nil { - t.Fatalf("CreateInstance failed: %v", err) - } - - // Simulate instance being in running state when persisted - // (this would happen if the instance was running when llamactl was stopped) - inst.SetStatus(instance.Running) - - // Shutdown first manager - manager1.Shutdown() - - // Create second manager (simulating restart of llamactl) - manager2 := manager.New(backendConfig, cfg, map[string]config.NodeConfig{}, "main") - - // Get the loaded instance - loadedInst, err := manager2.GetInstance("test-instance") - if err != nil { - t.Fatalf("GetInstance failed: %v", err) - } - - // The instance should be marked as Stopped, not Running - // because auto-restart is disabled - if loadedInst.IsRunning() { - t.Errorf("Expected instance with auto-restart disabled to be stopped after manager restart, but it was running") - } - - if loadedInst.GetStatus() != instance.Stopped { - t.Errorf("Expected instance status to be Stopped, got %v", loadedInst.GetStatus()) - } - - manager2.Shutdown() -} diff --git a/pkg/manager/operations_test.go b/pkg/manager/operations_test.go index 976fb39..47396fa 100644 --- a/pkg/manager/operations_test.go +++ b/pkg/manager/operations_test.go @@ -9,36 +9,7 @@ import ( "testing" ) -func TestCreateInstance_Success(t *testing.T) { - manager := createTestManager() - - options := &instance.Options{ - BackendOptions: backends.Options{ - BackendType: backends.BackendTypeLlamaCpp, - LlamaServerOptions: &backends.LlamaServerOptions{ - Model: "/path/to/model.gguf", - Port: 8080, - }, - }, - } - - inst, err := manager.CreateInstance("test-instance", options) - if err != nil { - t.Fatalf("CreateInstance failed: %v", err) - } - - if inst.Name != "test-instance" { - t.Errorf("Expected instance name 'test-instance', got %q", inst.Name) - } - if inst.GetStatus() != instance.Stopped { - t.Error("New instance should not be running") - } - if inst.GetPort() != 8080 { - t.Errorf("Expected port 8080, got %d", inst.GetPort()) - } -} - -func TestCreateInstance_DuplicateName(t *testing.T) { +func TestCreateInstance_FailsWithDuplicateName(t *testing.T) { mngr := createTestManager() options := &instance.Options{ BackendOptions: backends.Options{ @@ -64,7 +35,7 @@ func TestCreateInstance_DuplicateName(t *testing.T) { } } -func TestCreateInstance_MaxInstancesLimit(t *testing.T) { +func TestCreateInstance_FailsWhenMaxInstancesReached(t *testing.T) { backendConfig := config.BackendConfig{ LlamaCpp: config.BackendSettings{ Command: "llama-server", @@ -101,30 +72,7 @@ func TestCreateInstance_MaxInstancesLimit(t *testing.T) { } } -func TestCreateInstance_AutoAssignsPort(t *testing.T) { - manager := createTestManager() - - options := &instance.Options{ - BackendOptions: backends.Options{ - BackendType: backends.BackendTypeLlamaCpp, - LlamaServerOptions: &backends.LlamaServerOptions{ - Model: "/path/to/model.gguf", - }, - }, - } - - inst, err := manager.CreateInstance("instance1", options) - if err != nil { - t.Fatalf("CreateInstance failed: %v", err) - } - - port := inst.GetPort() - if port < 8000 || port > 9000 { - t.Errorf("Expected port in range 8000-9000, got %d", port) - } -} - -func TestCreateInstance_PortConflict(t *testing.T) { +func TestCreateInstance_FailsWithPortConflict(t *testing.T) { manager := createTestManager() options1 := &instance.Options{ @@ -162,74 +110,7 @@ func TestCreateInstance_PortConflict(t *testing.T) { } } -func TestDeleteInstance_ReleasesPort(t *testing.T) { - manager := createTestManager() - - options := &instance.Options{ - BackendOptions: backends.Options{ - BackendType: backends.BackendTypeLlamaCpp, - LlamaServerOptions: &backends.LlamaServerOptions{ - Model: "/path/to/model.gguf", - Port: 8080, - }, - }, - } - - _, err := manager.CreateInstance("port-test", options) - if err != nil { - t.Fatalf("CreateInstance failed: %v", err) - } - - err = manager.DeleteInstance("port-test") - if err != nil { - t.Fatalf("DeleteInstance failed: %v", err) - } - - // Should be able to create new instance with same port - _, err = manager.CreateInstance("new-port-test", options) - if err != nil { - t.Errorf("Expected to reuse port after deletion, got error: %v", err) - } -} - -func TestUpdateInstance(t *testing.T) { - manager := createTestManager() - - options := &instance.Options{ - BackendOptions: backends.Options{ - BackendType: backends.BackendTypeLlamaCpp, - LlamaServerOptions: &backends.LlamaServerOptions{ - Model: "/path/to/model.gguf", - Port: 8080, - }, - }, - } - - _, err := manager.CreateInstance("test-instance", options) - if err != nil { - t.Fatalf("CreateInstance failed: %v", err) - } - - newOptions := &instance.Options{ - BackendOptions: backends.Options{ - BackendType: backends.BackendTypeLlamaCpp, - LlamaServerOptions: &backends.LlamaServerOptions{ - Model: "/path/to/new-model.gguf", - Port: 8081, - }, - }, - } - - updated, err := manager.UpdateInstance("test-instance", newOptions) - if err != nil { - t.Fatalf("UpdateInstance failed: %v", err) - } - if updated.GetOptions().BackendOptions.LlamaServerOptions.Model != "/path/to/new-model.gguf" { - t.Errorf("Expected model '/path/to/new-model.gguf', got %q", updated.GetOptions().BackendOptions.LlamaServerOptions.Model) - } -} - -func TestInstanceOperations_NonExistentInstance(t *testing.T) { +func TestInstanceOperations_FailWithNonExistentInstance(t *testing.T) { manager := createTestManager() options := &instance.Options{ @@ -257,179 +138,6 @@ func TestInstanceOperations_NonExistentInstance(t *testing.T) { } } -func TestStartInstance(t *testing.T) { - mgr := createTestManager() - defer mgr.Shutdown() - - options := &instance.Options{ - BackendOptions: backends.Options{ - BackendType: backends.BackendTypeLlamaCpp, - LlamaServerOptions: &backends.LlamaServerOptions{ - Model: "/path/to/model.gguf", - }, - }, - } - - inst, err := mgr.CreateInstance("test-instance", options) - if err != nil { - t.Fatalf("CreateInstance failed: %v", err) - } - - if inst.IsRunning() { - t.Error("New instance should not be running") - } - - // Start the instance - started, err := mgr.StartInstance("test-instance") - if err != nil { - t.Fatalf("StartInstance failed: %v", err) - } - - if !started.IsRunning() { - t.Error("Instance should be running after start") - } -} - -func TestStartInstance_Idempotent(t *testing.T) { - mgr := createTestManager() - defer mgr.Shutdown() - - options := &instance.Options{ - BackendOptions: backends.Options{ - BackendType: backends.BackendTypeLlamaCpp, - LlamaServerOptions: &backends.LlamaServerOptions{ - Model: "/path/to/model.gguf", - }, - }, - } - - inst, err := mgr.CreateInstance("test-instance", options) - if err != nil { - t.Fatalf("CreateInstance failed: %v", err) - } - - // Start the instance - _, err = mgr.StartInstance("test-instance") - if err != nil { - t.Fatalf("First StartInstance failed: %v", err) - } - - // Start again - should be idempotent - started, err := mgr.StartInstance("test-instance") - if err != nil { - t.Fatalf("Second StartInstance failed: %v", err) - } - - if !started.IsRunning() { - t.Error("Instance should still be running") - } - - if inst.GetStatus() != instance.Running { - t.Errorf("Expected Running status, got %v", inst.GetStatus()) - } -} - -func TestStopInstance(t *testing.T) { - mgr := createTestManager() - defer mgr.Shutdown() - - options := &instance.Options{ - BackendOptions: backends.Options{ - BackendType: backends.BackendTypeLlamaCpp, - LlamaServerOptions: &backends.LlamaServerOptions{ - Model: "/path/to/model.gguf", - }, - }, - } - - _, err := mgr.CreateInstance("test-instance", options) - if err != nil { - t.Fatalf("CreateInstance failed: %v", err) - } - - _, err = mgr.StartInstance("test-instance") - if err != nil { - t.Fatalf("StartInstance failed: %v", err) - } - - // Stop the instance - stopped, err := mgr.StopInstance("test-instance") - if err != nil { - t.Fatalf("StopInstance failed: %v", err) - } - - if stopped.IsRunning() { - t.Error("Instance should not be running after stop") - } -} - -func TestStopInstance_Idempotent(t *testing.T) { - mgr := createTestManager() - defer mgr.Shutdown() - - options := &instance.Options{ - BackendOptions: backends.Options{ - BackendType: backends.BackendTypeLlamaCpp, - LlamaServerOptions: &backends.LlamaServerOptions{ - Model: "/path/to/model.gguf", - }, - }, - } - - inst, err := mgr.CreateInstance("test-instance", options) - if err != nil { - t.Fatalf("CreateInstance failed: %v", err) - } - - // Stop when already stopped - should be idempotent - stopped, err := mgr.StopInstance("test-instance") - if err != nil { - t.Fatalf("StopInstance failed: %v", err) - } - - if stopped.IsRunning() { - t.Error("Instance should not be running") - } - - if inst.GetStatus() != instance.Stopped { - t.Errorf("Expected Stopped status, got %v", inst.GetStatus()) - } -} - -func TestRestartInstance(t *testing.T) { - mgr := createTestManager() - defer mgr.Shutdown() - - options := &instance.Options{ - BackendOptions: backends.Options{ - BackendType: backends.BackendTypeLlamaCpp, - LlamaServerOptions: &backends.LlamaServerOptions{ - Model: "/path/to/model.gguf", - }, - }, - } - - _, err := mgr.CreateInstance("test-instance", options) - if err != nil { - t.Fatalf("CreateInstance failed: %v", err) - } - - _, err = mgr.StartInstance("test-instance") - if err != nil { - t.Fatalf("StartInstance failed: %v", err) - } - - // Restart the instance - restarted, err := mgr.RestartInstance("test-instance") - if err != nil { - t.Fatalf("RestartInstance failed: %v", err) - } - - if !restarted.IsRunning() { - t.Error("Instance should be running after restart") - } -} - func TestDeleteInstance_RunningInstanceFails(t *testing.T) { mgr := createTestManager() defer mgr.Shutdown() @@ -460,7 +168,7 @@ func TestDeleteInstance_RunningInstanceFails(t *testing.T) { } } -func TestUpdateInstance_OnRunningInstance(t *testing.T) { +func TestUpdateInstance(t *testing.T) { mgr := createTestManager() defer mgr.Shutdown() @@ -510,7 +218,7 @@ func TestUpdateInstance_OnRunningInstance(t *testing.T) { } } -func TestUpdateInstance_PortChange(t *testing.T) { +func TestUpdateInstance_ReleasesOldPort(t *testing.T) { mgr := createTestManager() defer mgr.Shutdown() From bac18b56269ed251be0d4fec286d58ab528c3238 Mon Sep 17 00:00:00 2001 From: LordMathis Date: Tue, 21 Oct 2025 22:37:10 +0200 Subject: [PATCH 11/22] Unexport factory functions --- pkg/manager/lifecycle.go | 4 ++-- pkg/manager/manager.go | 10 +++++----- pkg/manager/persistence.go | 4 ++-- pkg/manager/ports.go | 4 ++-- pkg/manager/registry.go | 4 ++-- pkg/manager/remote.go | 4 ++-- 6 files changed, 15 insertions(+), 15 deletions(-) diff --git a/pkg/manager/lifecycle.go b/pkg/manager/lifecycle.go index 73cb073..cc147df 100644 --- a/pkg/manager/lifecycle.go +++ b/pkg/manager/lifecycle.go @@ -23,8 +23,8 @@ type lifecycleManager struct { shutdownOnce sync.Once } -// NewLifecycleManager creates a new lifecycle manager. -func NewLifecycleManager( +// newLifecycleManager creates a new lifecycle manager. +func newLifecycleManager( registry *instanceRegistry, manager InstanceManager, checkInterval time.Duration, diff --git a/pkg/manager/manager.go b/pkg/manager/manager.go index 551b1dd..e2c39de 100644 --- a/pkg/manager/manager.go +++ b/pkg/manager/manager.go @@ -52,23 +52,23 @@ func New(backendsConfig config.BackendConfig, instancesConfig config.InstancesCo } // Initialize components - registry := NewInstanceRegistry() + registry := newInstanceRegistry() // Initialize port allocator portRange := instancesConfig.PortRange - ports, err := NewPortAllocator(portRange[0], portRange[1]) + ports, err := newPortAllocator(portRange[0], portRange[1]) if err != nil { log.Fatalf("Failed to create port allocator: %v", err) } // Initialize persistence - persistence, err := NewInstancePersister(instancesConfig.InstancesDir) + persistence, err := newInstancePersister(instancesConfig.InstancesDir) if err != nil { log.Fatalf("Failed to create instance persister: %v", err) } // Initialize remote manager - remote := NewRemoteManager(nodesConfig, 30*time.Second) + remote := newRemoteManager(nodesConfig, 30*time.Second) // Create manager instance im := &instanceManager{ @@ -83,7 +83,7 @@ func New(backendsConfig config.BackendConfig, instancesConfig config.InstancesCo // Initialize lifecycle manager (needs reference to manager for Stop/Evict operations) checkInterval := time.Duration(instancesConfig.TimeoutCheckInterval) * time.Minute - im.lifecycle = NewLifecycleManager(registry, im, checkInterval, true) + im.lifecycle = newLifecycleManager(registry, im, checkInterval, true) // Load existing instances from disk if err := im.loadInstances(); err != nil { diff --git a/pkg/manager/persistence.go b/pkg/manager/persistence.go index 21f4588..e9f2238 100644 --- a/pkg/manager/persistence.go +++ b/pkg/manager/persistence.go @@ -18,9 +18,9 @@ type instancePersister struct { enabled bool } -// NewInstancePersister creates a new instance persister. +// newInstancePersister creates a new instance persister. // If instancesDir is empty, persistence is disabled. -func NewInstancePersister(instancesDir string) (*instancePersister, error) { +func newInstancePersister(instancesDir string) (*instancePersister, error) { if instancesDir == "" { return &instancePersister{ enabled: false, diff --git a/pkg/manager/ports.go b/pkg/manager/ports.go index c5aac3e..18a3787 100644 --- a/pkg/manager/ports.go +++ b/pkg/manager/ports.go @@ -23,9 +23,9 @@ type portAllocator struct { rangeSize int } -// NewPortAllocator creates a new port allocator for the given port range. +// newPortAllocator creates a new port allocator for the given port range. // Returns an error if the port range is invalid. -func NewPortAllocator(minPort, maxPort int) (*portAllocator, error) { +func newPortAllocator(minPort, maxPort int) (*portAllocator, error) { if minPort <= 0 || maxPort <= 0 { return nil, fmt.Errorf("invalid port range: min=%d, max=%d (must be > 0)", minPort, maxPort) } diff --git a/pkg/manager/registry.go b/pkg/manager/registry.go index c8758f9..f0fd797 100644 --- a/pkg/manager/registry.go +++ b/pkg/manager/registry.go @@ -14,8 +14,8 @@ type instanceRegistry struct { running sync.Map // map[string]struct{} - lock-free for status checks } -// NewInstanceRegistry creates a new instance registry. -func NewInstanceRegistry() *instanceRegistry { +// newInstanceRegistry creates a new instance registry. +func newInstanceRegistry() *instanceRegistry { return &instanceRegistry{ instances: make(map[string]*instance.Instance), } diff --git a/pkg/manager/remote.go b/pkg/manager/remote.go index 50d2463..f149fa9 100644 --- a/pkg/manager/remote.go +++ b/pkg/manager/remote.go @@ -23,8 +23,8 @@ type remoteManager struct { instanceToNode map[string]*config.NodeConfig // instance name -> node config } -// NewRemoteManager creates a new remote manager. -func NewRemoteManager(nodes map[string]config.NodeConfig, timeout time.Duration) *remoteManager { +// newRemoteManager creates a new remote manager. +func newRemoteManager(nodes map[string]config.NodeConfig, timeout time.Duration) *remoteManager { if timeout <= 0 { timeout = 30 * time.Second } From 9bb106a1ce6ffe5875c7a5bf8557eb8bb3546c2d Mon Sep 17 00:00:00 2001 From: LordMathis Date: Tue, 21 Oct 2025 22:38:00 +0200 Subject: [PATCH 12/22] Remove deprecated operation mutex in instanceManager --- pkg/manager/manager.go | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pkg/manager/manager.go b/pkg/manager/manager.go index e2c39de..4cbbf10 100644 --- a/pkg/manager/manager.go +++ b/pkg/manager/manager.go @@ -40,8 +40,7 @@ type instanceManager struct { localNodeName string // Name of the local node // Synchronization - operationMu sync.Mutex // DEPRECATED: Use instanceLocks for per-instance operations - instanceLocks sync.Map // map[string]*sync.Mutex - per-instance locks for concurrent operations + instanceLocks sync.Map // map[string]*sync.Mutex - per-instance locks for concurrent operations shutdownOnce sync.Once } From c6ebe47511892817b34e6e6c6c94aba9b6d8ebbd Mon Sep 17 00:00:00 2001 From: LordMathis Date: Tue, 21 Oct 2025 22:47:41 +0200 Subject: [PATCH 13/22] Fix path validation false positive --- pkg/manager/persistence.go | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/pkg/manager/persistence.go b/pkg/manager/persistence.go index e9f2238..38c1c82 100644 --- a/pkg/manager/persistence.go +++ b/pkg/manager/persistence.go @@ -49,14 +49,15 @@ func (p *instancePersister) save(inst *instance.Instance) error { } // Validate instance name to prevent path traversal - if err := p.validateInstanceName(inst.Name); err != nil { + validatedName, err := p.validateInstanceName(inst.Name) + if err != nil { return err } p.mu.Lock() defer p.mu.Unlock() - instancePath := filepath.Join(p.instancesDir, inst.Name+".json") + instancePath := filepath.Join(p.instancesDir, validatedName+".json") tempPath := instancePath + ".tmp" // Serialize instance to JSON @@ -106,14 +107,15 @@ func (p *instancePersister) delete(name string) error { return nil } - if err := p.validateInstanceName(name); err != nil { + validatedName, err := p.validateInstanceName(name) + if err != nil { return err } p.mu.Lock() defer p.mu.Unlock() - instancePath := filepath.Join(p.instancesDir, name+".json") + instancePath := filepath.Join(p.instancesDir, validatedName+".json") if err := os.Remove(instancePath); err != nil { if os.IsNotExist(err) { @@ -199,17 +201,18 @@ func (p *instancePersister) loadInstanceFile(name, path string) (*instance.Insta } // validateInstanceName ensures the instance name is safe for filesystem operations. -func (p *instancePersister) validateInstanceName(name string) error { +// Returns the validated name if valid, or an error if invalid. +func (p *instancePersister) validateInstanceName(name string) (string, error) { if name == "" { - return fmt.Errorf("instance name cannot be empty") + return "", fmt.Errorf("instance name cannot be empty") } cleaned := filepath.Clean(name) // After cleaning, name should not contain any path separators if cleaned != name || strings.Contains(cleaned, string(filepath.Separator)) { - return fmt.Errorf("invalid instance name: %s", name) + return "", fmt.Errorf("invalid instance name: %s", name) } - return nil + return cleaned, nil } From bc025bbe28d754bdc4fb359a78c5431c28b8dd17 Mon Sep 17 00:00:00 2001 From: LordMathis Date: Tue, 21 Oct 2025 22:57:23 +0200 Subject: [PATCH 14/22] Fix instance name validation --- pkg/manager/manager_test.go | 79 +++++++++++++++++++++++++++++++++++++ pkg/manager/persistence.go | 17 +++++--- 2 files changed, 90 insertions(+), 6 deletions(-) diff --git a/pkg/manager/manager_test.go b/pkg/manager/manager_test.go index ed9cdcb..8a1b069 100644 --- a/pkg/manager/manager_test.go +++ b/pkg/manager/manager_test.go @@ -134,6 +134,85 @@ func TestConcurrentAccess(t *testing.T) { } } +// TestCreateInstance_RejectsPathTraversal tests that instance names with path traversal attempts are rejected +func TestCreateInstance_RejectsPathTraversal(t *testing.T) { + tempDir := t.TempDir() + cfg := createPersistenceConfig(tempDir) + backendConfig := createBackendConfig() + mgr := manager.New(backendConfig, cfg, map[string]config.NodeConfig{}, "main") + + options := &instance.Options{ + BackendOptions: backends.Options{ + BackendType: backends.BackendTypeLlamaCpp, + LlamaServerOptions: &backends.LlamaServerOptions{ + Model: "/path/to/model.gguf", + Port: 8080, + }, + }, + } + + // Test cases for malicious instance names + maliciousNames := []string{ + "../../etc/passwd", // Classic path traversal + "../../../etc/shadow", // Multiple parent directory references + "/etc/passwd", // Absolute path + "foo/../bar", // Parent reference in middle + ".../.../", // Variation with multiple dots + ".hidden", // Hidden file + "foo/bar", // Forward slash + "foo\\bar", // Backslash (Windows-style) + "test..instance", // Double dots not at path boundary (should fail) + "normal-name/../escape", // Normal-looking name with traversal + } + + for _, name := range maliciousNames { + t.Run(name, func(t *testing.T) { + _, err := mgr.CreateInstance(name, options) + if err == nil { + t.Errorf("Expected error when creating instance with malicious name %q, but got none", name) + } + }) + } +} + +// TestCreateInstance_AcceptsValidNames tests that valid instance names are accepted +func TestCreateInstance_AcceptsValidNames(t *testing.T) { + tempDir := t.TempDir() + cfg := createPersistenceConfig(tempDir) + backendConfig := createBackendConfig() + mgr := manager.New(backendConfig, cfg, map[string]config.NodeConfig{}, "main") + defer mgr.Shutdown() + + options := &instance.Options{ + BackendOptions: backends.Options{ + BackendType: backends.BackendTypeLlamaCpp, + LlamaServerOptions: &backends.LlamaServerOptions{ + Model: "/path/to/model.gguf", + }, + }, + } + + // Valid instance names + validNames := []string{ + "test-instance", + "my_instance", + "instance123", + "test-name-with-dashes", + "name_with_underscores", + } + + for _, name := range validNames { + t.Run(name, func(t *testing.T) { + _, err := mgr.CreateInstance(name, options) + if err != nil { + t.Errorf("Expected instance with valid name %q to be created, but got error: %v", name, err) + } + // Clean up + mgr.DeleteInstance(name) + }) + } +} + // Helper functions for test configuration func createBackendConfig() config.BackendConfig { // Use 'sleep' as a test command instead of 'llama-server' diff --git a/pkg/manager/persistence.go b/pkg/manager/persistence.go index 38c1c82..01fe11f 100644 --- a/pkg/manager/persistence.go +++ b/pkg/manager/persistence.go @@ -207,12 +207,17 @@ func (p *instancePersister) validateInstanceName(name string) (string, error) { return "", fmt.Errorf("instance name cannot be empty") } - cleaned := filepath.Clean(name) - - // After cleaning, name should not contain any path separators - if cleaned != name || strings.Contains(cleaned, string(filepath.Separator)) { - return "", fmt.Errorf("invalid instance name: %s", name) + // Check for path separators and parent directory references + // This prevents path traversal attacks + if strings.Contains(name, "/") || strings.Contains(name, "\\") || strings.Contains(name, "..") { + return "", fmt.Errorf("invalid instance name: %s (cannot contain path separators or '..')", name) } - return cleaned, nil + // Additional check: ensure the name doesn't start with a dot (hidden files) + // or contain any other suspicious characters + if strings.HasPrefix(name, ".") { + return "", fmt.Errorf("invalid instance name: %s (cannot start with '.')", name) + } + + return name, nil } From e0289ff42f7dcc8594fc7bd2a4bab58da0e88e60 Mon Sep 17 00:00:00 2001 From: LordMathis Date: Tue, 21 Oct 2025 23:16:20 +0200 Subject: [PATCH 15/22] Add instance name validation for URL safety and corresponding tests --- pkg/manager/remote.go | 90 ++++++++++++++++++++++++++++++++++---- pkg/manager/remote_test.go | 68 ++++++++++++++++++++++++++++ 2 files changed, 150 insertions(+), 8 deletions(-) create mode 100644 pkg/manager/remote_test.go diff --git a/pkg/manager/remote.go b/pkg/manager/remote.go index f149fa9..2dda681 100644 --- a/pkg/manager/remote.go +++ b/pkg/manager/remote.go @@ -9,6 +9,8 @@ import ( "llamactl/pkg/config" "llamactl/pkg/instance" "net/http" + "net/url" + "strings" "sync" "time" ) @@ -80,6 +82,38 @@ func (rm *remoteManager) removeInstance(instanceName string) { // --- HTTP request helpers --- +// validateInstanceNameForURL ensures the instance name is safe for use in URLs. +func validateInstanceNameForURL(name string) (string, error) { + if name == "" { + return "", fmt.Errorf("instance name cannot be empty") + } + + // Check for path separators and parent directory references + // This prevents path traversal and SSRF attacks + if strings.Contains(name, "/") || strings.Contains(name, "\\") || strings.Contains(name, "..") { + return "", fmt.Errorf("invalid instance name: %s (cannot contain path separators or '..')", name) + } + + // Check for URL-unsafe characters that could be used for injection + // Reject names with special URL characters that could allow URL manipulation + unsafeChars := []string{"?", "&", "#", "%", "=", "@", ":", " "} + for _, char := range unsafeChars { + if strings.Contains(name, char) { + return "", fmt.Errorf("invalid instance name: %s (cannot contain URL-unsafe characters)", name) + } + } + + // Additional validation: use url.PathEscape to ensure the name doesn't change + // when URL-encoded (indicating it contains characters that need encoding) + // This catches any other characters that could cause issues in URLs + escaped := url.PathEscape(name) + if escaped != name { + return "", fmt.Errorf("invalid instance name: %s (contains characters requiring URL encoding)", name) + } + + return name, nil +} + // makeRemoteRequest creates and executes an HTTP request to a remote node with context support. func (rm *remoteManager) makeRemoteRequest(ctx context.Context, nodeConfig *config.NodeConfig, method, path string, body any) (*http.Response, error) { var reqBody io.Reader @@ -139,7 +173,12 @@ func parseRemoteResponse(resp *http.Response, result any) error { // CreateInstance creates a new instance on a remote node. func (rm *remoteManager) createInstance(ctx context.Context, node *config.NodeConfig, name string, opts *instance.Options) (*instance.Instance, error) { - path := fmt.Sprintf("%s%s/", apiBasePath, name) + validatedName, err := validateInstanceNameForURL(name) + if err != nil { + return nil, err + } + + path := fmt.Sprintf("%s%s/", apiBasePath, validatedName) resp, err := rm.makeRemoteRequest(ctx, node, "POST", path, opts) if err != nil { @@ -156,7 +195,12 @@ func (rm *remoteManager) createInstance(ctx context.Context, node *config.NodeCo // GetInstance retrieves an instance by name from a remote node. func (rm *remoteManager) getInstance(ctx context.Context, node *config.NodeConfig, name string) (*instance.Instance, error) { - path := fmt.Sprintf("%s%s/", apiBasePath, name) + validatedName, err := validateInstanceNameForURL(name) + if err != nil { + return nil, err + } + + path := fmt.Sprintf("%s%s/", apiBasePath, validatedName) resp, err := rm.makeRemoteRequest(ctx, node, "GET", path, nil) if err != nil { return nil, err @@ -172,7 +216,12 @@ func (rm *remoteManager) getInstance(ctx context.Context, node *config.NodeConfi // UpdateInstance updates an existing instance on a remote node. func (rm *remoteManager) updateInstance(ctx context.Context, node *config.NodeConfig, name string, opts *instance.Options) (*instance.Instance, error) { - path := fmt.Sprintf("%s%s/", apiBasePath, name) + validatedName, err := validateInstanceNameForURL(name) + if err != nil { + return nil, err + } + + path := fmt.Sprintf("%s%s/", apiBasePath, validatedName) resp, err := rm.makeRemoteRequest(ctx, node, "PUT", path, opts) if err != nil { @@ -189,7 +238,12 @@ func (rm *remoteManager) updateInstance(ctx context.Context, node *config.NodeCo // DeleteInstance deletes an instance from a remote node. func (rm *remoteManager) deleteInstance(ctx context.Context, node *config.NodeConfig, name string) error { - path := fmt.Sprintf("%s%s/", apiBasePath, name) + validatedName, err := validateInstanceNameForURL(name) + if err != nil { + return err + } + + path := fmt.Sprintf("%s%s/", apiBasePath, validatedName) resp, err := rm.makeRemoteRequest(ctx, node, "DELETE", path, nil) if err != nil { return err @@ -200,7 +254,12 @@ func (rm *remoteManager) deleteInstance(ctx context.Context, node *config.NodeCo // StartInstance starts an instance on a remote node. func (rm *remoteManager) startInstance(ctx context.Context, node *config.NodeConfig, name string) (*instance.Instance, error) { - path := fmt.Sprintf("%s%s/start", apiBasePath, name) + validatedName, err := validateInstanceNameForURL(name) + if err != nil { + return nil, err + } + + path := fmt.Sprintf("%s%s/start", apiBasePath, validatedName) resp, err := rm.makeRemoteRequest(ctx, node, "POST", path, nil) if err != nil { return nil, err @@ -216,7 +275,12 @@ func (rm *remoteManager) startInstance(ctx context.Context, node *config.NodeCon // StopInstance stops an instance on a remote node. func (rm *remoteManager) stopInstance(ctx context.Context, node *config.NodeConfig, name string) (*instance.Instance, error) { - path := fmt.Sprintf("%s%s/stop", apiBasePath, name) + validatedName, err := validateInstanceNameForURL(name) + if err != nil { + return nil, err + } + + path := fmt.Sprintf("%s%s/stop", apiBasePath, validatedName) resp, err := rm.makeRemoteRequest(ctx, node, "POST", path, nil) if err != nil { return nil, err @@ -232,7 +296,12 @@ func (rm *remoteManager) stopInstance(ctx context.Context, node *config.NodeConf // RestartInstance restarts an instance on a remote node. func (rm *remoteManager) restartInstance(ctx context.Context, node *config.NodeConfig, name string) (*instance.Instance, error) { - path := fmt.Sprintf("%s%s/restart", apiBasePath, name) + validatedName, err := validateInstanceNameForURL(name) + if err != nil { + return nil, err + } + + path := fmt.Sprintf("%s%s/restart", apiBasePath, validatedName) resp, err := rm.makeRemoteRequest(ctx, node, "POST", path, nil) if err != nil { return nil, err @@ -248,7 +317,12 @@ func (rm *remoteManager) restartInstance(ctx context.Context, node *config.NodeC // GetInstanceLogs retrieves logs for an instance from a remote node. func (rm *remoteManager) getInstanceLogs(ctx context.Context, node *config.NodeConfig, name string, numLines int) (string, error) { - path := fmt.Sprintf("%s%s/logs?lines=%d", apiBasePath, name, numLines) + validatedName, err := validateInstanceNameForURL(name) + if err != nil { + return "", err + } + + path := fmt.Sprintf("%s%s/logs?lines=%d", apiBasePath, validatedName, numLines) resp, err := rm.makeRemoteRequest(ctx, node, "GET", path, nil) if err != nil { return "", err diff --git a/pkg/manager/remote_test.go b/pkg/manager/remote_test.go new file mode 100644 index 0000000..f628921 --- /dev/null +++ b/pkg/manager/remote_test.go @@ -0,0 +1,68 @@ +package manager + +import ( + "testing" +) + +// TestValidateInstanceNameForURL tests the URL validation function +func TestValidateInstanceNameForURL(t *testing.T) { + tests := []struct { + name string + input string + shouldError bool + }{ + // Valid names + {"valid simple name", "test-instance", false}, + {"valid with underscores", "my_instance", false}, + {"valid with numbers", "instance123", false}, + {"valid with dashes", "test-name-with-dashes", false}, + + // Invalid names - path traversal + {"path traversal with ..", "../../etc/passwd", true}, + {"path traversal multiple", "../../../etc/shadow", true}, + {"path traversal in middle", "foo/../bar", true}, + {"double dots variation", ".../", true}, + + // Invalid names - path separators + {"forward slash", "foo/bar", true}, + {"backslash", "foo\\bar", true}, + {"absolute path", "/etc/passwd", true}, + + // Invalid names - URL-unsafe characters + {"question mark", "test?param=value", true}, + {"ampersand", "test¶m", true}, + {"hash", "test#anchor", true}, + {"percent", "test%20space", true}, + {"equals", "test=value", true}, + {"at sign", "test@example", true}, + {"colon", "test:8080", true}, + {"space", "test instance", true}, + + // Invalid names - empty + {"empty string", "", true}, + + // Invalid names - characters requiring encoding + {"unicode", "test\u00e9", true}, + {"newline", "test\n", true}, + {"tab", "test\t", true}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + validatedName, err := validateInstanceNameForURL(tt.input) + + if tt.shouldError { + if err == nil { + t.Errorf("Expected error for input %q, but got none", tt.input) + } + } else { + if err != nil { + t.Errorf("Expected no error for input %q, but got: %v", tt.input, err) + } + if validatedName != tt.input { + t.Errorf("Expected validated name to be %q, but got %q", tt.input, validatedName) + } + } + }) + } +} From 7c2c02ab2f13c7c85f0c4dc29982a1266332a1b8 Mon Sep 17 00:00:00 2001 From: LordMathis Date: Tue, 21 Oct 2025 23:24:27 +0200 Subject: [PATCH 16/22] Use url escape instead for instance name param --- pkg/manager/remote.go | 98 ++++++++++---------------------------- pkg/manager/remote_test.go | 68 -------------------------- 2 files changed, 25 insertions(+), 141 deletions(-) delete mode 100644 pkg/manager/remote_test.go diff --git a/pkg/manager/remote.go b/pkg/manager/remote.go index 2dda681..acabcd0 100644 --- a/pkg/manager/remote.go +++ b/pkg/manager/remote.go @@ -10,7 +10,6 @@ import ( "llamactl/pkg/instance" "net/http" "net/url" - "strings" "sync" "time" ) @@ -82,38 +81,6 @@ func (rm *remoteManager) removeInstance(instanceName string) { // --- HTTP request helpers --- -// validateInstanceNameForURL ensures the instance name is safe for use in URLs. -func validateInstanceNameForURL(name string) (string, error) { - if name == "" { - return "", fmt.Errorf("instance name cannot be empty") - } - - // Check for path separators and parent directory references - // This prevents path traversal and SSRF attacks - if strings.Contains(name, "/") || strings.Contains(name, "\\") || strings.Contains(name, "..") { - return "", fmt.Errorf("invalid instance name: %s (cannot contain path separators or '..')", name) - } - - // Check for URL-unsafe characters that could be used for injection - // Reject names with special URL characters that could allow URL manipulation - unsafeChars := []string{"?", "&", "#", "%", "=", "@", ":", " "} - for _, char := range unsafeChars { - if strings.Contains(name, char) { - return "", fmt.Errorf("invalid instance name: %s (cannot contain URL-unsafe characters)", name) - } - } - - // Additional validation: use url.PathEscape to ensure the name doesn't change - // when URL-encoded (indicating it contains characters that need encoding) - // This catches any other characters that could cause issues in URLs - escaped := url.PathEscape(name) - if escaped != name { - return "", fmt.Errorf("invalid instance name: %s (contains characters requiring URL encoding)", name) - } - - return name, nil -} - // makeRemoteRequest creates and executes an HTTP request to a remote node with context support. func (rm *remoteManager) makeRemoteRequest(ctx context.Context, nodeConfig *config.NodeConfig, method, path string, body any) (*http.Response, error) { var reqBody io.Reader @@ -173,12 +140,11 @@ func parseRemoteResponse(resp *http.Response, result any) error { // CreateInstance creates a new instance on a remote node. func (rm *remoteManager) createInstance(ctx context.Context, node *config.NodeConfig, name string, opts *instance.Options) (*instance.Instance, error) { - validatedName, err := validateInstanceNameForURL(name) - if err != nil { - return nil, err - } + // URL-encode the instance name to safely include it in the URL path + // This prevents SSRF and URL injection attacks + escapedName := url.PathEscape(name) - path := fmt.Sprintf("%s%s/", apiBasePath, validatedName) + path := fmt.Sprintf("%s%s/", apiBasePath, escapedName) resp, err := rm.makeRemoteRequest(ctx, node, "POST", path, opts) if err != nil { @@ -195,12 +161,10 @@ func (rm *remoteManager) createInstance(ctx context.Context, node *config.NodeCo // GetInstance retrieves an instance by name from a remote node. func (rm *remoteManager) getInstance(ctx context.Context, node *config.NodeConfig, name string) (*instance.Instance, error) { - validatedName, err := validateInstanceNameForURL(name) - if err != nil { - return nil, err - } + // URL-encode the instance name to safely include it in the URL path + escapedName := url.PathEscape(name) - path := fmt.Sprintf("%s%s/", apiBasePath, validatedName) + path := fmt.Sprintf("%s%s/", apiBasePath, escapedName) resp, err := rm.makeRemoteRequest(ctx, node, "GET", path, nil) if err != nil { return nil, err @@ -216,12 +180,10 @@ func (rm *remoteManager) getInstance(ctx context.Context, node *config.NodeConfi // UpdateInstance updates an existing instance on a remote node. func (rm *remoteManager) updateInstance(ctx context.Context, node *config.NodeConfig, name string, opts *instance.Options) (*instance.Instance, error) { - validatedName, err := validateInstanceNameForURL(name) - if err != nil { - return nil, err - } + // URL-encode the instance name to safely include it in the URL path + escapedName := url.PathEscape(name) - path := fmt.Sprintf("%s%s/", apiBasePath, validatedName) + path := fmt.Sprintf("%s%s/", apiBasePath, escapedName) resp, err := rm.makeRemoteRequest(ctx, node, "PUT", path, opts) if err != nil { @@ -238,12 +200,10 @@ func (rm *remoteManager) updateInstance(ctx context.Context, node *config.NodeCo // DeleteInstance deletes an instance from a remote node. func (rm *remoteManager) deleteInstance(ctx context.Context, node *config.NodeConfig, name string) error { - validatedName, err := validateInstanceNameForURL(name) - if err != nil { - return err - } + // URL-encode the instance name to safely include it in the URL path + escapedName := url.PathEscape(name) - path := fmt.Sprintf("%s%s/", apiBasePath, validatedName) + path := fmt.Sprintf("%s%s/", apiBasePath, escapedName) resp, err := rm.makeRemoteRequest(ctx, node, "DELETE", path, nil) if err != nil { return err @@ -254,12 +214,10 @@ func (rm *remoteManager) deleteInstance(ctx context.Context, node *config.NodeCo // StartInstance starts an instance on a remote node. func (rm *remoteManager) startInstance(ctx context.Context, node *config.NodeConfig, name string) (*instance.Instance, error) { - validatedName, err := validateInstanceNameForURL(name) - if err != nil { - return nil, err - } + // URL-encode the instance name to safely include it in the URL path + escapedName := url.PathEscape(name) - path := fmt.Sprintf("%s%s/start", apiBasePath, validatedName) + path := fmt.Sprintf("%s%s/start", apiBasePath, escapedName) resp, err := rm.makeRemoteRequest(ctx, node, "POST", path, nil) if err != nil { return nil, err @@ -275,12 +233,10 @@ func (rm *remoteManager) startInstance(ctx context.Context, node *config.NodeCon // StopInstance stops an instance on a remote node. func (rm *remoteManager) stopInstance(ctx context.Context, node *config.NodeConfig, name string) (*instance.Instance, error) { - validatedName, err := validateInstanceNameForURL(name) - if err != nil { - return nil, err - } + // URL-encode the instance name to safely include it in the URL path + escapedName := url.PathEscape(name) - path := fmt.Sprintf("%s%s/stop", apiBasePath, validatedName) + path := fmt.Sprintf("%s%s/stop", apiBasePath, escapedName) resp, err := rm.makeRemoteRequest(ctx, node, "POST", path, nil) if err != nil { return nil, err @@ -296,12 +252,10 @@ func (rm *remoteManager) stopInstance(ctx context.Context, node *config.NodeConf // RestartInstance restarts an instance on a remote node. func (rm *remoteManager) restartInstance(ctx context.Context, node *config.NodeConfig, name string) (*instance.Instance, error) { - validatedName, err := validateInstanceNameForURL(name) - if err != nil { - return nil, err - } + // URL-encode the instance name to safely include it in the URL path + escapedName := url.PathEscape(name) - path := fmt.Sprintf("%s%s/restart", apiBasePath, validatedName) + path := fmt.Sprintf("%s%s/restart", apiBasePath, escapedName) resp, err := rm.makeRemoteRequest(ctx, node, "POST", path, nil) if err != nil { return nil, err @@ -317,12 +271,10 @@ func (rm *remoteManager) restartInstance(ctx context.Context, node *config.NodeC // GetInstanceLogs retrieves logs for an instance from a remote node. func (rm *remoteManager) getInstanceLogs(ctx context.Context, node *config.NodeConfig, name string, numLines int) (string, error) { - validatedName, err := validateInstanceNameForURL(name) - if err != nil { - return "", err - } + // URL-encode the instance name to safely include it in the URL path + escapedName := url.PathEscape(name) - path := fmt.Sprintf("%s%s/logs?lines=%d", apiBasePath, validatedName, numLines) + path := fmt.Sprintf("%s%s/logs?lines=%d", apiBasePath, escapedName, numLines) resp, err := rm.makeRemoteRequest(ctx, node, "GET", path, nil) if err != nil { return "", err diff --git a/pkg/manager/remote_test.go b/pkg/manager/remote_test.go deleted file mode 100644 index f628921..0000000 --- a/pkg/manager/remote_test.go +++ /dev/null @@ -1,68 +0,0 @@ -package manager - -import ( - "testing" -) - -// TestValidateInstanceNameForURL tests the URL validation function -func TestValidateInstanceNameForURL(t *testing.T) { - tests := []struct { - name string - input string - shouldError bool - }{ - // Valid names - {"valid simple name", "test-instance", false}, - {"valid with underscores", "my_instance", false}, - {"valid with numbers", "instance123", false}, - {"valid with dashes", "test-name-with-dashes", false}, - - // Invalid names - path traversal - {"path traversal with ..", "../../etc/passwd", true}, - {"path traversal multiple", "../../../etc/shadow", true}, - {"path traversal in middle", "foo/../bar", true}, - {"double dots variation", ".../", true}, - - // Invalid names - path separators - {"forward slash", "foo/bar", true}, - {"backslash", "foo\\bar", true}, - {"absolute path", "/etc/passwd", true}, - - // Invalid names - URL-unsafe characters - {"question mark", "test?param=value", true}, - {"ampersand", "test¶m", true}, - {"hash", "test#anchor", true}, - {"percent", "test%20space", true}, - {"equals", "test=value", true}, - {"at sign", "test@example", true}, - {"colon", "test:8080", true}, - {"space", "test instance", true}, - - // Invalid names - empty - {"empty string", "", true}, - - // Invalid names - characters requiring encoding - {"unicode", "test\u00e9", true}, - {"newline", "test\n", true}, - {"tab", "test\t", true}, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - validatedName, err := validateInstanceNameForURL(tt.input) - - if tt.shouldError { - if err == nil { - t.Errorf("Expected error for input %q, but got none", tt.input) - } - } else { - if err != nil { - t.Errorf("Expected no error for input %q, but got: %v", tt.input, err) - } - if validatedName != tt.input { - t.Errorf("Expected validated name to be %q, but got %q", tt.input, validatedName) - } - } - }) - } -} From 13f3bed5fe521087d267974872ef57baaf3e138c Mon Sep 17 00:00:00 2001 From: LordMathis Date: Tue, 21 Oct 2025 23:36:26 +0200 Subject: [PATCH 17/22] Add URL encoding for instance name in API calls in webui --- webui/src/lib/api.ts | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/webui/src/lib/api.ts b/webui/src/lib/api.ts index 8629c1f..ef03408 100644 --- a/webui/src/lib/api.ts +++ b/webui/src/lib/api.ts @@ -116,7 +116,7 @@ export const nodesApi = { list: () => apiCall("/nodes"), // GET /nodes/{name} - get: (name: string) => apiCall(`/nodes/${name}`), + get: (name: string) => apiCall(`/nodes/${encodeURIComponent(name)}`), }; // Instance API functions @@ -125,52 +125,52 @@ export const instancesApi = { list: () => apiCall("/instances"), // GET /instances/{name} - get: (name: string) => apiCall(`/instances/${name}`), + get: (name: string) => apiCall(`/instances/${encodeURIComponent(name)}`), // POST /instances/{name} create: (name: string, options: CreateInstanceOptions) => - apiCall(`/instances/${name}`, { + apiCall(`/instances/${encodeURIComponent(name)}`, { method: "POST", body: JSON.stringify(options), }), // PUT /instances/{name} update: (name: string, options: CreateInstanceOptions) => - apiCall(`/instances/${name}`, { + apiCall(`/instances/${encodeURIComponent(name)}`, { method: "PUT", body: JSON.stringify(options), }), // DELETE /instances/{name} delete: (name: string) => - apiCall(`/instances/${name}`, { + apiCall(`/instances/${encodeURIComponent(name)}`, { method: "DELETE", }), // POST /instances/{name}/start start: (name: string) => - apiCall(`/instances/${name}/start`, { + apiCall(`/instances/${encodeURIComponent(name)}/start`, { method: "POST", }), // POST /instances/{name}/stop stop: (name: string) => - apiCall(`/instances/${name}/stop`, { + apiCall(`/instances/${encodeURIComponent(name)}/stop`, { method: "POST", }), // POST /instances/{name}/restart restart: (name: string) => - apiCall(`/instances/${name}/restart`, { + apiCall(`/instances/${encodeURIComponent(name)}/restart`, { method: "POST", }), // GET /instances/{name}/logs getLogs: (name: string, lines?: number) => { const params = lines ? `?lines=${lines}` : ""; - return apiCall(`/instances/${name}/logs${params}`, {}, "text"); + return apiCall(`/instances/${encodeURIComponent(name)}/logs${params}`, {}, "text"); }, // GET /instances/{name}/proxy/health - getHealth: (name: string) => apiCall>(`/instances/${name}/proxy/health`), + getHealth: (name: string) => apiCall>(`/instances/${encodeURIComponent(name)}/proxy/health`), }; From 0f2c14d3ed2e65fd82adf19487683011fc5c3fc6 Mon Sep 17 00:00:00 2001 From: LordMathis Date: Wed, 22 Oct 2025 00:02:23 +0200 Subject: [PATCH 18/22] Validate instance names to prevent injection attacks --- pkg/manager/remote.go | 82 ++++++++++++++++++++++++++++++++++--------- 1 file changed, 65 insertions(+), 17 deletions(-) diff --git a/pkg/manager/remote.go b/pkg/manager/remote.go index acabcd0..ab021f2 100644 --- a/pkg/manager/remote.go +++ b/pkg/manager/remote.go @@ -8,6 +8,7 @@ import ( "io" "llamactl/pkg/config" "llamactl/pkg/instance" + "llamactl/pkg/validation" "net/http" "net/url" "sync" @@ -140,9 +141,14 @@ func parseRemoteResponse(resp *http.Response, result any) error { // CreateInstance creates a new instance on a remote node. func (rm *remoteManager) createInstance(ctx context.Context, node *config.NodeConfig, name string, opts *instance.Options) (*instance.Instance, error) { - // URL-encode the instance name to safely include it in the URL path - // This prevents SSRF and URL injection attacks - escapedName := url.PathEscape(name) + // Validate instance name to prevent injection attacks + validatedName, err := validation.ValidateInstanceName(name) + if err != nil { + return nil, err + } + + // URL-encode the validated instance name to safely include it in the URL path + escapedName := url.PathEscape(validatedName) path := fmt.Sprintf("%s%s/", apiBasePath, escapedName) @@ -161,8 +167,14 @@ func (rm *remoteManager) createInstance(ctx context.Context, node *config.NodeCo // GetInstance retrieves an instance by name from a remote node. func (rm *remoteManager) getInstance(ctx context.Context, node *config.NodeConfig, name string) (*instance.Instance, error) { - // URL-encode the instance name to safely include it in the URL path - escapedName := url.PathEscape(name) + // Validate instance name to prevent injection attacks + validatedName, err := validation.ValidateInstanceName(name) + if err != nil { + return nil, err + } + + // URL-encode the validated instance name to safely include it in the URL path + escapedName := url.PathEscape(validatedName) path := fmt.Sprintf("%s%s/", apiBasePath, escapedName) resp, err := rm.makeRemoteRequest(ctx, node, "GET", path, nil) @@ -180,8 +192,14 @@ func (rm *remoteManager) getInstance(ctx context.Context, node *config.NodeConfi // UpdateInstance updates an existing instance on a remote node. func (rm *remoteManager) updateInstance(ctx context.Context, node *config.NodeConfig, name string, opts *instance.Options) (*instance.Instance, error) { - // URL-encode the instance name to safely include it in the URL path - escapedName := url.PathEscape(name) + // Validate instance name to prevent injection attacks + validatedName, err := validation.ValidateInstanceName(name) + if err != nil { + return nil, err + } + + // URL-encode the validated instance name to safely include it in the URL path + escapedName := url.PathEscape(validatedName) path := fmt.Sprintf("%s%s/", apiBasePath, escapedName) @@ -200,8 +218,14 @@ func (rm *remoteManager) updateInstance(ctx context.Context, node *config.NodeCo // DeleteInstance deletes an instance from a remote node. func (rm *remoteManager) deleteInstance(ctx context.Context, node *config.NodeConfig, name string) error { - // URL-encode the instance name to safely include it in the URL path - escapedName := url.PathEscape(name) + // Validate instance name to prevent injection attacks + validatedName, err := validation.ValidateInstanceName(name) + if err != nil { + return err + } + + // URL-encode the validated instance name to safely include it in the URL path + escapedName := url.PathEscape(validatedName) path := fmt.Sprintf("%s%s/", apiBasePath, escapedName) resp, err := rm.makeRemoteRequest(ctx, node, "DELETE", path, nil) @@ -214,8 +238,14 @@ func (rm *remoteManager) deleteInstance(ctx context.Context, node *config.NodeCo // StartInstance starts an instance on a remote node. func (rm *remoteManager) startInstance(ctx context.Context, node *config.NodeConfig, name string) (*instance.Instance, error) { - // URL-encode the instance name to safely include it in the URL path - escapedName := url.PathEscape(name) + // Validate instance name to prevent injection attacks + validatedName, err := validation.ValidateInstanceName(name) + if err != nil { + return nil, err + } + + // URL-encode the validated instance name to safely include it in the URL path + escapedName := url.PathEscape(validatedName) path := fmt.Sprintf("%s%s/start", apiBasePath, escapedName) resp, err := rm.makeRemoteRequest(ctx, node, "POST", path, nil) @@ -233,8 +263,14 @@ func (rm *remoteManager) startInstance(ctx context.Context, node *config.NodeCon // StopInstance stops an instance on a remote node. func (rm *remoteManager) stopInstance(ctx context.Context, node *config.NodeConfig, name string) (*instance.Instance, error) { - // URL-encode the instance name to safely include it in the URL path - escapedName := url.PathEscape(name) + // Validate instance name to prevent injection attacks + validatedName, err := validation.ValidateInstanceName(name) + if err != nil { + return nil, err + } + + // URL-encode the validated instance name to safely include it in the URL path + escapedName := url.PathEscape(validatedName) path := fmt.Sprintf("%s%s/stop", apiBasePath, escapedName) resp, err := rm.makeRemoteRequest(ctx, node, "POST", path, nil) @@ -252,8 +288,14 @@ func (rm *remoteManager) stopInstance(ctx context.Context, node *config.NodeConf // RestartInstance restarts an instance on a remote node. func (rm *remoteManager) restartInstance(ctx context.Context, node *config.NodeConfig, name string) (*instance.Instance, error) { - // URL-encode the instance name to safely include it in the URL path - escapedName := url.PathEscape(name) + // Validate instance name to prevent injection attacks + validatedName, err := validation.ValidateInstanceName(name) + if err != nil { + return nil, err + } + + // URL-encode the validated instance name to safely include it in the URL path + escapedName := url.PathEscape(validatedName) path := fmt.Sprintf("%s%s/restart", apiBasePath, escapedName) resp, err := rm.makeRemoteRequest(ctx, node, "POST", path, nil) @@ -271,8 +313,14 @@ func (rm *remoteManager) restartInstance(ctx context.Context, node *config.NodeC // GetInstanceLogs retrieves logs for an instance from a remote node. func (rm *remoteManager) getInstanceLogs(ctx context.Context, node *config.NodeConfig, name string, numLines int) (string, error) { - // URL-encode the instance name to safely include it in the URL path - escapedName := url.PathEscape(name) + // Validate instance name to prevent injection attacks + validatedName, err := validation.ValidateInstanceName(name) + if err != nil { + return "", err + } + + // URL-encode the validated instance name to safely include it in the URL path + escapedName := url.PathEscape(validatedName) path := fmt.Sprintf("%s%s/logs?lines=%d", apiBasePath, escapedName, numLines) resp, err := rm.makeRemoteRequest(ctx, node, "GET", path, nil) From c794e4f98ba3a27fbcda240bb51f32f36267fb3d Mon Sep 17 00:00:00 2001 From: LordMathis Date: Wed, 22 Oct 2025 18:40:39 +0200 Subject: [PATCH 19/22] Move instance name validation to handlers --- pkg/manager/operations.go | 8 +-- pkg/manager/remote.go | 83 ++++++-------------------------- pkg/server/handlers_instances.go | 77 ++++++++++++++++++----------- 3 files changed, 65 insertions(+), 103 deletions(-) diff --git a/pkg/manager/operations.go b/pkg/manager/operations.go index 09fe06b..55fb24b 100644 --- a/pkg/manager/operations.go +++ b/pkg/manager/operations.go @@ -4,7 +4,6 @@ import ( "context" "fmt" "llamactl/pkg/instance" - "llamactl/pkg/validation" "log" ) @@ -58,12 +57,7 @@ func (im *instanceManager) CreateInstance(name string, options *instance.Options return nil, fmt.Errorf("instance options cannot be nil") } - name, err := validation.ValidateInstanceName(name) - if err != nil { - return nil, err - } - - err = options.BackendOptions.ValidateInstanceOptions() + err := options.BackendOptions.ValidateInstanceOptions() if err != nil { return nil, err } diff --git a/pkg/manager/remote.go b/pkg/manager/remote.go index ab021f2..292b39f 100644 --- a/pkg/manager/remote.go +++ b/pkg/manager/remote.go @@ -8,7 +8,6 @@ import ( "io" "llamactl/pkg/config" "llamactl/pkg/instance" - "llamactl/pkg/validation" "net/http" "net/url" "sync" @@ -139,16 +138,9 @@ func parseRemoteResponse(resp *http.Response, result any) error { // --- Remote CRUD operations --- -// CreateInstance creates a new instance on a remote node. +// createInstance creates a new instance on a remote node. func (rm *remoteManager) createInstance(ctx context.Context, node *config.NodeConfig, name string, opts *instance.Options) (*instance.Instance, error) { - // Validate instance name to prevent injection attacks - validatedName, err := validation.ValidateInstanceName(name) - if err != nil { - return nil, err - } - - // URL-encode the validated instance name to safely include it in the URL path - escapedName := url.PathEscape(validatedName) + escapedName := url.PathEscape(name) path := fmt.Sprintf("%s%s/", apiBasePath, escapedName) @@ -165,16 +157,10 @@ func (rm *remoteManager) createInstance(ctx context.Context, node *config.NodeCo return &inst, nil } -// GetInstance retrieves an instance by name from a remote node. +// getInstance retrieves an instance by name from a remote node. func (rm *remoteManager) getInstance(ctx context.Context, node *config.NodeConfig, name string) (*instance.Instance, error) { - // Validate instance name to prevent injection attacks - validatedName, err := validation.ValidateInstanceName(name) - if err != nil { - return nil, err - } - // URL-encode the validated instance name to safely include it in the URL path - escapedName := url.PathEscape(validatedName) + escapedName := url.PathEscape(name) path := fmt.Sprintf("%s%s/", apiBasePath, escapedName) resp, err := rm.makeRemoteRequest(ctx, node, "GET", path, nil) @@ -190,16 +176,10 @@ func (rm *remoteManager) getInstance(ctx context.Context, node *config.NodeConfi return &inst, nil } -// UpdateInstance updates an existing instance on a remote node. +// updateInstance updates an existing instance on a remote node. func (rm *remoteManager) updateInstance(ctx context.Context, node *config.NodeConfig, name string, opts *instance.Options) (*instance.Instance, error) { - // Validate instance name to prevent injection attacks - validatedName, err := validation.ValidateInstanceName(name) - if err != nil { - return nil, err - } - // URL-encode the validated instance name to safely include it in the URL path - escapedName := url.PathEscape(validatedName) + escapedName := url.PathEscape(name) path := fmt.Sprintf("%s%s/", apiBasePath, escapedName) @@ -216,16 +196,10 @@ func (rm *remoteManager) updateInstance(ctx context.Context, node *config.NodeCo return &inst, nil } -// DeleteInstance deletes an instance from a remote node. +// deleteInstance deletes an instance from a remote node. func (rm *remoteManager) deleteInstance(ctx context.Context, node *config.NodeConfig, name string) error { - // Validate instance name to prevent injection attacks - validatedName, err := validation.ValidateInstanceName(name) - if err != nil { - return err - } - // URL-encode the validated instance name to safely include it in the URL path - escapedName := url.PathEscape(validatedName) + escapedName := url.PathEscape(name) path := fmt.Sprintf("%s%s/", apiBasePath, escapedName) resp, err := rm.makeRemoteRequest(ctx, node, "DELETE", path, nil) @@ -236,16 +210,10 @@ func (rm *remoteManager) deleteInstance(ctx context.Context, node *config.NodeCo return parseRemoteResponse(resp, nil) } -// StartInstance starts an instance on a remote node. +// startInstance starts an instance on a remote node. func (rm *remoteManager) startInstance(ctx context.Context, node *config.NodeConfig, name string) (*instance.Instance, error) { - // Validate instance name to prevent injection attacks - validatedName, err := validation.ValidateInstanceName(name) - if err != nil { - return nil, err - } - // URL-encode the validated instance name to safely include it in the URL path - escapedName := url.PathEscape(validatedName) + escapedName := url.PathEscape(name) path := fmt.Sprintf("%s%s/start", apiBasePath, escapedName) resp, err := rm.makeRemoteRequest(ctx, node, "POST", path, nil) @@ -261,16 +229,10 @@ func (rm *remoteManager) startInstance(ctx context.Context, node *config.NodeCon return &inst, nil } -// StopInstance stops an instance on a remote node. +// stopInstance stops an instance on a remote node. func (rm *remoteManager) stopInstance(ctx context.Context, node *config.NodeConfig, name string) (*instance.Instance, error) { - // Validate instance name to prevent injection attacks - validatedName, err := validation.ValidateInstanceName(name) - if err != nil { - return nil, err - } - // URL-encode the validated instance name to safely include it in the URL path - escapedName := url.PathEscape(validatedName) + escapedName := url.PathEscape(name) path := fmt.Sprintf("%s%s/stop", apiBasePath, escapedName) resp, err := rm.makeRemoteRequest(ctx, node, "POST", path, nil) @@ -286,16 +248,9 @@ func (rm *remoteManager) stopInstance(ctx context.Context, node *config.NodeConf return &inst, nil } -// RestartInstance restarts an instance on a remote node. +// restartInstance restarts an instance on a remote node. func (rm *remoteManager) restartInstance(ctx context.Context, node *config.NodeConfig, name string) (*instance.Instance, error) { - // Validate instance name to prevent injection attacks - validatedName, err := validation.ValidateInstanceName(name) - if err != nil { - return nil, err - } - - // URL-encode the validated instance name to safely include it in the URL path - escapedName := url.PathEscape(validatedName) + escapedName := url.PathEscape(name) path := fmt.Sprintf("%s%s/restart", apiBasePath, escapedName) resp, err := rm.makeRemoteRequest(ctx, node, "POST", path, nil) @@ -311,16 +266,10 @@ func (rm *remoteManager) restartInstance(ctx context.Context, node *config.NodeC return &inst, nil } -// GetInstanceLogs retrieves logs for an instance from a remote node. +// getInstanceLogs retrieves logs for an instance from a remote node. func (rm *remoteManager) getInstanceLogs(ctx context.Context, node *config.NodeConfig, name string, numLines int) (string, error) { - // Validate instance name to prevent injection attacks - validatedName, err := validation.ValidateInstanceName(name) - if err != nil { - return "", err - } - // URL-encode the validated instance name to safely include it in the URL path - escapedName := url.PathEscape(validatedName) + escapedName := url.PathEscape(name) path := fmt.Sprintf("%s%s/logs?lines=%d", apiBasePath, escapedName, numLines) resp, err := rm.makeRemoteRequest(ctx, node, "GET", path, nil) diff --git a/pkg/server/handlers_instances.go b/pkg/server/handlers_instances.go index 9e41190..dfe9971 100644 --- a/pkg/server/handlers_instances.go +++ b/pkg/server/handlers_instances.go @@ -5,6 +5,7 @@ import ( "fmt" "llamactl/pkg/instance" "llamactl/pkg/manager" + "llamactl/pkg/validation" "net/http" "net/http/httputil" "net/url" @@ -55,8 +56,10 @@ func (h *Handler) ListInstances() http.HandlerFunc { func (h *Handler) CreateInstance() http.HandlerFunc { return func(w http.ResponseWriter, r *http.Request) { name := chi.URLParam(r, "name") - if name == "" { - http.Error(w, "Instance name cannot be empty", http.StatusBadRequest) + + validatedName, err := validation.ValidateInstanceName(name) + if err != nil { + http.Error(w, "Invalid instance name: "+err.Error(), http.StatusBadRequest) return } @@ -66,7 +69,7 @@ func (h *Handler) CreateInstance() http.HandlerFunc { return } - inst, err := h.InstanceManager.CreateInstance(name, &options) + inst, err := h.InstanceManager.CreateInstance(validatedName, &options) if err != nil { http.Error(w, "Failed to create instance: "+err.Error(), http.StatusInternalServerError) return @@ -95,12 +98,14 @@ func (h *Handler) CreateInstance() http.HandlerFunc { func (h *Handler) GetInstance() http.HandlerFunc { return func(w http.ResponseWriter, r *http.Request) { name := chi.URLParam(r, "name") - if name == "" { - http.Error(w, "Instance name cannot be empty", http.StatusBadRequest) + + validatedName, err := validation.ValidateInstanceName(name) + if err != nil { + http.Error(w, "Invalid instance name: "+err.Error(), http.StatusBadRequest) return } - inst, err := h.InstanceManager.GetInstance(name) + inst, err := h.InstanceManager.GetInstance(validatedName) if err != nil { http.Error(w, "Invalid instance: "+err.Error(), http.StatusBadRequest) return @@ -130,8 +135,10 @@ func (h *Handler) GetInstance() http.HandlerFunc { func (h *Handler) UpdateInstance() http.HandlerFunc { return func(w http.ResponseWriter, r *http.Request) { name := chi.URLParam(r, "name") - if name == "" { - http.Error(w, "Instance name cannot be empty", http.StatusBadRequest) + + validatedName, err := validation.ValidateInstanceName(name) + if err != nil { + http.Error(w, "Invalid instance name: "+err.Error(), http.StatusBadRequest) return } @@ -141,7 +148,7 @@ func (h *Handler) UpdateInstance() http.HandlerFunc { return } - inst, err := h.InstanceManager.UpdateInstance(name, &options) + inst, err := h.InstanceManager.UpdateInstance(validatedName, &options) if err != nil { http.Error(w, "Failed to update instance: "+err.Error(), http.StatusInternalServerError) return @@ -169,12 +176,14 @@ func (h *Handler) UpdateInstance() http.HandlerFunc { func (h *Handler) StartInstance() http.HandlerFunc { return func(w http.ResponseWriter, r *http.Request) { name := chi.URLParam(r, "name") - if name == "" { - http.Error(w, "Instance name cannot be empty", http.StatusBadRequest) + + validatedName, err := validation.ValidateInstanceName(name) + if err != nil { + http.Error(w, "Invalid instance name: "+err.Error(), http.StatusBadRequest) return } - inst, err := h.InstanceManager.StartInstance(name) + inst, err := h.InstanceManager.StartInstance(validatedName) if err != nil { // Check if error is due to maximum running instances limit if _, ok := err.(manager.MaxRunningInstancesError); ok { @@ -208,12 +217,14 @@ func (h *Handler) StartInstance() http.HandlerFunc { func (h *Handler) StopInstance() http.HandlerFunc { return func(w http.ResponseWriter, r *http.Request) { name := chi.URLParam(r, "name") - if name == "" { - http.Error(w, "Instance name cannot be empty", http.StatusBadRequest) + + validatedName, err := validation.ValidateInstanceName(name) + if err != nil { + http.Error(w, "Invalid instance name: "+err.Error(), http.StatusBadRequest) return } - inst, err := h.InstanceManager.StopInstance(name) + inst, err := h.InstanceManager.StopInstance(validatedName) if err != nil { http.Error(w, "Failed to stop instance: "+err.Error(), http.StatusInternalServerError) return @@ -241,12 +252,14 @@ func (h *Handler) StopInstance() http.HandlerFunc { func (h *Handler) RestartInstance() http.HandlerFunc { return func(w http.ResponseWriter, r *http.Request) { name := chi.URLParam(r, "name") - if name == "" { - http.Error(w, "Instance name cannot be empty", http.StatusBadRequest) + + validatedName, err := validation.ValidateInstanceName(name) + if err != nil { + http.Error(w, "Invalid instance name: "+err.Error(), http.StatusBadRequest) return } - inst, err := h.InstanceManager.RestartInstance(name) + inst, err := h.InstanceManager.RestartInstance(validatedName) if err != nil { http.Error(w, "Failed to restart instance: "+err.Error(), http.StatusInternalServerError) return @@ -273,12 +286,14 @@ func (h *Handler) RestartInstance() http.HandlerFunc { func (h *Handler) DeleteInstance() http.HandlerFunc { return func(w http.ResponseWriter, r *http.Request) { name := chi.URLParam(r, "name") - if name == "" { - http.Error(w, "Instance name cannot be empty", http.StatusBadRequest) + + validatedName, err := validation.ValidateInstanceName(name) + if err != nil { + http.Error(w, "Invalid instance name: "+err.Error(), http.StatusBadRequest) return } - if err := h.InstanceManager.DeleteInstance(name); err != nil { + if err := h.InstanceManager.DeleteInstance(validatedName); err != nil { http.Error(w, "Failed to delete instance: "+err.Error(), http.StatusInternalServerError) return } @@ -302,8 +317,10 @@ func (h *Handler) DeleteInstance() http.HandlerFunc { func (h *Handler) GetInstanceLogs() http.HandlerFunc { return func(w http.ResponseWriter, r *http.Request) { name := chi.URLParam(r, "name") - if name == "" { - http.Error(w, "Instance name cannot be empty", http.StatusBadRequest) + + validatedName, err := validation.ValidateInstanceName(name) + if err != nil { + http.Error(w, "Invalid instance name: "+err.Error(), http.StatusBadRequest) return } @@ -319,7 +336,7 @@ func (h *Handler) GetInstanceLogs() http.HandlerFunc { } // Use the instance manager which handles both local and remote instances - logs, err := h.InstanceManager.GetInstanceLogs(name, numLines) + logs, err := h.InstanceManager.GetInstanceLogs(validatedName, numLines) if err != nil { http.Error(w, "Failed to get logs: "+err.Error(), http.StatusInternalServerError) return @@ -345,12 +362,14 @@ func (h *Handler) GetInstanceLogs() http.HandlerFunc { func (h *Handler) ProxyToInstance() http.HandlerFunc { return func(w http.ResponseWriter, r *http.Request) { name := chi.URLParam(r, "name") - if name == "" { - http.Error(w, "Instance name cannot be empty", http.StatusBadRequest) + + validatedName, err := validation.ValidateInstanceName(name) + if err != nil { + http.Error(w, "Invalid instance name: "+err.Error(), http.StatusBadRequest) return } - inst, err := h.InstanceManager.GetInstance(name) + inst, err := h.InstanceManager.GetInstance(validatedName) if err != nil { http.Error(w, "Failed to get instance: "+err.Error(), http.StatusInternalServerError) return @@ -358,7 +377,7 @@ func (h *Handler) ProxyToInstance() http.HandlerFunc { // Check if this is a remote instance if inst.IsRemote() { - h.RemoteInstanceProxy(w, r, name, inst) + h.RemoteInstanceProxy(w, r, validatedName, inst) return } @@ -375,7 +394,7 @@ func (h *Handler) ProxyToInstance() http.HandlerFunc { } // Strip the "/api/v1/instances//proxy" prefix from the request URL - prefix := fmt.Sprintf("/api/v1/instances/%s/proxy", name) + prefix := fmt.Sprintf("/api/v1/instances/%s/proxy", validatedName) r.URL.Path = strings.TrimPrefix(r.URL.Path, prefix) // Update the last request time for the instance From c6053f6afd3bf74c7066eea3ec6847e941db1817 Mon Sep 17 00:00:00 2001 From: LordMathis Date: Wed, 22 Oct 2025 18:50:38 +0200 Subject: [PATCH 20/22] Remove old validation tests --- pkg/manager/manager_test.go | 79 ------------------------------------- 1 file changed, 79 deletions(-) diff --git a/pkg/manager/manager_test.go b/pkg/manager/manager_test.go index 8a1b069..ed9cdcb 100644 --- a/pkg/manager/manager_test.go +++ b/pkg/manager/manager_test.go @@ -134,85 +134,6 @@ func TestConcurrentAccess(t *testing.T) { } } -// TestCreateInstance_RejectsPathTraversal tests that instance names with path traversal attempts are rejected -func TestCreateInstance_RejectsPathTraversal(t *testing.T) { - tempDir := t.TempDir() - cfg := createPersistenceConfig(tempDir) - backendConfig := createBackendConfig() - mgr := manager.New(backendConfig, cfg, map[string]config.NodeConfig{}, "main") - - options := &instance.Options{ - BackendOptions: backends.Options{ - BackendType: backends.BackendTypeLlamaCpp, - LlamaServerOptions: &backends.LlamaServerOptions{ - Model: "/path/to/model.gguf", - Port: 8080, - }, - }, - } - - // Test cases for malicious instance names - maliciousNames := []string{ - "../../etc/passwd", // Classic path traversal - "../../../etc/shadow", // Multiple parent directory references - "/etc/passwd", // Absolute path - "foo/../bar", // Parent reference in middle - ".../.../", // Variation with multiple dots - ".hidden", // Hidden file - "foo/bar", // Forward slash - "foo\\bar", // Backslash (Windows-style) - "test..instance", // Double dots not at path boundary (should fail) - "normal-name/../escape", // Normal-looking name with traversal - } - - for _, name := range maliciousNames { - t.Run(name, func(t *testing.T) { - _, err := mgr.CreateInstance(name, options) - if err == nil { - t.Errorf("Expected error when creating instance with malicious name %q, but got none", name) - } - }) - } -} - -// TestCreateInstance_AcceptsValidNames tests that valid instance names are accepted -func TestCreateInstance_AcceptsValidNames(t *testing.T) { - tempDir := t.TempDir() - cfg := createPersistenceConfig(tempDir) - backendConfig := createBackendConfig() - mgr := manager.New(backendConfig, cfg, map[string]config.NodeConfig{}, "main") - defer mgr.Shutdown() - - options := &instance.Options{ - BackendOptions: backends.Options{ - BackendType: backends.BackendTypeLlamaCpp, - LlamaServerOptions: &backends.LlamaServerOptions{ - Model: "/path/to/model.gguf", - }, - }, - } - - // Valid instance names - validNames := []string{ - "test-instance", - "my_instance", - "instance123", - "test-name-with-dashes", - "name_with_underscores", - } - - for _, name := range validNames { - t.Run(name, func(t *testing.T) { - _, err := mgr.CreateInstance(name, options) - if err != nil { - t.Errorf("Expected instance with valid name %q to be created, but got error: %v", name, err) - } - // Clean up - mgr.DeleteInstance(name) - }) - } -} - // Helper functions for test configuration func createBackendConfig() config.BackendConfig { // Use 'sleep' as a test command instead of 'llama-server' From 3b8bc658e398c1cbb5d4975c3db8b011a71bb3eb Mon Sep 17 00:00:00 2001 From: LordMathis Date: Wed, 22 Oct 2025 18:50:51 +0200 Subject: [PATCH 21/22] Add name validation to backend handlers --- pkg/server/handlers_backends.go | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/pkg/server/handlers_backends.go b/pkg/server/handlers_backends.go index 1ae2835..43d01ad 100644 --- a/pkg/server/handlers_backends.go +++ b/pkg/server/handlers_backends.go @@ -5,6 +5,7 @@ import ( "fmt" "llamactl/pkg/backends" "llamactl/pkg/instance" + "llamactl/pkg/validation" "net/http" "os/exec" "strings" @@ -22,13 +23,16 @@ func (h *Handler) LlamaCppProxy(onDemandStart bool) http.HandlerFunc { // Get the instance name from the URL parameter name := chi.URLParam(r, "name") - if name == "" { - http.Error(w, "Instance name cannot be empty", http.StatusBadRequest) + + // Validate instance name at the entry point + validatedName, err := validation.ValidateInstanceName(name) + if err != nil { + http.Error(w, "Invalid instance name: "+err.Error(), http.StatusBadRequest) return } // Route to the appropriate inst based on instance name - inst, err := h.InstanceManager.GetInstance(name) + inst, err := h.InstanceManager.GetInstance(validatedName) if err != nil { http.Error(w, "Invalid instance: "+err.Error(), http.StatusBadRequest) return @@ -66,7 +70,7 @@ func (h *Handler) LlamaCppProxy(onDemandStart bool) http.HandlerFunc { } // If on-demand start is enabled, start the instance - if _, err := h.InstanceManager.StartInstance(name); err != nil { + if _, err := h.InstanceManager.StartInstance(validatedName); err != nil { http.Error(w, "Failed to start instance: "+err.Error(), http.StatusInternalServerError) return } @@ -85,7 +89,7 @@ func (h *Handler) LlamaCppProxy(onDemandStart bool) http.HandlerFunc { } // Strip the "/llama-cpp/" prefix from the request URL - prefix := fmt.Sprintf("/llama-cpp/%s", name) + prefix := fmt.Sprintf("/llama-cpp/%s", validatedName) r.URL.Path = strings.TrimPrefix(r.URL.Path, prefix) // Update the last request time for the instance From a9fb0d613d61b1aee2638145f13fae9560862a01 Mon Sep 17 00:00:00 2001 From: LordMathis Date: Wed, 22 Oct 2025 18:55:57 +0200 Subject: [PATCH 22/22] Validate instance name in openai proxy --- pkg/server/handlers_openai.go | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/pkg/server/handlers_openai.go b/pkg/server/handlers_openai.go index 075f651..bddabf3 100644 --- a/pkg/server/handlers_openai.go +++ b/pkg/server/handlers_openai.go @@ -6,6 +6,7 @@ import ( "fmt" "io" "llamactl/pkg/instance" + "llamactl/pkg/validation" "net/http" "net/http/httputil" "net/url" @@ -85,8 +86,15 @@ func (h *Handler) OpenAIProxy() http.HandlerFunc { return } + // Validate instance name at the entry point + validatedName, err := validation.ValidateInstanceName(modelName) + if err != nil { + http.Error(w, "Invalid instance name: "+err.Error(), http.StatusBadRequest) + return + } + // Route to the appropriate inst based on instance name - inst, err := h.InstanceManager.GetInstance(modelName) + inst, err := h.InstanceManager.GetInstance(validatedName) if err != nil { http.Error(w, "Invalid instance: "+err.Error(), http.StatusBadRequest) return @@ -96,7 +104,7 @@ func (h *Handler) OpenAIProxy() http.HandlerFunc { if inst.IsRemote() { // Restore the body for the remote proxy r.Body = io.NopCloser(bytes.NewReader(bodyBytes)) - h.RemoteOpenAIProxy(w, r, modelName, inst) + h.RemoteOpenAIProxy(w, r, validatedName, inst) return } @@ -122,7 +130,7 @@ func (h *Handler) OpenAIProxy() http.HandlerFunc { } // If on-demand start is enabled, start the instance - if _, err := h.InstanceManager.StartInstance(modelName); err != nil { + if _, err := h.InstanceManager.StartInstance(validatedName); err != nil { http.Error(w, "Failed to start instance: "+err.Error(), http.StatusInternalServerError) return }