From b13f8c471d3997f9c6f10af39a0f9e6ee7bed156 Mon Sep 17 00:00:00 2001 From: LordMathis Date: Sat, 18 Oct 2025 10:28:15 +0200 Subject: [PATCH] Split off process struct --- pkg/instance/instance.go | 75 +++++-- pkg/instance/lifecycle.go | 406 ---------------------------------- pkg/instance/process.go | 446 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 497 insertions(+), 430 deletions(-) delete mode 100644 pkg/instance/lifecycle.go create mode 100644 pkg/instance/process.go diff --git a/pkg/instance/instance.go b/pkg/instance/instance.go index 5b29737..5cc0d6e 100644 --- a/pkg/instance/instance.go +++ b/pkg/instance/instance.go @@ -1,16 +1,12 @@ package instance import ( - "context" "encoding/json" "fmt" - "io" "llamactl/pkg/backends" "llamactl/pkg/config" "log" "net/http/httputil" - "os/exec" - "sync" "time" ) @@ -30,21 +26,9 @@ type Instance struct { localNodeName string `json:"-"` // Name of the local node for remote detection // Components (can be nil for remote instances or when stopped) - logger *logger `json:"-"` // nil for remote instances - proxy *proxy `json:"-"` // nil for remote instances - - // internal - cmd *exec.Cmd `json:"-"` // Command to run the instance - ctx context.Context `json:"-"` // Context for managing the instance lifecycle - cancel context.CancelFunc `json:"-"` // Function to cancel the context - stdout io.ReadCloser `json:"-"` // Standard output stream - stderr io.ReadCloser `json:"-"` // Standard error stream - mu sync.RWMutex `json:"-"` // RWMutex for better read/write separation - restarts int `json:"-"` // Number of restarts - - // Restart control - restartCancel context.CancelFunc `json:"-"` // Cancel function for pending restarts - monitorDone chan struct{} `json:"-"` // Channel to signal monitor goroutine completion + process *process `json:"-"` // nil for remote instances, nil when stopped + proxy *proxy `json:"-"` // nil for remote instances, created on demand + logger *logger `json:"-"` // nil for remote instances } // New creates a new instance with the given name, log path, and options @@ -76,6 +60,9 @@ func New(name string, globalBackendSettings *config.BackendConfig, globalInstanc // Create Proxy component instance.proxy = NewProxy(instance) + // Create Process component (will be initialized on first Start) + instance.process = newProcess(instance) + return instance } @@ -204,10 +191,6 @@ func (i *Instance) GetProxy() (*httputil.ReverseProxy, error) { // MarshalJSON implements json.Marshaler for Instance func (i *Instance) MarshalJSON() ([]byte, error) { - // Use read lock since we're only reading data - i.mu.RLock() - defer i.mu.RUnlock() - // Get options opts := i.GetOptions() @@ -280,7 +263,7 @@ func (i *Instance) UnmarshalJSON(data []byte) error { i.options = newOptions(&Options{}) } - // Only create logger and proxy for non-remote instances + // Only create logger, proxy, and process for non-remote instances // Remote instances are metadata only (no logger, proxy, or process) if !i.IsRemote() { if i.logger == nil && i.globalInstanceSettings != nil { @@ -289,6 +272,9 @@ func (i *Instance) UnmarshalJSON(data []byte) error { if i.proxy == nil { i.proxy = NewProxy(i) } + if i.process == nil { + i.process = newProcess(i) + } } return nil @@ -321,6 +307,47 @@ func (i *Instance) GetLogs(num_lines int) (string, error) { return i.logger.GetLogs(num_lines) } +// Start starts the instance, delegating to process component +func (i *Instance) Start() error { + if i.process == nil { + return fmt.Errorf("instance %s has no process component (remote instances cannot be started locally)", i.Name) + } + return i.process.Start() +} + +// Stop stops the instance, delegating to process component +func (i *Instance) Stop() error { + if i.process == nil { + return fmt.Errorf("instance %s has no process component (remote instances cannot be stopped locally)", i.Name) + } + return i.process.Stop() +} + +// Restart restarts the instance, delegating to process component +func (i *Instance) Restart() error { + if i.process == nil { + return fmt.Errorf("instance %s has no process component (remote instances cannot be restarted locally)", i.Name) + } + return i.process.Restart() +} + +// WaitForHealthy waits for the instance to become healthy, delegating to process component +func (i *Instance) WaitForHealthy(timeout int) error { + if i.process == nil { + return fmt.Errorf("instance %s has no process component (remote instances cannot be health checked locally)", i.Name) + } + return i.process.WaitForHealthy(timeout) +} + +// LastRequestTime returns the last request time as a Unix timestamp +// Delegates to the Proxy component +func (i *Instance) LastRequestTime() int64 { + if i.proxy == nil { + return 0 + } + return i.proxy.LastRequestTime() +} + // getBackendHostPort extracts the host and port from instance options // Returns the configured host and port for the backend func (i *Instance) getBackendHostPort() (string, int) { diff --git a/pkg/instance/lifecycle.go b/pkg/instance/lifecycle.go deleted file mode 100644 index 86f73db..0000000 --- a/pkg/instance/lifecycle.go +++ /dev/null @@ -1,406 +0,0 @@ -package instance - -import ( - "context" - "fmt" - "log" - "net/http" - "os" - "os/exec" - "runtime" - "syscall" - "time" - - "llamactl/pkg/backends" - "llamactl/pkg/config" -) - -// Start starts the llama server instance and returns an error if it fails. -func (i *Instance) Start() error { - i.mu.Lock() - defer i.mu.Unlock() - - if i.IsRunning() { - return fmt.Errorf("instance %s is already running", i.Name) - } - - // Safety check: ensure options are valid - if i.options == nil { - return fmt.Errorf("instance %s has no options set", i.Name) - } - - // Reset restart counter when manually starting (not during auto-restart) - // We can detect auto-restart by checking if restartCancel is set - if i.restartCancel == nil { - i.restarts = 0 - } - - // Initialize last request time to current time when starting - if i.proxy != nil { - i.proxy.UpdateLastRequestTime() - } - - // Create context before building command (needed for CommandContext) - i.ctx, i.cancel = context.WithCancel(context.Background()) - - // Create log files - if err := i.logger.Create(); err != nil { - return fmt.Errorf("failed to create log files: %w", err) - } - - // Build command using backend-specific methods - cmd, cmdErr := i.buildCommand() - if cmdErr != nil { - return fmt.Errorf("failed to build command: %w", cmdErr) - } - i.cmd = cmd - - if runtime.GOOS != "windows" { - setProcAttrs(i.cmd) - } - - var err error - i.stdout, err = i.cmd.StdoutPipe() - if err != nil { - i.logger.Close() - return fmt.Errorf("failed to get stdout pipe: %w", err) - } - i.stderr, err = i.cmd.StderrPipe() - if err != nil { - i.stdout.Close() - i.logger.Close() - return fmt.Errorf("failed to get stderr pipe: %w", err) - } - - if err := i.cmd.Start(); err != nil { - return fmt.Errorf("failed to start instance %s: %w", i.Name, err) - } - - i.SetStatus(Running) - - // Create channel for monitor completion signaling - i.monitorDone = make(chan struct{}) - - go i.logger.readOutput(i.stdout) - go i.logger.readOutput(i.stderr) - - go i.monitorProcess() - - return nil -} - -// Stop terminates the subprocess -func (i *Instance) Stop() error { - i.mu.Lock() - - if !i.IsRunning() { - // Even if not running, cancel any pending restart - if i.restartCancel != nil { - i.restartCancel() - i.restartCancel = nil - log.Printf("Cancelled pending restart for instance %s", i.Name) - } - i.mu.Unlock() - return fmt.Errorf("instance %s is not running", i.Name) - } - - // Cancel any pending restart - if i.restartCancel != nil { - i.restartCancel() - i.restartCancel = nil - } - - // Set status to stopped first to signal intentional stop - i.SetStatus(Stopped) - - // Get the monitor done channel before releasing the lock - monitorDone := i.monitorDone - - i.mu.Unlock() - - // Stop the process with SIGINT if cmd exists - if i.cmd != nil && i.cmd.Process != nil { - if err := i.cmd.Process.Signal(syscall.SIGINT); err != nil { - log.Printf("Failed to send SIGINT to instance %s: %v", i.Name, err) - } - } - - // If no process exists, we can return immediately - if i.cmd == nil || monitorDone == nil { - i.logger.Close() - return nil - } - - select { - case <-monitorDone: - // Process exited normally - case <-time.After(30 * time.Second): - // Force kill if it doesn't exit within 30 seconds - if i.cmd != nil && i.cmd.Process != nil { - killErr := i.cmd.Process.Kill() - if killErr != nil { - log.Printf("Failed to force kill instance %s: %v", i.Name, killErr) - } - log.Printf("Instance %s did not stop in time, force killed", i.Name) - - // Wait a bit more for the monitor to finish after force kill - select { - case <-monitorDone: - // Monitor completed after force kill - case <-time.After(2 * time.Second): - log.Printf("Warning: Monitor goroutine did not complete after force kill for instance %s", i.Name) - } - } - } - - i.logger.Close() - - return nil -} - -// LastRequestTime returns the last request time as a Unix timestamp -// Delegates to the Proxy component -func (i *Instance) LastRequestTime() int64 { - if i.proxy == nil { - return 0 - } - return i.proxy.LastRequestTime() -} - -func (i *Instance) WaitForHealthy(timeout int) error { - if !i.IsRunning() { - return fmt.Errorf("instance %s is not running", i.Name) - } - - if timeout <= 0 { - timeout = 30 // Default to 30 seconds if no timeout is specified - } - - ctx, cancel := context.WithTimeout(context.Background(), time.Duration(timeout)*time.Second) - defer cancel() - - // Get host/port from process - host, port := i.getBackendHostPort() - healthURL := fmt.Sprintf("http://%s:%d/health", host, port) - - // Create a dedicated HTTP client for health checks - client := &http.Client{ - Timeout: 5 * time.Second, // 5 second timeout per request - } - - // Helper function to check health directly - checkHealth := func() bool { - req, err := http.NewRequestWithContext(ctx, "GET", healthURL, nil) - if err != nil { - return false - } - - resp, err := client.Do(req) - if err != nil { - return false - } - defer resp.Body.Close() - - return resp.StatusCode == http.StatusOK - } - - // Try immediate check first - if checkHealth() { - return nil // Instance is healthy - } - - // If immediate check failed, start polling - ticker := time.NewTicker(1 * time.Second) - defer ticker.Stop() - - for { - select { - case <-ctx.Done(): - return fmt.Errorf("timeout waiting for instance %s to become healthy after %d seconds", i.Name, timeout) - case <-ticker.C: - if checkHealth() { - return nil // Instance is healthy - } - // Continue polling - } - } -} - -func (i *Instance) monitorProcess() { - defer func() { - i.mu.Lock() - if i.monitorDone != nil { - close(i.monitorDone) - i.monitorDone = nil - } - i.mu.Unlock() - }() - - err := i.cmd.Wait() - - i.mu.Lock() - - // Check if the instance was intentionally stopped - if !i.IsRunning() { - i.mu.Unlock() - return - } - - i.SetStatus(Stopped) - i.logger.Close() - - // Cancel any existing restart context since we're handling a new exit - if i.restartCancel != nil { - i.restartCancel() - i.restartCancel = nil - } - - // Log the exit - if err != nil { - log.Printf("Instance %s crashed with error: %v", i.Name, err) - // Handle restart while holding the lock, then release it - i.handleRestart() - } else { - log.Printf("Instance %s exited cleanly", i.Name) - i.mu.Unlock() - } -} - -// handleRestart manages the restart process while holding the lock -func (i *Instance) handleRestart() { - // Validate restart conditions and get safe parameters - shouldRestart, maxRestarts, restartDelay := i.validateRestartConditions() - if !shouldRestart { - i.SetStatus(Failed) - i.mu.Unlock() - return - } - - i.restarts++ - log.Printf("Auto-restarting instance %s (attempt %d/%d) in %v", - i.Name, i.restarts, maxRestarts, time.Duration(restartDelay)*time.Second) - - // Create a cancellable context for the restart delay - restartCtx, cancel := context.WithCancel(context.Background()) - i.restartCancel = cancel - - // Release the lock before sleeping - i.mu.Unlock() - - // Use context-aware sleep so it can be cancelled - select { - case <-time.After(time.Duration(restartDelay) * time.Second): - // Sleep completed normally, continue with restart - case <-restartCtx.Done(): - // Restart was cancelled - log.Printf("Restart cancelled for instance %s", i.Name) - return - } - - // Restart the instance - if err := i.Start(); err != nil { - log.Printf("Failed to restart instance %s: %v", i.Name, err) - } else { - log.Printf("Successfully restarted instance %s", i.Name) - // Clear the cancel function - i.mu.Lock() - i.restartCancel = nil - i.mu.Unlock() - } -} - -// validateRestartConditions checks if the instance should be restarted and returns the parameters -func (i *Instance) validateRestartConditions() (shouldRestart bool, maxRestarts int, restartDelay int) { - opts := i.GetOptions() - if opts == nil { - log.Printf("Instance %s not restarting: options are nil", i.Name) - return false, 0, 0 - } - - if opts.AutoRestart == nil || !*opts.AutoRestart { - log.Printf("Instance %s not restarting: AutoRestart is disabled", i.Name) - return false, 0, 0 - } - - if opts.MaxRestarts == nil { - log.Printf("Instance %s not restarting: MaxRestarts is nil", i.Name) - return false, 0, 0 - } - - if opts.RestartDelay == nil { - log.Printf("Instance %s not restarting: RestartDelay is nil", i.Name) - return false, 0, 0 - } - - // Values are already validated during unmarshaling/SetOptions - maxRestarts = *opts.MaxRestarts - restartDelay = *opts.RestartDelay - - if i.restarts >= maxRestarts { - log.Printf("Instance %s exceeded max restart attempts (%d)", i.Name, maxRestarts) - return false, 0, 0 - } - - return true, maxRestarts, restartDelay -} - -// buildCommand builds the command to execute using backend-specific logic -func (i *Instance) buildCommand() (*exec.Cmd, error) { - // Get options - opts := i.GetOptions() - if opts == nil { - return nil, fmt.Errorf("instance options are nil") - } - - // Get backend configuration - backendConfig, err := i.getBackendConfig() - if err != nil { - return nil, err - } - - // Build the environment variables - env := opts.BuildEnvironment(backendConfig) - - // Get the command to execute - command := opts.GetCommand(backendConfig) - - // Build command arguments - args := opts.BuildCommandArgs(backendConfig) - - // Create the exec.Cmd - cmd := exec.CommandContext(i.ctx, command, args...) - - // Start with host environment variables - cmd.Env = os.Environ() - - // Add/override with backend-specific environment variables - for k, v := range env { - cmd.Env = append(cmd.Env, fmt.Sprintf("%s=%s", k, v)) - } - - return cmd, nil -} - -// getBackendConfig resolves the backend configuration for the current instance -func (i *Instance) getBackendConfig() (*config.BackendSettings, error) { - opts := i.GetOptions() - if opts == nil { - return nil, fmt.Errorf("instance options are nil") - } - - var backendTypeStr string - - switch opts.BackendType { - case backends.BackendTypeLlamaCpp: - backendTypeStr = "llama-cpp" - case backends.BackendTypeMlxLm: - backendTypeStr = "mlx" - case backends.BackendTypeVllm: - backendTypeStr = "vllm" - default: - return nil, fmt.Errorf("unsupported backend type: %s", opts.BackendType) - } - - settings := i.globalBackendSettings.GetBackendSettings(backendTypeStr) - return &settings, nil -} diff --git a/pkg/instance/process.go b/pkg/instance/process.go new file mode 100644 index 0000000..afbe0e8 --- /dev/null +++ b/pkg/instance/process.go @@ -0,0 +1,446 @@ +package instance + +import ( + "context" + "fmt" + "io" + "log" + "net/http" + "os" + "os/exec" + "runtime" + "sync" + "syscall" + "time" + + "llamactl/pkg/backends" + "llamactl/pkg/config" +) + +// process manages the OS process lifecycle for a local instance (unexported). +// process owns its complete lifecycle including auto-restart logic. +type process struct { + instance *Instance // Back-reference for SetStatus, GetOptions + + mu sync.RWMutex + cmd *exec.Cmd + ctx context.Context + cancel context.CancelFunc + stdout io.ReadCloser + stderr io.ReadCloser + restarts int // process owns restart counter + restartCancel context.CancelFunc + monitorDone chan struct{} +} + +// newProcess creates a new process component for the given instance +func newProcess(instance *Instance) *process { + return &process{ + instance: instance, + } +} + +// Start starts the OS process and returns an error if it fails. +func (p *process) Start() error { + p.mu.Lock() + defer p.mu.Unlock() + + if p.instance.IsRunning() { + return fmt.Errorf("instance %s is already running", p.instance.Name) + } + + // Safety check: ensure options are valid + if p.instance.options == nil { + return fmt.Errorf("instance %s has no options set", p.instance.Name) + } + + // Reset restart counter when manually starting (not during auto-restart) + // We can detect auto-restart by checking if restartCancel is set + if p.restartCancel == nil { + p.restarts = 0 + } + + // Initialize last request time to current time when starting + if p.instance.proxy != nil { + p.instance.proxy.UpdateLastRequestTime() + } + + // Create context before building command (needed for CommandContext) + p.ctx, p.cancel = context.WithCancel(context.Background()) + + // Create log files + if err := p.instance.logger.Create(); err != nil { + return fmt.Errorf("failed to create log files: %w", err) + } + + // Build command using backend-specific methods + cmd, cmdErr := p.buildCommand() + if cmdErr != nil { + return fmt.Errorf("failed to build command: %w", cmdErr) + } + p.cmd = cmd + + if runtime.GOOS != "windows" { + setProcAttrs(p.cmd) + } + + var err error + p.stdout, err = p.cmd.StdoutPipe() + if err != nil { + p.instance.logger.Close() + return fmt.Errorf("failed to get stdout pipe: %w", err) + } + p.stderr, err = p.cmd.StderrPipe() + if err != nil { + p.stdout.Close() + p.instance.logger.Close() + return fmt.Errorf("failed to get stderr pipe: %w", err) + } + + if err := p.cmd.Start(); err != nil { + return fmt.Errorf("failed to start instance %s: %w", p.instance.Name, err) + } + + p.instance.SetStatus(Running) + + // Create channel for monitor completion signaling + p.monitorDone = make(chan struct{}) + + go p.instance.logger.readOutput(p.stdout) + go p.instance.logger.readOutput(p.stderr) + + go p.monitorProcess() + + return nil +} + +// Stop terminates the subprocess without restarting +func (p *process) Stop() error { + p.mu.Lock() + + if !p.instance.IsRunning() { + // Even if not running, cancel any pending restart + if p.restartCancel != nil { + p.restartCancel() + p.restartCancel = nil + log.Printf("Cancelled pending restart for instance %s", p.instance.Name) + } + p.mu.Unlock() + return fmt.Errorf("instance %s is not running", p.instance.Name) + } + + // Cancel any pending restart + if p.restartCancel != nil { + p.restartCancel() + p.restartCancel = nil + } + + // Set status to stopped first to signal intentional stop + p.instance.SetStatus(Stopped) + + // Get the monitor done channel before releasing the lock + monitorDone := p.monitorDone + + p.mu.Unlock() + + // Stop the process with SIGINT if cmd exists + if p.cmd != nil && p.cmd.Process != nil { + if err := p.cmd.Process.Signal(syscall.SIGINT); err != nil { + log.Printf("Failed to send SIGINT to instance %s: %v", p.instance.Name, err) + } + } + + // If no process exists, we can return immediately + if p.cmd == nil || monitorDone == nil { + p.instance.logger.Close() + return nil + } + + select { + case <-monitorDone: + // Process exited normally + case <-time.After(30 * time.Second): + // Force kill if it doesn't exit within 30 seconds + if p.cmd != nil && p.cmd.Process != nil { + killErr := p.cmd.Process.Kill() + if killErr != nil { + log.Printf("Failed to force kill instance %s: %v", p.instance.Name, killErr) + } + log.Printf("Instance %s did not stop in time, force killed", p.instance.Name) + + // Wait a bit more for the monitor to finish after force kill + select { + case <-monitorDone: + // Monitor completed after force kill + case <-time.After(2 * time.Second): + log.Printf("Warning: Monitor goroutine did not complete after force kill for instance %s", p.instance.Name) + } + } + } + + p.instance.logger.Close() + + return nil +} + +// Restart manually restarts the process (resets restart counter) +func (p *process) Restart() error { + // Stop the process first + if err := p.Stop(); err != nil { + // If it's not running, that's ok - we'll just start it + if err.Error() != fmt.Sprintf("instance %s is not running", p.instance.Name) { + return fmt.Errorf("failed to stop instance during restart: %w", err) + } + } + + // Reset restart counter for manual restart + p.mu.Lock() + p.restarts = 0 + p.mu.Unlock() + + // Start the process + return p.Start() +} + +// WaitForHealthy waits for the process to become healthy +func (p *process) WaitForHealthy(timeout int) error { + if !p.instance.IsRunning() { + return fmt.Errorf("instance %s is not running", p.instance.Name) + } + + if timeout <= 0 { + timeout = 30 // Default to 30 seconds if no timeout is specified + } + + ctx, cancel := context.WithTimeout(context.Background(), time.Duration(timeout)*time.Second) + defer cancel() + + // Get host/port from instance + host, port := p.instance.getBackendHostPort() + healthURL := fmt.Sprintf("http://%s:%d/health", host, port) + + // Create a dedicated HTTP client for health checks + client := &http.Client{ + Timeout: 5 * time.Second, // 5 second timeout per request + } + + // Helper function to check health directly + checkHealth := func() bool { + req, err := http.NewRequestWithContext(ctx, "GET", healthURL, nil) + if err != nil { + return false + } + + resp, err := client.Do(req) + if err != nil { + return false + } + defer resp.Body.Close() + + return resp.StatusCode == http.StatusOK + } + + // Try immediate check first + if checkHealth() { + return nil // Instance is healthy + } + + // If immediate check failed, start polling + ticker := time.NewTicker(1 * time.Second) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + return fmt.Errorf("timeout waiting for instance %s to become healthy after %d seconds", p.instance.Name, timeout) + case <-ticker.C: + if checkHealth() { + return nil // Instance is healthy + } + // Continue polling + } + } +} + +// monitorProcess monitors the OS process and handles crashes/exits +func (p *process) monitorProcess() { + defer func() { + p.mu.Lock() + if p.monitorDone != nil { + close(p.monitorDone) + p.monitorDone = nil + } + p.mu.Unlock() + }() + + err := p.cmd.Wait() + + p.mu.Lock() + + // Check if the instance was intentionally stopped + if !p.instance.IsRunning() { + p.mu.Unlock() + return + } + + p.instance.SetStatus(Stopped) + p.instance.logger.Close() + + // Cancel any existing restart context since we're handling a new exit + if p.restartCancel != nil { + p.restartCancel() + p.restartCancel = nil + } + + // Log the exit + if err != nil { + log.Printf("Instance %s crashed with error: %v", p.instance.Name, err) + // Handle auto-restart logic + p.handleAutoRestart(err) + } else { + log.Printf("Instance %s exited cleanly", p.instance.Name) + p.mu.Unlock() + } +} + +// shouldAutoRestart checks if the process should auto-restart +func (p *process) shouldAutoRestart() bool { + opts := p.instance.GetOptions() + if opts == nil { + log.Printf("Instance %s not restarting: options are nil", p.instance.Name) + return false + } + + if opts.AutoRestart == nil || !*opts.AutoRestart { + log.Printf("Instance %s not restarting: AutoRestart is disabled", p.instance.Name) + return false + } + + if opts.MaxRestarts == nil { + log.Printf("Instance %s not restarting: MaxRestarts is nil", p.instance.Name) + return false + } + + maxRestarts := *opts.MaxRestarts + if p.restarts >= maxRestarts { + log.Printf("Instance %s exceeded max restart attempts (%d)", p.instance.Name, maxRestarts) + return false + } + + return true +} + +// handleAutoRestart manages the auto-restart process +func (p *process) handleAutoRestart(err error) { + // Check if should restart + if !p.shouldAutoRestart() { + p.instance.SetStatus(Failed) + p.mu.Unlock() + return + } + + // Get restart parameters + opts := p.instance.GetOptions() + if opts.RestartDelay == nil { + log.Printf("Instance %s not restarting: RestartDelay is nil", p.instance.Name) + p.instance.SetStatus(Failed) + p.mu.Unlock() + return + } + + restartDelay := *opts.RestartDelay + maxRestarts := *opts.MaxRestarts + + p.restarts++ + log.Printf("Auto-restarting instance %s (attempt %d/%d) in %v", + p.instance.Name, p.restarts, maxRestarts, time.Duration(restartDelay)*time.Second) + + // Create a cancellable context for the restart delay + restartCtx, cancel := context.WithCancel(context.Background()) + p.restartCancel = cancel + + // Release the lock before sleeping + p.mu.Unlock() + + // Use context-aware sleep so it can be cancelled + select { + case <-time.After(time.Duration(restartDelay) * time.Second): + // Sleep completed normally, continue with restart + case <-restartCtx.Done(): + // Restart was cancelled + log.Printf("Restart cancelled for instance %s", p.instance.Name) + return + } + + // Restart the instance + if err := p.Start(); err != nil { + log.Printf("Failed to restart instance %s: %v", p.instance.Name, err) + } else { + log.Printf("Successfully restarted instance %s", p.instance.Name) + // Clear the cancel function + p.mu.Lock() + p.restartCancel = nil + p.mu.Unlock() + } +} + +// buildCommand builds the command to execute using backend-specific logic +func (p *process) buildCommand() (*exec.Cmd, error) { + // Get options + opts := p.instance.GetOptions() + if opts == nil { + return nil, fmt.Errorf("instance options are nil") + } + + // Get backend configuration + backendConfig, err := p.getBackendConfig() + if err != nil { + return nil, err + } + + // Build the environment variables + env := opts.BuildEnvironment(backendConfig) + + // Get the command to execute + command := opts.GetCommand(backendConfig) + + // Build command arguments + args := opts.BuildCommandArgs(backendConfig) + + // Create the exec.Cmd + cmd := exec.CommandContext(p.ctx, command, args...) + + // Start with host environment variables + cmd.Env = os.Environ() + + // Add/override with backend-specific environment variables + for k, v := range env { + cmd.Env = append(cmd.Env, fmt.Sprintf("%s=%s", k, v)) + } + + return cmd, nil +} + +// getBackendConfig resolves the backend configuration for the current instance +func (p *process) getBackendConfig() (*config.BackendSettings, error) { + opts := p.instance.GetOptions() + if opts == nil { + return nil, fmt.Errorf("instance options are nil") + } + + var backendTypeStr string + + switch opts.BackendType { + case backends.BackendTypeLlamaCpp: + backendTypeStr = "llama-cpp" + case backends.BackendTypeMlxLm: + backendTypeStr = "mlx" + case backends.BackendTypeVllm: + backendTypeStr = "vllm" + default: + return nil, fmt.Errorf("unsupported backend type: %s", opts.BackendType) + } + + settings := p.instance.globalBackendSettings.GetBackendSettings(backendTypeStr) + return &settings, nil +}