Split off process struct

2025-12-23 01:24:24 +00:00 · 2025-10-18 10:28:15 +02:00
parent 3f834004a8
commit b13f8c471d
3 changed files with 497 additions and 430 deletions
--- a/pkg/instance/instance.go
+++ b/pkg/instance/instance.go
@@ -1,16 +1,12 @@
 package instance
 import (
 	"context"
 	"encoding/json"
 	"fmt"
 	"io"
 	"llamactl/pkg/backends"
 	"llamactl/pkg/config"
 	"log"
 	"net/http/httputil"
 	"os/exec"
 	"sync"
 	"time"
 )
@@ -30,21 +26,9 @@ type Instance struct {
 	localNodeName          string `json:"-"` // Name of the local node for remote detection
 	// Components (can be nil for remote instances or when stopped)
 	process *process `json:"-"` // nil for remote instances, nil when stopped
 	proxy   *proxy   `json:"-"` // nil for remote instances, created on demand
 	logger  *logger  `json:"-"` // nil for remote instances
 	proxy  *proxy  `json:"-"` // nil for remote instances
 	// internal
 	cmd      *exec.Cmd          `json:"-"` // Command to run the instance
 	ctx      context.Context    `json:"-"` // Context for managing the instance lifecycle
 	cancel   context.CancelFunc `json:"-"` // Function to cancel the context
 	stdout   io.ReadCloser      `json:"-"` // Standard output stream
 	stderr   io.ReadCloser      `json:"-"` // Standard error stream
 	mu       sync.RWMutex       `json:"-"` // RWMutex for better read/write separation
 	restarts int                `json:"-"` // Number of restarts
 	// Restart control
 	restartCancel context.CancelFunc `json:"-"` // Cancel function for pending restarts
 	monitorDone   chan struct{}      `json:"-"` // Channel to signal monitor goroutine completion
 }
 // New creates a new instance with the given name, log path, and options
@@ -76,6 +60,9 @@ func New(name string, globalBackendSettings *config.BackendConfig, globalInstanc
 	// Create Proxy component
 	instance.proxy = NewProxy(instance)
 	// Create Process component (will be initialized on first Start)
 	instance.process = newProcess(instance)
 	return instance
 }
@@ -204,10 +191,6 @@ func (i *Instance) GetProxy() (*httputil.ReverseProxy, error) {
 // MarshalJSON implements json.Marshaler for Instance
 func (i *Instance) MarshalJSON() ([]byte, error) {
 	// Use read lock since we're only reading data
 	i.mu.RLock()
 	defer i.mu.RUnlock()
 	// Get options
 	opts := i.GetOptions()
@@ -280,7 +263,7 @@ func (i *Instance) UnmarshalJSON(data []byte) error {
 		i.options = newOptions(&Options{})
 	}
-	// Only create logger and proxy for non-remote instances
+	// Only create logger, proxy, and process for non-remote instances
 	// Remote instances are metadata only (no logger, proxy, or process)
 	if !i.IsRemote() {
 		if i.logger == nil && i.globalInstanceSettings != nil {
@@ -289,6 +272,9 @@ func (i *Instance) UnmarshalJSON(data []byte) error {
 		if i.proxy == nil {
 			i.proxy = NewProxy(i)
 		}
 		if i.process == nil {
 			i.process = newProcess(i)
 		}
 	}
 	return nil
@@ -321,6 +307,47 @@ func (i *Instance) GetLogs(num_lines int) (string, error) {
 	return i.logger.GetLogs(num_lines)
 }
 // Start starts the instance, delegating to process component
 func (i *Instance) Start() error {
 	if i.process == nil {
 		return fmt.Errorf("instance %s has no process component (remote instances cannot be started locally)", i.Name)
 	}
 	return i.process.Start()
 }
 // Stop stops the instance, delegating to process component
 func (i *Instance) Stop() error {
 	if i.process == nil {
 		return fmt.Errorf("instance %s has no process component (remote instances cannot be stopped locally)", i.Name)
 	}
 	return i.process.Stop()
 }
 // Restart restarts the instance, delegating to process component
 func (i *Instance) Restart() error {
 	if i.process == nil {
 		return fmt.Errorf("instance %s has no process component (remote instances cannot be restarted locally)", i.Name)
 	}
 	return i.process.Restart()
 }
 // WaitForHealthy waits for the instance to become healthy, delegating to process component
 func (i *Instance) WaitForHealthy(timeout int) error {
 	if i.process == nil {
 		return fmt.Errorf("instance %s has no process component (remote instances cannot be health checked locally)", i.Name)
 	}
 	return i.process.WaitForHealthy(timeout)
 }
 // LastRequestTime returns the last request time as a Unix timestamp
 // Delegates to the Proxy component
 func (i *Instance) LastRequestTime() int64 {
 	if i.proxy == nil {
 		return 0
 	}
 	return i.proxy.LastRequestTime()
 }
 // getBackendHostPort extracts the host and port from instance options
 // Returns the configured host and port for the backend
 func (i *Instance) getBackendHostPort() (string, int) {
--- a/pkg/instance/lifecycle.go
+++ b/pkg/instance/lifecycle.go
@@ -1,406 +0,0 @@
 package instance
 import (
 	"context"
 	"fmt"
 	"log"
 	"net/http"
 	"os"
 	"os/exec"
 	"runtime"
 	"syscall"
 	"time"
 	"llamactl/pkg/backends"
 	"llamactl/pkg/config"
 )
 // Start starts the llama server instance and returns an error if it fails.
 func (i *Instance) Start() error {
 	i.mu.Lock()
 	defer i.mu.Unlock()
 	if i.IsRunning() {
 		return fmt.Errorf("instance %s is already running", i.Name)
 	}
 	// Safety check: ensure options are valid
 	if i.options == nil {
 		return fmt.Errorf("instance %s has no options set", i.Name)
 	}
 	// Reset restart counter when manually starting (not during auto-restart)
 	// We can detect auto-restart by checking if restartCancel is set
 	if i.restartCancel == nil {
 		i.restarts = 0
 	}
 	// Initialize last request time to current time when starting
 	if i.proxy != nil {
 		i.proxy.UpdateLastRequestTime()
 	}
 	// Create context before building command (needed for CommandContext)
 	i.ctx, i.cancel = context.WithCancel(context.Background())
 	// Create log files
 	if err := i.logger.Create(); err != nil {
 		return fmt.Errorf("failed to create log files: %w", err)
 	}
 	// Build command using backend-specific methods
 	cmd, cmdErr := i.buildCommand()
 	if cmdErr != nil {
 		return fmt.Errorf("failed to build command: %w", cmdErr)
 	}
 	i.cmd = cmd
 	if runtime.GOOS != "windows" {
 		setProcAttrs(i.cmd)
 	}
 	var err error
 	i.stdout, err = i.cmd.StdoutPipe()
 	if err != nil {
 		i.logger.Close()
 		return fmt.Errorf("failed to get stdout pipe: %w", err)
 	}
 	i.stderr, err = i.cmd.StderrPipe()
 	if err != nil {
 		i.stdout.Close()
 		i.logger.Close()
 		return fmt.Errorf("failed to get stderr pipe: %w", err)
 	}
 	if err := i.cmd.Start(); err != nil {
 		return fmt.Errorf("failed to start instance %s: %w", i.Name, err)
 	}
 	i.SetStatus(Running)
 	// Create channel for monitor completion signaling
 	i.monitorDone = make(chan struct{})
 	go i.logger.readOutput(i.stdout)
 	go i.logger.readOutput(i.stderr)
 	go i.monitorProcess()
 	return nil
 }
 // Stop terminates the subprocess
 func (i *Instance) Stop() error {
 	i.mu.Lock()
 	if !i.IsRunning() {
 		// Even if not running, cancel any pending restart
 		if i.restartCancel != nil {
 			i.restartCancel()
 			i.restartCancel = nil
 			log.Printf("Cancelled pending restart for instance %s", i.Name)
 		}
 		i.mu.Unlock()
 		return fmt.Errorf("instance %s is not running", i.Name)
 	}
 	// Cancel any pending restart
 	if i.restartCancel != nil {
 		i.restartCancel()
 		i.restartCancel = nil
 	}
 	// Set status to stopped first to signal intentional stop
 	i.SetStatus(Stopped)
 	// Get the monitor done channel before releasing the lock
 	monitorDone := i.monitorDone
 	i.mu.Unlock()
 	// Stop the process with SIGINT if cmd exists
 	if i.cmd != nil && i.cmd.Process != nil {
 		if err := i.cmd.Process.Signal(syscall.SIGINT); err != nil {
 			log.Printf("Failed to send SIGINT to instance %s: %v", i.Name, err)
 		}
 	}
 	// If no process exists, we can return immediately
 	if i.cmd == nil || monitorDone == nil {
 		i.logger.Close()
 		return nil
 	}
 	select {
 	case <-monitorDone:
 		// Process exited normally
 	case <-time.After(30 * time.Second):
 		// Force kill if it doesn't exit within 30 seconds
 		if i.cmd != nil && i.cmd.Process != nil {
 			killErr := i.cmd.Process.Kill()
 			if killErr != nil {
 				log.Printf("Failed to force kill instance %s: %v", i.Name, killErr)
 			}
 			log.Printf("Instance %s did not stop in time, force killed", i.Name)
 			// Wait a bit more for the monitor to finish after force kill
 			select {
 			case <-monitorDone:
 				// Monitor completed after force kill
 			case <-time.After(2 * time.Second):
 				log.Printf("Warning: Monitor goroutine did not complete after force kill for instance %s", i.Name)
 			}
 		}
 	}
 	i.logger.Close()
 	return nil
 }
 // LastRequestTime returns the last request time as a Unix timestamp
 // Delegates to the Proxy component
 func (i *Instance) LastRequestTime() int64 {
 	if i.proxy == nil {
 		return 0
 	}
 	return i.proxy.LastRequestTime()
 }
 func (i *Instance) WaitForHealthy(timeout int) error {
 	if !i.IsRunning() {
 		return fmt.Errorf("instance %s is not running", i.Name)
 	}
 	if timeout <= 0 {
 		timeout = 30 // Default to 30 seconds if no timeout is specified
 	}
 	ctx, cancel := context.WithTimeout(context.Background(), time.Duration(timeout)*time.Second)
 	defer cancel()
 	// Get host/port from process
 	host, port := i.getBackendHostPort()
 	healthURL := fmt.Sprintf("http://%s:%d/health", host, port)
 	// Create a dedicated HTTP client for health checks
 	client := &http.Client{
 		Timeout: 5 * time.Second, // 5 second timeout per request
 	}
 	// Helper function to check health directly
 	checkHealth := func() bool {
 		req, err := http.NewRequestWithContext(ctx, "GET", healthURL, nil)
 		if err != nil {
 			return false
 		}
 		resp, err := client.Do(req)
 		if err != nil {
 			return false
 		}
 		defer resp.Body.Close()
 		return resp.StatusCode == http.StatusOK
 	}
 	// Try immediate check first
 	if checkHealth() {
 		return nil // Instance is healthy
 	}
 	// If immediate check failed, start polling
 	ticker := time.NewTicker(1 * time.Second)
 	defer ticker.Stop()
 	for {
 		select {
 		case <-ctx.Done():
 			return fmt.Errorf("timeout waiting for instance %s to become healthy after %d seconds", i.Name, timeout)
 		case <-ticker.C:
 			if checkHealth() {
 				return nil // Instance is healthy
 			}
 			// Continue polling
 		}
 	}
 }
 func (i *Instance) monitorProcess() {
 	defer func() {
 		i.mu.Lock()
 		if i.monitorDone != nil {
 			close(i.monitorDone)
 			i.monitorDone = nil
 		}
 		i.mu.Unlock()
 	}()
 	err := i.cmd.Wait()
 	i.mu.Lock()
 	// Check if the instance was intentionally stopped
 	if !i.IsRunning() {
 		i.mu.Unlock()
 		return
 	}
 	i.SetStatus(Stopped)
 	i.logger.Close()
 	// Cancel any existing restart context since we're handling a new exit
 	if i.restartCancel != nil {
 		i.restartCancel()
 		i.restartCancel = nil
 	}
 	// Log the exit
 	if err != nil {
 		log.Printf("Instance %s crashed with error: %v", i.Name, err)
 		// Handle restart while holding the lock, then release it
 		i.handleRestart()
 	} else {
 		log.Printf("Instance %s exited cleanly", i.Name)
 		i.mu.Unlock()
 	}
 }
 // handleRestart manages the restart process while holding the lock
 func (i *Instance) handleRestart() {
 	// Validate restart conditions and get safe parameters
 	shouldRestart, maxRestarts, restartDelay := i.validateRestartConditions()
 	if !shouldRestart {
 		i.SetStatus(Failed)
 		i.mu.Unlock()
 		return
 	}
 	i.restarts++
 	log.Printf("Auto-restarting instance %s (attempt %d/%d) in %v",
 		i.Name, i.restarts, maxRestarts, time.Duration(restartDelay)*time.Second)
 	// Create a cancellable context for the restart delay
 	restartCtx, cancel := context.WithCancel(context.Background())
 	i.restartCancel = cancel
 	// Release the lock before sleeping
 	i.mu.Unlock()
 	// Use context-aware sleep so it can be cancelled
 	select {
 	case <-time.After(time.Duration(restartDelay) * time.Second):
 		// Sleep completed normally, continue with restart
 	case <-restartCtx.Done():
 		// Restart was cancelled
 		log.Printf("Restart cancelled for instance %s", i.Name)
 		return
 	}
 	// Restart the instance
 	if err := i.Start(); err != nil {
 		log.Printf("Failed to restart instance %s: %v", i.Name, err)
 	} else {
 		log.Printf("Successfully restarted instance %s", i.Name)
 		// Clear the cancel function
 		i.mu.Lock()
 		i.restartCancel = nil
 		i.mu.Unlock()
 	}
 }
 // validateRestartConditions checks if the instance should be restarted and returns the parameters
 func (i *Instance) validateRestartConditions() (shouldRestart bool, maxRestarts int, restartDelay int) {
 	opts := i.GetOptions()
 	if opts == nil {
 		log.Printf("Instance %s not restarting: options are nil", i.Name)
 		return false, 0, 0
 	}
 	if opts.AutoRestart == nil || !*opts.AutoRestart {
 		log.Printf("Instance %s not restarting: AutoRestart is disabled", i.Name)
 		return false, 0, 0
 	}
 	if opts.MaxRestarts == nil {
 		log.Printf("Instance %s not restarting: MaxRestarts is nil", i.Name)
 		return false, 0, 0
 	}
 	if opts.RestartDelay == nil {
 		log.Printf("Instance %s not restarting: RestartDelay is nil", i.Name)
 		return false, 0, 0
 	}
 	// Values are already validated during unmarshaling/SetOptions
 	maxRestarts = *opts.MaxRestarts
 	restartDelay = *opts.RestartDelay
 	if i.restarts >= maxRestarts {
 		log.Printf("Instance %s exceeded max restart attempts (%d)", i.Name, maxRestarts)
 		return false, 0, 0
 	}
 	return true, maxRestarts, restartDelay
 }
 // buildCommand builds the command to execute using backend-specific logic
 func (i *Instance) buildCommand() (*exec.Cmd, error) {
 	// Get options
 	opts := i.GetOptions()
 	if opts == nil {
 		return nil, fmt.Errorf("instance options are nil")
 	}
 	// Get backend configuration
 	backendConfig, err := i.getBackendConfig()
 	if err != nil {
 		return nil, err
 	}
 	// Build the environment variables
 	env := opts.BuildEnvironment(backendConfig)
 	// Get the command to execute
 	command := opts.GetCommand(backendConfig)
 	// Build command arguments
 	args := opts.BuildCommandArgs(backendConfig)
 	// Create the exec.Cmd
 	cmd := exec.CommandContext(i.ctx, command, args...)
 	// Start with host environment variables
 	cmd.Env = os.Environ()
 	// Add/override with backend-specific environment variables
 	for k, v := range env {
 		cmd.Env = append(cmd.Env, fmt.Sprintf("%s=%s", k, v))
 	}
 	return cmd, nil
 }
 // getBackendConfig resolves the backend configuration for the current instance
 func (i *Instance) getBackendConfig() (*config.BackendSettings, error) {
 	opts := i.GetOptions()
 	if opts == nil {
 		return nil, fmt.Errorf("instance options are nil")
 	}
 	var backendTypeStr string
 	switch opts.BackendType {
 	case backends.BackendTypeLlamaCpp:
 		backendTypeStr = "llama-cpp"
 	case backends.BackendTypeMlxLm:
 		backendTypeStr = "mlx"
 	case backends.BackendTypeVllm:
 		backendTypeStr = "vllm"
 	default:
 		return nil, fmt.Errorf("unsupported backend type: %s", opts.BackendType)
 	}
 	settings := i.globalBackendSettings.GetBackendSettings(backendTypeStr)
 	return &settings, nil
 }
--- a/pkg/instance/process.go
+++ b/pkg/instance/process.go
@@ -0,0 +1,446 @@
 package instance
 import (
 	"context"
 	"fmt"
 	"io"
 	"log"
 	"net/http"
 	"os"
 	"os/exec"
 	"runtime"
 	"sync"
 	"syscall"
 	"time"
 	"llamactl/pkg/backends"
 	"llamactl/pkg/config"
 )
 // process manages the OS process lifecycle for a local instance (unexported).
 // process owns its complete lifecycle including auto-restart logic.
 type process struct {
 	instance *Instance // Back-reference for SetStatus, GetOptions
 	mu            sync.RWMutex
 	cmd           *exec.Cmd
 	ctx           context.Context
 	cancel        context.CancelFunc
 	stdout        io.ReadCloser
 	stderr        io.ReadCloser
 	restarts      int               // process owns restart counter
 	restartCancel context.CancelFunc
 	monitorDone   chan struct{}
 }
 // newProcess creates a new process component for the given instance
 func newProcess(instance *Instance) *process {
 	return &process{
 		instance: instance,
 	}
 }
 // Start starts the OS process and returns an error if it fails.
 func (p *process) Start() error {
 	p.mu.Lock()
 	defer p.mu.Unlock()
 	if p.instance.IsRunning() {
 		return fmt.Errorf("instance %s is already running", p.instance.Name)
 	}
 	// Safety check: ensure options are valid
 	if p.instance.options == nil {
 		return fmt.Errorf("instance %s has no options set", p.instance.Name)
 	}
 	// Reset restart counter when manually starting (not during auto-restart)
 	// We can detect auto-restart by checking if restartCancel is set
 	if p.restartCancel == nil {
 		p.restarts = 0
 	}
 	// Initialize last request time to current time when starting
 	if p.instance.proxy != nil {
 		p.instance.proxy.UpdateLastRequestTime()
 	}
 	// Create context before building command (needed for CommandContext)
 	p.ctx, p.cancel = context.WithCancel(context.Background())
 	// Create log files
 	if err := p.instance.logger.Create(); err != nil {
 		return fmt.Errorf("failed to create log files: %w", err)
 	}
 	// Build command using backend-specific methods
 	cmd, cmdErr := p.buildCommand()
 	if cmdErr != nil {
 		return fmt.Errorf("failed to build command: %w", cmdErr)
 	}
 	p.cmd = cmd
 	if runtime.GOOS != "windows" {
 		setProcAttrs(p.cmd)
 	}
 	var err error
 	p.stdout, err = p.cmd.StdoutPipe()
 	if err != nil {
 		p.instance.logger.Close()
 		return fmt.Errorf("failed to get stdout pipe: %w", err)
 	}
 	p.stderr, err = p.cmd.StderrPipe()
 	if err != nil {
 		p.stdout.Close()
 		p.instance.logger.Close()
 		return fmt.Errorf("failed to get stderr pipe: %w", err)
 	}
 	if err := p.cmd.Start(); err != nil {
 		return fmt.Errorf("failed to start instance %s: %w", p.instance.Name, err)
 	}
 	p.instance.SetStatus(Running)
 	// Create channel for monitor completion signaling
 	p.monitorDone = make(chan struct{})
 	go p.instance.logger.readOutput(p.stdout)
 	go p.instance.logger.readOutput(p.stderr)
 	go p.monitorProcess()
 	return nil
 }
 // Stop terminates the subprocess without restarting
 func (p *process) Stop() error {
 	p.mu.Lock()
 	if !p.instance.IsRunning() {
 		// Even if not running, cancel any pending restart
 		if p.restartCancel != nil {
 			p.restartCancel()
 			p.restartCancel = nil
 			log.Printf("Cancelled pending restart for instance %s", p.instance.Name)
 		}
 		p.mu.Unlock()
 		return fmt.Errorf("instance %s is not running", p.instance.Name)
 	}
 	// Cancel any pending restart
 	if p.restartCancel != nil {
 		p.restartCancel()
 		p.restartCancel = nil
 	}
 	// Set status to stopped first to signal intentional stop
 	p.instance.SetStatus(Stopped)
 	// Get the monitor done channel before releasing the lock
 	monitorDone := p.monitorDone
 	p.mu.Unlock()
 	// Stop the process with SIGINT if cmd exists
 	if p.cmd != nil && p.cmd.Process != nil {
 		if err := p.cmd.Process.Signal(syscall.SIGINT); err != nil {
 			log.Printf("Failed to send SIGINT to instance %s: %v", p.instance.Name, err)
 		}
 	}
 	// If no process exists, we can return immediately
 	if p.cmd == nil || monitorDone == nil {
 		p.instance.logger.Close()
 		return nil
 	}
 	select {
 	case <-monitorDone:
 		// Process exited normally
 	case <-time.After(30 * time.Second):
 		// Force kill if it doesn't exit within 30 seconds
 		if p.cmd != nil && p.cmd.Process != nil {
 			killErr := p.cmd.Process.Kill()
 			if killErr != nil {
 				log.Printf("Failed to force kill instance %s: %v", p.instance.Name, killErr)
 			}
 			log.Printf("Instance %s did not stop in time, force killed", p.instance.Name)
 			// Wait a bit more for the monitor to finish after force kill
 			select {
 			case <-monitorDone:
 				// Monitor completed after force kill
 			case <-time.After(2 * time.Second):
 				log.Printf("Warning: Monitor goroutine did not complete after force kill for instance %s", p.instance.Name)
 			}
 		}
 	}
 	p.instance.logger.Close()
 	return nil
 }
 // Restart manually restarts the process (resets restart counter)
 func (p *process) Restart() error {
 	// Stop the process first
 	if err := p.Stop(); err != nil {
 		// If it's not running, that's ok - we'll just start it
 		if err.Error() != fmt.Sprintf("instance %s is not running", p.instance.Name) {
 			return fmt.Errorf("failed to stop instance during restart: %w", err)
 		}
 	}
 	// Reset restart counter for manual restart
 	p.mu.Lock()
 	p.restarts = 0
 	p.mu.Unlock()
 	// Start the process
 	return p.Start()
 }
 // WaitForHealthy waits for the process to become healthy
 func (p *process) WaitForHealthy(timeout int) error {
 	if !p.instance.IsRunning() {
 		return fmt.Errorf("instance %s is not running", p.instance.Name)
 	}
 	if timeout <= 0 {
 		timeout = 30 // Default to 30 seconds if no timeout is specified
 	}
 	ctx, cancel := context.WithTimeout(context.Background(), time.Duration(timeout)*time.Second)
 	defer cancel()
 	// Get host/port from instance
 	host, port := p.instance.getBackendHostPort()
 	healthURL := fmt.Sprintf("http://%s:%d/health", host, port)
 	// Create a dedicated HTTP client for health checks
 	client := &http.Client{
 		Timeout: 5 * time.Second, // 5 second timeout per request
 	}
 	// Helper function to check health directly
 	checkHealth := func() bool {
 		req, err := http.NewRequestWithContext(ctx, "GET", healthURL, nil)
 		if err != nil {
 			return false
 		}
 		resp, err := client.Do(req)
 		if err != nil {
 			return false
 		}
 		defer resp.Body.Close()
 		return resp.StatusCode == http.StatusOK
 	}
 	// Try immediate check first
 	if checkHealth() {
 		return nil // Instance is healthy
 	}
 	// If immediate check failed, start polling
 	ticker := time.NewTicker(1 * time.Second)
 	defer ticker.Stop()
 	for {
 		select {
 		case <-ctx.Done():
 			return fmt.Errorf("timeout waiting for instance %s to become healthy after %d seconds", p.instance.Name, timeout)
 		case <-ticker.C:
 			if checkHealth() {
 				return nil // Instance is healthy
 			}
 			// Continue polling
 		}
 	}
 }
 // monitorProcess monitors the OS process and handles crashes/exits
 func (p *process) monitorProcess() {
 	defer func() {
 		p.mu.Lock()
 		if p.monitorDone != nil {
 			close(p.monitorDone)
 			p.monitorDone = nil
 		}
 		p.mu.Unlock()
 	}()
 	err := p.cmd.Wait()
 	p.mu.Lock()
 	// Check if the instance was intentionally stopped
 	if !p.instance.IsRunning() {
 		p.mu.Unlock()
 		return
 	}
 	p.instance.SetStatus(Stopped)
 	p.instance.logger.Close()
 	// Cancel any existing restart context since we're handling a new exit
 	if p.restartCancel != nil {
 		p.restartCancel()
 		p.restartCancel = nil
 	}
 	// Log the exit
 	if err != nil {
 		log.Printf("Instance %s crashed with error: %v", p.instance.Name, err)
 		// Handle auto-restart logic
 		p.handleAutoRestart(err)
 	} else {
 		log.Printf("Instance %s exited cleanly", p.instance.Name)
 		p.mu.Unlock()
 	}
 }
 // shouldAutoRestart checks if the process should auto-restart
 func (p *process) shouldAutoRestart() bool {
 	opts := p.instance.GetOptions()
 	if opts == nil {
 		log.Printf("Instance %s not restarting: options are nil", p.instance.Name)
 		return false
 	}
 	if opts.AutoRestart == nil || !*opts.AutoRestart {
 		log.Printf("Instance %s not restarting: AutoRestart is disabled", p.instance.Name)
 		return false
 	}
 	if opts.MaxRestarts == nil {
 		log.Printf("Instance %s not restarting: MaxRestarts is nil", p.instance.Name)
 		return false
 	}
 	maxRestarts := *opts.MaxRestarts
 	if p.restarts >= maxRestarts {
 		log.Printf("Instance %s exceeded max restart attempts (%d)", p.instance.Name, maxRestarts)
 		return false
 	}
 	return true
 }
 // handleAutoRestart manages the auto-restart process
 func (p *process) handleAutoRestart(err error) {
 	// Check if should restart
 	if !p.shouldAutoRestart() {
 		p.instance.SetStatus(Failed)
 		p.mu.Unlock()
 		return
 	}
 	// Get restart parameters
 	opts := p.instance.GetOptions()
 	if opts.RestartDelay == nil {
 		log.Printf("Instance %s not restarting: RestartDelay is nil", p.instance.Name)
 		p.instance.SetStatus(Failed)
 		p.mu.Unlock()
 		return
 	}
 	restartDelay := *opts.RestartDelay
 	maxRestarts := *opts.MaxRestarts
 	p.restarts++
 	log.Printf("Auto-restarting instance %s (attempt %d/%d) in %v",
 		p.instance.Name, p.restarts, maxRestarts, time.Duration(restartDelay)*time.Second)
 	// Create a cancellable context for the restart delay
 	restartCtx, cancel := context.WithCancel(context.Background())
 	p.restartCancel = cancel
 	// Release the lock before sleeping
 	p.mu.Unlock()
 	// Use context-aware sleep so it can be cancelled
 	select {
 	case <-time.After(time.Duration(restartDelay) * time.Second):
 		// Sleep completed normally, continue with restart
 	case <-restartCtx.Done():
 		// Restart was cancelled
 		log.Printf("Restart cancelled for instance %s", p.instance.Name)
 		return
 	}
 	// Restart the instance
 	if err := p.Start(); err != nil {
 		log.Printf("Failed to restart instance %s: %v", p.instance.Name, err)
 	} else {
 		log.Printf("Successfully restarted instance %s", p.instance.Name)
 		// Clear the cancel function
 		p.mu.Lock()
 		p.restartCancel = nil
 		p.mu.Unlock()
 	}
 }
 // buildCommand builds the command to execute using backend-specific logic
 func (p *process) buildCommand() (*exec.Cmd, error) {
 	// Get options
 	opts := p.instance.GetOptions()
 	if opts == nil {
 		return nil, fmt.Errorf("instance options are nil")
 	}
 	// Get backend configuration
 	backendConfig, err := p.getBackendConfig()
 	if err != nil {
 		return nil, err
 	}
 	// Build the environment variables
 	env := opts.BuildEnvironment(backendConfig)
 	// Get the command to execute
 	command := opts.GetCommand(backendConfig)
 	// Build command arguments
 	args := opts.BuildCommandArgs(backendConfig)
 	// Create the exec.Cmd
 	cmd := exec.CommandContext(p.ctx, command, args...)
 	// Start with host environment variables
 	cmd.Env = os.Environ()
 	// Add/override with backend-specific environment variables
 	for k, v := range env {
 		cmd.Env = append(cmd.Env, fmt.Sprintf("%s=%s", k, v))
 	}
 	return cmd, nil
 }
 // getBackendConfig resolves the backend configuration for the current instance
 func (p *process) getBackendConfig() (*config.BackendSettings, error) {
 	opts := p.instance.GetOptions()
 	if opts == nil {
 		return nil, fmt.Errorf("instance options are nil")
 	}
 	var backendTypeStr string
 	switch opts.BackendType {
 	case backends.BackendTypeLlamaCpp:
 		backendTypeStr = "llama-cpp"
 	case backends.BackendTypeMlxLm:
 		backendTypeStr = "mlx"
 	case backends.BackendTypeVllm:
 		backendTypeStr = "vllm"
 	default:
 		return nil, fmt.Errorf("unsupported backend type: %s", opts.BackendType)
 	}
 	settings := p.instance.globalBackendSettings.GetBackendSettings(backendTypeStr)
 	return &settings, nil
 }