Split off process struct

2025-12-22 17:14:22 +00:00 · 2025-10-18 10:28:15 +02:00
parent 3f834004a8
commit b13f8c471d
3 changed files with 497 additions and 430 deletions
--- a/pkg/instance/instance.go
+++ b/pkg/instance/instance.go
@@ -1,16 +1,12 @@
 package instance

 import (
-	"context"
 	"encoding/json"
 	"fmt"
-	"io"
 	"llamactl/pkg/backends"
 	"llamactl/pkg/config"
 	"log"
 	"net/http/httputil"
-	"os/exec"
-	"sync"
 	"time"
 )

@@ -30,21 +26,9 @@ type Instance struct {
 	localNodeName          string `json:"-"` // Name of the local node for remote detection

 	// Components (can be nil for remote instances or when stopped)
-	logger *logger `json:"-"` // nil for remote instances
-	proxy  *proxy  `json:"-"` // nil for remote instances
-
-	// internal
-	cmd      *exec.Cmd          `json:"-"` // Command to run the instance
-	ctx      context.Context    `json:"-"` // Context for managing the instance lifecycle
-	cancel   context.CancelFunc `json:"-"` // Function to cancel the context
-	stdout   io.ReadCloser      `json:"-"` // Standard output stream
-	stderr   io.ReadCloser      `json:"-"` // Standard error stream
-	mu       sync.RWMutex       `json:"-"` // RWMutex for better read/write separation
-	restarts int                `json:"-"` // Number of restarts
-
-	// Restart control
-	restartCancel context.CancelFunc `json:"-"` // Cancel function for pending restarts
-	monitorDone   chan struct{}      `json:"-"` // Channel to signal monitor goroutine completion
+	process *process `json:"-"` // nil for remote instances, nil when stopped
+	proxy   *proxy   `json:"-"` // nil for remote instances, created on demand
+	logger  *logger  `json:"-"` // nil for remote instances
 }

 // New creates a new instance with the given name, log path, and options
@@ -76,6 +60,9 @@ func New(name string, globalBackendSettings *config.BackendConfig, globalInstanc
 	// Create Proxy component
 	instance.proxy = NewProxy(instance)

+	// Create Process component (will be initialized on first Start)
+	instance.process = newProcess(instance)
+
 	return instance
 }

@@ -204,10 +191,6 @@ func (i *Instance) GetProxy() (*httputil.ReverseProxy, error) {

 // MarshalJSON implements json.Marshaler for Instance
 func (i *Instance) MarshalJSON() ([]byte, error) {
-	// Use read lock since we're only reading data
-	i.mu.RLock()
-	defer i.mu.RUnlock()
-
 	// Get options
 	opts := i.GetOptions()

@@ -280,7 +263,7 @@ func (i *Instance) UnmarshalJSON(data []byte) error {
 		i.options = newOptions(&Options{})
 	}

-	// Only create logger and proxy for non-remote instances
+	// Only create logger, proxy, and process for non-remote instances
 	// Remote instances are metadata only (no logger, proxy, or process)
 	if !i.IsRemote() {
 		if i.logger == nil && i.globalInstanceSettings != nil {
@@ -289,6 +272,9 @@ func (i *Instance) UnmarshalJSON(data []byte) error {
 		if i.proxy == nil {
 			i.proxy = NewProxy(i)
 		}
+		if i.process == nil {
+			i.process = newProcess(i)
+		}
 	}

 	return nil
@@ -321,6 +307,47 @@ func (i *Instance) GetLogs(num_lines int) (string, error) {
 	return i.logger.GetLogs(num_lines)
 }

+// Start starts the instance, delegating to process component
+func (i *Instance) Start() error {
+	if i.process == nil {
+		return fmt.Errorf("instance %s has no process component (remote instances cannot be started locally)", i.Name)
+	}
+	return i.process.Start()
+}
+
+// Stop stops the instance, delegating to process component
+func (i *Instance) Stop() error {
+	if i.process == nil {
+		return fmt.Errorf("instance %s has no process component (remote instances cannot be stopped locally)", i.Name)
+	}
+	return i.process.Stop()
+}
+
+// Restart restarts the instance, delegating to process component
+func (i *Instance) Restart() error {
+	if i.process == nil {
+		return fmt.Errorf("instance %s has no process component (remote instances cannot be restarted locally)", i.Name)
+	}
+	return i.process.Restart()
+}
+
+// WaitForHealthy waits for the instance to become healthy, delegating to process component
+func (i *Instance) WaitForHealthy(timeout int) error {
+	if i.process == nil {
+		return fmt.Errorf("instance %s has no process component (remote instances cannot be health checked locally)", i.Name)
+	}
+	return i.process.WaitForHealthy(timeout)
+}
+
+// LastRequestTime returns the last request time as a Unix timestamp
+// Delegates to the Proxy component
+func (i *Instance) LastRequestTime() int64 {
+	if i.proxy == nil {
+		return 0
+	}
+	return i.proxy.LastRequestTime()
+}
+
 // getBackendHostPort extracts the host and port from instance options
 // Returns the configured host and port for the backend
 func (i *Instance) getBackendHostPort() (string, int) {
--- a/pkg/instance/lifecycle.go
+++ b/pkg/instance/lifecycle.go
@@ -1,406 +0,0 @@
-package instance
-
-import (
-	"context"
-	"fmt"
-	"log"
-	"net/http"
-	"os"
-	"os/exec"
-	"runtime"
-	"syscall"
-	"time"
-
-	"llamactl/pkg/backends"
-	"llamactl/pkg/config"
-)
-
-// Start starts the llama server instance and returns an error if it fails.
-func (i *Instance) Start() error {
-	i.mu.Lock()
-	defer i.mu.Unlock()
-
-	if i.IsRunning() {
-		return fmt.Errorf("instance %s is already running", i.Name)
-	}
-
-	// Safety check: ensure options are valid
-	if i.options == nil {
-		return fmt.Errorf("instance %s has no options set", i.Name)
-	}
-
-	// Reset restart counter when manually starting (not during auto-restart)
-	// We can detect auto-restart by checking if restartCancel is set
-	if i.restartCancel == nil {
-		i.restarts = 0
-	}
-
-	// Initialize last request time to current time when starting
-	if i.proxy != nil {
-		i.proxy.UpdateLastRequestTime()
-	}
-
-	// Create context before building command (needed for CommandContext)
-	i.ctx, i.cancel = context.WithCancel(context.Background())
-
-	// Create log files
-	if err := i.logger.Create(); err != nil {
-		return fmt.Errorf("failed to create log files: %w", err)
-	}
-
-	// Build command using backend-specific methods
-	cmd, cmdErr := i.buildCommand()
-	if cmdErr != nil {
-		return fmt.Errorf("failed to build command: %w", cmdErr)
-	}
-	i.cmd = cmd
-
-	if runtime.GOOS != "windows" {
-		setProcAttrs(i.cmd)
-	}
-
-	var err error
-	i.stdout, err = i.cmd.StdoutPipe()
-	if err != nil {
-		i.logger.Close()
-		return fmt.Errorf("failed to get stdout pipe: %w", err)
-	}
-	i.stderr, err = i.cmd.StderrPipe()
-	if err != nil {
-		i.stdout.Close()
-		i.logger.Close()
-		return fmt.Errorf("failed to get stderr pipe: %w", err)
-	}
-
-	if err := i.cmd.Start(); err != nil {
-		return fmt.Errorf("failed to start instance %s: %w", i.Name, err)
-	}
-
-	i.SetStatus(Running)
-
-	// Create channel for monitor completion signaling
-	i.monitorDone = make(chan struct{})
-
-	go i.logger.readOutput(i.stdout)
-	go i.logger.readOutput(i.stderr)
-
-	go i.monitorProcess()
-
-	return nil
-}
-
-// Stop terminates the subprocess
-func (i *Instance) Stop() error {
-	i.mu.Lock()
-
-	if !i.IsRunning() {
-		// Even if not running, cancel any pending restart
-		if i.restartCancel != nil {
-			i.restartCancel()
-			i.restartCancel = nil
-			log.Printf("Cancelled pending restart for instance %s", i.Name)
-		}
-		i.mu.Unlock()
-		return fmt.Errorf("instance %s is not running", i.Name)
-	}
-
-	// Cancel any pending restart
-	if i.restartCancel != nil {
-		i.restartCancel()
-		i.restartCancel = nil
-	}
-
-	// Set status to stopped first to signal intentional stop
-	i.SetStatus(Stopped)
-
-	// Get the monitor done channel before releasing the lock
-	monitorDone := i.monitorDone
-
-	i.mu.Unlock()
-
-	// Stop the process with SIGINT if cmd exists
-	if i.cmd != nil && i.cmd.Process != nil {
-		if err := i.cmd.Process.Signal(syscall.SIGINT); err != nil {
-			log.Printf("Failed to send SIGINT to instance %s: %v", i.Name, err)
-		}
-	}
-
-	// If no process exists, we can return immediately
-	if i.cmd == nil || monitorDone == nil {
-		i.logger.Close()
-		return nil
-	}
-
-	select {
-	case <-monitorDone:
-		// Process exited normally
-	case <-time.After(30 * time.Second):
-		// Force kill if it doesn't exit within 30 seconds
-		if i.cmd != nil && i.cmd.Process != nil {
-			killErr := i.cmd.Process.Kill()
-			if killErr != nil {
-				log.Printf("Failed to force kill instance %s: %v", i.Name, killErr)
-			}
-			log.Printf("Instance %s did not stop in time, force killed", i.Name)
-
-			// Wait a bit more for the monitor to finish after force kill
-			select {
-			case <-monitorDone:
-				// Monitor completed after force kill
-			case <-time.After(2 * time.Second):
-				log.Printf("Warning: Monitor goroutine did not complete after force kill for instance %s", i.Name)
-			}
-		}
-	}
-
-	i.logger.Close()
-
-	return nil
-}
-
-// LastRequestTime returns the last request time as a Unix timestamp
-// Delegates to the Proxy component
-func (i *Instance) LastRequestTime() int64 {
-	if i.proxy == nil {
-		return 0
-	}
-	return i.proxy.LastRequestTime()
-}
-
-func (i *Instance) WaitForHealthy(timeout int) error {
-	if !i.IsRunning() {
-		return fmt.Errorf("instance %s is not running", i.Name)
-	}
-
-	if timeout <= 0 {
-		timeout = 30 // Default to 30 seconds if no timeout is specified
-	}
-
-	ctx, cancel := context.WithTimeout(context.Background(), time.Duration(timeout)*time.Second)
-	defer cancel()
-
-	// Get host/port from process
-	host, port := i.getBackendHostPort()
-	healthURL := fmt.Sprintf("http://%s:%d/health", host, port)
-
-	// Create a dedicated HTTP client for health checks
-	client := &http.Client{
-		Timeout: 5 * time.Second, // 5 second timeout per request
-	}
-
-	// Helper function to check health directly
-	checkHealth := func() bool {
-		req, err := http.NewRequestWithContext(ctx, "GET", healthURL, nil)
-		if err != nil {
-			return false
-		}
-
-		resp, err := client.Do(req)
-		if err != nil {
-			return false
-		}
-		defer resp.Body.Close()
-
-		return resp.StatusCode == http.StatusOK
-	}
-
-	// Try immediate check first
-	if checkHealth() {
-		return nil // Instance is healthy
-	}
-
-	// If immediate check failed, start polling
-	ticker := time.NewTicker(1 * time.Second)
-	defer ticker.Stop()
-
-	for {
-		select {
-		case <-ctx.Done():
-			return fmt.Errorf("timeout waiting for instance %s to become healthy after %d seconds", i.Name, timeout)
-		case <-ticker.C:
-			if checkHealth() {
-				return nil // Instance is healthy
-			}
-			// Continue polling
-		}
-	}
-}
-
-func (i *Instance) monitorProcess() {
-	defer func() {
-		i.mu.Lock()
-		if i.monitorDone != nil {
-			close(i.monitorDone)
-			i.monitorDone = nil
-		}
-		i.mu.Unlock()
-	}()
-
-	err := i.cmd.Wait()
-
-	i.mu.Lock()
-
-	// Check if the instance was intentionally stopped
-	if !i.IsRunning() {
-		i.mu.Unlock()
-		return
-	}
-
-	i.SetStatus(Stopped)
-	i.logger.Close()
-
-	// Cancel any existing restart context since we're handling a new exit
-	if i.restartCancel != nil {
-		i.restartCancel()
-		i.restartCancel = nil
-	}
-
-	// Log the exit
-	if err != nil {
-		log.Printf("Instance %s crashed with error: %v", i.Name, err)
-		// Handle restart while holding the lock, then release it
-		i.handleRestart()
-	} else {
-		log.Printf("Instance %s exited cleanly", i.Name)
-		i.mu.Unlock()
-	}
-}
-
-// handleRestart manages the restart process while holding the lock
-func (i *Instance) handleRestart() {
-	// Validate restart conditions and get safe parameters
-	shouldRestart, maxRestarts, restartDelay := i.validateRestartConditions()
-	if !shouldRestart {
-		i.SetStatus(Failed)
-		i.mu.Unlock()
-		return
-	}
-
-	i.restarts++
-	log.Printf("Auto-restarting instance %s (attempt %d/%d) in %v",
-		i.Name, i.restarts, maxRestarts, time.Duration(restartDelay)*time.Second)
-
-	// Create a cancellable context for the restart delay
-	restartCtx, cancel := context.WithCancel(context.Background())
-	i.restartCancel = cancel
-
-	// Release the lock before sleeping
-	i.mu.Unlock()
-
-	// Use context-aware sleep so it can be cancelled
-	select {
-	case <-time.After(time.Duration(restartDelay) * time.Second):
-		// Sleep completed normally, continue with restart
-	case <-restartCtx.Done():
-		// Restart was cancelled
-		log.Printf("Restart cancelled for instance %s", i.Name)
-		return
-	}
-
-	// Restart the instance
-	if err := i.Start(); err != nil {
-		log.Printf("Failed to restart instance %s: %v", i.Name, err)
-	} else {
-		log.Printf("Successfully restarted instance %s", i.Name)
-		// Clear the cancel function
-		i.mu.Lock()
-		i.restartCancel = nil
-		i.mu.Unlock()
-	}
-}
-
-// validateRestartConditions checks if the instance should be restarted and returns the parameters
-func (i *Instance) validateRestartConditions() (shouldRestart bool, maxRestarts int, restartDelay int) {
-	opts := i.GetOptions()
-	if opts == nil {
-		log.Printf("Instance %s not restarting: options are nil", i.Name)
-		return false, 0, 0
-	}
-
-	if opts.AutoRestart == nil || !*opts.AutoRestart {
-		log.Printf("Instance %s not restarting: AutoRestart is disabled", i.Name)
-		return false, 0, 0
-	}
-
-	if opts.MaxRestarts == nil {
-		log.Printf("Instance %s not restarting: MaxRestarts is nil", i.Name)
-		return false, 0, 0
-	}
-
-	if opts.RestartDelay == nil {
-		log.Printf("Instance %s not restarting: RestartDelay is nil", i.Name)
-		return false, 0, 0
-	}
-
-	// Values are already validated during unmarshaling/SetOptions
-	maxRestarts = *opts.MaxRestarts
-	restartDelay = *opts.RestartDelay
-
-	if i.restarts >= maxRestarts {
-		log.Printf("Instance %s exceeded max restart attempts (%d)", i.Name, maxRestarts)
-		return false, 0, 0
-	}
-
-	return true, maxRestarts, restartDelay
-}
-
-// buildCommand builds the command to execute using backend-specific logic
-func (i *Instance) buildCommand() (*exec.Cmd, error) {
-	// Get options
-	opts := i.GetOptions()
-	if opts == nil {
-		return nil, fmt.Errorf("instance options are nil")
-	}
-
-	// Get backend configuration
-	backendConfig, err := i.getBackendConfig()
-	if err != nil {
-		return nil, err
-	}
-
-	// Build the environment variables
-	env := opts.BuildEnvironment(backendConfig)
-
-	// Get the command to execute
-	command := opts.GetCommand(backendConfig)
-
-	// Build command arguments
-	args := opts.BuildCommandArgs(backendConfig)
-
-	// Create the exec.Cmd
-	cmd := exec.CommandContext(i.ctx, command, args...)
-
-	// Start with host environment variables
-	cmd.Env = os.Environ()
-
-	// Add/override with backend-specific environment variables
-	for k, v := range env {
-		cmd.Env = append(cmd.Env, fmt.Sprintf("%s=%s", k, v))
-	}
-
-	return cmd, nil
-}
-
-// getBackendConfig resolves the backend configuration for the current instance
-func (i *Instance) getBackendConfig() (*config.BackendSettings, error) {
-	opts := i.GetOptions()
-	if opts == nil {
-		return nil, fmt.Errorf("instance options are nil")
-	}
-
-	var backendTypeStr string
-
-	switch opts.BackendType {
-	case backends.BackendTypeLlamaCpp:
-		backendTypeStr = "llama-cpp"
-	case backends.BackendTypeMlxLm:
-		backendTypeStr = "mlx"
-	case backends.BackendTypeVllm:
-		backendTypeStr = "vllm"
-	default:
-		return nil, fmt.Errorf("unsupported backend type: %s", opts.BackendType)
-	}
-
-	settings := i.globalBackendSettings.GetBackendSettings(backendTypeStr)
-	return &settings, nil
-}
--- a/pkg/instance/process.go
+++ b/pkg/instance/process.go
@@ -0,0 +1,446 @@
+package instance
+
+import (
+	"context"
+	"fmt"
+	"io"
+	"log"
+	"net/http"
+	"os"
+	"os/exec"
+	"runtime"
+	"sync"
+	"syscall"
+	"time"
+
+	"llamactl/pkg/backends"
+	"llamactl/pkg/config"
+)
+
+// process manages the OS process lifecycle for a local instance (unexported).
+// process owns its complete lifecycle including auto-restart logic.
+type process struct {
+	instance *Instance // Back-reference for SetStatus, GetOptions
+
+	mu            sync.RWMutex
+	cmd           *exec.Cmd
+	ctx           context.Context
+	cancel        context.CancelFunc
+	stdout        io.ReadCloser
+	stderr        io.ReadCloser
+	restarts      int               // process owns restart counter
+	restartCancel context.CancelFunc
+	monitorDone   chan struct{}
+}
+
+// newProcess creates a new process component for the given instance
+func newProcess(instance *Instance) *process {
+	return &process{
+		instance: instance,
+	}
+}
+
+// Start starts the OS process and returns an error if it fails.
+func (p *process) Start() error {
+	p.mu.Lock()
+	defer p.mu.Unlock()
+
+	if p.instance.IsRunning() {
+		return fmt.Errorf("instance %s is already running", p.instance.Name)
+	}
+
+	// Safety check: ensure options are valid
+	if p.instance.options == nil {
+		return fmt.Errorf("instance %s has no options set", p.instance.Name)
+	}
+
+	// Reset restart counter when manually starting (not during auto-restart)
+	// We can detect auto-restart by checking if restartCancel is set
+	if p.restartCancel == nil {
+		p.restarts = 0
+	}
+
+	// Initialize last request time to current time when starting
+	if p.instance.proxy != nil {
+		p.instance.proxy.UpdateLastRequestTime()
+	}
+
+	// Create context before building command (needed for CommandContext)
+	p.ctx, p.cancel = context.WithCancel(context.Background())
+
+	// Create log files
+	if err := p.instance.logger.Create(); err != nil {
+		return fmt.Errorf("failed to create log files: %w", err)
+	}
+
+	// Build command using backend-specific methods
+	cmd, cmdErr := p.buildCommand()
+	if cmdErr != nil {
+		return fmt.Errorf("failed to build command: %w", cmdErr)
+	}
+	p.cmd = cmd
+
+	if runtime.GOOS != "windows" {
+		setProcAttrs(p.cmd)
+	}
+
+	var err error
+	p.stdout, err = p.cmd.StdoutPipe()
+	if err != nil {
+		p.instance.logger.Close()
+		return fmt.Errorf("failed to get stdout pipe: %w", err)
+	}
+	p.stderr, err = p.cmd.StderrPipe()
+	if err != nil {
+		p.stdout.Close()
+		p.instance.logger.Close()
+		return fmt.Errorf("failed to get stderr pipe: %w", err)
+	}
+
+	if err := p.cmd.Start(); err != nil {
+		return fmt.Errorf("failed to start instance %s: %w", p.instance.Name, err)
+	}
+
+	p.instance.SetStatus(Running)
+
+	// Create channel for monitor completion signaling
+	p.monitorDone = make(chan struct{})
+
+	go p.instance.logger.readOutput(p.stdout)
+	go p.instance.logger.readOutput(p.stderr)
+
+	go p.monitorProcess()
+
+	return nil
+}
+
+// Stop terminates the subprocess without restarting
+func (p *process) Stop() error {
+	p.mu.Lock()
+
+	if !p.instance.IsRunning() {
+		// Even if not running, cancel any pending restart
+		if p.restartCancel != nil {
+			p.restartCancel()
+			p.restartCancel = nil
+			log.Printf("Cancelled pending restart for instance %s", p.instance.Name)
+		}
+		p.mu.Unlock()
+		return fmt.Errorf("instance %s is not running", p.instance.Name)
+	}
+
+	// Cancel any pending restart
+	if p.restartCancel != nil {
+		p.restartCancel()
+		p.restartCancel = nil
+	}
+
+	// Set status to stopped first to signal intentional stop
+	p.instance.SetStatus(Stopped)
+
+	// Get the monitor done channel before releasing the lock
+	monitorDone := p.monitorDone
+
+	p.mu.Unlock()
+
+	// Stop the process with SIGINT if cmd exists
+	if p.cmd != nil && p.cmd.Process != nil {
+		if err := p.cmd.Process.Signal(syscall.SIGINT); err != nil {
+			log.Printf("Failed to send SIGINT to instance %s: %v", p.instance.Name, err)
+		}
+	}
+
+	// If no process exists, we can return immediately
+	if p.cmd == nil || monitorDone == nil {
+		p.instance.logger.Close()
+		return nil
+	}
+
+	select {
+	case <-monitorDone:
+		// Process exited normally
+	case <-time.After(30 * time.Second):
+		// Force kill if it doesn't exit within 30 seconds
+		if p.cmd != nil && p.cmd.Process != nil {
+			killErr := p.cmd.Process.Kill()
+			if killErr != nil {
+				log.Printf("Failed to force kill instance %s: %v", p.instance.Name, killErr)
+			}
+			log.Printf("Instance %s did not stop in time, force killed", p.instance.Name)
+
+			// Wait a bit more for the monitor to finish after force kill
+			select {
+			case <-monitorDone:
+				// Monitor completed after force kill
+			case <-time.After(2 * time.Second):
+				log.Printf("Warning: Monitor goroutine did not complete after force kill for instance %s", p.instance.Name)
+			}
+		}
+	}
+
+	p.instance.logger.Close()
+
+	return nil
+}
+
+// Restart manually restarts the process (resets restart counter)
+func (p *process) Restart() error {
+	// Stop the process first
+	if err := p.Stop(); err != nil {
+		// If it's not running, that's ok - we'll just start it
+		if err.Error() != fmt.Sprintf("instance %s is not running", p.instance.Name) {
+			return fmt.Errorf("failed to stop instance during restart: %w", err)
+		}
+	}
+
+	// Reset restart counter for manual restart
+	p.mu.Lock()
+	p.restarts = 0
+	p.mu.Unlock()
+
+	// Start the process
+	return p.Start()
+}
+
+// WaitForHealthy waits for the process to become healthy
+func (p *process) WaitForHealthy(timeout int) error {
+	if !p.instance.IsRunning() {
+		return fmt.Errorf("instance %s is not running", p.instance.Name)
+	}
+
+	if timeout <= 0 {
+		timeout = 30 // Default to 30 seconds if no timeout is specified
+	}
+
+	ctx, cancel := context.WithTimeout(context.Background(), time.Duration(timeout)*time.Second)
+	defer cancel()
+
+	// Get host/port from instance
+	host, port := p.instance.getBackendHostPort()
+	healthURL := fmt.Sprintf("http://%s:%d/health", host, port)
+
+	// Create a dedicated HTTP client for health checks
+	client := &http.Client{
+		Timeout: 5 * time.Second, // 5 second timeout per request
+	}
+
+	// Helper function to check health directly
+	checkHealth := func() bool {
+		req, err := http.NewRequestWithContext(ctx, "GET", healthURL, nil)
+		if err != nil {
+			return false
+		}
+
+		resp, err := client.Do(req)
+		if err != nil {
+			return false
+		}
+		defer resp.Body.Close()
+
+		return resp.StatusCode == http.StatusOK
+	}
+
+	// Try immediate check first
+	if checkHealth() {
+		return nil // Instance is healthy
+	}
+
+	// If immediate check failed, start polling
+	ticker := time.NewTicker(1 * time.Second)
+	defer ticker.Stop()
+
+	for {
+		select {
+		case <-ctx.Done():
+			return fmt.Errorf("timeout waiting for instance %s to become healthy after %d seconds", p.instance.Name, timeout)
+		case <-ticker.C:
+			if checkHealth() {
+				return nil // Instance is healthy
+			}
+			// Continue polling
+		}
+	}
+}
+
+// monitorProcess monitors the OS process and handles crashes/exits
+func (p *process) monitorProcess() {
+	defer func() {
+		p.mu.Lock()
+		if p.monitorDone != nil {
+			close(p.monitorDone)
+			p.monitorDone = nil
+		}
+		p.mu.Unlock()
+	}()
+
+	err := p.cmd.Wait()
+
+	p.mu.Lock()
+
+	// Check if the instance was intentionally stopped
+	if !p.instance.IsRunning() {
+		p.mu.Unlock()
+		return
+	}
+
+	p.instance.SetStatus(Stopped)
+	p.instance.logger.Close()
+
+	// Cancel any existing restart context since we're handling a new exit
+	if p.restartCancel != nil {
+		p.restartCancel()
+		p.restartCancel = nil
+	}
+
+	// Log the exit
+	if err != nil {
+		log.Printf("Instance %s crashed with error: %v", p.instance.Name, err)
+		// Handle auto-restart logic
+		p.handleAutoRestart(err)
+	} else {
+		log.Printf("Instance %s exited cleanly", p.instance.Name)
+		p.mu.Unlock()
+	}
+}
+
+// shouldAutoRestart checks if the process should auto-restart
+func (p *process) shouldAutoRestart() bool {
+	opts := p.instance.GetOptions()
+	if opts == nil {
+		log.Printf("Instance %s not restarting: options are nil", p.instance.Name)
+		return false
+	}
+
+	if opts.AutoRestart == nil || !*opts.AutoRestart {
+		log.Printf("Instance %s not restarting: AutoRestart is disabled", p.instance.Name)
+		return false
+	}
+
+	if opts.MaxRestarts == nil {
+		log.Printf("Instance %s not restarting: MaxRestarts is nil", p.instance.Name)
+		return false
+	}
+
+	maxRestarts := *opts.MaxRestarts
+	if p.restarts >= maxRestarts {
+		log.Printf("Instance %s exceeded max restart attempts (%d)", p.instance.Name, maxRestarts)
+		return false
+	}
+
+	return true
+}
+
+// handleAutoRestart manages the auto-restart process
+func (p *process) handleAutoRestart(err error) {
+	// Check if should restart
+	if !p.shouldAutoRestart() {
+		p.instance.SetStatus(Failed)
+		p.mu.Unlock()
+		return
+	}
+
+	// Get restart parameters
+	opts := p.instance.GetOptions()
+	if opts.RestartDelay == nil {
+		log.Printf("Instance %s not restarting: RestartDelay is nil", p.instance.Name)
+		p.instance.SetStatus(Failed)
+		p.mu.Unlock()
+		return
+	}
+
+	restartDelay := *opts.RestartDelay
+	maxRestarts := *opts.MaxRestarts
+
+	p.restarts++
+	log.Printf("Auto-restarting instance %s (attempt %d/%d) in %v",
+		p.instance.Name, p.restarts, maxRestarts, time.Duration(restartDelay)*time.Second)
+
+	// Create a cancellable context for the restart delay
+	restartCtx, cancel := context.WithCancel(context.Background())
+	p.restartCancel = cancel
+
+	// Release the lock before sleeping
+	p.mu.Unlock()
+
+	// Use context-aware sleep so it can be cancelled
+	select {
+	case <-time.After(time.Duration(restartDelay) * time.Second):
+		// Sleep completed normally, continue with restart
+	case <-restartCtx.Done():
+		// Restart was cancelled
+		log.Printf("Restart cancelled for instance %s", p.instance.Name)
+		return
+	}
+
+	// Restart the instance
+	if err := p.Start(); err != nil {
+		log.Printf("Failed to restart instance %s: %v", p.instance.Name, err)
+	} else {
+		log.Printf("Successfully restarted instance %s", p.instance.Name)
+		// Clear the cancel function
+		p.mu.Lock()
+		p.restartCancel = nil
+		p.mu.Unlock()
+	}
+}
+
+// buildCommand builds the command to execute using backend-specific logic
+func (p *process) buildCommand() (*exec.Cmd, error) {
+	// Get options
+	opts := p.instance.GetOptions()
+	if opts == nil {
+		return nil, fmt.Errorf("instance options are nil")
+	}
+
+	// Get backend configuration
+	backendConfig, err := p.getBackendConfig()
+	if err != nil {
+		return nil, err
+	}
+
+	// Build the environment variables
+	env := opts.BuildEnvironment(backendConfig)
+
+	// Get the command to execute
+	command := opts.GetCommand(backendConfig)
+
+	// Build command arguments
+	args := opts.BuildCommandArgs(backendConfig)
+
+	// Create the exec.Cmd
+	cmd := exec.CommandContext(p.ctx, command, args...)
+
+	// Start with host environment variables
+	cmd.Env = os.Environ()
+
+	// Add/override with backend-specific environment variables
+	for k, v := range env {
+		cmd.Env = append(cmd.Env, fmt.Sprintf("%s=%s", k, v))
+	}
+
+	return cmd, nil
+}
+
+// getBackendConfig resolves the backend configuration for the current instance
+func (p *process) getBackendConfig() (*config.BackendSettings, error) {
+	opts := p.instance.GetOptions()
+	if opts == nil {
+		return nil, fmt.Errorf("instance options are nil")
+	}
+
+	var backendTypeStr string
+
+	switch opts.BackendType {
+	case backends.BackendTypeLlamaCpp:
+		backendTypeStr = "llama-cpp"
+	case backends.BackendTypeMlxLm:
+		backendTypeStr = "mlx"
+	case backends.BackendTypeVllm:
+		backendTypeStr = "vllm"
+	default:
+		return nil, fmt.Errorf("unsupported backend type: %s", opts.BackendType)
+	}
+
+	settings := p.instance.globalBackendSettings.GetBackendSettings(backendTypeStr)
+	return &settings, nil
+}