llamactl/pkg/instance/process.go

package instance

import (
	"context"
	"fmt"
	"io"
	"log"
	"net/http"
	"os"
	"os/exec"
	"runtime"
	"sync"
	"syscall"
	"time"
)

// process manages the OS process lifecycle for a local instance.
// process owns its complete lifecycle including auto-restart logic.
type process struct {
	instance *Instance // Back-reference for SetStatus, GetOptions

	mu            sync.RWMutex
	cmd           *exec.Cmd
	ctx           context.Context
	cancel        context.CancelFunc
	stdout        io.ReadCloser
	stderr        io.ReadCloser
	restarts      int
	restartCancel context.CancelFunc
	monitorDone   chan struct{}
}

// newProcess creates a new process component for the given instance
func newProcess(instance *Instance) *process {
	return &process{
		instance: instance,
	}
}

// start starts the OS process and returns an error if it fails.
func (p *process) start() error {
	p.mu.Lock()
	defer p.mu.Unlock()

	if p.instance.IsRunning() {
		return fmt.Errorf("instance %s is already running", p.instance.Name)
	}

	// Safety check: ensure options are valid
	if p.instance.options == nil {
		return fmt.Errorf("instance %s has no options set", p.instance.Name)
	}

	// Reset restart counter when manually starting (not during auto-restart)
	// We can detect auto-restart by checking if restartCancel is set
	if p.restartCancel == nil {
		p.restarts = 0
	}

	// Initialize last request time to current time when starting
	if p.instance.proxy != nil {
		p.instance.proxy.updateLastRequestTime()
	}

	// Create context before building command (needed for CommandContext)
	p.ctx, p.cancel = context.WithCancel(context.Background())

	// Create log files
	if err := p.instance.logger.create(); err != nil {
		return fmt.Errorf("failed to create log files: %w", err)
	}

	// Build command using backend-specific methods
	cmd, cmdErr := p.buildCommand()
	if cmdErr != nil {
		return fmt.Errorf("failed to build command: %w", cmdErr)
	}
	p.cmd = cmd

	if runtime.GOOS != "windows" {
		setProcAttrs(p.cmd)
	}

	var err error
	p.stdout, err = p.cmd.StdoutPipe()
	if err != nil {
		p.instance.logger.close()
		return fmt.Errorf("failed to get stdout pipe: %w", err)
	}
	p.stderr, err = p.cmd.StderrPipe()
	if err != nil {
		p.stdout.Close()
		p.instance.logger.close()
		return fmt.Errorf("failed to get stderr pipe: %w", err)
	}

	if err := p.cmd.Start(); err != nil {
		return fmt.Errorf("failed to start instance %s: %w", p.instance.Name, err)
	}

	p.instance.SetStatus(Running)

	// Create channel for monitor completion signaling
	p.monitorDone = make(chan struct{})

	go p.instance.logger.readOutput(p.stdout)
	go p.instance.logger.readOutput(p.stderr)

	go p.monitorProcess()

	return nil
}

// stop terminates the subprocess without restarting
func (p *process) stop() error {
	p.mu.Lock()

	if !p.instance.IsRunning() {
		// Even if not running, cancel any pending restart
		if p.restartCancel != nil {
			p.restartCancel()
			p.restartCancel = nil
			log.Printf("Cancelled pending restart for instance %s", p.instance.Name)
		}
		p.mu.Unlock()
		return fmt.Errorf("instance %s is not running", p.instance.Name)
	}

	// Cancel any pending restart
	if p.restartCancel != nil {
		p.restartCancel()
		p.restartCancel = nil
	}

	// Set status to stopped first to signal intentional stop
	p.instance.SetStatus(Stopped)

	// Get the monitor done channel before releasing the lock
	monitorDone := p.monitorDone

	p.mu.Unlock()

	// Stop the process with SIGINT if cmd exists
	if p.cmd != nil && p.cmd.Process != nil {
		if err := p.cmd.Process.Signal(syscall.SIGINT); err != nil {
			log.Printf("Failed to send SIGINT to instance %s: %v", p.instance.Name, err)
		}
	}

	// If no process exists, we can return immediately
	if p.cmd == nil || monitorDone == nil {
		p.instance.logger.close()
		return nil
	}

	select {
	case <-monitorDone:
		// Process exited normally
	case <-time.After(30 * time.Second):
		// Force kill if it doesn't exit within 30 seconds
		if p.cmd != nil && p.cmd.Process != nil {
			killErr := p.cmd.Process.Kill()
			if killErr != nil {
				log.Printf("Failed to force kill instance %s: %v", p.instance.Name, killErr)
			}
			log.Printf("Instance %s did not stop in time, force killed", p.instance.Name)

			// Wait a bit more for the monitor to finish after force kill
			select {
			case <-monitorDone:
				// Monitor completed after force kill
			case <-time.After(2 * time.Second):
				log.Printf("Warning: Monitor goroutine did not complete after force kill for instance %s", p.instance.Name)
			}
		}
	}

	p.instance.logger.close()

	return nil
}

// restart manually restarts the process (resets restart counter)
func (p *process) restart() error {
	// Stop the process first
	if err := p.stop(); err != nil {
		// If it's not running, that's ok - we'll just start it
		if err.Error() != fmt.Sprintf("instance %s is not running", p.instance.Name) {
			return fmt.Errorf("failed to stop instance during restart: %w", err)
		}
	}

	// Reset restart counter for manual restart
	p.mu.Lock()
	p.restarts = 0
	p.mu.Unlock()

	// Start the process
	return p.start()
}

// waitForHealthy waits for the process to become healthy
func (p *process) waitForHealthy(timeout int) error {
	if !p.instance.IsRunning() {
		return fmt.Errorf("instance %s is not running", p.instance.Name)
	}

	if timeout <= 0 {
		timeout = 30 // Default to 30 seconds if no timeout is specified
	}

	ctx, cancel := context.WithTimeout(context.Background(), time.Duration(timeout)*time.Second)
	defer cancel()

	// Get host/port from instance
	host := p.instance.options.GetHost()
	port := p.instance.options.GetPort()
	healthURL := fmt.Sprintf("http://%s:%d/health", host, port)

	// Create a dedicated HTTP client for health checks
	client := &http.Client{
		Timeout: 5 * time.Second, // 5 second timeout per request
	}

	// Helper function to check health directly
	checkHealth := func() bool {
		req, err := http.NewRequestWithContext(ctx, "GET", healthURL, nil)
		if err != nil {
			return false
		}

		resp, err := client.Do(req)
		if err != nil {
			return false
		}
		defer resp.Body.Close()

		return resp.StatusCode == http.StatusOK
	}

	// Try immediate check first
	if checkHealth() {
		return nil // Instance is healthy
	}

	// If immediate check failed, start polling
	ticker := time.NewTicker(1 * time.Second)
	defer ticker.Stop()

	for {
		select {
		case <-ctx.Done():
			return fmt.Errorf("timeout waiting for instance %s to become healthy after %d seconds", p.instance.Name, timeout)
		case <-ticker.C:
			if checkHealth() {
				return nil // Instance is healthy
			}
			// Continue polling
		}
	}
}

// monitorProcess monitors the OS process and handles crashes/exits
func (p *process) monitorProcess() {
	defer func() {
		p.mu.Lock()
		if p.monitorDone != nil {
			close(p.monitorDone)
			p.monitorDone = nil
		}
		p.mu.Unlock()
	}()

	err := p.cmd.Wait()

	p.mu.Lock()

	// Check if the instance was intentionally stopped
	if !p.instance.IsRunning() {
		p.mu.Unlock()
		return
	}

	p.instance.SetStatus(Stopped)
	p.instance.logger.close()

	// Cancel any existing restart context since we're handling a new exit
	if p.restartCancel != nil {
		p.restartCancel()
		p.restartCancel = nil
	}

	// Log the exit
	if err != nil {
		log.Printf("Instance %s crashed with error: %v", p.instance.Name, err)
		// Handle auto-restart logic
		p.handleAutoRestart(err)
	} else {
		log.Printf("Instance %s exited cleanly", p.instance.Name)
		p.mu.Unlock()
	}
}

// shouldAutoRestart checks if the process should auto-restart
func (p *process) shouldAutoRestart() bool {
	opts := p.instance.GetOptions()
	if opts == nil {
		log.Printf("Instance %s not restarting: options are nil", p.instance.Name)
		return false
	}

	if opts.AutoRestart == nil || !*opts.AutoRestart {
		log.Printf("Instance %s not restarting: AutoRestart is disabled", p.instance.Name)
		return false
	}

	if opts.MaxRestarts == nil {
		log.Printf("Instance %s not restarting: MaxRestarts is nil", p.instance.Name)
		return false
	}

	maxRestarts := *opts.MaxRestarts
	if p.restarts >= maxRestarts {
		log.Printf("Instance %s exceeded max restart attempts (%d)", p.instance.Name, maxRestarts)
		return false
	}

	return true
}

// handleAutoRestart manages the auto-restart process
func (p *process) handleAutoRestart(err error) {
	// Check if should restart
	if !p.shouldAutoRestart() {
		p.instance.SetStatus(Failed)
		p.mu.Unlock()
		return
	}

	// Get restart parameters
	opts := p.instance.GetOptions()
	if opts.RestartDelay == nil {
		log.Printf("Instance %s not restarting: RestartDelay is nil", p.instance.Name)
		p.instance.SetStatus(Failed)
		p.mu.Unlock()
		return
	}

	restartDelay := *opts.RestartDelay
	maxRestarts := *opts.MaxRestarts

	p.restarts++
	log.Printf("Auto-restarting instance %s (attempt %d/%d) in %v",
		p.instance.Name, p.restarts, maxRestarts, time.Duration(restartDelay)*time.Second)

	// Create a cancellable context for the restart delay
	restartCtx, cancel := context.WithCancel(context.Background())
	p.restartCancel = cancel

	// Release the lock before sleeping
	p.mu.Unlock()

	// Use context-aware sleep so it can be cancelled
	select {
	case <-time.After(time.Duration(restartDelay) * time.Second):
		// Sleep completed normally, continue with restart
	case <-restartCtx.Done():
		// Restart was cancelled
		log.Printf("Restart cancelled for instance %s", p.instance.Name)
		return
	}

	// Restart the instance
	if err := p.start(); err != nil {
		log.Printf("Failed to restart instance %s: %v", p.instance.Name, err)
	} else {
		log.Printf("Successfully restarted instance %s", p.instance.Name)
		// Clear the cancel function
		p.mu.Lock()
		p.restartCancel = nil
		p.mu.Unlock()
	}
}

// buildCommand builds the command to execute using backend-specific logic
func (p *process) buildCommand() (*exec.Cmd, error) {

	// Build the environment variables
	env := p.instance.buildEnvironment()

	// Get the command to execute
	command := p.instance.getCommand()

	// Build command arguments
	args := p.instance.buildCommandArgs()

	// Create the exec.Cmd
	cmd := exec.CommandContext(p.ctx, command, args...)

	// Start with host environment variables
	cmd.Env = os.Environ()

	// Add/override with backend-specific environment variables
	for k, v := range env {
		cmd.Env = append(cmd.Env, fmt.Sprintf("%s=%s", k, v))
	}

	return cmd, nil
}