Fix a bunch of deadlocks and nil options issues

This commit is contained in:
2025-07-21 22:19:06 +02:00
parent 33b5200e42
commit d05f0af637
2 changed files with 223 additions and 40 deletions

View File

@@ -46,29 +46,68 @@ type Instance struct {
cancel context.CancelFunc `json:"-"` // Function to cancel the context cancel context.CancelFunc `json:"-"` // Function to cancel the context
stdout io.ReadCloser `json:"-"` // Standard output stream stdout io.ReadCloser `json:"-"` // Standard output stream
stderr io.ReadCloser `json:"-"` // Standard error stream stderr io.ReadCloser `json:"-"` // Standard error stream
mu sync.Mutex `json:"-"` // Mutex for synchronizing access to the instance mu sync.RWMutex `json:"-"` // RWMutex for better read/write separation
restarts int `json:"-"` // Number of restarts restarts int `json:"-"` // Number of restarts
proxy *httputil.ReverseProxy `json:"-"` // Reverse proxy for this instance proxy *httputil.ReverseProxy `json:"-"` // Reverse proxy for this instance
// Restart control
restartCancel context.CancelFunc `json:"-"` // Cancel function for pending restarts
} }
// NewInstance creates a new instance with the given name, log path, and options // NewInstance creates a new instance with the given name, log path, and options
func NewInstance(name string, globalSettings *InstancesConfig, options *CreateInstanceOptions) *Instance { func NewInstance(name string, globalSettings *InstancesConfig, options *CreateInstanceOptions) *Instance {
if options.AutoRestart == nil { // Make a deep copy of options to avoid modifying the original and to prevent data races
optionsCopy := &CreateInstanceOptions{}
if options != nil {
// Copy the embedded LlamaServerOptions
optionsCopy.LlamaServerOptions = options.LlamaServerOptions
// Copy and validate pointer fields
if options.AutoRestart != nil {
autoRestart := *options.AutoRestart
optionsCopy.AutoRestart = &autoRestart
}
if options.MaxRestarts != nil {
maxRestarts := *options.MaxRestarts
if maxRestarts > 100 {
log.Printf("Instance %s MaxRestarts value (%d) limited to 100", name, maxRestarts)
maxRestarts = 100
} else if maxRestarts < 0 {
log.Printf("Instance %s MaxRestarts value (%d) cannot be negative, setting to 0", name, maxRestarts)
maxRestarts = 0
}
optionsCopy.MaxRestarts = &maxRestarts
}
if options.RestartDelay != nil {
restartDelay := *options.RestartDelay
if restartDelay < 1 {
log.Printf("Instance %s RestartDelay value (%d) too low, setting to 5 seconds", name, restartDelay)
restartDelay = 5
} else if restartDelay > 300 {
log.Printf("Instance %s RestartDelay value (%d) too high, limiting to 300 seconds", name, restartDelay)
restartDelay = 300
}
optionsCopy.RestartDelay = &restartDelay
}
}
// Set defaults for restart options if not provided
if optionsCopy.AutoRestart == nil {
defaultAutoRestart := globalSettings.DefaultAutoRestart defaultAutoRestart := globalSettings.DefaultAutoRestart
options.AutoRestart = &defaultAutoRestart optionsCopy.AutoRestart = &defaultAutoRestart
} }
if options.MaxRestarts == nil { if optionsCopy.MaxRestarts == nil {
defaultMaxRestarts := globalSettings.DefaultMaxRestarts defaultMaxRestarts := globalSettings.DefaultMaxRestarts
options.MaxRestarts = &defaultMaxRestarts optionsCopy.MaxRestarts = &defaultMaxRestarts
} }
if options.RestartDelay == nil { if optionsCopy.RestartDelay == nil {
defaultRestartDelay := globalSettings.DefaultRestartDelay defaultRestartDelay := globalSettings.DefaultRestartDelay
options.RestartDelay = &defaultRestartDelay optionsCopy.RestartDelay = &defaultRestartDelay
} }
return &Instance{ return &Instance{
Name: name, Name: name,
options: options, options: optionsCopy,
globalSettings: globalSettings, globalSettings: globalSettings,
Running: false, Running: false,
@@ -103,8 +142,8 @@ func (i *Instance) closeLogFile() {
} }
func (i *Instance) GetOptions() *CreateInstanceOptions { func (i *Instance) GetOptions() *CreateInstanceOptions {
i.mu.Lock() i.mu.RLock()
defer i.mu.Unlock() defer i.mu.RUnlock()
return i.options return i.options
} }
@@ -115,7 +154,26 @@ func (i *Instance) SetOptions(options *CreateInstanceOptions) {
log.Println("Warning: Attempted to set nil options on instance", i.Name) log.Println("Warning: Attempted to set nil options on instance", i.Name)
return return
} }
i.options = options
// Make a deep copy to avoid sharing references
optionsCopy := &CreateInstanceOptions{}
optionsCopy.LlamaServerOptions = options.LlamaServerOptions
// Copy pointer fields (validation already done in UnmarshalJSON)
if options.AutoRestart != nil {
autoRestart := *options.AutoRestart
optionsCopy.AutoRestart = &autoRestart
}
if options.MaxRestarts != nil {
maxRestarts := *options.MaxRestarts
optionsCopy.MaxRestarts = &maxRestarts
}
if options.RestartDelay != nil {
restartDelay := *options.RestartDelay
optionsCopy.RestartDelay = &restartDelay
}
i.options = optionsCopy
// Clear the proxy so it gets recreated with new options // Clear the proxy so it gets recreated with new options
i.proxy = nil i.proxy = nil
} }
@@ -149,6 +207,11 @@ func (i *Instance) Start() error {
return fmt.Errorf("instance %s is already running", i.Name) return fmt.Errorf("instance %s is already running", i.Name)
} }
// Safety check: ensure options are valid
if i.options == nil {
return fmt.Errorf("instance %s has no options set", i.Name)
}
// Create log files // Create log files
if err := i.createLogFile(); err != nil { if err := i.createLogFile(); err != nil {
return fmt.Errorf("failed to create log files: %w", err) return fmt.Errorf("failed to create log files: %w", err)
@@ -199,9 +262,21 @@ func (i *Instance) Stop() error {
defer i.mu.Unlock() defer i.mu.Unlock()
if !i.Running { if !i.Running {
// Even if not running, cancel any pending restart
if i.restartCancel != nil {
i.restartCancel()
i.restartCancel = nil
log.Printf("Cancelled pending restart for instance %s", i.Name)
}
return fmt.Errorf("instance %s is not running", i.Name) return fmt.Errorf("instance %s is not running", i.Name)
} }
// Cancel any pending restart
if i.restartCancel != nil {
i.restartCancel()
i.restartCancel = nil
}
// Cancel the context to signal termination // Cancel the context to signal termination
i.cancel() i.cancel()
@@ -233,14 +308,18 @@ func (i *Instance) Stop() error {
// GetLogs retrieves the last n lines of logs from the instance // GetLogs retrieves the last n lines of logs from the instance
func (i *Instance) GetLogs(num_lines int) (string, error) { func (i *Instance) GetLogs(num_lines int) (string, error) {
i.mu.Lock() i.mu.RLock()
defer i.mu.Unlock() logFileName := ""
if i.logFile != nil {
logFileName = i.logFile.Name()
}
i.mu.RUnlock()
if i.logFile == nil { if logFileName == "" {
return "", fmt.Errorf("log file not created for instance %s", i.Name) return "", fmt.Errorf("log file not created for instance %s", i.Name)
} }
file, err := os.Open(i.logFile.Name()) file, err := os.Open(logFileName)
if err != nil { if err != nil {
return "", fmt.Errorf("failed to open log file: %w", err) return "", fmt.Errorf("failed to open log file: %w", err)
} }
@@ -295,45 +374,106 @@ func (i *Instance) monitorProcess() {
if !i.Running { if !i.Running {
return return
} }
i.Running = false
i.Running = false
i.closeLogFile() i.closeLogFile()
// Cancel any existing restart context since we're handling a new exit
if i.restartCancel != nil {
i.restartCancel()
i.restartCancel = nil
}
// Log the exit // Log the exit
if err != nil { if err != nil {
log.Printf("Instance %s crashed with error: %v", i.Name, err) log.Printf("Instance %s crashed with error: %v", i.Name, err)
i.attemptRestart()
} else { } else {
log.Printf("Instance %s exited cleanly", i.Name) log.Printf("Instance %s exited cleanly", i.Name)
} }
}
// attemptRestart handles the auto-restart logic with safety checks
func (i *Instance) attemptRestart() {
// Validate restart conditions and get safe parameters
shouldRestart, maxRestarts, restartDelay := i.validateRestartConditions()
if !shouldRestart {
return
}
// Handle restart if process crashed and auto-restart is enabled
if err != nil && *i.options.AutoRestart && i.restarts < *i.options.MaxRestarts {
i.restarts++ i.restarts++
delayDuration := time.Duration(*i.options.RestartDelay) * time.Second
log.Printf("Auto-restarting instance %s (attempt %d/%d) in %v", log.Printf("Auto-restarting instance %s (attempt %d/%d) in %v",
i.Name, i.restarts, i.options.MaxRestarts, delayDuration) i.Name, i.restarts, maxRestarts, time.Duration(restartDelay)*time.Second)
// Unlock mutex during sleep to avoid blocking other operations // Create a cancellable context for the restart delay
restartCtx, cancel := context.WithCancel(context.Background())
i.restartCancel = cancel
// Sleep and restart without holding the lock
i.mu.Unlock() i.mu.Unlock()
time.Sleep(delayDuration)
i.mu.Lock()
// Attempt restart // Use context-aware sleep so it can be cancelled
select {
case <-time.After(time.Duration(restartDelay) * time.Second):
// Sleep completed normally, continue with restart
case <-restartCtx.Done():
// Restart was cancelled
log.Printf("Restart cancelled for instance %s", i.Name)
return
}
// Restart the instance
if err := i.Start(); err != nil { if err := i.Start(); err != nil {
log.Printf("Failed to restart instance %s: %v", i.Name, err) log.Printf("Failed to restart instance %s: %v", i.Name, err)
} else { } else {
log.Printf("Successfully restarted instance %s", i.Name) log.Printf("Successfully restarted instance %s", i.Name)
i.restarts = 0 // Reset restart count on successful restart // Reset restart count on successful restart
i.mu.Lock()
i.restarts = 0
i.restartCancel = nil // Clear the cancel function
i.mu.Unlock()
} }
} else if i.restarts >= *i.options.MaxRestarts {
log.Printf("Instance %s exceeded max restart attempts (%d)", i.Name, i.options.MaxRestarts)
} }
// validateRestartConditions checks if the instance should be restarted and returns the parameters
func (i *Instance) validateRestartConditions() (shouldRestart bool, maxRestarts int, restartDelay int) {
if i.options == nil {
log.Printf("Instance %s not restarting: options are nil", i.Name)
return false, 0, 0
}
if i.options.AutoRestart == nil || !*i.options.AutoRestart {
log.Printf("Instance %s not restarting: AutoRestart is disabled", i.Name)
return false, 0, 0
}
if i.options.MaxRestarts == nil {
log.Printf("Instance %s not restarting: MaxRestarts is nil", i.Name)
return false, 0, 0
}
if i.options.RestartDelay == nil {
log.Printf("Instance %s not restarting: RestartDelay is nil", i.Name)
return false, 0, 0
}
// Values are already validated during unmarshaling/SetOptions
maxRestarts = *i.options.MaxRestarts
restartDelay = *i.options.RestartDelay
if i.restarts >= maxRestarts {
log.Printf("Instance %s exceeded max restart attempts (%d)", i.Name, maxRestarts)
return false, 0, 0
}
return true, maxRestarts, restartDelay
} }
// MarshalJSON implements json.Marshaler for Instance // MarshalJSON implements json.Marshaler for Instance
func (i *Instance) MarshalJSON() ([]byte, error) { func (i *Instance) MarshalJSON() ([]byte, error) {
i.mu.Lock() // Use read lock since we're only reading data
defer i.mu.Unlock() i.mu.RLock()
defer i.mu.RUnlock()
// Create a temporary struct with exported fields for JSON marshalling // Create a temporary struct with exported fields for JSON marshalling
temp := struct { temp := struct {
@@ -366,8 +506,25 @@ func (i *Instance) UnmarshalJSON(data []byte) error {
i.Name = temp.Name i.Name = temp.Name
i.Running = temp.Running i.Running = temp.Running
// Handle options - ensure embedded LlamaServerOptions is initialized // Handle options with validation
if temp.Options != nil { if temp.Options != nil {
// Validate and sanitize restart parameters
if temp.Options.MaxRestarts != nil {
if *temp.Options.MaxRestarts < 0 {
log.Printf("Instance %s MaxRestarts value (%d) cannot be negative, setting to 0", i.Name, *temp.Options.MaxRestarts)
maxRestarts := 0
temp.Options.MaxRestarts = &maxRestarts
}
}
if temp.Options.RestartDelay != nil {
if *temp.Options.RestartDelay < 0 {
log.Printf("Instance %s RestartDelay value (%d) cannot be negative, setting to 0", i.Name, *temp.Options.RestartDelay)
restartDelay := 5
temp.Options.RestartDelay = &restartDelay
}
}
i.options = temp.Options i.options = temp.Options
} }

View File

@@ -2,6 +2,7 @@ package llamactl
import ( import (
"fmt" "fmt"
"sync"
) )
// InstanceManager defines the interface for managing instances of the llama server. // InstanceManager defines the interface for managing instances of the llama server.
@@ -18,6 +19,7 @@ type InstanceManager interface {
} }
type instanceManager struct { type instanceManager struct {
mu sync.RWMutex
instances map[string]*Instance instances map[string]*Instance
ports map[int]bool ports map[int]bool
instancesConfig InstancesConfig instancesConfig InstancesConfig
@@ -34,6 +36,9 @@ func NewInstanceManager(instancesConfig InstancesConfig) InstanceManager {
// ListInstances returns a list of all instances managed by the instance manager. // ListInstances returns a list of all instances managed by the instance manager.
func (im *instanceManager) ListInstances() ([]*Instance, error) { func (im *instanceManager) ListInstances() ([]*Instance, error) {
im.mu.RLock()
defer im.mu.RUnlock()
var instances []*Instance var instances []*Instance
for _, instance := range im.instances { for _, instance := range im.instances {
instances = append(instances, instance) instances = append(instances, instance)
@@ -58,6 +63,9 @@ func (im *instanceManager) CreateInstance(name string, options *CreateInstanceOp
return nil, err return nil, err
} }
im.mu.Lock()
defer im.mu.Unlock()
// Check if instance with this name already exists // Check if instance with this name already exists
if im.instances[name] != nil { if im.instances[name] != nil {
return nil, fmt.Errorf("instance with name %s already exists", name) return nil, fmt.Errorf("instance with name %s already exists", name)
@@ -80,6 +88,9 @@ func (im *instanceManager) CreateInstance(name string, options *CreateInstanceOp
// GetInstance retrieves an instance by its name. // GetInstance retrieves an instance by its name.
func (im *instanceManager) GetInstance(name string) (*Instance, error) { func (im *instanceManager) GetInstance(name string) (*Instance, error) {
im.mu.RLock()
defer im.mu.RUnlock()
instance, exists := im.instances[name] instance, exists := im.instances[name]
if !exists { if !exists {
return nil, fmt.Errorf("instance with name %s not found", name) return nil, fmt.Errorf("instance with name %s not found", name)
@@ -89,7 +100,10 @@ func (im *instanceManager) GetInstance(name string) (*Instance, error) {
// UpdateInstance updates the options of an existing instance and returns it. // UpdateInstance updates the options of an existing instance and returns it.
func (im *instanceManager) UpdateInstance(name string, options *CreateInstanceOptions) (*Instance, error) { func (im *instanceManager) UpdateInstance(name string, options *CreateInstanceOptions) (*Instance, error) {
im.mu.RLock()
instance, exists := im.instances[name] instance, exists := im.instances[name]
im.mu.RUnlock()
if !exists { if !exists {
return nil, fmt.Errorf("instance with name %s not found", name) return nil, fmt.Errorf("instance with name %s not found", name)
} }
@@ -109,6 +123,9 @@ func (im *instanceManager) UpdateInstance(name string, options *CreateInstanceOp
// DeleteInstance removes stopped instance by its name. // DeleteInstance removes stopped instance by its name.
func (im *instanceManager) DeleteInstance(name string) error { func (im *instanceManager) DeleteInstance(name string) error {
im.mu.Lock()
defer im.mu.Unlock()
_, exists := im.instances[name] _, exists := im.instances[name]
if !exists { if !exists {
return fmt.Errorf("instance with name %s not found", name) return fmt.Errorf("instance with name %s not found", name)
@@ -125,7 +142,10 @@ func (im *instanceManager) DeleteInstance(name string) error {
// StartInstance starts a stopped instance and returns it. // StartInstance starts a stopped instance and returns it.
// If the instance is already running, it returns an error. // If the instance is already running, it returns an error.
func (im *instanceManager) StartInstance(name string) (*Instance, error) { func (im *instanceManager) StartInstance(name string) (*Instance, error) {
im.mu.RLock()
instance, exists := im.instances[name] instance, exists := im.instances[name]
im.mu.RUnlock()
if !exists { if !exists {
return nil, fmt.Errorf("instance with name %s not found", name) return nil, fmt.Errorf("instance with name %s not found", name)
} }
@@ -142,7 +162,10 @@ func (im *instanceManager) StartInstance(name string) (*Instance, error) {
// StopInstance stops a running instance and returns it. // StopInstance stops a running instance and returns it.
func (im *instanceManager) StopInstance(name string) (*Instance, error) { func (im *instanceManager) StopInstance(name string) (*Instance, error) {
im.mu.RLock()
instance, exists := im.instances[name] instance, exists := im.instances[name]
im.mu.RUnlock()
if !exists { if !exists {
return nil, fmt.Errorf("instance with name %s not found", name) return nil, fmt.Errorf("instance with name %s not found", name)
} }
@@ -168,7 +191,10 @@ func (im *instanceManager) RestartInstance(name string) (*Instance, error) {
// GetInstanceLogs retrieves the logs for a specific instance by its name. // GetInstanceLogs retrieves the logs for a specific instance by its name.
func (im *instanceManager) GetInstanceLogs(name string) (string, error) { func (im *instanceManager) GetInstanceLogs(name string) (string, error) {
im.mu.RLock()
_, exists := im.instances[name] _, exists := im.instances[name]
im.mu.RUnlock()
if !exists { if !exists {
return "", fmt.Errorf("instance with name %s not found", name) return "", fmt.Errorf("instance with name %s not found", name)
} }