mirror of
https://github.com/lordmathis/llamactl.git
synced 2025-11-05 16:44:22 +00:00
Improve health checks for instances
This commit is contained in:
@@ -2,7 +2,7 @@
|
||||
import React from "react";
|
||||
import { Badge } from "@/components/ui/badge";
|
||||
import type { HealthStatus } from "@/types/instance";
|
||||
import { CheckCircle, Loader2, XCircle } from "lucide-react";
|
||||
import { CheckCircle, Loader2, XCircle, Clock, AlertCircle } from "lucide-react";
|
||||
|
||||
interface HealthBadgeProps {
|
||||
health?: HealthStatus;
|
||||
@@ -10,53 +10,61 @@ interface HealthBadgeProps {
|
||||
|
||||
const HealthBadge: React.FC<HealthBadgeProps> = ({ health }) => {
|
||||
if (!health) {
|
||||
health = {
|
||||
status: "unknown", // Default to unknown if not provided
|
||||
lastChecked: new Date(), // Default to current date
|
||||
message: undefined, // No message by default
|
||||
};
|
||||
return null;
|
||||
}
|
||||
|
||||
const getIcon = () => {
|
||||
switch (health.status) {
|
||||
case "ok":
|
||||
switch (health.state) {
|
||||
case "ready":
|
||||
return <CheckCircle className="h-3 w-3" />;
|
||||
case "loading":
|
||||
return <Loader2 className="h-3 w-3 animate-spin" />;
|
||||
case "error":
|
||||
return <XCircle className="h-3 w-3" />;
|
||||
case "unknown":
|
||||
case "starting":
|
||||
return <Loader2 className="h-3 w-3 animate-spin" />;
|
||||
case "restarting":
|
||||
return <Loader2 className="h-3 w-3 animate-spin" />;
|
||||
case "stopped":
|
||||
return <Clock className="h-3 w-3" />;
|
||||
case "error":
|
||||
return <AlertCircle className="h-3 w-3" />;
|
||||
case "failed":
|
||||
return <XCircle className="h-3 w-3" />;
|
||||
}
|
||||
};
|
||||
|
||||
const getVariant = () => {
|
||||
switch (health.status) {
|
||||
case "ok":
|
||||
switch (health.state) {
|
||||
case "ready":
|
||||
return "default";
|
||||
case "loading":
|
||||
return "outline";
|
||||
case "starting":
|
||||
return "outline";
|
||||
case "restarting":
|
||||
return "outline";
|
||||
case "stopped":
|
||||
return "secondary";
|
||||
case "error":
|
||||
return "destructive";
|
||||
case "unknown":
|
||||
return "secondary";
|
||||
case "failed":
|
||||
return "destructive";
|
||||
}
|
||||
};
|
||||
|
||||
const getText = () => {
|
||||
switch (health.status) {
|
||||
case "ok":
|
||||
switch (health.state) {
|
||||
case "ready":
|
||||
return "Ready";
|
||||
case "loading":
|
||||
return "Loading";
|
||||
case "starting":
|
||||
return "Starting";
|
||||
case "restarting":
|
||||
return "Restarting";
|
||||
case "stopped":
|
||||
return "Stopped";
|
||||
case "error":
|
||||
return "Error";
|
||||
case "unknown":
|
||||
return "Unknown";
|
||||
case "failed":
|
||||
return "Failed";
|
||||
}
|
||||
@@ -66,10 +74,11 @@ const HealthBadge: React.FC<HealthBadgeProps> = ({ health }) => {
|
||||
<Badge
|
||||
variant={getVariant()}
|
||||
className={`flex items-center gap-1.5 ${
|
||||
health.status === "ok"
|
||||
health.state === "ready"
|
||||
? "bg-green-100 text-green-800 border-green-200 dark:bg-green-900 dark:text-green-200 dark:border-green-800"
|
||||
: ""
|
||||
}`}
|
||||
title={health.error || `Source: ${health.source}`}
|
||||
>
|
||||
{getIcon()}
|
||||
<span className="text-xs">{getText()}</span>
|
||||
|
||||
@@ -2,6 +2,7 @@ import { type ReactNode, createContext, useContext, useState, useEffect, useCall
|
||||
import type { CreateInstanceOptions, Instance } from '@/types/instance'
|
||||
import { instancesApi } from '@/lib/api'
|
||||
import { useAuth } from '@/contexts/AuthContext'
|
||||
import { healthService } from '@/lib/healthService'
|
||||
|
||||
interface InstancesContextState {
|
||||
instances: Instance[]
|
||||
@@ -115,6 +116,9 @@ export const InstancesProvider = ({ children }: InstancesProviderProps) => {
|
||||
|
||||
// Update only this instance's status
|
||||
updateInstanceInMap(name, { status: "running" })
|
||||
|
||||
// Trigger health check after starting
|
||||
healthService.checkHealthAfterOperation(name, 'start')
|
||||
} catch (err) {
|
||||
setError(err instanceof Error ? err.message : 'Failed to start instance')
|
||||
}
|
||||
@@ -127,6 +131,9 @@ export const InstancesProvider = ({ children }: InstancesProviderProps) => {
|
||||
|
||||
// Update only this instance's status
|
||||
updateInstanceInMap(name, { status: "stopped" })
|
||||
|
||||
// Trigger health check after stopping
|
||||
healthService.checkHealthAfterOperation(name, 'stop')
|
||||
} catch (err) {
|
||||
setError(err instanceof Error ? err.message : 'Failed to stop instance')
|
||||
}
|
||||
@@ -139,6 +146,9 @@ export const InstancesProvider = ({ children }: InstancesProviderProps) => {
|
||||
|
||||
// Update only this instance's status
|
||||
updateInstanceInMap(name, { status: "running" })
|
||||
|
||||
// Trigger health check after restarting
|
||||
healthService.checkHealthAfterOperation(name, 'restart')
|
||||
} catch (err) {
|
||||
setError(err instanceof Error ? err.message : 'Failed to restart instance')
|
||||
}
|
||||
|
||||
@@ -7,23 +7,22 @@ export function useInstanceHealth(instanceName: string, instanceStatus: Instance
|
||||
const [health, setHealth] = useState<HealthStatus | undefined>()
|
||||
|
||||
useEffect(() => {
|
||||
if (instanceStatus === "stopped") {
|
||||
setHealth({ status: "unknown", lastChecked: new Date() })
|
||||
return
|
||||
}
|
||||
|
||||
if (instanceStatus === "failed") {
|
||||
setHealth({ status: instanceStatus, lastChecked: new Date() })
|
||||
return
|
||||
}
|
||||
|
||||
// Subscribe to health updates for this instance
|
||||
const unsubscribe = healthService.subscribe(instanceName, (healthStatus) => {
|
||||
setHealth(healthStatus)
|
||||
})
|
||||
|
||||
// Cleanup subscription on unmount or when instanceStatus changes
|
||||
// Cleanup subscription on unmount or when instance changes
|
||||
return unsubscribe
|
||||
}, [instanceName])
|
||||
|
||||
// Trigger health check when instance status changes to active states
|
||||
useEffect(() => {
|
||||
if (instanceStatus === 'running' || instanceStatus === 'restarting') {
|
||||
healthService.refreshHealth(instanceName).catch(error => {
|
||||
console.error(`Failed to refresh health for ${instanceName}:`, error)
|
||||
})
|
||||
}
|
||||
}, [instanceName, instanceStatus])
|
||||
|
||||
return health
|
||||
|
||||
@@ -1,46 +1,221 @@
|
||||
import { type HealthStatus } from '@/types/instance'
|
||||
import { type HealthStatus, type InstanceStatus, type HealthState } from '@/types/instance'
|
||||
import { instancesApi } from '@/lib/api'
|
||||
|
||||
type HealthCallback = (health: HealthStatus) => void
|
||||
|
||||
// Polling intervals based on health state (in milliseconds)
|
||||
const POLLING_INTERVALS: Record<HealthState, number> = {
|
||||
'starting': 5000, // 5 seconds - frequent during startup
|
||||
'loading': 5000, // 5 seconds - model loading
|
||||
'restarting': 5000, // 5 seconds - restart in progress
|
||||
'ready': 60000, // 60 seconds - stable state
|
||||
'stopped': 0, // No polling
|
||||
'failed': 0, // No polling
|
||||
'error': 10000, // 10 seconds - retry on error
|
||||
}
|
||||
|
||||
class HealthService {
|
||||
private intervals: Map<string, NodeJS.Timeout> = new Map()
|
||||
private callbacks: Map<string, Set<HealthCallback>> = new Map()
|
||||
private lastHealthState: Map<string, HealthState> = new Map()
|
||||
private healthCache: Map<string, { health: HealthStatus; timestamp: number }> = new Map()
|
||||
private readonly CACHE_TTL = 2000 // 2 seconds cache
|
||||
|
||||
/**
|
||||
* Performs a two-tier health check:
|
||||
* 1. Get instance status from backend (authoritative)
|
||||
* 2. If running, perform HTTP health check
|
||||
*/
|
||||
async performHealthCheck(instanceName: string): Promise<HealthStatus> {
|
||||
// Check cache first
|
||||
const cached = this.healthCache.get(instanceName)
|
||||
if (cached && Date.now() - cached.timestamp < this.CACHE_TTL) {
|
||||
return cached.health
|
||||
}
|
||||
|
||||
async checkHealth(instanceName: string): Promise<HealthStatus> {
|
||||
try {
|
||||
await instancesApi.getHealth(instanceName)
|
||||
// Step 1: Get instance details (includes status)
|
||||
const instance = await instancesApi.get(instanceName)
|
||||
|
||||
return {
|
||||
status: 'ok',
|
||||
lastChecked: new Date()
|
||||
}
|
||||
} catch (error) {
|
||||
if (error instanceof Error) {
|
||||
// Check if it's a 503 (service unavailable - loading)
|
||||
if (error.message.includes('503')) {
|
||||
return {
|
||||
status: 'loading',
|
||||
message: 'Instance is starting up',
|
||||
lastChecked: new Date()
|
||||
// Step 2: If running, attempt HTTP health check
|
||||
if (instance.status === 'running') {
|
||||
try {
|
||||
await instancesApi.getHealth(instanceName)
|
||||
|
||||
// HTTP health check succeeded
|
||||
const health: HealthStatus = {
|
||||
state: 'ready',
|
||||
instanceStatus: 'running',
|
||||
lastChecked: new Date(),
|
||||
source: 'http'
|
||||
}
|
||||
|
||||
this.updateCache(instanceName, health)
|
||||
return health
|
||||
|
||||
} catch (httpError) {
|
||||
// HTTP health check failed while instance is running
|
||||
// Re-verify instance is still running
|
||||
try {
|
||||
const verifyInstance = await instancesApi.get(instanceName)
|
||||
|
||||
if (verifyInstance.status !== 'running') {
|
||||
// Instance stopped/failed since our first check
|
||||
const health: HealthStatus = {
|
||||
state: this.mapStatusToHealthState(verifyInstance.status),
|
||||
instanceStatus: verifyInstance.status,
|
||||
lastChecked: new Date(),
|
||||
source: 'backend'
|
||||
}
|
||||
|
||||
this.updateCache(instanceName, health)
|
||||
return health
|
||||
}
|
||||
|
||||
// Instance still running but HTTP failed - classify error
|
||||
const health = this.classifyHttpError(httpError as Error, 'running')
|
||||
this.updateCache(instanceName, health)
|
||||
return health
|
||||
|
||||
} catch (verifyError) {
|
||||
// Failed to verify - return error state
|
||||
const health: HealthStatus = {
|
||||
state: 'error',
|
||||
instanceStatus: 'running',
|
||||
lastChecked: new Date(),
|
||||
error: 'Failed to verify instance status',
|
||||
source: 'error'
|
||||
}
|
||||
|
||||
this.updateCache(instanceName, health)
|
||||
return health
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
status: 'error',
|
||||
message: error.message,
|
||||
lastChecked: new Date()
|
||||
} else {
|
||||
// Instance not running - return backend status
|
||||
const health: HealthStatus = {
|
||||
state: this.mapStatusToHealthState(instance.status),
|
||||
instanceStatus: instance.status,
|
||||
lastChecked: new Date(),
|
||||
source: 'backend'
|
||||
}
|
||||
|
||||
this.updateCache(instanceName, health)
|
||||
return health
|
||||
}
|
||||
|
||||
return {
|
||||
status: 'error',
|
||||
message: 'Unknown error',
|
||||
lastChecked: new Date()
|
||||
} catch (error) {
|
||||
// Failed to get instance
|
||||
const health: HealthStatus = {
|
||||
state: 'error',
|
||||
instanceStatus: 'unknown',
|
||||
lastChecked: new Date(),
|
||||
error: error instanceof Error ? error.message : 'Unknown error',
|
||||
source: 'error'
|
||||
}
|
||||
|
||||
this.updateCache(instanceName, health)
|
||||
return health
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Classifies HTTP errors into appropriate health states
|
||||
*/
|
||||
private classifyHttpError(error: Error, instanceStatus: InstanceStatus): HealthStatus {
|
||||
const errorMessage = error.message.toLowerCase()
|
||||
|
||||
// Parse HTTP status code from error message if available
|
||||
if (errorMessage.includes('503')) {
|
||||
return {
|
||||
state: 'loading',
|
||||
instanceStatus,
|
||||
lastChecked: new Date(),
|
||||
error: 'Service loading',
|
||||
source: 'http'
|
||||
}
|
||||
}
|
||||
|
||||
if (errorMessage.includes('connection refused') ||
|
||||
errorMessage.includes('econnrefused') ||
|
||||
errorMessage.includes('network error')) {
|
||||
return {
|
||||
state: 'starting',
|
||||
instanceStatus,
|
||||
lastChecked: new Date(),
|
||||
error: 'Connection refused',
|
||||
source: 'http'
|
||||
}
|
||||
}
|
||||
|
||||
// Other HTTP errors
|
||||
return {
|
||||
state: 'error',
|
||||
instanceStatus,
|
||||
lastChecked: new Date(),
|
||||
error: error.message,
|
||||
source: 'http'
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Maps backend instance status to health state
|
||||
*/
|
||||
private mapStatusToHealthState(status: InstanceStatus): HealthState {
|
||||
switch (status) {
|
||||
case 'stopped': return 'stopped'
|
||||
case 'running': return 'starting' // Unknown without HTTP check
|
||||
case 'failed': return 'failed'
|
||||
case 'restarting': return 'restarting'
|
||||
default: return 'error'
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Updates health cache
|
||||
*/
|
||||
private updateCache(instanceName: string, health: HealthStatus): void {
|
||||
this.healthCache.set(instanceName, {
|
||||
health,
|
||||
timestamp: Date.now()
|
||||
})
|
||||
}
|
||||
|
||||
/**
|
||||
* Manually refresh health for an instance
|
||||
*/
|
||||
async refreshHealth(instanceName: string): Promise<void> {
|
||||
// Invalidate cache
|
||||
this.healthCache.delete(instanceName)
|
||||
|
||||
const health = await this.performHealthCheck(instanceName)
|
||||
this.notifyCallbacks(instanceName, health)
|
||||
|
||||
// Update last state and adjust polling interval if needed
|
||||
const previousState = this.lastHealthState.get(instanceName)
|
||||
this.lastHealthState.set(instanceName, health.state)
|
||||
|
||||
if (previousState !== health.state) {
|
||||
this.adjustPollingInterval(instanceName, health.state)
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Trigger health check after instance operation
|
||||
*/
|
||||
checkHealthAfterOperation(instanceName: string, operation: 'start' | 'stop' | 'restart'): void {
|
||||
// Invalidate cache immediately
|
||||
this.healthCache.delete(instanceName)
|
||||
|
||||
// Perform immediate health check
|
||||
this.refreshHealth(instanceName).catch(error => {
|
||||
console.error(`Failed to check health after ${operation}:`, error)
|
||||
})
|
||||
}
|
||||
|
||||
/**
|
||||
* Subscribe to health updates for an instance
|
||||
*/
|
||||
subscribe(instanceName: string, callback: HealthCallback): () => void {
|
||||
if (!this.callbacks.has(instanceName)) {
|
||||
this.callbacks.set(instanceName, new Set())
|
||||
@@ -63,31 +238,70 @@ class HealthService {
|
||||
if (callbacks.size === 0) {
|
||||
this.stopHealthCheck(instanceName)
|
||||
this.callbacks.delete(instanceName)
|
||||
this.lastHealthState.delete(instanceName)
|
||||
this.healthCache.delete(instanceName)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Start health checking for an instance
|
||||
*/
|
||||
private startHealthCheck(instanceName: string): void {
|
||||
if (this.intervals.has(instanceName)) {
|
||||
return // Already checking
|
||||
}
|
||||
|
||||
// Initial check with delay
|
||||
setTimeout(async () => {
|
||||
const health = await this.checkHealth(instanceName)
|
||||
this.notifyCallbacks(instanceName, health)
|
||||
|
||||
// Start periodic checks
|
||||
const interval = setInterval(async () => {
|
||||
const health = await this.checkHealth(instanceName)
|
||||
this.notifyCallbacks(instanceName, health)
|
||||
}, 60000)
|
||||
|
||||
this.intervals.set(instanceName, interval)
|
||||
}, 5000)
|
||||
// Initial check immediately
|
||||
this.refreshHealth(instanceName).then(() => {
|
||||
const currentState = this.lastHealthState.get(instanceName)
|
||||
if (currentState) {
|
||||
this.adjustPollingInterval(instanceName, currentState)
|
||||
}
|
||||
}).catch(error => {
|
||||
console.error(`Failed to start health check for ${instanceName}:`, error)
|
||||
})
|
||||
}
|
||||
|
||||
/**
|
||||
* Adjust polling interval based on current health state
|
||||
*/
|
||||
private adjustPollingInterval(instanceName: string, state: HealthState): void {
|
||||
// Clear existing interval
|
||||
this.stopHealthCheck(instanceName)
|
||||
|
||||
const pollInterval = POLLING_INTERVALS[state]
|
||||
|
||||
// Don't poll for stable states (stopped, failed, ready has long interval)
|
||||
if (pollInterval === 0) {
|
||||
return
|
||||
}
|
||||
|
||||
// Start new interval with appropriate timing
|
||||
const interval = setInterval(async () => {
|
||||
try {
|
||||
const health = await this.performHealthCheck(instanceName)
|
||||
this.notifyCallbacks(instanceName, health)
|
||||
|
||||
// Check if state changed and adjust interval
|
||||
const previousState = this.lastHealthState.get(instanceName)
|
||||
this.lastHealthState.set(instanceName, health.state)
|
||||
|
||||
if (previousState !== health.state) {
|
||||
this.adjustPollingInterval(instanceName, health.state)
|
||||
}
|
||||
} catch (error) {
|
||||
console.error(`Health check failed for ${instanceName}:`, error)
|
||||
}
|
||||
}, pollInterval)
|
||||
|
||||
this.intervals.set(instanceName, interval)
|
||||
}
|
||||
|
||||
/**
|
||||
* Stop health checking for an instance
|
||||
*/
|
||||
private stopHealthCheck(instanceName: string): void {
|
||||
const interval = this.intervals.get(instanceName)
|
||||
if (interval) {
|
||||
@@ -96,6 +310,9 @@ class HealthService {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Notify all callbacks with health update
|
||||
*/
|
||||
private notifyCallbacks(instanceName: string, health: HealthStatus): void {
|
||||
const callbacks = this.callbacks.get(instanceName)
|
||||
if (callbacks) {
|
||||
@@ -103,16 +320,21 @@ class HealthService {
|
||||
}
|
||||
}
|
||||
|
||||
stopAll(): void {
|
||||
/**
|
||||
* Stop all health checking and cleanup
|
||||
*/
|
||||
destroy(): void {
|
||||
this.intervals.forEach(interval => clearInterval(interval))
|
||||
this.intervals.clear()
|
||||
this.callbacks.clear()
|
||||
this.lastHealthState.clear()
|
||||
this.healthCache.clear()
|
||||
}
|
||||
}
|
||||
|
||||
export const healthService = new HealthService()
|
||||
|
||||
// Export the individual checkHealth function as well
|
||||
// Export the individual performHealthCheck function as well
|
||||
export async function checkHealth(instanceName: string): Promise<HealthStatus> {
|
||||
return healthService.checkHealth(instanceName)
|
||||
return healthService.performHealthCheck(instanceName)
|
||||
}
|
||||
@@ -11,12 +11,16 @@ export const BackendType = {
|
||||
|
||||
export type BackendTypeValue = typeof BackendType[keyof typeof BackendType]
|
||||
|
||||
export type InstanceStatus = 'running' | 'stopped' | 'failed'
|
||||
export type InstanceStatus = 'running' | 'stopped' | 'failed' | 'restarting'
|
||||
|
||||
export type HealthState = 'stopped' | 'starting' | 'loading' | 'ready' | 'error' | 'failed' | 'restarting'
|
||||
|
||||
export interface HealthStatus {
|
||||
status: 'ok' | 'loading' | 'error' | 'unknown' | 'failed'
|
||||
message?: string
|
||||
state: HealthState
|
||||
instanceStatus: InstanceStatus | 'unknown'
|
||||
lastChecked: Date
|
||||
error?: string
|
||||
source: 'backend' | 'http' | 'error'
|
||||
}
|
||||
|
||||
export interface Instance {
|
||||
|
||||
Reference in New Issue
Block a user