From 2a1bebeb24d01378e3af8f8f2c221911a23f6dc9 Mon Sep 17 00:00:00 2001 From: LordMathis Date: Sun, 26 Oct 2025 19:05:03 +0100 Subject: [PATCH] Improve health checks for instances --- webui/src/components/HealthBadge.tsx | 49 ++-- webui/src/contexts/InstancesContext.tsx | 10 + webui/src/hooks/useInstanceHealth.ts | 23 +- webui/src/lib/healthService.ts | 312 ++++++++++++++++++++---- webui/src/types/instance.ts | 10 +- 5 files changed, 324 insertions(+), 80 deletions(-) diff --git a/webui/src/components/HealthBadge.tsx b/webui/src/components/HealthBadge.tsx index 45c1960..e2b04e7 100644 --- a/webui/src/components/HealthBadge.tsx +++ b/webui/src/components/HealthBadge.tsx @@ -2,7 +2,7 @@ import React from "react"; import { Badge } from "@/components/ui/badge"; import type { HealthStatus } from "@/types/instance"; -import { CheckCircle, Loader2, XCircle } from "lucide-react"; +import { CheckCircle, Loader2, XCircle, Clock, AlertCircle } from "lucide-react"; interface HealthBadgeProps { health?: HealthStatus; @@ -10,53 +10,61 @@ interface HealthBadgeProps { const HealthBadge: React.FC = ({ health }) => { if (!health) { - health = { - status: "unknown", // Default to unknown if not provided - lastChecked: new Date(), // Default to current date - message: undefined, // No message by default - }; + return null; } const getIcon = () => { - switch (health.status) { - case "ok": + switch (health.state) { + case "ready": return ; case "loading": return ; - case "error": - return ; - case "unknown": + case "starting": return ; + case "restarting": + return ; + case "stopped": + return ; + case "error": + return ; case "failed": return ; } }; const getVariant = () => { - switch (health.status) { - case "ok": + switch (health.state) { + case "ready": return "default"; case "loading": return "outline"; + case "starting": + return "outline"; + case "restarting": + return "outline"; + case "stopped": + return "secondary"; case "error": return "destructive"; - case "unknown": - return "secondary"; case "failed": return "destructive"; } }; const getText = () => { - switch (health.status) { - case "ok": + switch (health.state) { + case "ready": return "Ready"; case "loading": return "Loading"; + case "starting": + return "Starting"; + case "restarting": + return "Restarting"; + case "stopped": + return "Stopped"; case "error": return "Error"; - case "unknown": - return "Unknown"; case "failed": return "Failed"; } @@ -66,10 +74,11 @@ const HealthBadge: React.FC = ({ health }) => { {getIcon()} {getText()} diff --git a/webui/src/contexts/InstancesContext.tsx b/webui/src/contexts/InstancesContext.tsx index 1aa1bd7..8a03083 100644 --- a/webui/src/contexts/InstancesContext.tsx +++ b/webui/src/contexts/InstancesContext.tsx @@ -2,6 +2,7 @@ import { type ReactNode, createContext, useContext, useState, useEffect, useCall import type { CreateInstanceOptions, Instance } from '@/types/instance' import { instancesApi } from '@/lib/api' import { useAuth } from '@/contexts/AuthContext' +import { healthService } from '@/lib/healthService' interface InstancesContextState { instances: Instance[] @@ -115,6 +116,9 @@ export const InstancesProvider = ({ children }: InstancesProviderProps) => { // Update only this instance's status updateInstanceInMap(name, { status: "running" }) + + // Trigger health check after starting + healthService.checkHealthAfterOperation(name, 'start') } catch (err) { setError(err instanceof Error ? err.message : 'Failed to start instance') } @@ -127,6 +131,9 @@ export const InstancesProvider = ({ children }: InstancesProviderProps) => { // Update only this instance's status updateInstanceInMap(name, { status: "stopped" }) + + // Trigger health check after stopping + healthService.checkHealthAfterOperation(name, 'stop') } catch (err) { setError(err instanceof Error ? err.message : 'Failed to stop instance') } @@ -139,6 +146,9 @@ export const InstancesProvider = ({ children }: InstancesProviderProps) => { // Update only this instance's status updateInstanceInMap(name, { status: "running" }) + + // Trigger health check after restarting + healthService.checkHealthAfterOperation(name, 'restart') } catch (err) { setError(err instanceof Error ? err.message : 'Failed to restart instance') } diff --git a/webui/src/hooks/useInstanceHealth.ts b/webui/src/hooks/useInstanceHealth.ts index eaa1597..87d818c 100644 --- a/webui/src/hooks/useInstanceHealth.ts +++ b/webui/src/hooks/useInstanceHealth.ts @@ -7,24 +7,23 @@ export function useInstanceHealth(instanceName: string, instanceStatus: Instance const [health, setHealth] = useState() useEffect(() => { - if (instanceStatus === "stopped") { - setHealth({ status: "unknown", lastChecked: new Date() }) - return - } - - if (instanceStatus === "failed") { - setHealth({ status: instanceStatus, lastChecked: new Date() }) - return - } - // Subscribe to health updates for this instance const unsubscribe = healthService.subscribe(instanceName, (healthStatus) => { setHealth(healthStatus) }) - // Cleanup subscription on unmount or when instanceStatus changes + // Cleanup subscription on unmount or when instance changes return unsubscribe + }, [instanceName]) + + // Trigger health check when instance status changes to active states + useEffect(() => { + if (instanceStatus === 'running' || instanceStatus === 'restarting') { + healthService.refreshHealth(instanceName).catch(error => { + console.error(`Failed to refresh health for ${instanceName}:`, error) + }) + } }, [instanceName, instanceStatus]) return health -} \ No newline at end of file +} diff --git a/webui/src/lib/healthService.ts b/webui/src/lib/healthService.ts index 025d29e..5830d53 100644 --- a/webui/src/lib/healthService.ts +++ b/webui/src/lib/healthService.ts @@ -1,51 +1,226 @@ -import { type HealthStatus } from '@/types/instance' +import { type HealthStatus, type InstanceStatus, type HealthState } from '@/types/instance' import { instancesApi } from '@/lib/api' type HealthCallback = (health: HealthStatus) => void +// Polling intervals based on health state (in milliseconds) +const POLLING_INTERVALS: Record = { + 'starting': 5000, // 5 seconds - frequent during startup + 'loading': 5000, // 5 seconds - model loading + 'restarting': 5000, // 5 seconds - restart in progress + 'ready': 60000, // 60 seconds - stable state + 'stopped': 0, // No polling + 'failed': 0, // No polling + 'error': 10000, // 10 seconds - retry on error +} + class HealthService { private intervals: Map = new Map() private callbacks: Map> = new Map() + private lastHealthState: Map = new Map() + private healthCache: Map = new Map() + private readonly CACHE_TTL = 2000 // 2 seconds cache + + /** + * Performs a two-tier health check: + * 1. Get instance status from backend (authoritative) + * 2. If running, perform HTTP health check + */ + async performHealthCheck(instanceName: string): Promise { + // Check cache first + const cached = this.healthCache.get(instanceName) + if (cached && Date.now() - cached.timestamp < this.CACHE_TTL) { + return cached.health + } - async checkHealth(instanceName: string): Promise { try { - await instancesApi.getHealth(instanceName) - - return { - status: 'ok', - lastChecked: new Date() - } - } catch (error) { - if (error instanceof Error) { - // Check if it's a 503 (service unavailable - loading) - if (error.message.includes('503')) { - return { - status: 'loading', - message: 'Instance is starting up', - lastChecked: new Date() + // Step 1: Get instance details (includes status) + const instance = await instancesApi.get(instanceName) + + // Step 2: If running, attempt HTTP health check + if (instance.status === 'running') { + try { + await instancesApi.getHealth(instanceName) + + // HTTP health check succeeded + const health: HealthStatus = { + state: 'ready', + instanceStatus: 'running', + lastChecked: new Date(), + source: 'http' + } + + this.updateCache(instanceName, health) + return health + + } catch (httpError) { + // HTTP health check failed while instance is running + // Re-verify instance is still running + try { + const verifyInstance = await instancesApi.get(instanceName) + + if (verifyInstance.status !== 'running') { + // Instance stopped/failed since our first check + const health: HealthStatus = { + state: this.mapStatusToHealthState(verifyInstance.status), + instanceStatus: verifyInstance.status, + lastChecked: new Date(), + source: 'backend' + } + + this.updateCache(instanceName, health) + return health + } + + // Instance still running but HTTP failed - classify error + const health = this.classifyHttpError(httpError as Error, 'running') + this.updateCache(instanceName, health) + return health + + } catch (verifyError) { + // Failed to verify - return error state + const health: HealthStatus = { + state: 'error', + instanceStatus: 'running', + lastChecked: new Date(), + error: 'Failed to verify instance status', + source: 'error' + } + + this.updateCache(instanceName, health) + return health } } - - return { - status: 'error', - message: error.message, - lastChecked: new Date() + } else { + // Instance not running - return backend status + const health: HealthStatus = { + state: this.mapStatusToHealthState(instance.status), + instanceStatus: instance.status, + lastChecked: new Date(), + source: 'backend' } + + this.updateCache(instanceName, health) + return health } - - return { - status: 'error', - message: 'Unknown error', - lastChecked: new Date() + + } catch (error) { + // Failed to get instance + const health: HealthStatus = { + state: 'error', + instanceStatus: 'unknown', + lastChecked: new Date(), + error: error instanceof Error ? error.message : 'Unknown error', + source: 'error' } + + this.updateCache(instanceName, health) + return health } } + /** + * Classifies HTTP errors into appropriate health states + */ + private classifyHttpError(error: Error, instanceStatus: InstanceStatus): HealthStatus { + const errorMessage = error.message.toLowerCase() + + // Parse HTTP status code from error message if available + if (errorMessage.includes('503')) { + return { + state: 'loading', + instanceStatus, + lastChecked: new Date(), + error: 'Service loading', + source: 'http' + } + } + + if (errorMessage.includes('connection refused') || + errorMessage.includes('econnrefused') || + errorMessage.includes('network error')) { + return { + state: 'starting', + instanceStatus, + lastChecked: new Date(), + error: 'Connection refused', + source: 'http' + } + } + + // Other HTTP errors + return { + state: 'error', + instanceStatus, + lastChecked: new Date(), + error: error.message, + source: 'http' + } + } + + /** + * Maps backend instance status to health state + */ + private mapStatusToHealthState(status: InstanceStatus): HealthState { + switch (status) { + case 'stopped': return 'stopped' + case 'running': return 'starting' // Unknown without HTTP check + case 'failed': return 'failed' + case 'restarting': return 'restarting' + default: return 'error' + } + } + + /** + * Updates health cache + */ + private updateCache(instanceName: string, health: HealthStatus): void { + this.healthCache.set(instanceName, { + health, + timestamp: Date.now() + }) + } + + /** + * Manually refresh health for an instance + */ + async refreshHealth(instanceName: string): Promise { + // Invalidate cache + this.healthCache.delete(instanceName) + + const health = await this.performHealthCheck(instanceName) + this.notifyCallbacks(instanceName, health) + + // Update last state and adjust polling interval if needed + const previousState = this.lastHealthState.get(instanceName) + this.lastHealthState.set(instanceName, health.state) + + if (previousState !== health.state) { + this.adjustPollingInterval(instanceName, health.state) + } + } + + /** + * Trigger health check after instance operation + */ + checkHealthAfterOperation(instanceName: string, operation: 'start' | 'stop' | 'restart'): void { + // Invalidate cache immediately + this.healthCache.delete(instanceName) + + // Perform immediate health check + this.refreshHealth(instanceName).catch(error => { + console.error(`Failed to check health after ${operation}:`, error) + }) + } + + /** + * Subscribe to health updates for an instance + */ subscribe(instanceName: string, callback: HealthCallback): () => void { if (!this.callbacks.has(instanceName)) { this.callbacks.set(instanceName, new Set()) } - + this.callbacks.get(instanceName)!.add(callback) // Start health checking if this is the first subscriber @@ -58,36 +233,75 @@ class HealthService { const callbacks = this.callbacks.get(instanceName) if (callbacks) { callbacks.delete(callback) - + // Stop health checking if no more subscribers if (callbacks.size === 0) { this.stopHealthCheck(instanceName) this.callbacks.delete(instanceName) + this.lastHealthState.delete(instanceName) + this.healthCache.delete(instanceName) } } } } + /** + * Start health checking for an instance + */ private startHealthCheck(instanceName: string): void { if (this.intervals.has(instanceName)) { return // Already checking } - // Initial check with delay - setTimeout(async () => { - const health = await this.checkHealth(instanceName) - this.notifyCallbacks(instanceName, health) - - // Start periodic checks - const interval = setInterval(async () => { - const health = await this.checkHealth(instanceName) - this.notifyCallbacks(instanceName, health) - }, 60000) - - this.intervals.set(instanceName, interval) - }, 5000) + // Initial check immediately + this.refreshHealth(instanceName).then(() => { + const currentState = this.lastHealthState.get(instanceName) + if (currentState) { + this.adjustPollingInterval(instanceName, currentState) + } + }).catch(error => { + console.error(`Failed to start health check for ${instanceName}:`, error) + }) } + /** + * Adjust polling interval based on current health state + */ + private adjustPollingInterval(instanceName: string, state: HealthState): void { + // Clear existing interval + this.stopHealthCheck(instanceName) + + const pollInterval = POLLING_INTERVALS[state] + + // Don't poll for stable states (stopped, failed, ready has long interval) + if (pollInterval === 0) { + return + } + + // Start new interval with appropriate timing + const interval = setInterval(async () => { + try { + const health = await this.performHealthCheck(instanceName) + this.notifyCallbacks(instanceName, health) + + // Check if state changed and adjust interval + const previousState = this.lastHealthState.get(instanceName) + this.lastHealthState.set(instanceName, health.state) + + if (previousState !== health.state) { + this.adjustPollingInterval(instanceName, health.state) + } + } catch (error) { + console.error(`Health check failed for ${instanceName}:`, error) + } + }, pollInterval) + + this.intervals.set(instanceName, interval) + } + + /** + * Stop health checking for an instance + */ private stopHealthCheck(instanceName: string): void { const interval = this.intervals.get(instanceName) if (interval) { @@ -96,6 +310,9 @@ class HealthService { } } + /** + * Notify all callbacks with health update + */ private notifyCallbacks(instanceName: string, health: HealthStatus): void { const callbacks = this.callbacks.get(instanceName) if (callbacks) { @@ -103,16 +320,21 @@ class HealthService { } } - stopAll(): void { + /** + * Stop all health checking and cleanup + */ + destroy(): void { this.intervals.forEach(interval => clearInterval(interval)) this.intervals.clear() this.callbacks.clear() + this.lastHealthState.clear() + this.healthCache.clear() } } export const healthService = new HealthService() -// Export the individual checkHealth function as well +// Export the individual performHealthCheck function as well export async function checkHealth(instanceName: string): Promise { - return healthService.checkHealth(instanceName) -} \ No newline at end of file + return healthService.performHealthCheck(instanceName) +} diff --git a/webui/src/types/instance.ts b/webui/src/types/instance.ts index 074e2f2..97d3cac 100644 --- a/webui/src/types/instance.ts +++ b/webui/src/types/instance.ts @@ -11,12 +11,16 @@ export const BackendType = { export type BackendTypeValue = typeof BackendType[keyof typeof BackendType] -export type InstanceStatus = 'running' | 'stopped' | 'failed' +export type InstanceStatus = 'running' | 'stopped' | 'failed' | 'restarting' + +export type HealthState = 'stopped' | 'starting' | 'loading' | 'ready' | 'error' | 'failed' | 'restarting' export interface HealthStatus { - status: 'ok' | 'loading' | 'error' | 'unknown' | 'failed' - message?: string + state: HealthState + instanceStatus: InstanceStatus | 'unknown' lastChecked: Date + error?: string + source: 'backend' | 'http' | 'error' } export interface Instance {