513 lines
20 KiB
Python
513 lines
20 KiB
Python
from fastapi import APIRouter, HTTPException
|
|
from fastapi.responses import HTMLResponse
|
|
import psutil
|
|
import asyncio
|
|
import logging
|
|
from datetime import datetime, timedelta
|
|
from typing import Dict, Any, List
|
|
import json
|
|
|
|
# 设置日志
|
|
logger = logging.getLogger(__name__)
|
|
|
|
from workflow_service.config import settings
|
|
from workflow_service.database.api import get_workflow_runs_recent, get_workflow_run_nodes
|
|
from workflow_service.comfy.comfy_queue import WorkflowQueueManager
|
|
from workflow_service.comfy.comfy_server import server_manager
|
|
|
|
monitor_router = APIRouter(prefix="/monitor", tags=["监控"])
|
|
|
|
# 全局队列管理器实例
|
|
queue_manager = WorkflowQueueManager()
|
|
|
|
@monitor_router.get("/", response_class=HTMLResponse)
|
|
async def monitor_dashboard():
|
|
"""监控仪表板页面"""
|
|
html_content = """
|
|
<!DOCTYPE html>
|
|
<html lang="zh-CN">
|
|
<head>
|
|
<meta charset="UTF-8">
|
|
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
<title>ComfyUI 工作流服务监控</title>
|
|
<style>
|
|
body {
|
|
font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
|
|
margin: 0;
|
|
padding: 20px;
|
|
background-color: #f5f5f5;
|
|
}
|
|
.container {
|
|
max-width: 1200px;
|
|
margin: 0 auto;
|
|
}
|
|
.header {
|
|
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
|
color: white;
|
|
padding: 20px;
|
|
border-radius: 10px;
|
|
margin-bottom: 20px;
|
|
text-align: center;
|
|
}
|
|
.stats-grid {
|
|
display: grid;
|
|
grid-template-columns: repeat(auto-fit, minmax(250px, 1fr));
|
|
gap: 20px;
|
|
margin-bottom: 20px;
|
|
}
|
|
.stat-card {
|
|
background: white;
|
|
padding: 20px;
|
|
border-radius: 10px;
|
|
box-shadow: 0 2px 10px rgba(0,0,0,0.1);
|
|
text-align: center;
|
|
}
|
|
.stat-number {
|
|
font-size: 2.5em;
|
|
font-weight: bold;
|
|
color: #667eea;
|
|
margin: 10px 0;
|
|
}
|
|
.stat-label {
|
|
color: #666;
|
|
font-size: 0.9em;
|
|
}
|
|
.section {
|
|
background: white;
|
|
padding: 20px;
|
|
border-radius: 10px;
|
|
box-shadow: 0 2px 10px rgba(0,0,0,0.1);
|
|
margin-bottom: 20px;
|
|
}
|
|
.section h3 {
|
|
margin-top: 0;
|
|
color: #333;
|
|
border-bottom: 2px solid #667eea;
|
|
padding-bottom: 10px;
|
|
}
|
|
.server-status {
|
|
display: grid;
|
|
grid-template-columns: repeat(auto-fit, minmax(300px, 1fr));
|
|
gap: 15px;
|
|
margin-top: 15px;
|
|
}
|
|
.server-card {
|
|
border: 1px solid #ddd;
|
|
border-radius: 8px;
|
|
padding: 15px;
|
|
background: #fafafa;
|
|
}
|
|
.server-card.online {
|
|
border-color: #4CAF50;
|
|
background: #f1f8e9;
|
|
}
|
|
.server-card.offline {
|
|
border-color: #f44336;
|
|
background: #ffebee;
|
|
}
|
|
.status-indicator {
|
|
display: inline-block;
|
|
width: 12px;
|
|
height: 12px;
|
|
border-radius: 50%;
|
|
margin-right: 8px;
|
|
}
|
|
.status-online { background-color: #4CAF50; }
|
|
.status-offline { background-color: #f44336; }
|
|
.refresh-btn {
|
|
background: #667eea;
|
|
color: white;
|
|
border: none;
|
|
padding: 10px 20px;
|
|
border-radius: 5px;
|
|
cursor: pointer;
|
|
font-size: 14px;
|
|
margin-bottom: 20px;
|
|
}
|
|
.refresh-btn:hover {
|
|
background: #5a6fd8;
|
|
}
|
|
.task-list {
|
|
max-height: 400px;
|
|
overflow-y: auto;
|
|
}
|
|
.task-item {
|
|
border: 1px solid #ddd;
|
|
border-radius: 5px;
|
|
padding: 10px;
|
|
margin: 5px 0;
|
|
background: #f9f9f9;
|
|
}
|
|
.task-status {
|
|
display: inline-block;
|
|
padding: 2px 8px;
|
|
border-radius: 3px;
|
|
font-size: 0.8em;
|
|
color: white;
|
|
}
|
|
.status-pending { background: #ff9800; }
|
|
.status-running { background: #2196F3; }
|
|
.status-completed { background: #4CAF50; }
|
|
.status-failed { background: #f44336; }
|
|
</style>
|
|
</head>
|
|
<body>
|
|
<div class="container">
|
|
<div class="header">
|
|
<h1>🚀 ComfyUI 工作流服务监控</h1>
|
|
<p>实时监控系统状态、任务队列和服务器健康状态</p>
|
|
</div>
|
|
|
|
<button class="refresh-btn" onclick="refreshData()">🔄 刷新数据</button>
|
|
|
|
<div class="stats-grid">
|
|
<div class="stat-card">
|
|
<div class="stat-number" id="cpu-usage">--</div>
|
|
<div class="stat-label">CPU 使用率</div>
|
|
</div>
|
|
<div class="stat-card">
|
|
<div class="stat-number" id="memory-usage">--</div>
|
|
<div class="stat-label">内存使用率</div>
|
|
</div>
|
|
<div class="stat-card">
|
|
<div class="stat-number" id="running-tasks">--</div>
|
|
<div class="stat-label">运行中任务</div>
|
|
</div>
|
|
<div class="stat-card">
|
|
<div class="stat-number" id="pending-tasks">--</div>
|
|
<div class="stat-label">等待中任务</div>
|
|
</div>
|
|
</div>
|
|
|
|
<div class="section">
|
|
<h3>🖥️ 服务器状态</h3>
|
|
<div class="server-status" id="server-status">
|
|
<!-- 服务器状态将在这里动态加载 -->
|
|
</div>
|
|
</div>
|
|
|
|
<div class="section">
|
|
<h3>📋 最近任务</h3>
|
|
<div class="task-list" id="recent-tasks">
|
|
<!-- 最近任务将在这里动态加载 -->
|
|
</div>
|
|
</div>
|
|
</div>
|
|
|
|
<script>
|
|
async function refreshData() {
|
|
try {
|
|
// 获取系统状态
|
|
const systemStats = await fetch('/monitor/system-stats').then(r => r.json());
|
|
document.getElementById('cpu-usage').textContent = systemStats.cpu_percent + '%';
|
|
document.getElementById('memory-usage').textContent = systemStats.memory_percent + '%';
|
|
|
|
// 获取任务状态
|
|
const taskStats = await fetch('/monitor/task-stats').then(r => r.json());
|
|
document.getElementById('running-tasks').textContent = taskStats.running_tasks;
|
|
document.getElementById('pending-tasks').textContent = taskStats.pending_tasks;
|
|
|
|
// 获取服务器状态
|
|
const serverStatus = await fetch('/monitor/server-status').then(r => r.json());
|
|
updateServerStatus(serverStatus);
|
|
|
|
// 获取最近任务
|
|
const recentTasks = await fetch('/monitor/recent-tasks').then(r => r.json());
|
|
updateRecentTasks(recentTasks);
|
|
|
|
} catch (error) {
|
|
console.error('刷新数据失败:', error);
|
|
}
|
|
}
|
|
|
|
function updateServerStatus(servers) {
|
|
const container = document.getElementById('server-status');
|
|
container.innerHTML = '';
|
|
|
|
servers.forEach(server => {
|
|
const statusClass = server.status === 'online' ? 'online' : 'offline';
|
|
const statusColor = server.status === 'online' ? 'status-online' : 'status-offline';
|
|
|
|
// 格式化时间
|
|
const formatTime = (timeStr) => {
|
|
if (!timeStr) return 'N/A';
|
|
try {
|
|
return new Date(timeStr).toLocaleString();
|
|
} catch {
|
|
return timeStr;
|
|
}
|
|
};
|
|
|
|
container.innerHTML += `
|
|
<div class="server-card ${statusClass}">
|
|
<h4>${server.http_url}</h4>
|
|
<p><span class="status-indicator ${statusColor}"></span>${server.status}</p>
|
|
<p><strong>任务状态:</strong> ${server.current_tasks || 0}/${server.max_concurrent_tasks || 1}</p>
|
|
<p><strong>最后心跳:</strong> ${formatTime(server.last_heartbeat)}</p>
|
|
<p><strong>最后检查:</strong> ${formatTime(server.last_health_check)}</p>
|
|
${server.error ? `<p><strong>错误:</strong> <span style="color: red;">${server.error}</span></p>` : ''}
|
|
</div>
|
|
`;
|
|
});
|
|
}
|
|
|
|
function updateRecentTasks(tasks) {
|
|
const container = document.getElementById('recent-tasks');
|
|
container.innerHTML = '';
|
|
|
|
tasks.forEach(task => {
|
|
const statusClass = 'status-' + task.status.toLowerCase();
|
|
container.innerHTML += `
|
|
<div class="task-item">
|
|
<div style="display: flex; justify-content: space-between; align-items: center;">
|
|
<div>
|
|
<strong>${task.workflow_name}</strong>
|
|
<br>
|
|
<small>ID: ${task.id}</small>
|
|
</div>
|
|
<span class="task-status ${statusClass}">${task.status}</span>
|
|
</div>
|
|
<div style="margin-top: 5px; color: #666;">
|
|
<small>创建时间: ${new Date(task.created_at).toLocaleString()}</small>
|
|
</div>
|
|
</div>
|
|
`;
|
|
});
|
|
}
|
|
|
|
// 页面加载时自动刷新数据
|
|
document.addEventListener('DOMContentLoaded', refreshData);
|
|
|
|
// 每30秒自动刷新一次
|
|
setInterval(refreshData, 30000);
|
|
</script>
|
|
</body>
|
|
</html>
|
|
"""
|
|
return HTMLResponse(content=html_content)
|
|
|
|
@monitor_router.get("/system-stats")
|
|
async def get_system_stats() -> Dict[str, Any]:
|
|
"""获取系统统计信息"""
|
|
try:
|
|
cpu_percent = psutil.cpu_percent(interval=1)
|
|
memory = psutil.virtual_memory()
|
|
disk = psutil.disk_usage('/')
|
|
|
|
return {
|
|
"cpu_percent": round(cpu_percent, 1),
|
|
"memory_percent": round(memory.percent, 1),
|
|
"memory_used_gb": round(memory.used / (1024**3), 2),
|
|
"memory_total_gb": round(memory.total / (1024**3), 2),
|
|
"disk_percent": round(disk.percent, 1),
|
|
"disk_used_gb": round(disk.used / (1024**3), 2),
|
|
"disk_total_gb": round(disk.total / (1024**3), 2),
|
|
"timestamp": datetime.now().isoformat()
|
|
}
|
|
except Exception as e:
|
|
raise HTTPException(status_code=500, detail=f"获取系统统计信息失败: {str(e)}")
|
|
|
|
@monitor_router.get("/task-stats")
|
|
async def get_task_stats() -> Dict[str, Any]:
|
|
"""获取任务统计信息"""
|
|
try:
|
|
# 获取最近24小时的任务统计
|
|
end_time = datetime.now()
|
|
start_time = end_time - timedelta(hours=24)
|
|
|
|
recent_runs = await get_workflow_runs_recent(start_time, end_time)
|
|
|
|
# 统计各种状态的任务数量
|
|
status_counts = {}
|
|
for run in recent_runs:
|
|
status = run.get('status', 'unknown')
|
|
status_counts[status] = status_counts.get(status, 0) + 1
|
|
|
|
return {
|
|
"running_tasks": status_counts.get('running', 0),
|
|
"pending_tasks": status_counts.get('pending', 0),
|
|
"completed_tasks": status_counts.get('completed', 0),
|
|
"failed_tasks": status_counts.get('failed', 0),
|
|
"total_tasks_24h": len(recent_runs),
|
|
"timestamp": datetime.now().isoformat()
|
|
}
|
|
except Exception as e:
|
|
raise HTTPException(status_code=500, detail=f"获取任务统计信息失败: {str(e)}")
|
|
|
|
@monitor_router.get("/server-status")
|
|
async def get_server_status() -> List[Dict[str, Any]]:
|
|
"""获取服务器状态信息"""
|
|
try:
|
|
import aiohttp
|
|
import asyncio
|
|
|
|
# 从 ComfyUIServerManager 获取所有注册的服务器
|
|
all_servers = await server_manager.get_all_servers()
|
|
|
|
if not all_servers:
|
|
logger.info("当前没有动态注册的服务器")
|
|
return []
|
|
|
|
async def check_server_status(server):
|
|
try:
|
|
timeout = aiohttp.ClientTimeout(total=5)
|
|
async with aiohttp.ClientSession(timeout=timeout) as session:
|
|
async with session.get(f"{server.http_url}/system_stats") as response:
|
|
if response.status == 200:
|
|
return {
|
|
"http_url": server.http_url,
|
|
"ws_url": server.ws_url,
|
|
"status": "online",
|
|
"response_time": response.headers.get('X-Response-Time', 'N/A'),
|
|
"current_tasks": getattr(server, 'current_tasks', 0),
|
|
"max_concurrent_tasks": getattr(server, 'max_concurrent_tasks', 1),
|
|
"last_heartbeat": getattr(server, 'last_heartbeat', None),
|
|
"last_health_check": getattr(server, 'last_health_check', None)
|
|
}
|
|
else:
|
|
return {
|
|
"http_url": server.http_url,
|
|
"ws_url": server.ws_url,
|
|
"status": "offline",
|
|
"error": f"HTTP {response.status}",
|
|
"current_tasks": getattr(server, 'current_tasks', 0),
|
|
"max_concurrent_tasks": getattr(server, 'max_concurrent_tasks', 1),
|
|
"last_heartbeat": getattr(server, 'last_heartbeat', None),
|
|
"last_health_check": getattr(server, 'last_health_check', None)
|
|
}
|
|
except Exception as e:
|
|
return {
|
|
"http_url": server.http_url,
|
|
"ws_url": server.ws_url,
|
|
"status": "offline",
|
|
"error": str(e),
|
|
"current_tasks": getattr(server, 'current_tasks', 0),
|
|
"max_concurrent_tasks": getattr(server, 'max_concurrent_tasks', 1),
|
|
"last_heartbeat": getattr(server, 'last_heartbeat', None),
|
|
"last_health_check": getattr(server, 'last_health_check', None)
|
|
}
|
|
|
|
# 并发检查所有服务器状态
|
|
tasks = [check_server_status(server) for server in all_servers]
|
|
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
|
|
# 过滤掉异常结果
|
|
valid_results = []
|
|
for result in results:
|
|
if isinstance(result, dict):
|
|
valid_results.append(result)
|
|
|
|
return valid_results
|
|
|
|
except Exception as e:
|
|
raise HTTPException(status_code=500, detail=f"获取服务器状态失败: {str(e)}")
|
|
|
|
@monitor_router.get("/recent-tasks")
|
|
async def get_recent_tasks(limit: int = 10) -> List[Dict[str, Any]]:
|
|
"""获取最近的任务列表"""
|
|
try:
|
|
end_time = datetime.now()
|
|
start_time = end_time - timedelta(hours=24)
|
|
|
|
recent_runs = await get_workflow_runs_recent(start_time, end_time)
|
|
|
|
# 限制返回数量
|
|
limited_runs = recent_runs[:limit]
|
|
|
|
# 格式化返回数据
|
|
formatted_runs = []
|
|
for run in limited_runs:
|
|
formatted_runs.append({
|
|
"id": run.get('id'), # 使用数据库中的id字段
|
|
"workflow_name": run.get('workflow_name'),
|
|
"status": run.get('status', 'unknown'),
|
|
"created_at": run.get('created_at'),
|
|
"updated_at": run.get('updated_at'),
|
|
"api_spec": run.get('api_spec')
|
|
})
|
|
|
|
return formatted_runs
|
|
|
|
except Exception as e:
|
|
raise HTTPException(status_code=500, detail=f"获取最近任务失败: {str(e)}")
|
|
|
|
@monitor_router.get("/health")
|
|
async def health_check() -> Dict[str, Any]:
|
|
"""健康检查端点"""
|
|
try:
|
|
# 检查数据库连接
|
|
from workflow_service.database.connection import get_db
|
|
db = get_db()
|
|
|
|
# 从 ComfyUIServerManager 获取服务器信息
|
|
all_servers = await server_manager.get_all_servers()
|
|
dynamic_servers_count = len(all_servers)
|
|
|
|
# 检查服务器健康状态
|
|
online_servers = [s for s in all_servers if s.status == 'online']
|
|
offline_servers = [s for s in all_servers if s.status != 'online']
|
|
|
|
return {
|
|
"status": "healthy",
|
|
"timestamp": datetime.now().isoformat(),
|
|
"version": "1.0.0",
|
|
"database": "connected",
|
|
"servers": {
|
|
"total_dynamic": dynamic_servers_count,
|
|
"online": len(online_servers),
|
|
"offline": len(offline_servers)
|
|
},
|
|
"uptime": "N/A" # 可以添加启动时间跟踪
|
|
}
|
|
except Exception as e:
|
|
return {
|
|
"status": "unhealthy",
|
|
"timestamp": datetime.now().isoformat(),
|
|
"error": str(e)
|
|
}
|
|
|
|
@monitor_router.get("/queue/status")
|
|
async def get_queue_status():
|
|
"""获取队列状态"""
|
|
try:
|
|
status = await queue_manager.get_queue_status()
|
|
return {
|
|
"success": True,
|
|
"data": status,
|
|
"timestamp": datetime.now().isoformat()
|
|
}
|
|
except Exception as e:
|
|
logger.error(f"获取队列状态失败: {e}")
|
|
raise HTTPException(status_code=500, detail=f"获取队列状态失败: {str(e)}")
|
|
|
|
@monitor_router.post("/queue/trigger")
|
|
async def trigger_queue_processing():
|
|
"""手动触发队列处理"""
|
|
try:
|
|
await queue_manager.trigger_queue_processing()
|
|
return {
|
|
"success": True,
|
|
"message": "队列处理已手动触发",
|
|
"timestamp": datetime.now().isoformat()
|
|
}
|
|
except Exception as e:
|
|
logger.error(f"手动触发队列处理失败: {e}")
|
|
raise HTTPException(status_code=500, detail=f"手动触发队列处理失败: {str(e)}")
|
|
|
|
@monitor_router.post("/queue/interval")
|
|
async def set_monitor_interval(interval: int):
|
|
"""设置队列监控间隔"""
|
|
try:
|
|
if interval < 1:
|
|
raise HTTPException(status_code=400, detail="监控间隔不能小于1秒")
|
|
|
|
await queue_manager.set_monitor_interval(interval)
|
|
return {
|
|
"success": True,
|
|
"message": f"队列监控间隔已设置为 {interval} 秒",
|
|
"timestamp": datetime.now().isoformat()
|
|
}
|
|
except Exception as e:
|
|
logger.error(f"设置监控间隔失败: {e}")
|
|
raise HTTPException(status_code=500, detail=f"设置监控间隔失败: {str(e)}")
|