#!/usr/bin/env python3
"""
总控官(main)调度系统 - 基础框架
基于用户决策：混合调度策略 + 弹性并发 + 混合错误处理
"""

import sqlite3
import json
import os
import time
import threading
import logging
from datetime import datetime, timedelta
from pathlib import Path
from enum import Enum
from dataclasses import dataclass, asdict
from typing import Dict, List, Optional, Any, Tuple
import queue

# ============ 配置 ============

# 数据库路径
DB_PATHS = {
    'projects': '/root/data/disk/system/database/projects.db',
    'agents': '/root/data/disk/system/database/agents.db',
    'analytics': '/root/data/disk/system/database/analytics.db'
}

# 日志配置
LOG_CONFIG = {
    'level': logging.INFO,
    'format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    'file': '/root/data/disk/system/logs/main/scheduler.log',
    'max_size_mb': 100,
    'backup_count': 5
}

# 调度配置
SCHEDULER_CONFIG = {
    'name': 'main_scheduler',
    'version': '1.0.0',
    'max_parallel_tasks': 10,
    'agent_concurrency': {
        'collector': 2,
        'writer': 1,
        'producer': 1,
        'publisher': 1,
        'analyzer': 2
    },
    'heartbeat_timeout': {
        'collector': 300,    # 5分钟
        'writer': 600,       # 10分钟
        'producer': 900,     # 15分钟
        'publisher': 300,    # 5分钟
        'analyzer': 300      # 5分钟
    },
    'retention_days': 30,    # 历史数据保留30天
    'alert_thresholds': {
        'task_stuck_minutes': 30,   # 任务滞留30分钟告警
        'agent_dead_minutes': 10,   # Agent死亡10分钟告警
        'resource_high_percent': 80 # 资源使用率80%告警
    }
}

# ============ 初始化日志 ============

def setup_logging():
    """配置日志系统"""
    # 创建日志目录
    log_dir = os.path.dirname(LOG_CONFIG['file'])
    os.makedirs(log_dir, exist_ok=True)
    
    # 配置日志
    logging.basicConfig(
        level=LOG_CONFIG['level'],
        format=LOG_CONFIG['format'],
        handlers=[
            logging.FileHandler(LOG_CONFIG['file']),
            logging.StreamHandler()
        ]
    )
    
    return logging.getLogger('main_scheduler')

logger = setup_logging()

# ============ 枚举定义 ============

class TaskStatus(Enum):
    """任务状态"""
    PENDING = "pending"
    RUNNING = "running"
    COMPLETED = "completed"
    FAILED = "failed"
    RETRYING = "retrying"
    STUCK = "stuck"  # 新增：滞留状态

class AgentStatus(Enum):
    """Agent状态"""
    IDLE = "idle"
    BUSY = "busy"
    ERROR = "error"
    DEAD = "dead"    # 新增：死亡状态
    OFFLINE = "offline"

class ErrorLevel(Enum):
    """错误级别（混合错误处理）"""
    LEVEL_1 = "auto_retry"      # 自动重试
    LEVEL_2 = "auto_recover"    # 自动恢复
    LEVEL_3 = "manual_review"   # 人工审查
    LEVEL_4 = "escalate"        # 升级处理

class TaskPriority(Enum):
    """任务优先级（混合策略）"""
    CRITICAL = 100  # 紧急任务
    HIGH = 75      # 高优先级
    NORMAL = 50    # 普通优先级
    LOW = 25       # 低优先级

# ============ 数据类 ============

@dataclass
class Task:
    """任务数据类"""
    task_id: str
    project_id: str
    agent_name: str
    task_type: str
    priority: int = TaskPriority.NORMAL.value
    status: str = TaskStatus.PENDING.value
    input_data: Dict = None
    expected_outputs: List = None
    dependencies: List[str] = None
    retry_count: int = 0
    max_retries: int = 3
    created_at: str = None
    started_at: str = None
    completed_at: str = None
    error_message: str = ""
    timeout_seconds: int = 1800
    estimated_duration: int = 600  # 预计时长（秒）
    actual_duration: int = 0
    
    def __post_init__(self):
        if self.input_data is None:
            self.input_data = {}
        if self.expected_outputs is None:
            self.expected_outputs = []
        if self.dependencies is None:
            self.dependencies = []
        if self.created_at is None:
            self.created_at = datetime.now().isoformat()

@dataclass
class AgentState:
    """Agent状态数据类"""
    agent_name: str
    status: str = AgentStatus.IDLE.value
    current_task_id: str = None
    last_heartbeat: str = None
    performance_metrics: Dict = None
    skill_requirements: List[str] = None  # 要求使用的skill
    
    def __post_init__(self):
        if self.last_heartbeat is None:
            self.last_heartbeat = datetime.now().isoformat()
        if self.performance_metrics is None:
            self.performance_metrics = {
                'tasks_completed': 0,
                'tasks_failed': 0,
                'success_rate': 1.0,
                'avg_duration': 0,
                'last_error': None
            }
        if self.skill_requirements is None:
            self.skill_requirements = []

@dataclass
class ProjectInfo:
    """项目信息数据类"""
    project_id: str
    name: str
    description: str = ""
    status: str = "draft"
    workflow_step: str = "未开始"
    created_by: str = "system"
    created_at: str = None
    updated_at: str = None
    
    def __post_init__(self):
        if self.created_at is None:
            self.created_at = datetime.now().isoformat()
        if self.updated_at is None:
            self.updated_at = self.created_at

# ============ 数据库管理器 ============

class DatabaseManager:
    """数据库管理器"""
    
    def __init__(self):
        self.db_paths = DB_PATHS
        
    def connect(self, db_name: str) -> sqlite3.Connection:
        """连接到指定数据库"""
        db_path = self.db_paths.get(db_name)
        if not db_path:
            raise ValueError(f"未知数据库: {db_name}")
        
        conn = sqlite3.connect(db_path)
        conn.row_factory = sqlite3.Row
        return conn
    
    # ===== 项目相关操作 =====
    
    def get_project(self, project_id: str) -> Optional[Dict]:
        """获取项目信息"""
        try:
            conn = self.connect('projects')
            cursor = conn.cursor()
            
            cursor.execute('SELECT * FROM projects WHERE project_id = ?', (project_id,))
            row = cursor.fetchone()
            conn.close()
            
            return dict(row) if row else None
        except Exception as e:
            logger.error(f"获取项目信息失败 {project_id}: {e}")
            return None
    
    def create_project(self, project_data: Dict) -> bool:
        """创建项目"""
        try:
            conn = self.connect('projects')
            cursor = conn.cursor()
            
            cursor.execute('''
                INSERT INTO projects 
                (project_id, name, description, status, workflow_step, created_by)
                VALUES (?, ?, ?, ?, ?, ?)
            ''', (
                project_data['project_id'],
                project_data['name'],
                project_data.get('description', ''),
                project_data.get('status', 'draft'),
                project_data.get('workflow_step', '未开始'),
                project_data.get('created_by', 'system')
            ))
            
            conn.commit()
            conn.close()
            logger.info(f"创建项目成功: {project_data['project_id']}")
            return True
        except Exception as e:
            logger.error(f"创建项目失败 {project_data.get('project_id')}: {e}")
            return False
    
    def update_project_status(self, project_id: str, status: str, workflow_step: str = None) -> bool:
        """更新项目状态"""
        try:
            conn = self.connect('projects')
            cursor = conn.cursor()
            
            if workflow_step:
                cursor.execute('''
                    UPDATE projects 
                    SET status = ?, workflow_step = ?, updated_at = CURRENT_TIMESTAMP
                    WHERE project_id = ?
                ''', (status, workflow_step, project_id))
            else:
                cursor.execute('''
                    UPDATE projects 
                    SET status = ?, updated_at = CURRENT_TIMESTAMP
                    WHERE project_id = ?
                ''', (status, project_id))
            
            conn.commit()
            conn.close()
            logger.info(f"更新项目状态: {project_id} -> {status}")
            return True
        except Exception as e:
            logger.error(f"更新项目状态失败 {project_id}: {e}")
            return False
    
    # ===== 任务相关操作 =====
    
    def create_task(self, task: Task) -> bool:
        """创建新任务"""
        try:
            conn = self.connect('agents')
            cursor = conn.cursor()
            
            cursor.execute('''
                INSERT INTO agent_tasks 
                (task_id, project_id, agent_name, task_type, priority, status, 
                 input_data, expected_outputs, dependencies, retry_count, max_retries,
                 created_at, timeout_seconds, estimated_duration)
                VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
            ''', (
                task.task_id,
                task.project_id,
                task.agent_name,
                task.task_type,
                task.priority,
                task.status,
                json.dumps(task.input_data, ensure_ascii=False),
                json.dumps(task.expected_outputs, ensure_ascii=False),
                json.dumps(task.dependencies, ensure_ascii=False),
                task.retry_count,
                task.max_retries,
                task.created_at,
                task.timeout_seconds,
                task.estimated_duration
            ))
            
            conn.commit()
            conn.close()
            logger.info(f"创建任务成功: {task.task_id}")
            return True
        except Exception as e:
            logger.error(f"创建任务失败 {task.task_id}: {e}")
            return False
    
    def update_task_status(self, task_id: str, status: str, **kwargs) -> bool:
        """更新任务状态"""
        try:
            conn = self.connect('agents')
            cursor = conn.cursor()
            
            update_fields = []
            params = []
            
            if status == TaskStatus.RUNNING.value and 'started_at' in kwargs:
                update_fields.append("status = ?")
                update_fields.append("started_at = ?")
                params.extend([status, kwargs['started_at']])
            elif status == TaskStatus.COMPLETED.value and 'completed_at' in kwargs:
                update_fields.append("status = ?")
                update_fields.append("completed_at = ?")
                update_fields.append("actual_duration = ?")
                params.extend([status, kwargs['completed_at'], kwargs.get('actual_duration', 0)])
            else:
                update_fields.append("status = ?")
                params.append(status)
            
            if 'error_message' in kwargs:
                update_fields.append("error_message = ?")
                params.append(kwargs['error_message'])
            
            if update_fields:
                update_sql = ", ".join(update_fields)
                params.append(task_id)
                
                cursor.execute(f'''
                    UPDATE agent_tasks 
                    SET {update_sql}
                    WHERE task_id = ?
                ''', params)
            
            conn.commit()
            conn.close()
            logger.info(f"更新任务状态: {task_id} -> {status}")
            return True
        except Exception as e:
            logger.error(f"更新任务状态失败 {task_id}: {e}")
            return False
    
    def get_pending_tasks(self, limit: int = 100) -> List[Dict]:
        """获取待处理任务"""
        try:
            conn = self.connect('agents')
            cursor = conn.cursor()
            
            cursor.execute('''
                SELECT * FROM agent_tasks 
                WHERE status = 'pending'
                ORDER BY priority DESC, created_at ASC
                LIMIT ?
            ''', (limit,))
            
            tasks = [dict(row) for row in cursor.fetchall()]
            conn.close()
            return tasks
        except Exception as e:
            logger.error(f"获取待处理任务失败: {e}")
            return []
    
    def get_running_tasks(self) -> List[Dict]:
        """获取运行中任务"""
        try:
            conn = self.connect('agents')
            cursor = conn.cursor()
            
            cursor.execute('''
                SELECT * FROM agent_tasks 
                WHERE status = 'running'
            ''')
            
            tasks = [dict(row) for row in cursor.fetchall()]
            conn.close()
            return tasks
        except Exception as e:
            logger.error(f"获取运行中任务失败: {e}")
            return []
    
    # ===== Agent状态相关操作 =====
    
    def get_agent_state(self, agent_name: str) -> Optional[Dict]:
        """获取Agent状态"""
        try:
            conn = self.connect('agents')
            cursor = conn.cursor()
            
            cursor.execute('SELECT * FROM agent_status WHERE agent_name = ?', (agent_name,))
            row = cursor.fetchone()
            conn.close()
            
            return dict(row) if row else None
        except Exception as e:
            logger.error(f"获取Agent状态失败 {agent_name}: {e}")
            return None
    
    def update_agent_state(self, agent_name: str, state_data: Dict) -> bool:
        """更新Agent状态"""
        try:
            conn = self.connect('agents')
            cursor = conn.cursor()
            
            cursor.execute('''
                INSERT OR REPLACE INTO agent_status 
                (agent_name, status, current_task_id, last_heartbeat, performance_metrics, skill_requirements)
                VALUES (?, ?, ?, ?, ?, ?)
            ''', (
                agent_name,
                state_data.get('status', 'idle'),
                state_data.get('current_task_id'),
                state_data.get('last_heartbeat', datetime.now().isoformat()),
                json.dumps(state_data.get('performance_metrics', {}), ensure_ascii=False),
                json.dumps(state_data.get('skill_requirements', []), ensure_ascii=False)
            ))
            
            conn.commit()
            conn.close()
            return True
        except Exception as e:
            logger.error(f"更新Agent状态失败 {agent_name}: {e}")
            return False
    
    def get_all_agent_states(self) -> Dict[str, Dict]:
        """获取所有Agent状态"""
        try:
            conn = self.connect('agents')
            cursor = conn.cursor()
            
            cursor.execute('SELECT * FROM agent_status')
            rows = cursor.fetchall()
            conn.close()
            
            return {row['agent_name']: dict(row) for row in rows}
        except Exception as e:
            logger.error(f"获取所有Agent状态失败: {e}")
            return {}
    
    # ===== 监控数据相关操作 =====
    
    def log_monitoring_data(self, data_type: str, data: Dict) -> bool:
        """记录监控数据"""
        try:
            conn = self.connect('analytics')
            cursor = conn.cursor()
            
            cursor.execute('''
                INSERT INTO monitoring_data 
                (data_type, data, timestamp)
                VALUES (?, ?, ?)
            ''', (
                data_type,
                json.dumps(data, ensure_ascii=False),
                datetime.now().isoformat()
            ))
            
            conn.commit()
            conn.close()
            return True
        except Exception as e:
            logger.error(f"记录监控数据失败: {e}")
            return False
    
    def get_recent_alerts(self, limit: int = 20) -> List[Dict]:
        """获取最近告警"""
        try:
            conn = self.connect('analytics')
            cursor = conn.cursor()
            
            cursor.execute('''
                SELECT * FROM alerts 
                ORDER BY created_at DESC 
                LIMIT ?
            ''', (limit,))
            
            alerts = [dict(row) for row in cursor.fetchall()]
            conn.close()
            return alerts
        except Exception as e:
            logger.error(f"获取最近告警失败: {e}")
            return []
    
    def create_alert(self, alert_data: Dict) -> bool:
        """创建告警"""
        try:
            conn = self.connect('analytics')
            cursor = conn.cursor()
            
            cursor.execute('''
                INSERT INTO alerts 
                (alert_type, alert_level, message, task_id, agent_name, project_id)
                VALUES (?, ?, ?, ?, ?, ?)
            ''', (
                alert_data.get('alert_type', 'unknown'),
                alert_data.get('alert_level', 'warning'),
                alert_data.get('message', ''),
                alert_data.get('task_id'),
                alert_data.get('agent_name'),
                alert_data.get('project_id')
            ))
            
            conn.commit()
            conn.close()
            logger.warning(f"创建告警: {alert_data.get('message')}")
            return True
        except Exception as e:
            logger.error(f"创建告警失败: {e}")
            return False

# ============ 总控官调度器 ============

class MainScheduler:
    """总控官调度器"""
    
    def __init__(self, db_manager: DatabaseManager):
        self.db = db_manager
        self.config = SCHEDULER_CONFIG
        self.task_queue = queue.PriorityQueue()
        self.running_tasks: Dict[str, Dict] = {}
        self.agent_states: Dict[str, Dict] = {}
        
        # 初始化Agent行为规范
        self.agent_behavior_rules = self._init_agent_behavior_rules()
        
        self.running = False
        self.scheduler_thread = None
        self.monitor_thread = None
        self.alert_thread = None
    
    def _init_agent_behavior_rules(self) -> Dict:
        """初始化Agent行为规范"""
        return {
            'collector': {
                'required_skills': ['hot-topics', 'bilibili-all-in-one'],
                'input_dir': '/root/data/disk/collector/input',
                'output_dir': '/root/data/disk/collector/output',
                'data_table': 'collected_materials',
                'update_interval': 300,  # 5分钟更新一次
                'timeout': 1800  # 30分钟超时
            },
            'writer': {
                'required_skills': ['ai-drama-prompt-factory', 'seedance-creator'],
                'input_dir': '/root/data/disk/writer/input',
                'output_dir': '/root/data/disk/writer/output',
                'data_table': 'created_contents',
                'update_interval': 600,  # 10分钟更新一次
                'timeout': 3600  # 1小时超时
            },
            'producer': {
                'required_skills': ['seedance2-skill', 'seedance-shot-design'],
                'input_dir': '/root/data/disk/producer/input',
                'output_dir': '/root/data/disk/producer/output',
                'data_table': 'produced_media',
                'update_interval': 900,  # 15分钟更新一次
                'timeout': 7200  # 2小时超时
            },
            'publisher': {
                'required_skills': [],
                'input_dir': '/root/data/disk/publisher/input',
                'output_dir': '/root/data/disk/publisher/output',
                'data_table': 'published_content',
                'update_interval': 300,  # 5分钟更新一次
                'timeout': 1800  # 30分钟超时
            },
            'analyzer': {
                'required_skills': ['analytics', 'monitor'],
                'input_dir': '/root/data/disk/analyzer/input',
                'output_dir': '/root/data/disk/analyzer/output',
                'data_table': 'analysis_results',
                'update_interval': 300,  # 5分钟更新一次
                'timeout': 1800  # 30分钟超时
            }
        }
    
    def start(self):
        """启动调度器"""
        if self.running:
            logger.warning("调度器已经在运行")
            return
        
        self.running = True
        
        # 加载Agent状态
        self._load_agent_states()
        
        # 启动线程
        self.scheduler_thread = threading.Thread(target=self._scheduler_loop, daemon=True)
        self.monitor_thread = threading.Thread(target=self._monitor_loop, daemon=True)
        self.alert_thread = threading.Thread(target=self._alert_loop, daemon=True)
        
        self.scheduler_thread.start()
        self.monitor_thread.start()
        self.alert_thread.start()
        
        logger.info("总控官调度系统启动成功")
        
        # 发送启动通知
        self._send_startup_notification()
    
    def stop(self):
        """停止调度器"""
        self.running = False
        
        if self.scheduler_thread:
            self.scheduler_thread.join(timeout=5)
        if self.monitor_thread:
            self.monitor_thread.join(timeout=5)
        if self.alert_thread:
            self.alert_thread.join(timeout=5)
        
        logger.info("总控官调度系统停止")
    
    def _load_agent_states(self):
        """加载Agent状态"""
        self.agent_states = self.db.get_all_agent_states()
        
        # 为没有状态的Agent创建初始状态
        for agent_name in ['collector', 'writer', 'producer', 'publisher', 'analyzer']:
            if agent_name not in self.agent_states:
                behavior_rules = self.agent_behavior_rules.get(agent_name, {})
                agent_state = AgentState(
                    agent_name=agent_name,
                    status=AgentStatus.IDLE.value,
                    skill_requirements=behavior_rules.get('required_skills', [])
                )
                
                if self.db.update_agent_state(agent_name, asdict(agent_state)):
                    self.agent_states[agent_name] = asdict(agent_state)
                    logger.info(f"初始化Agent状态: {agent_name}")
    
    def schedule_task(self, task_data: Dict) -> bool:
        """调度新任务"""
        try:
            # 创建任务对象
            task = Task(
                task_id=task_data['task_id'],
                project_id=task_data['project_id'],
                agent_name=task_data['agent_name'],
                task_type=task_data['task_type'],
                priority=task_data.get('priority', TaskPriority.NORMAL.value),
                input_data=task_data.get('input_data', {}),
                expected_outputs=task_data.get('expected_outputs', []),
                dependencies=task_data.get('dependencies', []),
                estimated_duration=task_data.get('estimated_duration', 600)
            )
            
            # 保存到数据库
            if not self.db.create_task(task):
                return False
            
            # 计算调度优先级（混合策略）
            priority = self._calculate_scheduling_priority(task)
            
            # 添加到内存队列
            self.task_queue.put((-priority, task.task_id, asdict(task)))
            logger.info(f"任务已调度: {task.task_id} (优先级: {priority})")
            
            return True
            
        except Exception as e:
            logger.error(f"调度任务失败 {task_data.get('task_id')}: {e}")
            return False
    
    def _calculate_scheduling_priority(self, task: Task) -> int:
        """计算调度优先级（混合策略）"""
        base_priority = task.priority
        
        # 1. Agent可用性加成（弹性并发）
        agent_state = self.agent_states.get(task.agent_name)
        if agent_state and agent_state.get('status') == 'idle':
            base_priority += 20
        
        # 2. 依赖完成度加成
        if task.dependencies:
            completed_deps = 0
            for dep_id in task.dependencies:
                dep_task = self.db.get_task(dep_id)  # 需要实现这个方法
                if dep_task and dep_task.get('status') == 'completed':
                    completed_deps += 1
            
            dependency_ratio = completed_deps / len(task.dependencies) if task.dependencies else 1.0
            base_priority += int(30 * dependency_ratio)
        
        # 3. 时间紧迫度加成（如果任务有截止时间）
        # 这里可以添加基于截止时间的优先级计算
        
        return min(base_priority, 100)
    
    def _scheduler_loop(self):
        """调度器主循环"""
        logger.info("调度器主循环启动")
        
        while self.running:
            try:
                # 检查队列
                if not self.task_queue.empty():
                    # 获取最高优先级任务
                    _, task_id, task_data = self.task_queue.get()
                    
                    # 检查是否可以执行
                    if self._can_execute_task(task_data):
                        # 执行任务
                        self._execute_task(task_data)
                    else:
                        # 放回队列
                        priority = self._calculate_scheduling_priority(Task(**task_data))
                        self.task_queue.put((-priority, task_id, task_data))
                
                time.sleep(1)
                
            except Exception as e:
                logger.error(f"调度器循环异常: {e}")
                time.sleep(5)
    
    def _can_execute_task(self, task_data: Dict) -> bool:
        """检查任务是否可以执行"""
        agent_name = task_data['agent_name']
        
        # 1. 检查Agent状态
        agent_state = self.agent_states.get(agent_name)
        if not agent_state or agent_state.get('status') != 'idle':
            return False
        
        # 2. 检查并发数（弹性限制）
        current_concurrent = sum(1 for t in self.running_tasks.values() 
                                if t.get('agent_name') == agent_name)
        max_concurrent = self.config['agent_concurrency'].get(agent_name, 1)
        
        if current_concurrent >= max_concurrent:
            return False
        
        # 3. 检查总任务数
        if len(self.running_tasks) >= self.config['max_parallel_tasks']:
            return False
        
        # 4. 检查依赖是否完成
        for dep_id in task_data.get('dependencies', []):
            # 这里需要检查依赖任务状态
            # 简化处理：假设依赖都已完成
            pass
        
        return True
    
    def _execute_task(self, task_data: Dict):
        """执行任务"""
        try:
            task_id = task_data['task_id']
            
            # 1. 更新任务状态为running
            started_at = datetime.now().isoformat()
            if not self.db.update_task_status(task_id, 'running', started_at=started_at):
                return
            
            # 2. 更新Agent状态为busy
            agent_name = task_data['agent_name']
            agent_state = {
                'status': 'busy',
                'current_task_id': task_id,
                'last_heartbeat': started_at,
                'skill_requirements': self.agent_behavior_rules.get(agent_name, {}).get('required_skills', [])
            }
            
            if self.db.update_agent_state(agent_name, agent_state):
                self.agent_states[agent_name] = agent_state
            
            # 3. 添加到运行中任务列表
            self.running_tasks[task_id] = task_data
            
            # 4. 发送任务执行指令给Agent
            self._send_task_to_agent(task_data)
            
            logger.info(f"任务开始执行: {task_id}")
            
        except Exception as e:
            logger.error(f"执行任务失败 {task_data.get('task_id')}: {e}")
    
    def _send_task_to_agent(self, task_data: Dict):
        """发送任务给Agent"""
        agent_name = task_data['agent_name']
        task_id = task_data['task_id']
        
        # 获取Agent行为规范
        behavior_rules = self.agent_behavior_rules.get(agent_name, {})
        
        # 创建任务指令
        task_instruction = {
            'task_id': task_id,
            'task_type': task_data['task_type'],
            'project_id': task_data['project_id'],
            'input_data': task_data.get('input_data', {}),
            'expected_outputs': task_data.get('expected_outputs', []),
            'behavior_rules': behavior_rules,
            'instructions': [
                f"使用以下skill: {', '.join(behavior_rules.get('required_skills', []))}",
                f"从目录获取输入文件: {behavior_rules.get('input_dir')}",
                f"输出到目录: {behavior_rules.get('output_dir')}",
                f"及时更新数据表: {behavior_rules.get('data_table')}",
                f"预计完成时间: {datetime.now() + timedelta(seconds=task_data.get('estimated_duration', 600))}",
                "请按规范执行任务并及时更新状态"
            ]
        }
        
        # 保存任务指令到文件（让Agent读取）
        instruction_file = f"/root/data/disk/temp/task_instructions/{task_id}.json"
        os.makedirs(os.path.dirname(instruction_file), exist_ok=True)
        
        with open(instruction_file, 'w', encoding='utf-8') as f:
            json.dump(task_instruction, f, ensure_ascii=False, indent=2)
        
        logger.info(f"任务指令已保存: {instruction_file}")
        
        # 这里可以添加其他通信方式，如REST API调用、CLI命令等
    
    def _monitor_loop(self):
        """监控循环"""
        logger.info("监控循环启动")
        
        while self.running:
            try:
                # 1. 监控运行中任务
                self._monitor_running_tasks()
                
                # 2. 监控Agent状态
                self._monitor_agent_states()
                
                # 3. 记录监控数据
                self._log_monitoring_data()
                
                time.sleep(30)  # 30秒监控一次
                
            except Exception as e:
                logger.error(f"监控循环异常: {e}")
                time.sleep(60)
    
    def _monitor_running_tasks(self):
        """监控运行中任务"""
        current_time = datetime.now()
        
        for task_id, task_data in list(self.running_tasks.items()):
            started_at_str = task_data.get('started_at')
            if started_at_str:
                try:
                    started_at = datetime.fromisoformat(started_at_str.replace('Z', '+00:00'))
                    elapsed_minutes = (current_time - started_at).total_seconds() / 60
                    
                    # 检查是否滞留
                    if elapsed_minutes > self.config['alert_thresholds']['task_stuck_minutes']:
                        logger.warning(f"任务滞留: {task_id} ({elapsed_minutes:.1f}分钟)")
                        
                        # 标记为滞留状态
                        self.db.update_task_status(task_id, 'stuck')
                        
                        # 创建告警
                        alert_data = {
                            'alert_type': 'task_stuck',
                            'alert_level': 'warning',
                            'message': f"任务 {task_id} 已滞留 {elapsed_minutes:.1f} 分钟",
                            'task_id': task_id,
                            'agent_name': task_data.get('agent_name'),
                            'project_id': task_data.get('project_id')
                        }
                        self.db.create_alert(alert_data)
                    
                    # 检查是否超时
                    timeout_minutes = task_data.get('timeout_seconds', 1800) / 60
                    if elapsed_minutes > timeout_minutes:
                        logger.error(f"任务超时: {task_id}")
                        self._handle_task_failure(task_data, "任务超时")
                        
                except Exception as e:
                    logger.error(f"监控任务失败 {task_id}: {e}")
    
    def _monitor_agent_states(self):
        """监控Agent状态"""
        current_time = datetime.now()
        
        for agent_name, agent_state in self.agent_states.items():
            if agent_name == 'cto':
                continue
                
            last_heartbeat_str = agent_state.get('last_heartbeat')
            if last_heartbeat_str:
                try:
                    last_heartbeat = datetime.fromisoformat(last_heartbeat_str.replace('Z', '+00:00'))
                    elapsed_minutes = (current_time - last_heartbeat).total_seconds() / 60
                    
                    # 检查Agent是否死亡
                    if elapsed_minutes > self.config['alert_thresholds']['agent_dead_minutes']:
                        logger.error(f"Agent死亡: {agent_name} ({elapsed_minutes:.1f}分钟无心跳)")
                        
                        # 更新Agent状态
                        new_state = {
                            'status': 'dead',
                            'last_heartbeat': agent_state['last_heartbeat']
                        }
                        
                        if self.db.update_agent_state(agent_name, new_state):
                            self.agent_states[agent_name] = {**agent_state, **new_state}
                        
                        # 创建告警
                        alert_data = {
                            'alert_type': 'agent_dead',
                            'alert_level': 'critical',
                            'message': f"Agent {agent_name} 已 {elapsed_minutes:.1f} 分钟无心跳",
                            'agent_name': agent_name
                        }
                        self.db.create_alert(alert_data)
                        
                        # 处理该Agent的任务
                        self._handle_agent_failure(agent_name)
                    
                except Exception as e:
                    logger.error(f"监控Agent失败 {agent_name}: {e}")
    
    def _log_monitoring_data(self):
        """记录监控数据"""
        try:
            monitoring_data = {
                'timestamp': datetime.now().isoformat(),
                'queue_size': self.task_queue.qsize(),
                'running_tasks': len(self.running_tasks),
                'agent_states': {k: v.get('status') for k, v in self.agent_states.items()},
                'system_load': self._get_system_load()
            }
            
            self.db.log_monitoring_data('scheduler_status', monitoring_data)
            
        except Exception as e:
            logger.error(f"记录监控数据失败: {e}")
    
    def _get_system_load(self) -> Dict:
        """获取系统负载"""
        try:
            import psutil
            return {
                'cpu_percent': psutil.cpu_percent(interval=1),
                'memory_percent': psutil.virtual_memory().percent,
                'disk_percent': psutil.disk_usage('/').percent
            }
        except:
            return {'cpu_percent': 0, 'memory_percent': 0, 'disk_percent': 0}
    
    def _alert_loop(self):
        """告警循环"""
        logger.info("告警循环启动")
        
        while self.running:
            try:
                # 检查是否需要发送告警通知
                self._check_and_send_alerts()
                time.sleep(60)  # 1分钟检查一次
                
            except Exception as e:
                logger.error(f"告警循环异常: {e}")
                time.sleep(120)
    
    def _check_and_send_alerts(self):
        """检查并发送告警"""
        # 获取未处理的告警
        alerts = self.db.get_recent_alerts(limit=10)
        
        for alert in alerts:
            if not alert.get('notified'):
                # 发送告警通知
                self._send_alert_notification(alert)
                
                # 标记为已通知
                # 这里需要更新数据库中的notified字段
    
    def _send_alert_notification(self, alert: Dict):
        """发送告警通知"""
        alert_level = alert.get('alert_level', 'warning')
        message = alert.get('message', '')
        
        if alert_level == 'critical':
            logger.critical(f"🔴 严重告警: {message