package maintenance

import (
	"crypto/rand"
	"fmt"
	"sort"
	"time"

	"github.com/seaweedfs/seaweedfs/weed/glog"
)

// NewMaintenanceQueue creates a new maintenance queue
func NewMaintenanceQueue(policy *MaintenancePolicy) *MaintenanceQueue {
	queue := &MaintenanceQueue{
		tasks:           make(map[string]*MaintenanceTask),
		workers:         make(map[string]*MaintenanceWorker),
		pendingTasks:    make([]*MaintenanceTask, 0),
		policy:          policy,
		persistenceChan: make(chan *MaintenanceTask, 1000), // Buffer for async persistence
	}

	// Start persistence worker goroutine
	go queue.persistenceWorker()

	return queue
}

// SetIntegration sets the integration reference
func (mq *MaintenanceQueue) SetIntegration(integration *MaintenanceIntegration) {
	mq.integration = integration
	glog.V(1).Infof("Maintenance queue configured with integration")
}

// SetPersistence sets the task persistence interface
func (mq *MaintenanceQueue) SetPersistence(persistence TaskPersistence) {
	mq.persistence = persistence
	glog.V(1).Infof("Maintenance queue configured with task persistence")
}

// LoadTasksFromPersistence loads tasks from persistent storage on startup
func (mq *MaintenanceQueue) LoadTasksFromPersistence() error {
	if mq.persistence == nil {
		glog.V(1).Infof("No task persistence configured, skipping task loading")
		return nil
	}

	glog.Infof("Loading tasks from persistence...")

	// Load tasks without holding lock to prevent deadlock
	tasks, err := mq.persistence.LoadAllTaskStates()
	if err != nil {
		return fmt.Errorf("failed to load task states: %w", err)
	}

	// Only acquire lock for the in-memory operations
	mq.mutex.Lock()
	defer mq.mutex.Unlock()

	glog.Infof("DEBUG LoadTasksFromPersistence: Found %d tasks in persistence", len(tasks))

	// Reset task maps
	mq.tasks = make(map[string]*MaintenanceTask)
	mq.pendingTasks = make([]*MaintenanceTask, 0)

	// Load tasks by status
	for _, task := range tasks {
		glog.Infof("DEBUG LoadTasksFromPersistence: Loading task %s (type: %s, status: %s, scheduled: %v)", task.ID, task.Type, task.Status, task.ScheduledAt)
		mq.tasks[task.ID] = task

		switch task.Status {
		case TaskStatusPending:
			glog.Infof("DEBUG LoadTasksFromPersistence: Adding task %s to pending queue", task.ID)
			mq.pendingTasks = append(mq.pendingTasks, task)
		case TaskStatusAssigned, TaskStatusInProgress:
			// For assigned/in-progress tasks, we need to check if the worker is still available
			// If not, we should fail them and make them eligible for retry
			if task.WorkerID != "" {
				if _, exists := mq.workers[task.WorkerID]; !exists {
					glog.Warningf("Task %s was assigned to unavailable worker %s, marking as failed", task.ID, task.WorkerID)
					task.Status = TaskStatusFailed
					task.Error = "Worker unavailable after restart"
					completedTime := time.Now()
					task.CompletedAt = &completedTime

					// Check if it should be retried
					if task.RetryCount < task.MaxRetries {
						task.RetryCount++
						task.Status = TaskStatusPending
						task.WorkerID = ""
						task.StartedAt = nil
						task.CompletedAt = nil
						task.Error = ""
						task.ScheduledAt = time.Now().Add(1 * time.Minute) // Retry after restart delay
						glog.Infof("DEBUG LoadTasksFromPersistence: Retrying task %s, adding to pending queue", task.ID)
						mq.pendingTasks = append(mq.pendingTasks, task)
					}
				}
			}
		}
	}

	// Sort pending tasks by priority and schedule time
	sort.Slice(mq.pendingTasks, func(i, j int) bool {
		if mq.pendingTasks[i].Priority != mq.pendingTasks[j].Priority {
			return mq.pendingTasks[i].Priority > mq.pendingTasks[j].Priority
		}
		return mq.pendingTasks[i].ScheduledAt.Before(mq.pendingTasks[j].ScheduledAt)
	})

	glog.Infof("Loaded %d tasks from persistence (%d pending)", len(tasks), len(mq.pendingTasks))
	return nil
}

// persistenceWorker handles async persistence operations
func (mq *MaintenanceQueue) persistenceWorker() {
	for task := range mq.persistenceChan {
		if mq.persistence != nil {
			if err := mq.persistence.SaveTaskState(task); err != nil {
				glog.Errorf("Failed to save task state for %s: %v", task.ID, err)
			}
		}
	}
	glog.V(1).Infof("Persistence worker shut down")
}

// Close gracefully shuts down the maintenance queue
func (mq *MaintenanceQueue) Close() {
	if mq.persistenceChan != nil {
		close(mq.persistenceChan)
		glog.V(1).Infof("Maintenance queue persistence channel closed")
	}
}

// saveTaskState saves a task to persistent storage asynchronously
func (mq *MaintenanceQueue) saveTaskState(task *MaintenanceTask) {
	if mq.persistence != nil && mq.persistenceChan != nil {
		// Create a copy to avoid race conditions
		taskCopy := *task
		select {
		case mq.persistenceChan <- &taskCopy:
			// Successfully queued for async persistence
		default:
			glog.Warningf("Persistence channel full, task state may be lost: %s", task.ID)
		}
	}
}

// cleanupCompletedTasks removes old completed tasks beyond the retention limit
func (mq *MaintenanceQueue) cleanupCompletedTasks() {
	if mq.persistence != nil {
		if err := mq.persistence.CleanupCompletedTasks(); err != nil {
			glog.Errorf("Failed to cleanup completed tasks: %v", err)
		}
	}
}

// AddTask adds a new maintenance task to the queue with deduplication
func (mq *MaintenanceQueue) AddTask(task *MaintenanceTask) {
	mq.mutex.Lock()
	defer mq.mutex.Unlock()

	// Check for duplicate tasks (same type + volume + not completed)
	if mq.hasDuplicateTask(task) {
		glog.V(1).Infof("Task skipped (duplicate): %s for volume %d on %s (already queued or running)",
			task.Type, task.VolumeID, task.Server)
		return
	}

	task.ID = generateTaskID()
	task.Status = TaskStatusPending
	task.CreatedAt = time.Now()
	task.MaxRetries = 3 // Default retry count

	// Initialize assignment history and set creation context
	task.AssignmentHistory = make([]*TaskAssignmentRecord, 0)
	if task.CreatedBy == "" {
		task.CreatedBy = "maintenance-system"
	}
	if task.CreationContext == "" {
		task.CreationContext = "Automatic task creation based on system monitoring"
	}
	if task.Tags == nil {
		task.Tags = make(map[string]string)
	}

	mq.tasks[task.ID] = task
	mq.pendingTasks = append(mq.pendingTasks, task)

	// Sort pending tasks by priority and schedule time
	sort.Slice(mq.pendingTasks, func(i, j int) bool {
		if mq.pendingTasks[i].Priority != mq.pendingTasks[j].Priority {
			return mq.pendingTasks[i].Priority > mq.pendingTasks[j].Priority
		}
		return mq.pendingTasks[i].ScheduledAt.Before(mq.pendingTasks[j].ScheduledAt)
	})

	// Save task state to persistence
	mq.saveTaskState(task)

	scheduleInfo := ""
	if !task.ScheduledAt.IsZero() && time.Until(task.ScheduledAt) > time.Minute {
		scheduleInfo = fmt.Sprintf(", scheduled for %v", task.ScheduledAt.Format("15:04:05"))
	}

	glog.Infof("Task queued: %s (%s) volume %d on %s, priority %d%s, reason: %s",
		task.ID, task.Type, task.VolumeID, task.Server, task.Priority, scheduleInfo, task.Reason)
}

// hasDuplicateTask checks if a similar task already exists (same type, volume, and not completed)
func (mq *MaintenanceQueue) hasDuplicateTask(newTask *MaintenanceTask) bool {
	for _, existingTask := range mq.tasks {
		if existingTask.Type == newTask.Type &&
			existingTask.VolumeID == newTask.VolumeID &&
			existingTask.Server == newTask.Server &&
			(existingTask.Status == TaskStatusPending ||
				existingTask.Status == TaskStatusAssigned ||
				existingTask.Status == TaskStatusInProgress) {
			return true
		}
	}
	return false
}

// AddTasksFromResults converts detection results to tasks and adds them to the queue
func (mq *MaintenanceQueue) AddTasksFromResults(results []*TaskDetectionResult) {
	for _, result := range results {
		// Validate that task has proper typed parameters
		if result.TypedParams == nil {
			glog.Warningf("Rejecting invalid task: %s for volume %d on %s - no typed parameters (insufficient destinations or planning failed)",
				result.TaskType, result.VolumeID, result.Server)
			continue
		}

		task := &MaintenanceTask{
			Type:       result.TaskType,
			Priority:   result.Priority,
			VolumeID:   result.VolumeID,
			Server:     result.Server,
			Collection: result.Collection,
			// Copy typed protobuf parameters
			TypedParams: result.TypedParams,
			Reason:      result.Reason,
			ScheduledAt: result.ScheduleAt,
		}
		mq.AddTask(task)
	}
}

// GetNextTask returns the next available task for a worker
func (mq *MaintenanceQueue) GetNextTask(workerID string, capabilities []MaintenanceTaskType) *MaintenanceTask {
	// Use read lock for initial checks and search
	mq.mutex.RLock()

	worker, exists := mq.workers[workerID]
	if !exists {
		mq.mutex.RUnlock()
		glog.V(2).Infof("Task assignment failed for worker %s: worker not registered", workerID)
		return nil
	}

	// Check if worker has capacity
	if worker.CurrentLoad >= worker.MaxConcurrent {
		mq.mutex.RUnlock()
		glog.V(2).Infof("Task assignment failed for worker %s: at capacity (%d/%d)", workerID, worker.CurrentLoad, worker.MaxConcurrent)
		return nil
	}

	now := time.Now()
	var selectedTask *MaintenanceTask
	var selectedIndex int = -1

	// Find the next suitable task (using read lock)
	for i, task := range mq.pendingTasks {
		// Check if it's time to execute the task
		if task.ScheduledAt.After(now) {
			glog.V(3).Infof("Task %s skipped for worker %s: scheduled for future (%v)", task.ID, workerID, task.ScheduledAt)
			continue
		}

		// Check if worker can handle this task type
		if !mq.workerCanHandle(task.Type, capabilities) {
			glog.V(3).Infof("Task %s (%s) skipped for worker %s: capability mismatch (worker has: %v)", task.ID, task.Type, workerID, capabilities)
			continue
		}

		// Check if this task type needs a cooldown period
		if !mq.canScheduleTaskNow(task) {
			// Add detailed diagnostic information
			runningCount := mq.GetRunningTaskCount(task.Type)
			maxConcurrent := mq.getMaxConcurrentForTaskType(task.Type)
			glog.V(2).Infof("Task %s (%s) skipped for worker %s: scheduling constraints not met (running: %d, max: %d)",
				task.ID, task.Type, workerID, runningCount, maxConcurrent)
			continue
		}

		// Found a suitable task
		selectedTask = task
		selectedIndex = i
		break
	}

	// Release read lock
	mq.mutex.RUnlock()

	// If no task found, return nil
	if selectedTask == nil {
		glog.V(3).Infof("No suitable tasks available for worker %s (checked %d pending tasks)", workerID, len(mq.pendingTasks))
		return nil
	}

	// Now acquire write lock to actually assign the task
	mq.mutex.Lock()
	defer mq.mutex.Unlock()

	// Re-check that the task is still available (it might have been assigned to another worker)
	if selectedIndex >= len(mq.pendingTasks) || mq.pendingTasks[selectedIndex].ID != selectedTask.ID {
		glog.V(2).Infof("Task %s no longer available for worker %s: assigned to another worker", selectedTask.ID, workerID)
		return nil
	}

	// Record assignment history
	workerAddress := ""
	if worker, exists := mq.workers[workerID]; exists {
		workerAddress = worker.Address
	}

	// Create assignment record
	assignmentRecord := &TaskAssignmentRecord{
		WorkerID:      workerID,
		WorkerAddress: workerAddress,
		AssignedAt:    now,
		Reason:        "Task assigned to available worker",
	}

	// Initialize assignment history if nil
	if selectedTask.AssignmentHistory == nil {
		selectedTask.AssignmentHistory = make([]*TaskAssignmentRecord, 0)
	}
	selectedTask.AssignmentHistory = append(selectedTask.AssignmentHistory, assignmentRecord)

	// Assign the task
	selectedTask.Status = TaskStatusAssigned
	selectedTask.WorkerID = workerID
	selectedTask.StartedAt = &now

	// Remove from pending tasks
	mq.pendingTasks = append(mq.pendingTasks[:selectedIndex], mq.pendingTasks[selectedIndex+1:]...)

	// Update worker load
	if worker, exists := mq.workers[workerID]; exists {
		worker.CurrentLoad++
	}

	// Track pending operation
	mq.trackPendingOperation(selectedTask)

	// Save task state after assignment
	mq.saveTaskState(selectedTask)

	glog.Infof("Task assigned: %s (%s) → worker %s (volume %d, server %s)",
		selectedTask.ID, selectedTask.Type, workerID, selectedTask.VolumeID, selectedTask.Server)

	return selectedTask
}

// CompleteTask marks a task as completed
func (mq *MaintenanceQueue) CompleteTask(taskID string, error string) {
	mq.mutex.Lock()
	defer mq.mutex.Unlock()

	task, exists := mq.tasks[taskID]
	if !exists {
		glog.Warningf("Attempted to complete non-existent task: %s", taskID)
		return
	}

	completedTime := time.Now()
	task.CompletedAt = &completedTime

	// Calculate task duration
	var duration time.Duration
	if task.StartedAt != nil {
		duration = completedTime.Sub(*task.StartedAt)
	}

	if error != "" {
		task.Status = TaskStatusFailed
		task.Error = error

		// Check if task should be retried
		if task.RetryCount < task.MaxRetries {
			// Record unassignment due to failure/retry
			if task.WorkerID != "" && len(task.AssignmentHistory) > 0 {
				lastAssignment := task.AssignmentHistory[len(task.AssignmentHistory)-1]
				if lastAssignment.UnassignedAt == nil {
					unassignedTime := completedTime
					lastAssignment.UnassignedAt = &unassignedTime
					lastAssignment.Reason = fmt.Sprintf("Task failed, scheduling retry (attempt %d/%d): %s",
						task.RetryCount+1, task.MaxRetries, error)
				}
			}

			task.RetryCount++
			task.Status = TaskStatusPending
			task.WorkerID = ""
			task.StartedAt = nil
			task.CompletedAt = nil
			task.Error = ""
			task.ScheduledAt = time.Now().Add(15 * time.Minute) // Retry delay

			mq.pendingTasks = append(mq.pendingTasks, task)
			// Save task state after retry setup
			mq.saveTaskState(task)
			glog.Warningf("Task failed, scheduling retry: %s (%s) attempt %d/%d, worker %s, duration %v, error: %s",
				taskID, task.Type, task.RetryCount, task.MaxRetries, task.WorkerID, duration, error)
		} else {
			// Record unassignment due to permanent failure
			if task.WorkerID != "" && len(task.AssignmentHistory) > 0 {
				lastAssignment := task.AssignmentHistory[len(task.AssignmentHistory)-1]
				if lastAssignment.UnassignedAt == nil {
					unassignedTime := completedTime
					lastAssignment.UnassignedAt = &unassignedTime
					lastAssignment.Reason = fmt.Sprintf("Task failed permanently after %d retries: %s", task.MaxRetries, error)
				}
			}

			// Save task state after permanent failure
			mq.saveTaskState(task)
			glog.Errorf("Task failed permanently: %s (%s) worker %s, duration %v, after %d retries: %s",
				taskID, task.Type, task.WorkerID, duration, task.MaxRetries, error)
		}
	} else {
		task.Status = TaskStatusCompleted
		task.Progress = 100
		// Save task state after successful completion
		mq.saveTaskState(task)
		glog.Infof("Task completed: %s (%s) worker %s, duration %v, volume %d",
			taskID, task.Type, task.WorkerID, duration, task.VolumeID)
	}

	// Update worker
	if task.WorkerID != "" {
		if worker, exists := mq.workers[task.WorkerID]; exists {
			worker.CurrentTask = nil
			worker.CurrentLoad--
			if worker.CurrentLoad == 0 {
				worker.Status = "active"
			}
		}
	}

	// Remove pending operation (unless it's being retried)
	if task.Status != TaskStatusPending {
		mq.removePendingOperation(taskID)
	}

	// Periodically cleanup old completed tasks (every 10th completion)
	if task.Status == TaskStatusCompleted {
		// Simple counter-based trigger for cleanup
		if len(mq.tasks)%10 == 0 {
			go mq.cleanupCompletedTasks()
		}
	}
}

// UpdateTaskProgress updates the progress of a running task
func (mq *MaintenanceQueue) UpdateTaskProgress(taskID string, progress float64) {
	mq.mutex.RLock()
	defer mq.mutex.RUnlock()

	if task, exists := mq.tasks[taskID]; exists {
		oldProgress := task.Progress
		task.Progress = progress
		task.Status = TaskStatusInProgress

		// Update pending operation status
		mq.updatePendingOperationStatus(taskID, "in_progress")

		// Log progress at significant milestones or changes
		if progress == 0 {
			glog.V(1).Infof("Task started: %s (%s) worker %s, volume %d",
				taskID, task.Type, task.WorkerID, task.VolumeID)
		} else if progress >= 100 {
			glog.V(1).Infof("Task progress: %s (%s) worker %s, %.1f%% complete",
				taskID, task.Type, task.WorkerID, progress)
		} else if progress-oldProgress >= 25 { // Log every 25% increment
			glog.V(1).Infof("Task progress: %s (%s) worker %s, %.1f%% complete",
				taskID, task.Type, task.WorkerID, progress)
		}

		// Save task state after progress update
		if progress == 0 || progress >= 100 || progress-oldProgress >= 10 {
			mq.saveTaskState(task)
		}
	} else {
		glog.V(2).Infof("Progress update for unknown task: %s (%.1f%%)", taskID, progress)
	}
}

// RegisterWorker registers a new worker
func (mq *MaintenanceQueue) RegisterWorker(worker *MaintenanceWorker) {
	mq.mutex.Lock()
	defer mq.mutex.Unlock()

	isNewWorker := true
	if existingWorker, exists := mq.workers[worker.ID]; exists {
		isNewWorker = false
		glog.Infof("Worker reconnected: %s at %s (capabilities: %v, max concurrent: %d)",
			worker.ID, worker.Address, worker.Capabilities, worker.MaxConcurrent)

		// Preserve current load when reconnecting
		worker.CurrentLoad = existingWorker.CurrentLoad
	} else {
		glog.Infof("Worker registered: %s at %s (capabilities: %v, max concurrent: %d)",
			worker.ID, worker.Address, worker.Capabilities, worker.MaxConcurrent)
	}

	worker.LastHeartbeat = time.Now()
	worker.Status = "active"
	if isNewWorker {
		worker.CurrentLoad = 0
	}
	mq.workers[worker.ID] = worker
}

// UpdateWorkerHeartbeat updates worker heartbeat
func (mq *MaintenanceQueue) UpdateWorkerHeartbeat(workerID string) {
	mq.mutex.Lock()
	defer mq.mutex.Unlock()

	if worker, exists := mq.workers[workerID]; exists {
		lastSeen := worker.LastHeartbeat
		worker.LastHeartbeat = time.Now()

		// Log if worker was offline for a while
		if time.Since(lastSeen) > 2*time.Minute {
			glog.Infof("Worker %s heartbeat resumed after %v", workerID, time.Since(lastSeen))
		}
	} else {
		glog.V(2).Infof("Heartbeat from unknown worker: %s", workerID)
	}
}

// GetRunningTaskCount returns the number of running tasks of a specific type
func (mq *MaintenanceQueue) GetRunningTaskCount(taskType MaintenanceTaskType) int {
	mq.mutex.RLock()
	defer mq.mutex.RUnlock()

	count := 0
	for _, task := range mq.tasks {
		if task.Type == taskType && (task.Status == TaskStatusAssigned || task.Status == TaskStatusInProgress) {
			count++
		}
	}
	return count
}

// WasTaskRecentlyCompleted checks if a similar task was recently completed
func (mq *MaintenanceQueue) WasTaskRecentlyCompleted(taskType MaintenanceTaskType, volumeID uint32, server string, now time.Time) bool {
	mq.mutex.RLock()
	defer mq.mutex.RUnlock()

	// Get the repeat prevention interval for this task type
	interval := mq.getRepeatPreventionInterval(taskType)
	cutoff := now.Add(-interval)

	for _, task := range mq.tasks {
		if task.Type == taskType &&
			task.VolumeID == volumeID &&
			task.Server == server &&
			task.Status == TaskStatusCompleted &&
			task.CompletedAt != nil &&
			task.CompletedAt.After(cutoff) {
			return true
		}
	}
	return false
}

// getRepeatPreventionInterval returns the interval for preventing task repetition
func (mq *MaintenanceQueue) getRepeatPreventionInterval(taskType MaintenanceTaskType) time.Duration {
	// First try to get default from task scheduler
	if mq.integration != nil {
		if scheduler := mq.integration.GetTaskScheduler(taskType); scheduler != nil {
			defaultInterval := scheduler.GetDefaultRepeatInterval()
			if defaultInterval > 0 {
				glog.V(3).Infof("Using task scheduler default repeat interval for %s: %v", taskType, defaultInterval)
				return defaultInterval
			}
		}
	}

	// Fallback to policy configuration if no scheduler available or scheduler doesn't provide default
	if mq.policy != nil {
		repeatIntervalHours := GetRepeatInterval(mq.policy, taskType)
		if repeatIntervalHours > 0 {
			interval := time.Duration(repeatIntervalHours) * time.Hour
			glog.V(3).Infof("Using policy configuration repeat interval for %s: %v", taskType, interval)
			return interval
		}
	}

	// Ultimate fallback - but avoid hardcoded values where possible
	glog.V(2).Infof("No scheduler or policy configuration found for task type %s, using minimal default: 1h", taskType)
	return time.Hour // Minimal safe default
}

// GetTasks returns tasks with optional filtering
func (mq *MaintenanceQueue) GetTasks(status MaintenanceTaskStatus, taskType MaintenanceTaskType, limit int) []*MaintenanceTask {
	// Create a copy of task slice while holding the lock for minimal time
	mq.mutex.RLock()
	tasksCopy := make([]*MaintenanceTask, 0, len(mq.tasks))
	for _, task := range mq.tasks {
		if status != "" && task.Status != status {
			continue
		}
		if taskType != "" && task.Type != taskType {
			continue
		}
		// Create a shallow copy to avoid data races
		taskCopy := *task
		tasksCopy = append(tasksCopy, &taskCopy)
		if limit > 0 && len(tasksCopy) >= limit {
			break
		}
	}
	mq.mutex.RUnlock()

	// Sort after releasing the lock to prevent deadlocks
	sort.Slice(tasksCopy, func(i, j int) bool {
		return tasksCopy[i].CreatedAt.After(tasksCopy[j].CreatedAt)
	})

	return tasksCopy
}

// GetWorkers returns all registered workers
func (mq *MaintenanceQueue) GetWorkers() []*MaintenanceWorker {
	mq.mutex.RLock()
	workers := make([]*MaintenanceWorker, 0, len(mq.workers))
	for _, worker := range mq.workers {
		// Create a shallow copy to avoid data races
		workerCopy := *worker
		workers = append(workers, &workerCopy)
	}
	mq.mutex.RUnlock()

	return workers
}

// generateTaskID generates a unique ID for tasks
func generateTaskID() string {
	const charset = "abcdefghijklmnopqrstuvwxyz0123456789"
	b := make([]byte, 8)
	randBytes := make([]byte, 8)

	// Generate random bytes
	if _, err := rand.Read(randBytes); err != nil {
		// Fallback to timestamp-based ID if crypto/rand fails
		timestamp := time.Now().UnixNano()
		return fmt.Sprintf("task-%d", timestamp)
	}

	// Convert random bytes to charset
	for i := range b {
		b[i] = charset[int(randBytes[i])%len(charset)]
	}

	// Add timestamp suffix to ensure uniqueness
	timestamp := time.Now().Unix() % 10000 // last 4 digits of timestamp
	return fmt.Sprintf("%s-%04d", string(b), timestamp)
}

// RetryTask manually retries a failed or pending task
func (mq *MaintenanceQueue) RetryTask(taskID string) error {
	mq.mutex.Lock()
	defer mq.mutex.Unlock()

	task, exists := mq.tasks[taskID]
	if !exists {
		return fmt.Errorf("task %s not found", taskID)
	}

	// Only allow retry for failed or pending tasks
	if task.Status != TaskStatusFailed && task.Status != TaskStatusPending {
		return fmt.Errorf("task %s cannot be retried (status: %s)", taskID, task.Status)
	}

	// Reset task for retry
	now := time.Now()
	task.Status = TaskStatusPending
	task.WorkerID = ""
	task.StartedAt = nil
	task.CompletedAt = nil
	task.Error = ""
	task.ScheduledAt = now // Schedule immediately
	task.Progress = 0

	// Add to assignment history if it was previously assigned
	if len(task.AssignmentHistory) > 0 {
		lastAssignment := task.AssignmentHistory[len(task.AssignmentHistory)-1]
		if lastAssignment.UnassignedAt == nil {
			unassignedTime := now
			lastAssignment.UnassignedAt = &unassignedTime
			lastAssignment.Reason = "Manual retry requested"
		}
	}

	// Remove from current pending list if already there to avoid duplicates
	for i, pendingTask := range mq.pendingTasks {
		if pendingTask.ID == taskID {
			mq.pendingTasks = append(mq.pendingTasks[:i], mq.pendingTasks[i+1:]...)
			break
		}
	}

	// Add back to pending queue
	mq.pendingTasks = append(mq.pendingTasks, task)

	// Save task state
	mq.saveTaskState(task)

	glog.Infof("Task manually retried: %s (%s) for volume %d", taskID, task.Type, task.VolumeID)
	return nil
}

func (mq *MaintenanceQueue) CleanupOldTasks(retention time.Duration) int {
	mq.mutex.Lock()
	defer mq.mutex.Unlock()

	cutoff := time.Now().Add(-retention)
	removed := 0

	for id, task := range mq.tasks {
		if (task.Status == TaskStatusCompleted || task.Status == TaskStatusFailed) &&
			task.CompletedAt != nil &&
			task.CompletedAt.Before(cutoff) {
			delete(mq.tasks, id)
			removed++
		}
	}

	glog.V(2).Infof("Cleaned up %d old maintenance tasks", removed)
	return removed
}

// RemoveStaleWorkers removes workers that haven't sent heartbeat recently
func (mq *MaintenanceQueue) RemoveStaleWorkers(timeout time.Duration) int {
	mq.mutex.Lock()
	defer mq.mutex.Unlock()

	cutoff := time.Now().Add(-timeout)
	removed := 0

	for id, worker := range mq.workers {
		if worker.LastHeartbeat.Before(cutoff) {
			// Mark any assigned tasks as failed and record unassignment
			for _, task := range mq.tasks {
				if task.WorkerID == id && (task.Status == TaskStatusAssigned || task.Status == TaskStatusInProgress) {
					// Record unassignment due to worker becoming unavailable
					if len(task.AssignmentHistory) > 0 {
						lastAssignment := task.AssignmentHistory[len(task.AssignmentHistory)-1]
						if lastAssignment.UnassignedAt == nil {
							unassignedTime := time.Now()
							lastAssignment.UnassignedAt = &unassignedTime
							lastAssignment.Reason = "Worker became unavailable (stale heartbeat)"
						}
					}

					task.Status = TaskStatusFailed
					task.Error = "Worker became unavailable"
					completedTime := time.Now()
					task.CompletedAt = &completedTime
				}
			}

			delete(mq.workers, id)
			removed++
			glog.Warningf("Removed stale maintenance worker %s", id)
		}
	}

	return removed
}

// GetStats returns maintenance statistics
func (mq *MaintenanceQueue) GetStats() *MaintenanceStats {
	mq.mutex.RLock()
	defer mq.mutex.RUnlock()

	stats := &MaintenanceStats{
		TotalTasks:    len(mq.tasks),
		TasksByStatus: make(map[MaintenanceTaskStatus]int),
		TasksByType:   make(map[MaintenanceTaskType]int),
		ActiveWorkers: 0,
	}

	today := time.Now().Truncate(24 * time.Hour)
	var totalDuration time.Duration
	var completedTasks int

	for _, task := range mq.tasks {
		stats.TasksByStatus[task.Status]++
		stats.TasksByType[task.Type]++

		if task.CompletedAt != nil && task.CompletedAt.After(today) {
			if task.Status == TaskStatusCompleted {
				stats.CompletedToday++
			} else if task.Status == TaskStatusFailed {
				stats.FailedToday++
			}

			if task.StartedAt != nil {
				duration := task.CompletedAt.Sub(*task.StartedAt)
				totalDuration += duration
				completedTasks++
			}
		}
	}

	for _, worker := range mq.workers {
		if worker.Status == "active" || worker.Status == "busy" {
			stats.ActiveWorkers++
		}
	}

	if completedTasks > 0 {
		stats.AverageTaskTime = totalDuration / time.Duration(completedTasks)
	}

	return stats
}

// workerCanHandle checks if a worker can handle a specific task type
func (mq *MaintenanceQueue) workerCanHandle(taskType MaintenanceTaskType, capabilities []MaintenanceTaskType) bool {
	for _, capability := range capabilities {
		if capability == taskType {
			return true
		}
	}
	return false
}

// canScheduleTaskNow determines if a task can be scheduled using task schedulers or fallback logic
func (mq *MaintenanceQueue) canScheduleTaskNow(task *MaintenanceTask) bool {
	glog.V(2).Infof("Checking if task %s (type: %s) can be scheduled", task.ID, task.Type)

	// TEMPORARY FIX: Skip integration task scheduler which is being overly restrictive
	// Use fallback logic directly for now
	glog.V(2).Infof("Using fallback logic for task scheduling")
	canExecute := mq.canExecuteTaskType(task.Type)
	glog.V(2).Infof("Fallback decision for task %s: %v", task.ID, canExecute)
	return canExecute

	// NOTE: Original integration code disabled temporarily
	// Try task scheduling logic first
	/*
		if mq.integration != nil {
			glog.Infof("DEBUG canScheduleTaskNow: Using integration task scheduler")
			// Get all running tasks and available workers
			runningTasks := mq.getRunningTasks()
			availableWorkers := mq.getAvailableWorkers()

			glog.Infof("DEBUG canScheduleTaskNow: Running tasks: %d, Available workers: %d", len(runningTasks), len(availableWorkers))

			canSchedule := mq.integration.CanScheduleWithTaskSchedulers(task, runningTasks, availableWorkers)
			glog.Infof("DEBUG canScheduleTaskNow: Task scheduler decision for task %s (%s): %v", task.ID, task.Type, canSchedule)
			return canSchedule
		}
	*/
}

// canExecuteTaskType checks if we can execute more tasks of this type (concurrency limits) - fallback logic
func (mq *MaintenanceQueue) canExecuteTaskType(taskType MaintenanceTaskType) bool {
	runningCount := mq.GetRunningTaskCount(taskType)
	maxConcurrent := mq.getMaxConcurrentForTaskType(taskType)

	canExecute := runningCount < maxConcurrent
	glog.V(3).Infof("canExecuteTaskType for %s: running=%d, max=%d, canExecute=%v", taskType, runningCount, maxConcurrent, canExecute)

	return canExecute
}

// getMaxConcurrentForTaskType returns the maximum concurrent tasks allowed for a task type
func (mq *MaintenanceQueue) getMaxConcurrentForTaskType(taskType MaintenanceTaskType) int {
	// First try to get default from task scheduler
	if mq.integration != nil {
		if scheduler := mq.integration.GetTaskScheduler(taskType); scheduler != nil {
			maxConcurrent := scheduler.GetMaxConcurrent()
			if maxConcurrent > 0 {
				glog.V(3).Infof("Using task scheduler max concurrent for %s: %d", taskType, maxConcurrent)
				return maxConcurrent
			}
		}
	}

	// Fallback to policy configuration if no scheduler available or scheduler doesn't provide default
	if mq.policy != nil {
		maxConcurrent := GetMaxConcurrent(mq.policy, taskType)
		if maxConcurrent > 0 {
			glog.V(3).Infof("Using policy configuration max concurrent for %s: %d", taskType, maxConcurrent)
			return maxConcurrent
		}
	}

	// Ultimate fallback - minimal safe default
	glog.V(2).Infof("No scheduler or policy configuration found for task type %s, using minimal default: 1", taskType)
	return 1
}

// getRunningTasks returns all currently running tasks
func (mq *MaintenanceQueue) getRunningTasks() []*MaintenanceTask {
	var runningTasks []*MaintenanceTask
	for _, task := range mq.tasks {
		if task.Status == TaskStatusAssigned || task.Status == TaskStatusInProgress {
			runningTasks = append(runningTasks, task)
		}
	}
	return runningTasks
}

// getAvailableWorkers returns all workers that can take more work
func (mq *MaintenanceQueue) getAvailableWorkers() []*MaintenanceWorker {
	var availableWorkers []*MaintenanceWorker
	for _, worker := range mq.workers {
		if worker.Status == "active" && worker.CurrentLoad < worker.MaxConcurrent {
			availableWorkers = append(availableWorkers, worker)
		}
	}
	return availableWorkers
}

// trackPendingOperation adds a task to the pending operations tracker
func (mq *MaintenanceQueue) trackPendingOperation(task *MaintenanceTask) {
	if mq.integration == nil {
		return
	}

	pendingOps := mq.integration.GetPendingOperations()
	if pendingOps == nil {
		return
	}

	// Skip tracking for tasks without proper typed parameters
	if task.TypedParams == nil {
		glog.V(2).Infof("Skipping pending operation tracking for task %s - no typed parameters", task.ID)
		return
	}

	// Map maintenance task type to pending operation type
	var opType PendingOperationType
	switch task.Type {
	case MaintenanceTaskType("balance"):
		opType = OpTypeVolumeBalance
	case MaintenanceTaskType("erasure_coding"):
		opType = OpTypeErasureCoding
	case MaintenanceTaskType("vacuum"):
		opType = OpTypeVacuum
	case MaintenanceTaskType("replication"):
		opType = OpTypeReplication
	default:
		opType = OpTypeVolumeMove
	}

	// Determine destination node and estimated size from unified targets
	destNode := ""
	estimatedSize := uint64(1024 * 1024 * 1024) // Default 1GB estimate

	// Use unified targets array - the only source of truth
	if len(task.TypedParams.Targets) > 0 {
		destNode = task.TypedParams.Targets[0].Node
		if task.TypedParams.Targets[0].EstimatedSize > 0 {
			estimatedSize = task.TypedParams.Targets[0].EstimatedSize
		}
	}

	// Determine source node from unified sources
	sourceNode := ""
	if len(task.TypedParams.Sources) > 0 {
		sourceNode = task.TypedParams.Sources[0].Node
	}

	operation := &PendingOperation{
		VolumeID:      task.VolumeID,
		OperationType: opType,
		SourceNode:    sourceNode,
		DestNode:      destNode,
		TaskID:        task.ID,
		StartTime:     time.Now(),
		EstimatedSize: estimatedSize,
		Collection:    task.Collection,
		Status:        "assigned",
	}

	pendingOps.AddOperation(operation)
}

// removePendingOperation removes a task from the pending operations tracker
func (mq *MaintenanceQueue) removePendingOperation(taskID string) {
	if mq.integration == nil {
		return
	}

	pendingOps := mq.integration.GetPendingOperations()
	if pendingOps == nil {
		return
	}

	pendingOps.RemoveOperation(taskID)
}

// updatePendingOperationStatus updates the status of a pending operation
func (mq *MaintenanceQueue) updatePendingOperationStatus(taskID string, status string) {
	if mq.integration == nil {
		return
	}

	pendingOps := mq.integration.GetPendingOperations()
	if pendingOps == nil {
		return
	}

	pendingOps.UpdateOperationStatus(taskID, status)
}