|
|
@ -131,6 +131,14 @@ func (mq *MaintenanceQueue) cleanupCompletedTasks() { |
|
|
func (mq *MaintenanceQueue) AddTask(task *MaintenanceTask) { |
|
|
func (mq *MaintenanceQueue) AddTask(task *MaintenanceTask) { |
|
|
mq.mutex.Lock() |
|
|
mq.mutex.Lock() |
|
|
|
|
|
|
|
|
|
|
|
// Enforce one queued/active task per volume (across all task types).
|
|
|
|
|
|
if mq.hasQueuedOrActiveTaskForVolume(task.VolumeID) { |
|
|
|
|
|
mq.mutex.Unlock() |
|
|
|
|
|
glog.V(1).Infof("Task skipped (volume busy): %s for volume %d on %s (already queued or running)", |
|
|
|
|
|
task.Type, task.VolumeID, task.Server) |
|
|
|
|
|
return |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
// Check for duplicate tasks (same type + volume + not completed)
|
|
|
// Check for duplicate tasks (same type + volume + not completed)
|
|
|
if mq.hasDuplicateTask(task) { |
|
|
if mq.hasDuplicateTask(task) { |
|
|
mq.mutex.Unlock() |
|
|
mq.mutex.Unlock() |
|
|
@ -188,6 +196,25 @@ func (mq *MaintenanceQueue) AddTask(task *MaintenanceTask) { |
|
|
taskSnapshot.ID, taskSnapshot.Type, taskSnapshot.VolumeID, taskSnapshot.Server, taskSnapshot.Priority, scheduleInfo, taskSnapshot.Reason) |
|
|
taskSnapshot.ID, taskSnapshot.Type, taskSnapshot.VolumeID, taskSnapshot.Server, taskSnapshot.Priority, scheduleInfo, taskSnapshot.Reason) |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// hasQueuedOrActiveTaskForVolume checks if any pending/assigned/in-progress task already exists for this volume.
|
|
|
|
|
|
// Caller must hold mq.mutex.
|
|
|
|
|
|
func (mq *MaintenanceQueue) hasQueuedOrActiveTaskForVolume(volumeID uint32) bool { |
|
|
|
|
|
if volumeID == 0 { |
|
|
|
|
|
return false |
|
|
|
|
|
} |
|
|
|
|
|
for _, existingTask := range mq.tasks { |
|
|
|
|
|
if existingTask.VolumeID != volumeID { |
|
|
|
|
|
continue |
|
|
|
|
|
} |
|
|
|
|
|
if existingTask.Status == TaskStatusPending || |
|
|
|
|
|
existingTask.Status == TaskStatusAssigned || |
|
|
|
|
|
existingTask.Status == TaskStatusInProgress { |
|
|
|
|
|
return true |
|
|
|
|
|
} |
|
|
|
|
|
} |
|
|
|
|
|
return false |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
// hasDuplicateTask checks if a similar task already exists (same type, volume, and not completed)
|
|
|
// hasDuplicateTask checks if a similar task already exists (same type, volume, and not completed)
|
|
|
func (mq *MaintenanceQueue) hasDuplicateTask(newTask *MaintenanceTask) bool { |
|
|
func (mq *MaintenanceQueue) hasDuplicateTask(newTask *MaintenanceTask) bool { |
|
|
for _, existingTask := range mq.tasks { |
|
|
for _, existingTask := range mq.tasks { |
|
|
@ -260,6 +287,13 @@ func (mq *MaintenanceQueue) GetNextTask(workerID string, capabilities []Maintena |
|
|
continue |
|
|
continue |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// Avoid scheduling concurrent operations on the same volume
|
|
|
|
|
|
if activeTaskID, activeTaskType, hasActive := mq.activeTaskForVolume(task.VolumeID, task.ID); hasActive { |
|
|
|
|
|
glog.V(2).Infof("Task %s (%s) skipped for worker %s: volume %d is busy with task %s (%s)", |
|
|
|
|
|
task.ID, task.Type, workerID, task.VolumeID, activeTaskID, activeTaskType) |
|
|
|
|
|
continue |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
// Check if worker can handle this task type
|
|
|
// Check if worker can handle this task type
|
|
|
if !mq.workerCanHandle(task.Type, capabilities) { |
|
|
if !mq.workerCanHandle(task.Type, capabilities) { |
|
|
glog.V(3).Infof("Task %s (%s) skipped for worker %s: capability mismatch (worker has: %v)", task.ID, task.Type, workerID, capabilities) |
|
|
glog.V(3).Infof("Task %s (%s) skipped for worker %s: capability mismatch (worker has: %v)", task.ID, task.Type, workerID, capabilities) |
|
|
@ -304,6 +338,14 @@ func (mq *MaintenanceQueue) GetNextTask(workerID string, capabilities []Maintena |
|
|
return nil |
|
|
return nil |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// Re-check volume conflict after acquiring write lock
|
|
|
|
|
|
if activeTaskID, activeTaskType, hasActive := mq.activeTaskForVolume(selectedTask.VolumeID, selectedTaskID); hasActive { |
|
|
|
|
|
mq.mutex.Unlock() |
|
|
|
|
|
glog.V(2).Infof("Task %s no longer available for worker %s: volume %d is busy with task %s (%s)", |
|
|
|
|
|
selectedTaskID, workerID, selectedTask.VolumeID, activeTaskID, activeTaskType) |
|
|
|
|
|
return nil |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
// Record assignment history
|
|
|
// Record assignment history
|
|
|
workerAddress := "" |
|
|
workerAddress := "" |
|
|
if worker, exists := mq.workers[workerID]; exists { |
|
|
if worker, exists := mq.workers[workerID]; exists { |
|
|
@ -877,6 +919,28 @@ func (mq *MaintenanceQueue) workerCanHandle(taskType MaintenanceTaskType, capabi |
|
|
return false |
|
|
return false |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// activeTaskForVolume returns the active task ID/type for a volume, if any.
|
|
|
|
|
|
// Caller must hold mq.mutex (read or write).
|
|
|
|
|
|
func (mq *MaintenanceQueue) activeTaskForVolume(volumeID uint32, excludeTaskID string) (string, MaintenanceTaskType, bool) { |
|
|
|
|
|
if volumeID == 0 { |
|
|
|
|
|
return "", "", false |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
for _, task := range mq.tasks { |
|
|
|
|
|
if task.ID == excludeTaskID { |
|
|
|
|
|
continue |
|
|
|
|
|
} |
|
|
|
|
|
if task.VolumeID != volumeID { |
|
|
|
|
|
continue |
|
|
|
|
|
} |
|
|
|
|
|
if task.Status == TaskStatusAssigned || task.Status == TaskStatusInProgress { |
|
|
|
|
|
return task.ID, task.Type, true |
|
|
|
|
|
} |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
return "", "", false |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
// canScheduleTaskNow determines if a task can be scheduled using task schedulers or fallback logic
|
|
|
// canScheduleTaskNow determines if a task can be scheduled using task schedulers or fallback logic
|
|
|
func (mq *MaintenanceQueue) canScheduleTaskNow(task *MaintenanceTask) bool { |
|
|
func (mq *MaintenanceQueue) canScheduleTaskNow(task *MaintenanceTask) bool { |
|
|
glog.V(2).Infof("Checking if task %s (type: %s) can be scheduled", task.ID, task.Type) |
|
|
glog.V(2).Infof("Checking if task %s (type: %s) can be scheduled", task.ID, task.Type) |
|
|
|