You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
123 lines
4.2 KiB
123 lines
4.2 KiB
package task
|
|
|
|
import (
|
|
"time"
|
|
|
|
"github.com/seaweedfs/seaweedfs/weed/glog"
|
|
)
|
|
|
|
// FailureHandler handles various failure scenarios in the task system
|
|
type FailureHandler struct {
|
|
config *AdminConfig
|
|
}
|
|
|
|
// NewFailureHandler creates a new failure handler
|
|
func NewFailureHandler(config *AdminConfig) *FailureHandler {
|
|
return &FailureHandler{
|
|
config: config,
|
|
}
|
|
}
|
|
|
|
// HandleWorkerTimeout handles worker timeout scenarios
|
|
func (fh *FailureHandler) HandleWorkerTimeout(workerID string, affectedTasks []*InProgressTask) {
|
|
glog.Warningf("Handling worker timeout for worker %s with %d affected tasks", workerID, len(affectedTasks))
|
|
|
|
for _, task := range affectedTasks {
|
|
fh.handleTaskFailure(task, "worker_timeout", "Worker became unresponsive")
|
|
}
|
|
}
|
|
|
|
// HandleTaskStuck handles stuck task scenarios
|
|
func (fh *FailureHandler) HandleTaskStuck(task *InProgressTask) {
|
|
glog.Warningf("Handling stuck task %s (no progress for %v)", task.Task.ID, time.Since(task.LastUpdate))
|
|
|
|
fh.handleTaskFailure(task, "task_stuck", "Task made no progress within timeout period")
|
|
}
|
|
|
|
// HandleTaskFailure handles general task failure scenarios
|
|
func (fh *FailureHandler) HandleTaskFailure(task *InProgressTask, reason string, details string) {
|
|
glog.Errorf("Handling task failure for task %s: %s - %s", task.Task.ID, reason, details)
|
|
|
|
fh.handleTaskFailure(task, reason, details)
|
|
}
|
|
|
|
// handleTaskFailure is the internal handler for task failures
|
|
func (fh *FailureHandler) handleTaskFailure(task *InProgressTask, reason string, details string) {
|
|
// Record failure reason
|
|
task.Task.Error = details
|
|
|
|
// Determine if task should be retried
|
|
if task.Task.RetryCount < fh.config.MaxRetries {
|
|
fh.scheduleRetry(task, reason)
|
|
} else {
|
|
fh.markTaskFailed(task, reason)
|
|
}
|
|
}
|
|
|
|
// scheduleRetry schedules a task for retry
|
|
func (fh *FailureHandler) scheduleRetry(task *InProgressTask, reason string) {
|
|
task.Task.RetryCount++
|
|
|
|
// Calculate retry delay with exponential backoff
|
|
retryDelay := time.Duration(task.Task.RetryCount) * 5 * time.Minute
|
|
task.Task.ScheduledAt = time.Now().Add(retryDelay)
|
|
|
|
glog.Infof("Scheduling retry %d/%d for task %s (reason: %s, delay: %v)",
|
|
task.Task.RetryCount, fh.config.MaxRetries, task.Task.ID, reason, retryDelay)
|
|
}
|
|
|
|
// markTaskFailed permanently marks a task as failed
|
|
func (fh *FailureHandler) markTaskFailed(task *InProgressTask, reason string) {
|
|
glog.Errorf("Task %s permanently failed after %d retries (reason: %s)",
|
|
task.Task.ID, task.Task.RetryCount, reason)
|
|
|
|
// Could trigger alerts or notifications here
|
|
fh.sendFailureAlert(task, reason)
|
|
}
|
|
|
|
// sendFailureAlert sends alerts for permanently failed tasks
|
|
func (fh *FailureHandler) sendFailureAlert(task *InProgressTask, reason string) {
|
|
// In a real implementation, this would:
|
|
// 1. Send notifications to administrators
|
|
// 2. Update monitoring dashboards
|
|
// 3. Log to audit trails
|
|
// 4. Possibly trigger automatic remediation
|
|
|
|
glog.Errorf("ALERT: Task permanently failed - ID: %s, Type: %s, Volume: %d, Reason: %s",
|
|
task.Task.ID, task.Task.Type, task.Task.VolumeID, reason)
|
|
}
|
|
|
|
// HandleDuplicateTask handles duplicate task detection
|
|
func (fh *FailureHandler) HandleDuplicateTask(existingTaskID string, duplicateTaskID string, volumeID uint32) {
|
|
glog.Warningf("Detected duplicate task for volume %d: existing=%s, duplicate=%s",
|
|
volumeID, existingTaskID, duplicateTaskID)
|
|
|
|
// Cancel the duplicate task
|
|
// In a real implementation, this would send a cancellation signal
|
|
}
|
|
|
|
// HandleResourceExhaustion handles resource exhaustion scenarios
|
|
func (fh *FailureHandler) HandleResourceExhaustion(workerID string, taskType string) {
|
|
glog.Warningf("Worker %s reported resource exhaustion for task type %s", workerID, taskType)
|
|
|
|
// Could implement:
|
|
// 1. Temporary worker blacklisting
|
|
// 2. Task redistribution
|
|
// 3. Resource monitoring alerts
|
|
}
|
|
|
|
// GetFailureStats returns failure statistics
|
|
func (fh *FailureHandler) GetFailureStats() map[string]interface{} {
|
|
// In a real implementation, this would track:
|
|
// - Failure rates by type
|
|
// - Worker reliability scores
|
|
// - Task retry statistics
|
|
// - System health metrics
|
|
|
|
return map[string]interface{}{
|
|
"enabled": true,
|
|
"max_retries": fh.config.MaxRetries,
|
|
"task_timeout": fh.config.TaskTimeout.String(),
|
|
"worker_timeout": fh.config.WorkerTimeout.String(),
|
|
}
|
|
}
|