seaweedfs/weed/admin/maintenance/maintenance_manager.go


								package maintenance


								import (

									"fmt"

									"strings"

									"sync"

									"time"


									"github.com/seaweedfs/seaweedfs/weed/glog"

									"github.com/seaweedfs/seaweedfs/weed/pb/worker_pb"

									"github.com/seaweedfs/seaweedfs/weed/worker/tasks/balance"

									"github.com/seaweedfs/seaweedfs/weed/worker/tasks/erasure_coding"

									"github.com/seaweedfs/seaweedfs/weed/worker/tasks/vacuum"

								)


								// buildPolicyFromTaskConfigs loads task configurations from separate files and builds a MaintenancePolicy

								func buildPolicyFromTaskConfigs() *worker_pb.MaintenancePolicy {

									policy := &worker_pb.MaintenancePolicy{

										GlobalMaxConcurrent:          4,

										DefaultRepeatIntervalSeconds: 6 * 3600,  // 6 hours in seconds

										DefaultCheckIntervalSeconds:  12 * 3600, // 12 hours in seconds

										TaskPolicies:                 make(map[string]*worker_pb.TaskPolicy),

									}


									// Load vacuum task configuration

									if vacuumConfig := vacuum.LoadConfigFromPersistence(nil); vacuumConfig != nil {

										policy.TaskPolicies["vacuum"] = &worker_pb.TaskPolicy{

											Enabled:               vacuumConfig.Enabled,

											MaxConcurrent:         int32(vacuumConfig.MaxConcurrent),

											RepeatIntervalSeconds: int32(vacuumConfig.ScanIntervalSeconds),

											CheckIntervalSeconds:  int32(vacuumConfig.ScanIntervalSeconds),

											TaskConfig: &worker_pb.TaskPolicy_VacuumConfig{

												VacuumConfig: &worker_pb.VacuumTaskConfig{

													GarbageThreshold:   float64(vacuumConfig.GarbageThreshold),

													MinVolumeAgeHours:  int32(vacuumConfig.MinVolumeAgeSeconds / 3600), // Convert seconds to hours

													MinIntervalSeconds: int32(vacuumConfig.MinIntervalSeconds),

												},

											},

										}

									}


									// Load erasure coding task configuration

									if ecConfig := erasure_coding.LoadConfigFromPersistence(nil); ecConfig != nil {

										policy.TaskPolicies["erasure_coding"] = &worker_pb.TaskPolicy{

											Enabled:               ecConfig.Enabled,

											MaxConcurrent:         int32(ecConfig.MaxConcurrent),

											RepeatIntervalSeconds: int32(ecConfig.ScanIntervalSeconds),

											CheckIntervalSeconds:  int32(ecConfig.ScanIntervalSeconds),

											TaskConfig: &worker_pb.TaskPolicy_ErasureCodingConfig{

												ErasureCodingConfig: &worker_pb.ErasureCodingTaskConfig{

													FullnessRatio:    float64(ecConfig.FullnessRatio),

													QuietForSeconds:  int32(ecConfig.QuietForSeconds),

													MinVolumeSizeMb:  int32(ecConfig.MinSizeMB),

													CollectionFilter: ecConfig.CollectionFilter,

												},

											},

										}

									}


									// Load balance task configuration

									if balanceConfig := balance.LoadConfigFromPersistence(nil); balanceConfig != nil {

										policy.TaskPolicies["balance"] = &worker_pb.TaskPolicy{

											Enabled:               balanceConfig.Enabled,

											MaxConcurrent:         int32(balanceConfig.MaxConcurrent),

											RepeatIntervalSeconds: int32(balanceConfig.ScanIntervalSeconds),

											CheckIntervalSeconds:  int32(balanceConfig.ScanIntervalSeconds),

											TaskConfig: &worker_pb.TaskPolicy_BalanceConfig{

												BalanceConfig: &worker_pb.BalanceTaskConfig{

													ImbalanceThreshold: float64(balanceConfig.ImbalanceThreshold),

													MinServerCount:     int32(balanceConfig.MinServerCount),

												},

											},

										}

									}


									glog.V(1).Infof("Built maintenance policy from separate task configs - %d task policies loaded", len(policy.TaskPolicies))

									return policy

								}


								// MaintenanceManager coordinates the maintenance system

								type MaintenanceManager struct {

									config      *MaintenanceConfig

									scanner     *MaintenanceScanner

									queue       *MaintenanceQueue

									adminClient AdminClient

									running     bool

									stopChan    chan struct{}

									// Error handling and backoff

									errorCount     int

									lastError      error

									lastErrorTime  time.Time

									backoffDelay   time.Duration

									mutex          sync.RWMutex

									scanInProgress bool

								}


								// NewMaintenanceManager creates a new maintenance manager

								func NewMaintenanceManager(adminClient AdminClient, config *MaintenanceConfig) *MaintenanceManager {

									if config == nil {

										config = DefaultMaintenanceConfig()

									}


									// Use the policy from the config (which is populated from separate task files in LoadMaintenanceConfig)

									policy := config.Policy

									if policy == nil {

										// Fallback: build policy from separate task configuration files if not already populated

										policy = buildPolicyFromTaskConfigs()

									}


									queue := NewMaintenanceQueue(policy)

									scanner := NewMaintenanceScanner(adminClient, policy, queue)


									return &MaintenanceManager{

										config:       config,

										scanner:      scanner,

										queue:        queue,

										adminClient:  adminClient,

										stopChan:     make(chan struct{}),

										backoffDelay: time.Second, // Start with 1 second backoff

									}

								}


								// Start begins the maintenance manager

								func (mm *MaintenanceManager) Start() error {

									if !mm.config.Enabled {

										glog.V(1).Infof("Maintenance system is disabled")

										return nil

									}


									// Validate configuration durations to prevent ticker panics

									if err := mm.validateConfig(); err != nil {

										return fmt.Errorf("invalid maintenance configuration: %w", err)

									}


									mm.running = true


									// Start background processes

									go mm.scanLoop()

									go mm.cleanupLoop()


									glog.Infof("Maintenance manager started with scan interval %ds", mm.config.ScanIntervalSeconds)

									return nil

								}


								// validateConfig validates the maintenance configuration durations

								func (mm *MaintenanceManager) validateConfig() error {

									if mm.config.ScanIntervalSeconds <= 0 {

										glog.Warningf("Invalid scan interval %ds, using default 30m", mm.config.ScanIntervalSeconds)

										mm.config.ScanIntervalSeconds = 30 * 60 // 30 minutes in seconds

									}


									if mm.config.CleanupIntervalSeconds <= 0 {

										glog.Warningf("Invalid cleanup interval %ds, using default 24h", mm.config.CleanupIntervalSeconds)

										mm.config.CleanupIntervalSeconds = 24 * 60 * 60 // 24 hours in seconds

									}


									if mm.config.WorkerTimeoutSeconds <= 0 {

										glog.Warningf("Invalid worker timeout %ds, using default 5m", mm.config.WorkerTimeoutSeconds)

										mm.config.WorkerTimeoutSeconds = 5 * 60 // 5 minutes in seconds

									}


									if mm.config.TaskTimeoutSeconds <= 0 {

										glog.Warningf("Invalid task timeout %ds, using default 2h", mm.config.TaskTimeoutSeconds)

										mm.config.TaskTimeoutSeconds = 2 * 60 * 60 // 2 hours in seconds

									}


									if mm.config.RetryDelaySeconds <= 0 {

										glog.Warningf("Invalid retry delay %ds, using default 15m", mm.config.RetryDelaySeconds)

										mm.config.RetryDelaySeconds = 15 * 60 // 15 minutes in seconds

									}


									if mm.config.TaskRetentionSeconds <= 0 {

										glog.Warningf("Invalid task retention %ds, using default 168h", mm.config.TaskRetentionSeconds)

										mm.config.TaskRetentionSeconds = 7 * 24 * 60 * 60 // 7 days in seconds

									}


									return nil

								}


								// IsRunning returns whether the maintenance manager is currently running

								func (mm *MaintenanceManager) IsRunning() bool {

									return mm.running

								}


								// Stop terminates the maintenance manager

								func (mm *MaintenanceManager) Stop() {

									mm.running = false

									close(mm.stopChan)

									glog.Infof("Maintenance manager stopped")

								}


								// scanLoop periodically scans for maintenance tasks with adaptive timing

								func (mm *MaintenanceManager) scanLoop() {

									scanInterval := time.Duration(mm.config.ScanIntervalSeconds) * time.Second

									ticker := time.NewTicker(scanInterval)

									defer ticker.Stop()


									for mm.running {

										select {

										case <-mm.stopChan:

											return

										case <-ticker.C:

											glog.V(1).Infof("Performing maintenance scan every %v", scanInterval)


											// Use the same synchronization as TriggerScan to prevent concurrent scans

											if err := mm.triggerScanInternal(false); err != nil {

												glog.V(1).Infof("Scheduled scan skipped: %v", err)

											}


											// Adjust ticker interval based on error state (read error state safely)

											currentInterval := mm.getScanInterval(scanInterval)


											// Reset ticker with new interval if needed

											if currentInterval != scanInterval {

												ticker.Stop()

												ticker = time.NewTicker(currentInterval)

											}

										}

									}

								}


								// getScanInterval safely reads the current scan interval with error backoff

								func (mm *MaintenanceManager) getScanInterval(baseInterval time.Duration) time.Duration {

									mm.mutex.RLock()

									defer mm.mutex.RUnlock()


									if mm.errorCount > 0 {

										// Use backoff delay when there are errors

										currentInterval := mm.backoffDelay

										if currentInterval > baseInterval {

											// Don't make it longer than the configured interval * 10

											maxInterval := baseInterval * 10

											if currentInterval > maxInterval {

												currentInterval = maxInterval

											}

										}

										return currentInterval

									}

									return baseInterval

								}


								// cleanupLoop periodically cleans up old tasks and stale workers

								func (mm *MaintenanceManager) cleanupLoop() {

									cleanupInterval := time.Duration(mm.config.CleanupIntervalSeconds) * time.Second

									ticker := time.NewTicker(cleanupInterval)

									defer ticker.Stop()


									for mm.running {

										select {

										case <-mm.stopChan:

											return

										case <-ticker.C:

											mm.performCleanup()

										}

									}

								}


								// performScan executes a maintenance scan with error handling and backoff

								func (mm *MaintenanceManager) performScan() {

									defer func() {

										// Always reset scan in progress flag when done

										mm.mutex.Lock()

										mm.scanInProgress = false

										mm.mutex.Unlock()

									}()


									glog.Infof("Starting maintenance scan...")


									results, err := mm.scanner.ScanForMaintenanceTasks()

									if err != nil {

										// Handle scan error

										mm.mutex.Lock()

										mm.handleScanError(err)

										mm.mutex.Unlock()

										glog.Warningf("Maintenance scan failed: %v", err)

										return

									}


									// Scan succeeded - update state and process results

									mm.handleScanSuccess(results)

								}


								// handleScanSuccess processes successful scan results with proper lock management

								func (mm *MaintenanceManager) handleScanSuccess(results []*TaskDetectionResult) {

									// Update manager state first

									mm.mutex.Lock()

									mm.resetErrorTracking()

									taskCount := len(results)

									mm.mutex.Unlock()


									if taskCount > 0 {

										// Count tasks by type for logging (outside of lock)

										taskCounts := make(map[MaintenanceTaskType]int)

										for _, result := range results {

											taskCounts[result.TaskType]++

										}


										// Add tasks to queue (no manager lock held)

										mm.queue.AddTasksFromResults(results)


										// Log detailed scan results

										glog.Infof("Maintenance scan completed: found %d tasks", taskCount)

										for taskType, count := range taskCounts {

											glog.Infof("  - %s: %d tasks", taskType, count)

										}

									} else {

										glog.Infof("Maintenance scan completed: no maintenance tasks needed")

									}

								}


								// handleScanError handles scan errors with exponential backoff and reduced logging

								func (mm *MaintenanceManager) handleScanError(err error) {

									now := time.Now()

									mm.errorCount++

									mm.lastError = err

									mm.lastErrorTime = now


									// Use exponential backoff with jitter

									if mm.errorCount > 1 {

										mm.backoffDelay = mm.backoffDelay * 2

										if mm.backoffDelay > 5*time.Minute {

											mm.backoffDelay = 5 * time.Minute // Cap at 5 minutes

										}

									}


									// Reduce log frequency based on error count and time

									shouldLog := false

									if mm.errorCount <= 3 {

										// Log first 3 errors immediately

										shouldLog = true

									} else if mm.errorCount <= 10 && mm.errorCount%3 == 0 {

										// Log every 3rd error for errors 4-10

										shouldLog = true

									} else if mm.errorCount%10 == 0 {

										// Log every 10th error after that

										shouldLog = true

									}


									if shouldLog {

										// Check if it's a connection error to provide better messaging

										if isConnectionError(err) {

											if mm.errorCount == 1 {

												glog.Errorf("Maintenance scan failed: %v (will retry with backoff)", err)

											} else {

												glog.Errorf("Maintenance scan still failing after %d attempts: %v (backoff: %v)",

													mm.errorCount, err, mm.backoffDelay)

											}

										} else {

											glog.Errorf("Maintenance scan failed: %v", err)

										}

									} else {

										// Use debug level for suppressed errors

										glog.V(3).Infof("Maintenance scan failed (error #%d, suppressed): %v", mm.errorCount, err)

									}

								}


								// resetErrorTracking resets error tracking when scan succeeds

								func (mm *MaintenanceManager) resetErrorTracking() {

									if mm.errorCount > 0 {

										glog.V(1).Infof("Maintenance scan recovered after %d failed attempts", mm.errorCount)

										mm.errorCount = 0

										mm.lastError = nil

										mm.backoffDelay = time.Second // Reset to initial delay

									}

								}


								// isConnectionError checks if the error is a connection-related error

								func isConnectionError(err error) bool {

									if err == nil {

										return false

									}

									errStr := err.Error()

									return strings.Contains(errStr, "connection refused") ||

										strings.Contains(errStr, "connection error") ||

										strings.Contains(errStr, "dial tcp") ||

										strings.Contains(errStr, "connection timeout") ||

										strings.Contains(errStr, "no route to host") ||

										strings.Contains(errStr, "network unreachable")

								}


								// performCleanup cleans up old tasks and stale workers

								func (mm *MaintenanceManager) performCleanup() {

									glog.V(2).Infof("Starting maintenance cleanup")


									taskRetention := time.Duration(mm.config.TaskRetentionSeconds) * time.Second

									workerTimeout := time.Duration(mm.config.WorkerTimeoutSeconds) * time.Second


									removedTasks := mm.queue.CleanupOldTasks(taskRetention)

									removedWorkers := mm.queue.RemoveStaleWorkers(workerTimeout)


									// Clean up stale pending operations (operations running for more than 4 hours)

									staleOperationTimeout := 4 * time.Hour

									removedOperations := 0

									if mm.scanner != nil && mm.scanner.integration != nil {

										pendingOps := mm.scanner.integration.GetPendingOperations()

										if pendingOps != nil {

											removedOperations = pendingOps.CleanupStaleOperations(staleOperationTimeout)

										}

									}


									if removedTasks > 0 || removedWorkers > 0 || removedOperations > 0 {

										glog.V(1).Infof("Cleanup completed: removed %d old tasks, %d stale workers, and %d stale operations",

											removedTasks, removedWorkers, removedOperations)

									}

								}


								// GetQueue returns the maintenance queue

								func (mm *MaintenanceManager) GetQueue() *MaintenanceQueue {

									return mm.queue

								}


								// GetConfig returns the maintenance configuration

								func (mm *MaintenanceManager) GetConfig() *MaintenanceConfig {

									return mm.config

								}


								// GetStats returns maintenance statistics

								func (mm *MaintenanceManager) GetStats() *MaintenanceStats {

									stats := mm.queue.GetStats()


									mm.mutex.RLock()

									defer mm.mutex.RUnlock()


									stats.LastScanTime = time.Now() // Would need to track this properly


									// Calculate next scan time based on current error state

									scanInterval := time.Duration(mm.config.ScanIntervalSeconds) * time.Second

									nextScanInterval := scanInterval

									if mm.errorCount > 0 {

										nextScanInterval = mm.backoffDelay

										maxInterval := scanInterval * 10

										if nextScanInterval > maxInterval {

											nextScanInterval = maxInterval

										}

									}

									stats.NextScanTime = time.Now().Add(nextScanInterval)


									return stats

								}


								// ReloadTaskConfigurations reloads task configurations from the current policy

								func (mm *MaintenanceManager) ReloadTaskConfigurations() error {

									mm.mutex.Lock()

									defer mm.mutex.Unlock()


									// Trigger configuration reload in the integration layer

									if mm.scanner != nil && mm.scanner.integration != nil {

										mm.scanner.integration.ConfigureTasksFromPolicy()

										glog.V(1).Infof("Task configurations reloaded from policy")

										return nil

									}


									return fmt.Errorf("integration not available for configuration reload")

								}


								// GetErrorState returns the current error state for monitoring

								func (mm *MaintenanceManager) GetErrorState() (errorCount int, lastError error, backoffDelay time.Duration) {

									mm.mutex.RLock()

									defer mm.mutex.RUnlock()

									return mm.errorCount, mm.lastError, mm.backoffDelay

								}


								// GetTasks returns tasks with filtering

								func (mm *MaintenanceManager) GetTasks(status MaintenanceTaskStatus, taskType MaintenanceTaskType, limit int) []*MaintenanceTask {

									return mm.queue.GetTasks(status, taskType, limit)

								}


								// GetWorkers returns all registered workers

								func (mm *MaintenanceManager) GetWorkers() []*MaintenanceWorker {

									return mm.queue.GetWorkers()

								}


								// TriggerScan manually triggers a maintenance scan

								func (mm *MaintenanceManager) TriggerScan() error {

									return mm.triggerScanInternal(true)

								}


								// triggerScanInternal handles both manual and automatic scan triggers

								func (mm *MaintenanceManager) triggerScanInternal(isManual bool) error {

									if !mm.running {

										return fmt.Errorf("maintenance manager is not running")

									}


									// Prevent multiple concurrent scans

									mm.mutex.Lock()

									if mm.scanInProgress {

										mm.mutex.Unlock()

										if isManual {

											glog.V(1).Infof("Manual scan already in progress, ignoring trigger request")

										} else {

											glog.V(2).Infof("Automatic scan already in progress, ignoring scheduled scan")

										}

										return fmt.Errorf("scan already in progress")

									}

									mm.scanInProgress = true

									mm.mutex.Unlock()


									go mm.performScan()

									return nil

								}


								// UpdateConfig updates the maintenance configuration

								func (mm *MaintenanceManager) UpdateConfig(config *MaintenanceConfig) error {

									if config == nil {

										return fmt.Errorf("config cannot be nil")

									}


									mm.config = config

									mm.queue.policy = config.Policy

									mm.scanner.policy = config.Policy


									glog.V(1).Infof("Maintenance configuration updated")

									return nil

								}


								// CancelTask cancels a pending task

								func (mm *MaintenanceManager) CancelTask(taskID string) error {

									mm.queue.mutex.Lock()

									defer mm.queue.mutex.Unlock()


									task, exists := mm.queue.tasks[taskID]

									if !exists {

										return fmt.Errorf("task %s not found", taskID)

									}


									if task.Status == TaskStatusPending {

										task.Status = TaskStatusCancelled

										task.CompletedAt = &[]time.Time{time.Now()}[0]


										// Remove from pending tasks

										for i, pendingTask := range mm.queue.pendingTasks {

											if pendingTask.ID == taskID {

												mm.queue.pendingTasks = append(mm.queue.pendingTasks[:i], mm.queue.pendingTasks[i+1:]...)

												break

											}

										}


										glog.V(2).Infof("Cancelled task %s", taskID)

										return nil

									}


									return fmt.Errorf("task %s cannot be cancelled (status: %s)", taskID, task.Status)

								}


								// RegisterWorker registers a new worker

								func (mm *MaintenanceManager) RegisterWorker(worker *MaintenanceWorker) {

									mm.queue.RegisterWorker(worker)

								}


								// GetNextTask returns the next task for a worker

								func (mm *MaintenanceManager) GetNextTask(workerID string, capabilities []MaintenanceTaskType) *MaintenanceTask {

									return mm.queue.GetNextTask(workerID, capabilities)

								}


								// CompleteTask marks a task as completed

								func (mm *MaintenanceManager) CompleteTask(taskID string, error string) {

									mm.queue.CompleteTask(taskID, error)

								}


								// UpdateTaskProgress updates task progress

								func (mm *MaintenanceManager) UpdateTaskProgress(taskID string, progress float64) {

									mm.queue.UpdateTaskProgress(taskID, progress)

								}


								// UpdateWorkerHeartbeat updates worker heartbeat

								func (mm *MaintenanceManager) UpdateWorkerHeartbeat(workerID string) {

									mm.queue.UpdateWorkerHeartbeat(workerID)

								}