You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

568 lines
18 KiB

package maintenance
import (
"fmt"
"strings"
"sync"
"time"
"github.com/seaweedfs/seaweedfs/weed/glog"
"github.com/seaweedfs/seaweedfs/weed/pb/worker_pb"
"github.com/seaweedfs/seaweedfs/weed/worker/tasks/balance"
"github.com/seaweedfs/seaweedfs/weed/worker/tasks/erasure_coding"
"github.com/seaweedfs/seaweedfs/weed/worker/tasks/vacuum"
)
// buildPolicyFromTaskConfigs loads task configurations from separate files and builds a MaintenancePolicy
func buildPolicyFromTaskConfigs() *worker_pb.MaintenancePolicy {
policy := &worker_pb.MaintenancePolicy{
GlobalMaxConcurrent: 4,
DefaultRepeatIntervalSeconds: 6 * 3600, // 6 hours in seconds
DefaultCheckIntervalSeconds: 12 * 3600, // 12 hours in seconds
TaskPolicies: make(map[string]*worker_pb.TaskPolicy),
}
// Load vacuum task configuration
if vacuumConfig := vacuum.LoadConfigFromPersistence(nil); vacuumConfig != nil {
policy.TaskPolicies["vacuum"] = &worker_pb.TaskPolicy{
Enabled: vacuumConfig.Enabled,
MaxConcurrent: int32(vacuumConfig.MaxConcurrent),
RepeatIntervalSeconds: int32(vacuumConfig.ScanIntervalSeconds),
CheckIntervalSeconds: int32(vacuumConfig.ScanIntervalSeconds),
TaskConfig: &worker_pb.TaskPolicy_VacuumConfig{
VacuumConfig: &worker_pb.VacuumTaskConfig{
GarbageThreshold: float64(vacuumConfig.GarbageThreshold),
MinVolumeAgeHours: int32(vacuumConfig.MinVolumeAgeSeconds / 3600), // Convert seconds to hours
MinIntervalSeconds: int32(vacuumConfig.MinIntervalSeconds),
},
},
}
}
// Load erasure coding task configuration
if ecConfig := erasure_coding.LoadConfigFromPersistence(nil); ecConfig != nil {
policy.TaskPolicies["erasure_coding"] = &worker_pb.TaskPolicy{
Enabled: ecConfig.Enabled,
MaxConcurrent: int32(ecConfig.MaxConcurrent),
RepeatIntervalSeconds: int32(ecConfig.ScanIntervalSeconds),
CheckIntervalSeconds: int32(ecConfig.ScanIntervalSeconds),
TaskConfig: &worker_pb.TaskPolicy_ErasureCodingConfig{
ErasureCodingConfig: &worker_pb.ErasureCodingTaskConfig{
FullnessRatio: float64(ecConfig.FullnessRatio),
QuietForSeconds: int32(ecConfig.QuietForSeconds),
MinVolumeSizeMb: int32(ecConfig.MinSizeMB),
CollectionFilter: ecConfig.CollectionFilter,
},
},
}
}
// Load balance task configuration
if balanceConfig := balance.LoadConfigFromPersistence(nil); balanceConfig != nil {
policy.TaskPolicies["balance"] = &worker_pb.TaskPolicy{
Enabled: balanceConfig.Enabled,
MaxConcurrent: int32(balanceConfig.MaxConcurrent),
RepeatIntervalSeconds: int32(balanceConfig.ScanIntervalSeconds),
CheckIntervalSeconds: int32(balanceConfig.ScanIntervalSeconds),
TaskConfig: &worker_pb.TaskPolicy_BalanceConfig{
BalanceConfig: &worker_pb.BalanceTaskConfig{
ImbalanceThreshold: float64(balanceConfig.ImbalanceThreshold),
MinServerCount: int32(balanceConfig.MinServerCount),
},
},
}
}
glog.V(1).Infof("Built maintenance policy from separate task configs - %d task policies loaded", len(policy.TaskPolicies))
return policy
}
// MaintenanceManager coordinates the maintenance system
type MaintenanceManager struct {
config *MaintenanceConfig
scanner *MaintenanceScanner
queue *MaintenanceQueue
adminClient AdminClient
running bool
stopChan chan struct{}
// Error handling and backoff
errorCount int
lastError error
lastErrorTime time.Time
backoffDelay time.Duration
mutex sync.RWMutex
scanInProgress bool
}
// NewMaintenanceManager creates a new maintenance manager
func NewMaintenanceManager(adminClient AdminClient, config *MaintenanceConfig) *MaintenanceManager {
if config == nil {
config = DefaultMaintenanceConfig()
}
// Use the policy from the config (which is populated from separate task files in LoadMaintenanceConfig)
policy := config.Policy
if policy == nil {
// Fallback: build policy from separate task configuration files if not already populated
policy = buildPolicyFromTaskConfigs()
}
queue := NewMaintenanceQueue(policy)
scanner := NewMaintenanceScanner(adminClient, policy, queue)
return &MaintenanceManager{
config: config,
scanner: scanner,
queue: queue,
adminClient: adminClient,
stopChan: make(chan struct{}),
backoffDelay: time.Second, // Start with 1 second backoff
}
}
// Start begins the maintenance manager
func (mm *MaintenanceManager) Start() error {
if !mm.config.Enabled {
glog.V(1).Infof("Maintenance system is disabled")
return nil
}
// Validate configuration durations to prevent ticker panics
if err := mm.validateConfig(); err != nil {
return fmt.Errorf("invalid maintenance configuration: %w", err)
}
mm.running = true
// Start background processes
go mm.scanLoop()
go mm.cleanupLoop()
glog.Infof("Maintenance manager started with scan interval %ds", mm.config.ScanIntervalSeconds)
return nil
}
// validateConfig validates the maintenance configuration durations
func (mm *MaintenanceManager) validateConfig() error {
if mm.config.ScanIntervalSeconds <= 0 {
glog.Warningf("Invalid scan interval %ds, using default 30m", mm.config.ScanIntervalSeconds)
mm.config.ScanIntervalSeconds = 30 * 60 // 30 minutes in seconds
}
if mm.config.CleanupIntervalSeconds <= 0 {
glog.Warningf("Invalid cleanup interval %ds, using default 24h", mm.config.CleanupIntervalSeconds)
mm.config.CleanupIntervalSeconds = 24 * 60 * 60 // 24 hours in seconds
}
if mm.config.WorkerTimeoutSeconds <= 0 {
glog.Warningf("Invalid worker timeout %ds, using default 5m", mm.config.WorkerTimeoutSeconds)
mm.config.WorkerTimeoutSeconds = 5 * 60 // 5 minutes in seconds
}
if mm.config.TaskTimeoutSeconds <= 0 {
glog.Warningf("Invalid task timeout %ds, using default 2h", mm.config.TaskTimeoutSeconds)
mm.config.TaskTimeoutSeconds = 2 * 60 * 60 // 2 hours in seconds
}
if mm.config.RetryDelaySeconds <= 0 {
glog.Warningf("Invalid retry delay %ds, using default 15m", mm.config.RetryDelaySeconds)
mm.config.RetryDelaySeconds = 15 * 60 // 15 minutes in seconds
}
if mm.config.TaskRetentionSeconds <= 0 {
glog.Warningf("Invalid task retention %ds, using default 168h", mm.config.TaskRetentionSeconds)
mm.config.TaskRetentionSeconds = 7 * 24 * 60 * 60 // 7 days in seconds
}
return nil
}
// IsRunning returns whether the maintenance manager is currently running
func (mm *MaintenanceManager) IsRunning() bool {
return mm.running
}
// Stop terminates the maintenance manager
func (mm *MaintenanceManager) Stop() {
mm.running = false
close(mm.stopChan)
glog.Infof("Maintenance manager stopped")
}
// scanLoop periodically scans for maintenance tasks with adaptive timing
func (mm *MaintenanceManager) scanLoop() {
scanInterval := time.Duration(mm.config.ScanIntervalSeconds) * time.Second
ticker := time.NewTicker(scanInterval)
defer ticker.Stop()
for mm.running {
select {
case <-mm.stopChan:
return
case <-ticker.C:
glog.V(1).Infof("Performing maintenance scan every %v", scanInterval)
// Use the same synchronization as TriggerScan to prevent concurrent scans
if err := mm.triggerScanInternal(false); err != nil {
glog.V(1).Infof("Scheduled scan skipped: %v", err)
}
// Adjust ticker interval based on error state (read error state safely)
currentInterval := mm.getScanInterval(scanInterval)
// Reset ticker with new interval if needed
if currentInterval != scanInterval {
ticker.Stop()
ticker = time.NewTicker(currentInterval)
}
}
}
}
// getScanInterval safely reads the current scan interval with error backoff
func (mm *MaintenanceManager) getScanInterval(baseInterval time.Duration) time.Duration {
mm.mutex.RLock()
defer mm.mutex.RUnlock()
if mm.errorCount > 0 {
// Use backoff delay when there are errors
currentInterval := mm.backoffDelay
if currentInterval > baseInterval {
// Don't make it longer than the configured interval * 10
maxInterval := baseInterval * 10
if currentInterval > maxInterval {
currentInterval = maxInterval
}
}
return currentInterval
}
return baseInterval
}
// cleanupLoop periodically cleans up old tasks and stale workers
func (mm *MaintenanceManager) cleanupLoop() {
cleanupInterval := time.Duration(mm.config.CleanupIntervalSeconds) * time.Second
ticker := time.NewTicker(cleanupInterval)
defer ticker.Stop()
for mm.running {
select {
case <-mm.stopChan:
return
case <-ticker.C:
mm.performCleanup()
}
}
}
// performScan executes a maintenance scan with error handling and backoff
func (mm *MaintenanceManager) performScan() {
defer func() {
// Always reset scan in progress flag when done
mm.mutex.Lock()
mm.scanInProgress = false
mm.mutex.Unlock()
}()
glog.Infof("Starting maintenance scan...")
results, err := mm.scanner.ScanForMaintenanceTasks()
if err != nil {
// Handle scan error
mm.mutex.Lock()
mm.handleScanError(err)
mm.mutex.Unlock()
glog.Warningf("Maintenance scan failed: %v", err)
return
}
// Scan succeeded - update state and process results
mm.handleScanSuccess(results)
}
// handleScanSuccess processes successful scan results with proper lock management
func (mm *MaintenanceManager) handleScanSuccess(results []*TaskDetectionResult) {
// Update manager state first
mm.mutex.Lock()
mm.resetErrorTracking()
taskCount := len(results)
mm.mutex.Unlock()
if taskCount > 0 {
// Count tasks by type for logging (outside of lock)
taskCounts := make(map[MaintenanceTaskType]int)
for _, result := range results {
taskCounts[result.TaskType]++
}
// Add tasks to queue (no manager lock held)
mm.queue.AddTasksFromResults(results)
// Log detailed scan results
glog.Infof("Maintenance scan completed: found %d tasks", taskCount)
for taskType, count := range taskCounts {
glog.Infof(" - %s: %d tasks", taskType, count)
}
} else {
glog.Infof("Maintenance scan completed: no maintenance tasks needed")
}
}
// handleScanError handles scan errors with exponential backoff and reduced logging
func (mm *MaintenanceManager) handleScanError(err error) {
now := time.Now()
mm.errorCount++
mm.lastError = err
mm.lastErrorTime = now
// Use exponential backoff with jitter
if mm.errorCount > 1 {
mm.backoffDelay = mm.backoffDelay * 2
if mm.backoffDelay > 5*time.Minute {
mm.backoffDelay = 5 * time.Minute // Cap at 5 minutes
}
}
// Reduce log frequency based on error count and time
shouldLog := false
if mm.errorCount <= 3 {
// Log first 3 errors immediately
shouldLog = true
} else if mm.errorCount <= 10 && mm.errorCount%3 == 0 {
// Log every 3rd error for errors 4-10
shouldLog = true
} else if mm.errorCount%10 == 0 {
// Log every 10th error after that
shouldLog = true
}
if shouldLog {
// Check if it's a connection error to provide better messaging
if isConnectionError(err) {
if mm.errorCount == 1 {
glog.Errorf("Maintenance scan failed: %v (will retry with backoff)", err)
} else {
glog.Errorf("Maintenance scan still failing after %d attempts: %v (backoff: %v)",
mm.errorCount, err, mm.backoffDelay)
}
} else {
glog.Errorf("Maintenance scan failed: %v", err)
}
} else {
// Use debug level for suppressed errors
glog.V(3).Infof("Maintenance scan failed (error #%d, suppressed): %v", mm.errorCount, err)
}
}
// resetErrorTracking resets error tracking when scan succeeds
func (mm *MaintenanceManager) resetErrorTracking() {
if mm.errorCount > 0 {
glog.V(1).Infof("Maintenance scan recovered after %d failed attempts", mm.errorCount)
mm.errorCount = 0
mm.lastError = nil
mm.backoffDelay = time.Second // Reset to initial delay
}
}
// isConnectionError checks if the error is a connection-related error
func isConnectionError(err error) bool {
if err == nil {
return false
}
errStr := err.Error()
return strings.Contains(errStr, "connection refused") ||
strings.Contains(errStr, "connection error") ||
strings.Contains(errStr, "dial tcp") ||
strings.Contains(errStr, "connection timeout") ||
strings.Contains(errStr, "no route to host") ||
strings.Contains(errStr, "network unreachable")
}
// performCleanup cleans up old tasks and stale workers
func (mm *MaintenanceManager) performCleanup() {
glog.V(2).Infof("Starting maintenance cleanup")
taskRetention := time.Duration(mm.config.TaskRetentionSeconds) * time.Second
workerTimeout := time.Duration(mm.config.WorkerTimeoutSeconds) * time.Second
removedTasks := mm.queue.CleanupOldTasks(taskRetention)
removedWorkers := mm.queue.RemoveStaleWorkers(workerTimeout)
// Clean up stale pending operations (operations running for more than 4 hours)
staleOperationTimeout := 4 * time.Hour
removedOperations := 0
if mm.scanner != nil && mm.scanner.integration != nil {
pendingOps := mm.scanner.integration.GetPendingOperations()
if pendingOps != nil {
removedOperations = pendingOps.CleanupStaleOperations(staleOperationTimeout)
}
}
if removedTasks > 0 || removedWorkers > 0 || removedOperations > 0 {
glog.V(1).Infof("Cleanup completed: removed %d old tasks, %d stale workers, and %d stale operations",
removedTasks, removedWorkers, removedOperations)
}
}
// GetQueue returns the maintenance queue
func (mm *MaintenanceManager) GetQueue() *MaintenanceQueue {
return mm.queue
}
// GetConfig returns the maintenance configuration
func (mm *MaintenanceManager) GetConfig() *MaintenanceConfig {
return mm.config
}
// GetStats returns maintenance statistics
func (mm *MaintenanceManager) GetStats() *MaintenanceStats {
stats := mm.queue.GetStats()
mm.mutex.RLock()
defer mm.mutex.RUnlock()
stats.LastScanTime = time.Now() // Would need to track this properly
// Calculate next scan time based on current error state
scanInterval := time.Duration(mm.config.ScanIntervalSeconds) * time.Second
nextScanInterval := scanInterval
if mm.errorCount > 0 {
nextScanInterval = mm.backoffDelay
maxInterval := scanInterval * 10
if nextScanInterval > maxInterval {
nextScanInterval = maxInterval
}
}
stats.NextScanTime = time.Now().Add(nextScanInterval)
return stats
}
// ReloadTaskConfigurations reloads task configurations from the current policy
func (mm *MaintenanceManager) ReloadTaskConfigurations() error {
mm.mutex.Lock()
defer mm.mutex.Unlock()
// Trigger configuration reload in the integration layer
if mm.scanner != nil && mm.scanner.integration != nil {
mm.scanner.integration.ConfigureTasksFromPolicy()
glog.V(1).Infof("Task configurations reloaded from policy")
return nil
}
return fmt.Errorf("integration not available for configuration reload")
}
// GetErrorState returns the current error state for monitoring
func (mm *MaintenanceManager) GetErrorState() (errorCount int, lastError error, backoffDelay time.Duration) {
mm.mutex.RLock()
defer mm.mutex.RUnlock()
return mm.errorCount, mm.lastError, mm.backoffDelay
}
// GetTasks returns tasks with filtering
func (mm *MaintenanceManager) GetTasks(status MaintenanceTaskStatus, taskType MaintenanceTaskType, limit int) []*MaintenanceTask {
return mm.queue.GetTasks(status, taskType, limit)
}
// GetWorkers returns all registered workers
func (mm *MaintenanceManager) GetWorkers() []*MaintenanceWorker {
return mm.queue.GetWorkers()
}
// TriggerScan manually triggers a maintenance scan
func (mm *MaintenanceManager) TriggerScan() error {
return mm.triggerScanInternal(true)
}
// triggerScanInternal handles both manual and automatic scan triggers
func (mm *MaintenanceManager) triggerScanInternal(isManual bool) error {
if !mm.running {
return fmt.Errorf("maintenance manager is not running")
}
// Prevent multiple concurrent scans
mm.mutex.Lock()
if mm.scanInProgress {
mm.mutex.Unlock()
if isManual {
glog.V(1).Infof("Manual scan already in progress, ignoring trigger request")
} else {
glog.V(2).Infof("Automatic scan already in progress, ignoring scheduled scan")
}
return fmt.Errorf("scan already in progress")
}
mm.scanInProgress = true
mm.mutex.Unlock()
go mm.performScan()
return nil
}
// UpdateConfig updates the maintenance configuration
func (mm *MaintenanceManager) UpdateConfig(config *MaintenanceConfig) error {
if config == nil {
return fmt.Errorf("config cannot be nil")
}
mm.config = config
mm.queue.policy = config.Policy
mm.scanner.policy = config.Policy
glog.V(1).Infof("Maintenance configuration updated")
return nil
}
// CancelTask cancels a pending task
func (mm *MaintenanceManager) CancelTask(taskID string) error {
mm.queue.mutex.Lock()
defer mm.queue.mutex.Unlock()
task, exists := mm.queue.tasks[taskID]
if !exists {
return fmt.Errorf("task %s not found", taskID)
}
if task.Status == TaskStatusPending {
task.Status = TaskStatusCancelled
task.CompletedAt = &[]time.Time{time.Now()}[0]
// Remove from pending tasks
for i, pendingTask := range mm.queue.pendingTasks {
if pendingTask.ID == taskID {
mm.queue.pendingTasks = append(mm.queue.pendingTasks[:i], mm.queue.pendingTasks[i+1:]...)
break
}
}
glog.V(2).Infof("Cancelled task %s", taskID)
return nil
}
return fmt.Errorf("task %s cannot be cancelled (status: %s)", taskID, task.Status)
}
// RegisterWorker registers a new worker
func (mm *MaintenanceManager) RegisterWorker(worker *MaintenanceWorker) {
mm.queue.RegisterWorker(worker)
}
// GetNextTask returns the next task for a worker
func (mm *MaintenanceManager) GetNextTask(workerID string, capabilities []MaintenanceTaskType) *MaintenanceTask {
return mm.queue.GetNextTask(workerID, capabilities)
}
// CompleteTask marks a task as completed
func (mm *MaintenanceManager) CompleteTask(taskID string, error string) {
mm.queue.CompleteTask(taskID, error)
}
// UpdateTaskProgress updates task progress
func (mm *MaintenanceManager) UpdateTaskProgress(taskID string, progress float64) {
mm.queue.UpdateTaskProgress(taskID, progress)
}
// UpdateWorkerHeartbeat updates worker heartbeat
func (mm *MaintenanceManager) UpdateWorkerHeartbeat(workerID string) {
mm.queue.UpdateWorkerHeartbeat(workerID)
}