diff --git a/.gitignore b/.gitignore index b330bbd96..357e2911a 100644 --- a/.gitignore +++ b/.gitignore @@ -115,3 +115,4 @@ test/s3/versioning/weed-test.log /docker/admin_integration/data docker/agent_pub_record docker/admin_integration/weed-local +docker/admin_integration/ec_test_files.json diff --git a/weed/admin/maintenance/maintenance_manager.go b/weed/admin/maintenance/maintenance_manager.go index 62da87e5a..ce0c1a346 100644 --- a/weed/admin/maintenance/maintenance_manager.go +++ b/weed/admin/maintenance/maintenance_manager.go @@ -381,6 +381,43 @@ func (mm *MaintenanceManager) GetConfig() *MaintenanceConfig { // GetStats returns maintenance statistics func (mm *MaintenanceManager) GetStats() *MaintenanceStats { + // Quick check if scan is in progress - return cached/fast stats to prevent hanging + mm.mutex.RLock() + scanInProgress := mm.scanInProgress + mm.mutex.RUnlock() + + if scanInProgress { + glog.V(2).Infof("Scan in progress, returning fast stats to prevent hanging") + // Return basic stats without calling potentially blocking operations + stats := &MaintenanceStats{ + TotalTasks: 0, + TasksByStatus: make(map[MaintenanceTaskStatus]int), + TasksByType: make(map[MaintenanceTaskType]int), + ActiveWorkers: 0, + CompletedToday: 0, + FailedToday: 0, + AverageTaskTime: 0, + LastScanTime: time.Now().Add(-time.Minute), // Assume recent scan + } + + mm.mutex.RLock() + // Calculate next scan time based on current error state + scanInterval := time.Duration(mm.config.ScanIntervalSeconds) * time.Second + nextScanInterval := scanInterval + if mm.errorCount > 0 { + nextScanInterval = mm.backoffDelay + maxInterval := scanInterval * 10 + if nextScanInterval > maxInterval { + nextScanInterval = maxInterval + } + } + stats.NextScanTime = time.Now().Add(nextScanInterval) + mm.mutex.RUnlock() + + return stats + } + + // Normal path - get full stats from queue stats := mm.queue.GetStats() mm.mutex.RLock() diff --git a/weed/admin/maintenance/maintenance_scanner.go b/weed/admin/maintenance/maintenance_scanner.go index 627297b2b..9b130bf17 100644 --- a/weed/admin/maintenance/maintenance_scanner.go +++ b/weed/admin/maintenance/maintenance_scanner.go @@ -76,9 +76,14 @@ func (ms *MaintenanceScanner) getVolumeHealthMetrics() ([]*VolumeHealthMetrics, var metrics []*VolumeHealthMetrics glog.V(1).Infof("Collecting volume health metrics from master") + + // Add timeout protection to prevent hanging + ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second) + defer cancel() + err := ms.adminClient.WithMasterClient(func(client master_pb.SeaweedClient) error { - resp, err := client.VolumeList(context.Background(), &master_pb.VolumeListRequest{}) + resp, err := client.VolumeList(ctx, &master_pb.VolumeListRequest{}) if err != nil { return err } @@ -229,8 +234,12 @@ func (ms *MaintenanceScanner) enrichVolumeMetrics(metrics *[]*VolumeHealthMetric func (ms *MaintenanceScanner) getECVolumeSet() map[uint32]bool { ecVolumeSet := make(map[uint32]bool) + // Add timeout protection to prevent hanging + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + err := ms.adminClient.WithMasterClient(func(client master_pb.SeaweedClient) error { - resp, err := client.VolumeList(context.Background(), &master_pb.VolumeListRequest{}) + resp, err := client.VolumeList(ctx, &master_pb.VolumeListRequest{}) if err != nil { return err } @@ -267,8 +276,12 @@ func (ms *MaintenanceScanner) createECVolumeMetric(volumeID uint32) *VolumeHealt var metric *VolumeHealthMetrics var serverWithShards string + // Add timeout protection to prevent hanging + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + err := ms.adminClient.WithMasterClient(func(client master_pb.SeaweedClient) error { - resp, err := client.VolumeList(context.Background(), &master_pb.VolumeListRequest{}) + resp, err := client.VolumeList(ctx, &master_pb.VolumeListRequest{}) if err != nil { return err } @@ -401,8 +414,12 @@ func (ms *MaintenanceScanner) enrichECVolumeWithDeletionInfo(metric *VolumeHealt func (ms *MaintenanceScanner) findServersWithECShards(volumeId uint32) ([]string, error) { var serversWithShards []string + // Add timeout protection to prevent hanging + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + err := ms.adminClient.WithMasterClient(func(client master_pb.SeaweedClient) error { - resp, err := client.VolumeList(context.Background(), &master_pb.VolumeListRequest{}) + resp, err := client.VolumeList(ctx, &master_pb.VolumeListRequest{}) if err != nil { return err }