23 changed files with 2 additions and 2411 deletions
-
208weed/admin/dash/config_persistence.go
-
20weed/admin/handlers/maintenance_handlers.go
-
164weed/admin/handlers/maintenance_handlers_test.go
-
35weed/admin/maintenance/maintenance_manager.go
-
2weed/admin/maintenance/maintenance_worker.go
-
2weed/admin/view/layout/menu_helper.go
-
2weed/command/worker.go
-
267weed/worker/tasks/balance/balance_task.go
-
170weed/worker/tasks/balance/config.go
-
272weed/worker/tasks/balance/detection.go
-
158weed/worker/tasks/balance/execution.go
-
138weed/worker/tasks/balance/monitoring.go
-
86weed/worker/tasks/balance/register.go
-
37weed/worker/tasks/balance/scheduling.go
-
7weed/worker/tasks/base/volume_utils.go
-
190weed/worker/tasks/vacuum/config.go
-
133weed/worker/tasks/vacuum/detection.go
-
151weed/worker/tasks/vacuum/monitoring.go
-
86weed/worker/tasks/vacuum/register.go
-
37weed/worker/tasks/vacuum/scheduling.go
-
244weed/worker/tasks/vacuum/vacuum_task.go
-
2weed/worker/types/task_types.go
-
2weed/worker/worker.go
@ -1,267 +0,0 @@ |
|||||
package balance |
|
||||
|
|
||||
import ( |
|
||||
"context" |
|
||||
"fmt" |
|
||||
"io" |
|
||||
"time" |
|
||||
|
|
||||
"github.com/seaweedfs/seaweedfs/weed/glog" |
|
||||
"github.com/seaweedfs/seaweedfs/weed/operation" |
|
||||
"github.com/seaweedfs/seaweedfs/weed/pb" |
|
||||
"github.com/seaweedfs/seaweedfs/weed/pb/volume_server_pb" |
|
||||
"github.com/seaweedfs/seaweedfs/weed/pb/worker_pb" |
|
||||
"github.com/seaweedfs/seaweedfs/weed/storage/needle" |
|
||||
"github.com/seaweedfs/seaweedfs/weed/util" |
|
||||
"github.com/seaweedfs/seaweedfs/weed/worker/types" |
|
||||
"github.com/seaweedfs/seaweedfs/weed/worker/types/base" |
|
||||
"google.golang.org/grpc" |
|
||||
) |
|
||||
|
|
||||
// BalanceTask implements the Task interface
|
|
||||
type BalanceTask struct { |
|
||||
*base.BaseTask |
|
||||
server string |
|
||||
volumeID uint32 |
|
||||
collection string |
|
||||
progress float64 |
|
||||
} |
|
||||
|
|
||||
// NewBalanceTask creates a new balance task instance
|
|
||||
func NewBalanceTask(id string, server string, volumeID uint32, collection string) *BalanceTask { |
|
||||
return &BalanceTask{ |
|
||||
BaseTask: base.NewBaseTask(id, types.TaskTypeBalance), |
|
||||
server: server, |
|
||||
volumeID: volumeID, |
|
||||
collection: collection, |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// Execute implements the Task interface
|
|
||||
func (t *BalanceTask) Execute(ctx context.Context, params *worker_pb.TaskParams) error { |
|
||||
if params == nil { |
|
||||
return fmt.Errorf("task parameters are required") |
|
||||
} |
|
||||
|
|
||||
balanceParams := params.GetBalanceParams() |
|
||||
if balanceParams == nil { |
|
||||
return fmt.Errorf("balance parameters are required") |
|
||||
} |
|
||||
|
|
||||
// Get source and destination from unified arrays
|
|
||||
if len(params.Sources) == 0 { |
|
||||
return fmt.Errorf("source is required for balance task") |
|
||||
} |
|
||||
if len(params.Targets) == 0 { |
|
||||
return fmt.Errorf("target is required for balance task") |
|
||||
} |
|
||||
|
|
||||
sourceNode := params.Sources[0].Node |
|
||||
destNode := params.Targets[0].Node |
|
||||
|
|
||||
if sourceNode == "" { |
|
||||
return fmt.Errorf("source node is required for balance task") |
|
||||
} |
|
||||
if destNode == "" { |
|
||||
return fmt.Errorf("destination node is required for balance task") |
|
||||
} |
|
||||
|
|
||||
t.GetLogger().WithFields(map[string]interface{}{ |
|
||||
"volume_id": t.volumeID, |
|
||||
"source": sourceNode, |
|
||||
"destination": destNode, |
|
||||
"collection": t.collection, |
|
||||
}).Info("Starting balance task - moving volume") |
|
||||
|
|
||||
sourceServer := pb.ServerAddress(sourceNode) |
|
||||
targetServer := pb.ServerAddress(destNode) |
|
||||
volumeId := needle.VolumeId(t.volumeID) |
|
||||
|
|
||||
// Step 1: Mark volume readonly
|
|
||||
t.ReportProgress(10.0) |
|
||||
t.GetLogger().Info("Marking volume readonly for move") |
|
||||
if err := t.markVolumeReadonly(sourceServer, volumeId); err != nil { |
|
||||
return fmt.Errorf("failed to mark volume readonly: %v", err) |
|
||||
} |
|
||||
|
|
||||
// Step 2: Copy volume to destination
|
|
||||
t.ReportProgress(20.0) |
|
||||
t.GetLogger().Info("Copying volume to destination") |
|
||||
lastAppendAtNs, err := t.copyVolume(sourceServer, targetServer, volumeId) |
|
||||
if err != nil { |
|
||||
return fmt.Errorf("failed to copy volume: %v", err) |
|
||||
} |
|
||||
|
|
||||
// Step 3: Mount volume on target and mark it readonly
|
|
||||
t.ReportProgress(60.0) |
|
||||
t.GetLogger().Info("Mounting volume on target server") |
|
||||
if err := t.mountVolume(targetServer, volumeId); err != nil { |
|
||||
return fmt.Errorf("failed to mount volume on target: %v", err) |
|
||||
} |
|
||||
|
|
||||
// Step 4: Tail for updates
|
|
||||
t.ReportProgress(70.0) |
|
||||
t.GetLogger().Info("Syncing final updates") |
|
||||
if err := t.tailVolume(sourceServer, targetServer, volumeId, lastAppendAtNs); err != nil { |
|
||||
glog.Warningf("Tail operation failed (may be normal): %v", err) |
|
||||
} |
|
||||
|
|
||||
// Step 5: Unmount from source
|
|
||||
t.ReportProgress(85.0) |
|
||||
t.GetLogger().Info("Unmounting volume from source server") |
|
||||
if err := t.unmountVolume(sourceServer, volumeId); err != nil { |
|
||||
return fmt.Errorf("failed to unmount volume from source: %v", err) |
|
||||
} |
|
||||
|
|
||||
// Step 6: Delete from source
|
|
||||
t.ReportProgress(95.0) |
|
||||
t.GetLogger().Info("Deleting volume from source server") |
|
||||
if err := t.deleteVolume(sourceServer, volumeId); err != nil { |
|
||||
return fmt.Errorf("failed to delete volume from source: %v", err) |
|
||||
} |
|
||||
|
|
||||
t.ReportProgress(100.0) |
|
||||
glog.Infof("Balance task completed successfully: volume %d moved from %s to %s", |
|
||||
t.volumeID, t.server, destNode) |
|
||||
return nil |
|
||||
} |
|
||||
|
|
||||
// Validate implements the UnifiedTask interface
|
|
||||
func (t *BalanceTask) Validate(params *worker_pb.TaskParams) error { |
|
||||
if params == nil { |
|
||||
return fmt.Errorf("task parameters are required") |
|
||||
} |
|
||||
|
|
||||
balanceParams := params.GetBalanceParams() |
|
||||
if balanceParams == nil { |
|
||||
return fmt.Errorf("balance parameters are required") |
|
||||
} |
|
||||
|
|
||||
if params.VolumeId != t.volumeID { |
|
||||
return fmt.Errorf("volume ID mismatch: expected %d, got %d", t.volumeID, params.VolumeId) |
|
||||
} |
|
||||
|
|
||||
// Validate that at least one source matches our server
|
|
||||
found := false |
|
||||
for _, source := range params.Sources { |
|
||||
if source.Node == t.server { |
|
||||
found = true |
|
||||
break |
|
||||
} |
|
||||
} |
|
||||
if !found { |
|
||||
return fmt.Errorf("no source matches expected server %s", t.server) |
|
||||
} |
|
||||
|
|
||||
return nil |
|
||||
} |
|
||||
|
|
||||
// EstimateTime implements the UnifiedTask interface
|
|
||||
func (t *BalanceTask) EstimateTime(params *worker_pb.TaskParams) time.Duration { |
|
||||
// Basic estimate based on simulated steps
|
|
||||
return 14 * time.Second // Sum of all step durations
|
|
||||
} |
|
||||
|
|
||||
// GetProgress returns current progress
|
|
||||
func (t *BalanceTask) GetProgress() float64 { |
|
||||
return t.progress |
|
||||
} |
|
||||
|
|
||||
// Helper methods for real balance operations
|
|
||||
|
|
||||
// markVolumeReadonly marks the volume readonly
|
|
||||
func (t *BalanceTask) markVolumeReadonly(server pb.ServerAddress, volumeId needle.VolumeId) error { |
|
||||
return operation.WithVolumeServerClient(false, server, grpc.WithInsecure(), |
|
||||
func(client volume_server_pb.VolumeServerClient) error { |
|
||||
_, err := client.VolumeMarkReadonly(context.Background(), &volume_server_pb.VolumeMarkReadonlyRequest{ |
|
||||
VolumeId: uint32(volumeId), |
|
||||
}) |
|
||||
return err |
|
||||
}) |
|
||||
} |
|
||||
|
|
||||
// copyVolume copies volume from source to target server
|
|
||||
func (t *BalanceTask) copyVolume(sourceServer, targetServer pb.ServerAddress, volumeId needle.VolumeId) (uint64, error) { |
|
||||
var lastAppendAtNs uint64 |
|
||||
|
|
||||
err := operation.WithVolumeServerClient(true, targetServer, grpc.WithInsecure(), |
|
||||
func(client volume_server_pb.VolumeServerClient) error { |
|
||||
stream, err := client.VolumeCopy(context.Background(), &volume_server_pb.VolumeCopyRequest{ |
|
||||
VolumeId: uint32(volumeId), |
|
||||
SourceDataNode: string(sourceServer), |
|
||||
}) |
|
||||
if err != nil { |
|
||||
return err |
|
||||
} |
|
||||
|
|
||||
for { |
|
||||
resp, recvErr := stream.Recv() |
|
||||
if recvErr != nil { |
|
||||
if recvErr == io.EOF { |
|
||||
break |
|
||||
} |
|
||||
return recvErr |
|
||||
} |
|
||||
|
|
||||
if resp.LastAppendAtNs != 0 { |
|
||||
lastAppendAtNs = resp.LastAppendAtNs |
|
||||
} else { |
|
||||
// Report copy progress
|
|
||||
glog.V(1).Infof("Volume %d copy progress: %s", volumeId, |
|
||||
util.BytesToHumanReadable(uint64(resp.ProcessedBytes))) |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
return nil |
|
||||
}) |
|
||||
|
|
||||
return lastAppendAtNs, err |
|
||||
} |
|
||||
|
|
||||
// mountVolume mounts the volume on the target server
|
|
||||
func (t *BalanceTask) mountVolume(server pb.ServerAddress, volumeId needle.VolumeId) error { |
|
||||
return operation.WithVolumeServerClient(false, server, grpc.WithInsecure(), |
|
||||
func(client volume_server_pb.VolumeServerClient) error { |
|
||||
_, err := client.VolumeMount(context.Background(), &volume_server_pb.VolumeMountRequest{ |
|
||||
VolumeId: uint32(volumeId), |
|
||||
}) |
|
||||
return err |
|
||||
}) |
|
||||
} |
|
||||
|
|
||||
// tailVolume syncs remaining updates from source to target
|
|
||||
func (t *BalanceTask) tailVolume(sourceServer, targetServer pb.ServerAddress, volumeId needle.VolumeId, sinceNs uint64) error { |
|
||||
return operation.WithVolumeServerClient(true, targetServer, grpc.WithInsecure(), |
|
||||
func(client volume_server_pb.VolumeServerClient) error { |
|
||||
_, err := client.VolumeTailReceiver(context.Background(), &volume_server_pb.VolumeTailReceiverRequest{ |
|
||||
VolumeId: uint32(volumeId), |
|
||||
SinceNs: sinceNs, |
|
||||
IdleTimeoutSeconds: 60, // 1 minute timeout
|
|
||||
SourceVolumeServer: string(sourceServer), |
|
||||
}) |
|
||||
return err |
|
||||
}) |
|
||||
} |
|
||||
|
|
||||
// unmountVolume unmounts the volume from the server
|
|
||||
func (t *BalanceTask) unmountVolume(server pb.ServerAddress, volumeId needle.VolumeId) error { |
|
||||
return operation.WithVolumeServerClient(false, server, grpc.WithInsecure(), |
|
||||
func(client volume_server_pb.VolumeServerClient) error { |
|
||||
_, err := client.VolumeUnmount(context.Background(), &volume_server_pb.VolumeUnmountRequest{ |
|
||||
VolumeId: uint32(volumeId), |
|
||||
}) |
|
||||
return err |
|
||||
}) |
|
||||
} |
|
||||
|
|
||||
// deleteVolume deletes the volume from the server
|
|
||||
func (t *BalanceTask) deleteVolume(server pb.ServerAddress, volumeId needle.VolumeId) error { |
|
||||
return operation.WithVolumeServerClient(false, server, grpc.WithInsecure(), |
|
||||
func(client volume_server_pb.VolumeServerClient) error { |
|
||||
_, err := client.VolumeDelete(context.Background(), &volume_server_pb.VolumeDeleteRequest{ |
|
||||
VolumeId: uint32(volumeId), |
|
||||
OnlyEmpty: false, |
|
||||
}) |
|
||||
return err |
|
||||
}) |
|
||||
} |
|
||||
@ -1,170 +0,0 @@ |
|||||
package balance |
|
||||
|
|
||||
import ( |
|
||||
"fmt" |
|
||||
|
|
||||
"github.com/seaweedfs/seaweedfs/weed/admin/config" |
|
||||
"github.com/seaweedfs/seaweedfs/weed/glog" |
|
||||
"github.com/seaweedfs/seaweedfs/weed/pb/worker_pb" |
|
||||
"github.com/seaweedfs/seaweedfs/weed/worker/tasks/base" |
|
||||
) |
|
||||
|
|
||||
// Config extends BaseConfig with balance-specific settings
|
|
||||
type Config struct { |
|
||||
base.BaseConfig |
|
||||
ImbalanceThreshold float64 `json:"imbalance_threshold"` |
|
||||
MinServerCount int `json:"min_server_count"` |
|
||||
} |
|
||||
|
|
||||
// NewDefaultConfig creates a new default balance configuration
|
|
||||
func NewDefaultConfig() *Config { |
|
||||
return &Config{ |
|
||||
BaseConfig: base.BaseConfig{ |
|
||||
Enabled: true, |
|
||||
ScanIntervalSeconds: 30 * 60, // 30 minutes
|
|
||||
MaxConcurrent: 1, |
|
||||
}, |
|
||||
ImbalanceThreshold: 0.2, // 20%
|
|
||||
MinServerCount: 2, |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// GetConfigSpec returns the configuration schema for balance tasks
|
|
||||
func GetConfigSpec() base.ConfigSpec { |
|
||||
return base.ConfigSpec{ |
|
||||
Fields: []*config.Field{ |
|
||||
{ |
|
||||
Name: "enabled", |
|
||||
JSONName: "enabled", |
|
||||
Type: config.FieldTypeBool, |
|
||||
DefaultValue: true, |
|
||||
Required: false, |
|
||||
DisplayName: "Enable Balance Tasks", |
|
||||
Description: "Whether balance tasks should be automatically created", |
|
||||
HelpText: "Toggle this to enable or disable automatic balance task generation", |
|
||||
InputType: "checkbox", |
|
||||
CSSClasses: "form-check-input", |
|
||||
}, |
|
||||
{ |
|
||||
Name: "scan_interval_seconds", |
|
||||
JSONName: "scan_interval_seconds", |
|
||||
Type: config.FieldTypeInterval, |
|
||||
DefaultValue: 30 * 60, |
|
||||
MinValue: 5 * 60, |
|
||||
MaxValue: 2 * 60 * 60, |
|
||||
Required: true, |
|
||||
DisplayName: "Scan Interval", |
|
||||
Description: "How often to scan for volume distribution imbalances", |
|
||||
HelpText: "The system will check for volume distribution imbalances at this interval", |
|
||||
Placeholder: "30", |
|
||||
Unit: config.UnitMinutes, |
|
||||
InputType: "interval", |
|
||||
CSSClasses: "form-control", |
|
||||
}, |
|
||||
{ |
|
||||
Name: "max_concurrent", |
|
||||
JSONName: "max_concurrent", |
|
||||
Type: config.FieldTypeInt, |
|
||||
DefaultValue: 1, |
|
||||
MinValue: 1, |
|
||||
MaxValue: 3, |
|
||||
Required: true, |
|
||||
DisplayName: "Max Concurrent Tasks", |
|
||||
Description: "Maximum number of balance tasks that can run simultaneously", |
|
||||
HelpText: "Limits the number of balance operations running at the same time", |
|
||||
Placeholder: "1 (default)", |
|
||||
Unit: config.UnitCount, |
|
||||
InputType: "number", |
|
||||
CSSClasses: "form-control", |
|
||||
}, |
|
||||
{ |
|
||||
Name: "imbalance_threshold", |
|
||||
JSONName: "imbalance_threshold", |
|
||||
Type: config.FieldTypeFloat, |
|
||||
DefaultValue: 0.2, |
|
||||
MinValue: 0.05, |
|
||||
MaxValue: 0.5, |
|
||||
Required: true, |
|
||||
DisplayName: "Imbalance Threshold", |
|
||||
Description: "Minimum imbalance ratio to trigger balancing", |
|
||||
HelpText: "Volume distribution imbalances above this threshold will trigger balancing", |
|
||||
Placeholder: "0.20 (20%)", |
|
||||
Unit: config.UnitNone, |
|
||||
InputType: "number", |
|
||||
CSSClasses: "form-control", |
|
||||
}, |
|
||||
{ |
|
||||
Name: "min_server_count", |
|
||||
JSONName: "min_server_count", |
|
||||
Type: config.FieldTypeInt, |
|
||||
DefaultValue: 2, |
|
||||
MinValue: 2, |
|
||||
MaxValue: 10, |
|
||||
Required: true, |
|
||||
DisplayName: "Minimum Server Count", |
|
||||
Description: "Minimum number of servers required for balancing", |
|
||||
HelpText: "Balancing will only occur if there are at least this many servers", |
|
||||
Placeholder: "2 (default)", |
|
||||
Unit: config.UnitCount, |
|
||||
InputType: "number", |
|
||||
CSSClasses: "form-control", |
|
||||
}, |
|
||||
}, |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// ToTaskPolicy converts configuration to a TaskPolicy protobuf message
|
|
||||
func (c *Config) ToTaskPolicy() *worker_pb.TaskPolicy { |
|
||||
return &worker_pb.TaskPolicy{ |
|
||||
Enabled: c.Enabled, |
|
||||
MaxConcurrent: int32(c.MaxConcurrent), |
|
||||
RepeatIntervalSeconds: int32(c.ScanIntervalSeconds), |
|
||||
CheckIntervalSeconds: int32(c.ScanIntervalSeconds), |
|
||||
TaskConfig: &worker_pb.TaskPolicy_BalanceConfig{ |
|
||||
BalanceConfig: &worker_pb.BalanceTaskConfig{ |
|
||||
ImbalanceThreshold: float64(c.ImbalanceThreshold), |
|
||||
MinServerCount: int32(c.MinServerCount), |
|
||||
}, |
|
||||
}, |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// FromTaskPolicy loads configuration from a TaskPolicy protobuf message
|
|
||||
func (c *Config) FromTaskPolicy(policy *worker_pb.TaskPolicy) error { |
|
||||
if policy == nil { |
|
||||
return fmt.Errorf("policy is nil") |
|
||||
} |
|
||||
|
|
||||
// Set general TaskPolicy fields
|
|
||||
c.Enabled = policy.Enabled |
|
||||
c.MaxConcurrent = int(policy.MaxConcurrent) |
|
||||
c.ScanIntervalSeconds = int(policy.RepeatIntervalSeconds) // Direct seconds-to-seconds mapping
|
|
||||
|
|
||||
// Set balance-specific fields from the task config
|
|
||||
if balanceConfig := policy.GetBalanceConfig(); balanceConfig != nil { |
|
||||
c.ImbalanceThreshold = float64(balanceConfig.ImbalanceThreshold) |
|
||||
c.MinServerCount = int(balanceConfig.MinServerCount) |
|
||||
} |
|
||||
|
|
||||
return nil |
|
||||
} |
|
||||
|
|
||||
// LoadConfigFromPersistence loads configuration from the persistence layer if available
|
|
||||
func LoadConfigFromPersistence(configPersistence interface{}) *Config { |
|
||||
config := NewDefaultConfig() |
|
||||
|
|
||||
// Try to load from persistence if available
|
|
||||
if persistence, ok := configPersistence.(interface { |
|
||||
LoadBalanceTaskPolicy() (*worker_pb.TaskPolicy, error) |
|
||||
}); ok { |
|
||||
if policy, err := persistence.LoadBalanceTaskPolicy(); err == nil && policy != nil { |
|
||||
if err := config.FromTaskPolicy(policy); err == nil { |
|
||||
glog.V(1).Infof("Loaded balance configuration from persistence") |
|
||||
return config |
|
||||
} |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
glog.V(1).Infof("Using default balance configuration") |
|
||||
return config |
|
||||
} |
|
||||
@ -1,272 +0,0 @@ |
|||||
package balance |
|
||||
|
|
||||
import ( |
|
||||
"fmt" |
|
||||
"time" |
|
||||
|
|
||||
"github.com/seaweedfs/seaweedfs/weed/admin/topology" |
|
||||
"github.com/seaweedfs/seaweedfs/weed/glog" |
|
||||
"github.com/seaweedfs/seaweedfs/weed/pb/worker_pb" |
|
||||
"github.com/seaweedfs/seaweedfs/weed/worker/tasks/base" |
|
||||
"github.com/seaweedfs/seaweedfs/weed/worker/types" |
|
||||
) |
|
||||
|
|
||||
// Detection implements the detection logic for balance tasks
|
|
||||
func Detection(metrics []*types.VolumeHealthMetrics, clusterInfo *types.ClusterInfo, config base.TaskConfig) ([]*types.TaskDetectionResult, error) { |
|
||||
if !config.IsEnabled() { |
|
||||
return nil, nil |
|
||||
} |
|
||||
|
|
||||
balanceConfig := config.(*Config) |
|
||||
|
|
||||
// Skip if cluster is too small
|
|
||||
minVolumeCount := 2 // More reasonable for small clusters
|
|
||||
if len(metrics) < minVolumeCount { |
|
||||
glog.Infof("BALANCE: No tasks created - cluster too small (%d volumes, need ≥%d)", len(metrics), minVolumeCount) |
|
||||
return nil, nil |
|
||||
} |
|
||||
|
|
||||
// Analyze volume distribution across servers
|
|
||||
serverVolumeCounts := make(map[string]int) |
|
||||
for _, metric := range metrics { |
|
||||
serverVolumeCounts[metric.Server]++ |
|
||||
} |
|
||||
|
|
||||
if len(serverVolumeCounts) < balanceConfig.MinServerCount { |
|
||||
glog.Infof("BALANCE: No tasks created - too few servers (%d servers, need ≥%d)", len(serverVolumeCounts), balanceConfig.MinServerCount) |
|
||||
return nil, nil |
|
||||
} |
|
||||
|
|
||||
// Calculate balance metrics
|
|
||||
totalVolumes := len(metrics) |
|
||||
avgVolumesPerServer := float64(totalVolumes) / float64(len(serverVolumeCounts)) |
|
||||
|
|
||||
maxVolumes := 0 |
|
||||
minVolumes := totalVolumes |
|
||||
maxServer := "" |
|
||||
minServer := "" |
|
||||
|
|
||||
for server, count := range serverVolumeCounts { |
|
||||
if count > maxVolumes { |
|
||||
maxVolumes = count |
|
||||
maxServer = server |
|
||||
} |
|
||||
if count < minVolumes { |
|
||||
minVolumes = count |
|
||||
minServer = server |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// Check if imbalance exceeds threshold
|
|
||||
imbalanceRatio := float64(maxVolumes-minVolumes) / avgVolumesPerServer |
|
||||
if imbalanceRatio <= balanceConfig.ImbalanceThreshold { |
|
||||
glog.Infof("BALANCE: No tasks created - cluster well balanced. Imbalance=%.1f%% (threshold=%.1f%%). Max=%d volumes on %s, Min=%d on %s, Avg=%.1f", |
|
||||
imbalanceRatio*100, balanceConfig.ImbalanceThreshold*100, maxVolumes, maxServer, minVolumes, minServer, avgVolumesPerServer) |
|
||||
return nil, nil |
|
||||
} |
|
||||
|
|
||||
// Select a volume from the overloaded server for balance
|
|
||||
var selectedVolume *types.VolumeHealthMetrics |
|
||||
for _, metric := range metrics { |
|
||||
if metric.Server == maxServer { |
|
||||
selectedVolume = metric |
|
||||
break |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
if selectedVolume == nil { |
|
||||
glog.Warningf("BALANCE: Could not find volume on overloaded server %s", maxServer) |
|
||||
return nil, nil |
|
||||
} |
|
||||
|
|
||||
// Create balance task with volume and destination planning info
|
|
||||
reason := fmt.Sprintf("Cluster imbalance detected: %.1f%% (max: %d on %s, min: %d on %s, avg: %.1f)", |
|
||||
imbalanceRatio*100, maxVolumes, maxServer, minVolumes, minServer, avgVolumesPerServer) |
|
||||
|
|
||||
// Generate task ID for ActiveTopology integration
|
|
||||
taskID := fmt.Sprintf("balance_vol_%d_%d", selectedVolume.VolumeID, time.Now().Unix()) |
|
||||
|
|
||||
task := &types.TaskDetectionResult{ |
|
||||
TaskID: taskID, // Link to ActiveTopology pending task
|
|
||||
TaskType: types.TaskTypeBalance, |
|
||||
VolumeID: selectedVolume.VolumeID, |
|
||||
Server: selectedVolume.Server, |
|
||||
Collection: selectedVolume.Collection, |
|
||||
Priority: types.TaskPriorityNormal, |
|
||||
Reason: reason, |
|
||||
ScheduleAt: time.Now(), |
|
||||
} |
|
||||
|
|
||||
// Plan destination if ActiveTopology is available
|
|
||||
if clusterInfo.ActiveTopology != nil { |
|
||||
destinationPlan, err := planBalanceDestination(clusterInfo.ActiveTopology, selectedVolume) |
|
||||
if err != nil { |
|
||||
glog.Warningf("Failed to plan balance destination for volume %d: %v", selectedVolume.VolumeID, err) |
|
||||
return nil, nil // Skip this task if destination planning fails
|
|
||||
} |
|
||||
|
|
||||
// Find the actual disk containing the volume on the source server
|
|
||||
sourceDisk, found := base.FindVolumeDisk(clusterInfo.ActiveTopology, selectedVolume.VolumeID, selectedVolume.Collection, selectedVolume.Server) |
|
||||
if !found { |
|
||||
return nil, fmt.Errorf("BALANCE: Could not find volume %d (collection: %s) on source server %s - unable to create balance task", |
|
||||
selectedVolume.VolumeID, selectedVolume.Collection, selectedVolume.Server) |
|
||||
} |
|
||||
|
|
||||
// Create typed parameters with unified source and target information
|
|
||||
task.TypedParams = &worker_pb.TaskParams{ |
|
||||
TaskId: taskID, // Link to ActiveTopology pending task
|
|
||||
VolumeId: selectedVolume.VolumeID, |
|
||||
Collection: selectedVolume.Collection, |
|
||||
VolumeSize: selectedVolume.Size, // Store original volume size for tracking changes
|
|
||||
|
|
||||
// Unified sources and targets - the only way to specify locations
|
|
||||
Sources: []*worker_pb.TaskSource{ |
|
||||
{ |
|
||||
Node: selectedVolume.Server, |
|
||||
DiskId: sourceDisk, |
|
||||
VolumeId: selectedVolume.VolumeID, |
|
||||
EstimatedSize: selectedVolume.Size, |
|
||||
DataCenter: selectedVolume.DataCenter, |
|
||||
Rack: selectedVolume.Rack, |
|
||||
}, |
|
||||
}, |
|
||||
Targets: []*worker_pb.TaskTarget{ |
|
||||
{ |
|
||||
Node: destinationPlan.TargetNode, |
|
||||
DiskId: destinationPlan.TargetDisk, |
|
||||
VolumeId: selectedVolume.VolumeID, |
|
||||
EstimatedSize: destinationPlan.ExpectedSize, |
|
||||
DataCenter: destinationPlan.TargetDC, |
|
||||
Rack: destinationPlan.TargetRack, |
|
||||
}, |
|
||||
}, |
|
||||
|
|
||||
TaskParams: &worker_pb.TaskParams_BalanceParams{ |
|
||||
BalanceParams: &worker_pb.BalanceTaskParams{ |
|
||||
ForceMove: false, |
|
||||
TimeoutSeconds: 600, // 10 minutes default
|
|
||||
}, |
|
||||
}, |
|
||||
} |
|
||||
|
|
||||
glog.V(1).Infof("Planned balance destination for volume %d: %s -> %s", |
|
||||
selectedVolume.VolumeID, selectedVolume.Server, destinationPlan.TargetNode) |
|
||||
|
|
||||
// Add pending balance task to ActiveTopology for capacity management
|
|
||||
targetDisk := destinationPlan.TargetDisk |
|
||||
|
|
||||
err = clusterInfo.ActiveTopology.AddPendingTask(topology.TaskSpec{ |
|
||||
TaskID: taskID, |
|
||||
TaskType: topology.TaskTypeBalance, |
|
||||
VolumeID: selectedVolume.VolumeID, |
|
||||
VolumeSize: int64(selectedVolume.Size), |
|
||||
Sources: []topology.TaskSourceSpec{ |
|
||||
{ServerID: selectedVolume.Server, DiskID: sourceDisk}, |
|
||||
}, |
|
||||
Destinations: []topology.TaskDestinationSpec{ |
|
||||
{ServerID: destinationPlan.TargetNode, DiskID: targetDisk}, |
|
||||
}, |
|
||||
}) |
|
||||
if err != nil { |
|
||||
return nil, fmt.Errorf("BALANCE: Failed to add pending task for volume %d: %v", selectedVolume.VolumeID, err) |
|
||||
} |
|
||||
|
|
||||
glog.V(2).Infof("Added pending balance task %s to ActiveTopology for volume %d: %s:%d -> %s:%d", |
|
||||
taskID, selectedVolume.VolumeID, selectedVolume.Server, sourceDisk, destinationPlan.TargetNode, targetDisk) |
|
||||
} else { |
|
||||
glog.Warningf("No ActiveTopology available for destination planning in balance detection") |
|
||||
return nil, nil |
|
||||
} |
|
||||
|
|
||||
return []*types.TaskDetectionResult{task}, nil |
|
||||
} |
|
||||
|
|
||||
// planBalanceDestination plans the destination for a balance operation
|
|
||||
// This function implements destination planning logic directly in the detection phase
|
|
||||
func planBalanceDestination(activeTopology *topology.ActiveTopology, selectedVolume *types.VolumeHealthMetrics) (*topology.DestinationPlan, error) { |
|
||||
// Get source node information from topology
|
|
||||
var sourceRack, sourceDC string |
|
||||
|
|
||||
// Extract rack and DC from topology info
|
|
||||
topologyInfo := activeTopology.GetTopologyInfo() |
|
||||
if topologyInfo != nil { |
|
||||
for _, dc := range topologyInfo.DataCenterInfos { |
|
||||
for _, rack := range dc.RackInfos { |
|
||||
for _, dataNodeInfo := range rack.DataNodeInfos { |
|
||||
if dataNodeInfo.Id == selectedVolume.Server { |
|
||||
sourceDC = dc.Id |
|
||||
sourceRack = rack.Id |
|
||||
break |
|
||||
} |
|
||||
} |
|
||||
if sourceRack != "" { |
|
||||
break |
|
||||
} |
|
||||
} |
|
||||
if sourceDC != "" { |
|
||||
break |
|
||||
} |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// Get available disks, excluding the source node
|
|
||||
availableDisks := activeTopology.GetAvailableDisks(topology.TaskTypeBalance, selectedVolume.Server) |
|
||||
if len(availableDisks) == 0 { |
|
||||
return nil, fmt.Errorf("no available disks for balance operation") |
|
||||
} |
|
||||
|
|
||||
// Find the best destination disk based on balance criteria
|
|
||||
var bestDisk *topology.DiskInfo |
|
||||
bestScore := -1.0 |
|
||||
|
|
||||
for _, disk := range availableDisks { |
|
||||
score := calculateBalanceScore(disk, sourceRack, sourceDC, selectedVolume.Size) |
|
||||
if score > bestScore { |
|
||||
bestScore = score |
|
||||
bestDisk = disk |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
if bestDisk == nil { |
|
||||
return nil, fmt.Errorf("no suitable destination found for balance operation") |
|
||||
} |
|
||||
|
|
||||
return &topology.DestinationPlan{ |
|
||||
TargetNode: bestDisk.NodeID, |
|
||||
TargetDisk: bestDisk.DiskID, |
|
||||
TargetRack: bestDisk.Rack, |
|
||||
TargetDC: bestDisk.DataCenter, |
|
||||
ExpectedSize: selectedVolume.Size, |
|
||||
PlacementScore: bestScore, |
|
||||
}, nil |
|
||||
} |
|
||||
|
|
||||
// calculateBalanceScore calculates placement score for balance operations
|
|
||||
func calculateBalanceScore(disk *topology.DiskInfo, sourceRack, sourceDC string, volumeSize uint64) float64 { |
|
||||
if disk.DiskInfo == nil { |
|
||||
return 0.0 |
|
||||
} |
|
||||
|
|
||||
score := 0.0 |
|
||||
|
|
||||
// Prefer disks with lower current volume count (better for balance)
|
|
||||
if disk.DiskInfo.MaxVolumeCount > 0 { |
|
||||
utilization := float64(disk.DiskInfo.VolumeCount) / float64(disk.DiskInfo.MaxVolumeCount) |
|
||||
score += (1.0 - utilization) * 40.0 // Up to 40 points for low utilization
|
|
||||
} |
|
||||
|
|
||||
// Prefer different racks for better distribution
|
|
||||
if disk.Rack != sourceRack { |
|
||||
score += 30.0 |
|
||||
} |
|
||||
|
|
||||
// Prefer different data centers for better distribution
|
|
||||
if disk.DataCenter != sourceDC { |
|
||||
score += 20.0 |
|
||||
} |
|
||||
|
|
||||
// Prefer disks with lower current load
|
|
||||
score += (10.0 - float64(disk.LoadCount)) // Up to 10 points for low load
|
|
||||
|
|
||||
return score |
|
||||
} |
|
||||
@ -1,158 +0,0 @@ |
|||||
package balance |
|
||||
|
|
||||
import ( |
|
||||
"fmt" |
|
||||
"time" |
|
||||
|
|
||||
"github.com/seaweedfs/seaweedfs/weed/glog" |
|
||||
"github.com/seaweedfs/seaweedfs/weed/pb/worker_pb" |
|
||||
"github.com/seaweedfs/seaweedfs/weed/worker/tasks/base" |
|
||||
"github.com/seaweedfs/seaweedfs/weed/worker/types" |
|
||||
) |
|
||||
|
|
||||
// TypedTask implements balance operation with typed protobuf parameters
|
|
||||
type TypedTask struct { |
|
||||
*base.BaseTypedTask |
|
||||
|
|
||||
// Task state from protobuf
|
|
||||
sourceServer string |
|
||||
destNode string |
|
||||
volumeID uint32 |
|
||||
collection string |
|
||||
estimatedSize uint64 |
|
||||
forceMove bool |
|
||||
timeoutSeconds int32 |
|
||||
} |
|
||||
|
|
||||
// NewTypedTask creates a new typed balance task
|
|
||||
func NewTypedTask() types.TypedTaskInterface { |
|
||||
task := &TypedTask{ |
|
||||
BaseTypedTask: base.NewBaseTypedTask(types.TaskTypeBalance), |
|
||||
} |
|
||||
return task |
|
||||
} |
|
||||
|
|
||||
// ValidateTyped validates the typed parameters for balance task
|
|
||||
func (t *TypedTask) ValidateTyped(params *worker_pb.TaskParams) error { |
|
||||
// Basic validation from base class
|
|
||||
if err := t.BaseTypedTask.ValidateTyped(params); err != nil { |
|
||||
return err |
|
||||
} |
|
||||
|
|
||||
// Check that we have balance-specific parameters
|
|
||||
balanceParams := params.GetBalanceParams() |
|
||||
if balanceParams == nil { |
|
||||
return fmt.Errorf("balance_params is required for balance task") |
|
||||
} |
|
||||
|
|
||||
// Validate sources and targets
|
|
||||
if len(params.Sources) == 0 { |
|
||||
return fmt.Errorf("at least one source is required for balance task") |
|
||||
} |
|
||||
if len(params.Targets) == 0 { |
|
||||
return fmt.Errorf("at least one target is required for balance task") |
|
||||
} |
|
||||
|
|
||||
// Validate that source and target have volume IDs
|
|
||||
if params.Sources[0].VolumeId == 0 { |
|
||||
return fmt.Errorf("source volume_id is required for balance task") |
|
||||
} |
|
||||
if params.Targets[0].VolumeId == 0 { |
|
||||
return fmt.Errorf("target volume_id is required for balance task") |
|
||||
} |
|
||||
|
|
||||
// Validate timeout
|
|
||||
if balanceParams.TimeoutSeconds <= 0 { |
|
||||
return fmt.Errorf("timeout_seconds must be greater than 0") |
|
||||
} |
|
||||
|
|
||||
return nil |
|
||||
} |
|
||||
|
|
||||
// EstimateTimeTyped estimates the time needed for balance operation based on protobuf parameters
|
|
||||
func (t *TypedTask) EstimateTimeTyped(params *worker_pb.TaskParams) time.Duration { |
|
||||
balanceParams := params.GetBalanceParams() |
|
||||
if balanceParams != nil { |
|
||||
// Use the timeout from parameters if specified
|
|
||||
if balanceParams.TimeoutSeconds > 0 { |
|
||||
return time.Duration(balanceParams.TimeoutSeconds) * time.Second |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// Estimate based on volume size from sources (1 minute per GB)
|
|
||||
if len(params.Sources) > 0 { |
|
||||
source := params.Sources[0] |
|
||||
if source.EstimatedSize > 0 { |
|
||||
gbSize := source.EstimatedSize / (1024 * 1024 * 1024) |
|
||||
return time.Duration(gbSize) * time.Minute |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// Default estimation
|
|
||||
return 10 * time.Minute |
|
||||
} |
|
||||
|
|
||||
// ExecuteTyped implements the balance operation with typed parameters
|
|
||||
func (t *TypedTask) ExecuteTyped(params *worker_pb.TaskParams) error { |
|
||||
// Extract basic parameters
|
|
||||
t.volumeID = params.VolumeId |
|
||||
t.collection = params.Collection |
|
||||
|
|
||||
// Ensure sources and targets are present (should be guaranteed by validation)
|
|
||||
if len(params.Sources) == 0 { |
|
||||
return fmt.Errorf("at least one source is required for balance task (ExecuteTyped)") |
|
||||
} |
|
||||
if len(params.Targets) == 0 { |
|
||||
return fmt.Errorf("at least one target is required for balance task (ExecuteTyped)") |
|
||||
} |
|
||||
|
|
||||
// Extract source and target information
|
|
||||
t.sourceServer = params.Sources[0].Node |
|
||||
t.estimatedSize = params.Sources[0].EstimatedSize |
|
||||
t.destNode = params.Targets[0].Node |
|
||||
// Extract balance-specific parameters
|
|
||||
balanceParams := params.GetBalanceParams() |
|
||||
if balanceParams != nil { |
|
||||
t.forceMove = balanceParams.ForceMove |
|
||||
t.timeoutSeconds = balanceParams.TimeoutSeconds |
|
||||
} |
|
||||
|
|
||||
glog.Infof("Starting typed balance task for volume %d: %s -> %s (collection: %s, size: %d bytes)", |
|
||||
t.volumeID, t.sourceServer, t.destNode, t.collection, t.estimatedSize) |
|
||||
|
|
||||
// Simulate balance operation with progress updates
|
|
||||
steps := []struct { |
|
||||
name string |
|
||||
duration time.Duration |
|
||||
progress float64 |
|
||||
}{ |
|
||||
{"Analyzing cluster state", 2 * time.Second, 15}, |
|
||||
{"Verifying destination capacity", 1 * time.Second, 25}, |
|
||||
{"Starting volume migration", 1 * time.Second, 35}, |
|
||||
{"Moving volume data", 6 * time.Second, 75}, |
|
||||
{"Updating cluster metadata", 2 * time.Second, 95}, |
|
||||
{"Verifying balance completion", 1 * time.Second, 100}, |
|
||||
} |
|
||||
|
|
||||
for _, step := range steps { |
|
||||
if t.IsCancelled() { |
|
||||
return fmt.Errorf("balance task cancelled during: %s", step.name) |
|
||||
} |
|
||||
|
|
||||
glog.V(1).Infof("Balance task step: %s", step.name) |
|
||||
t.SetProgress(step.progress) |
|
||||
|
|
||||
// Simulate work
|
|
||||
time.Sleep(step.duration) |
|
||||
} |
|
||||
|
|
||||
glog.Infof("Typed balance task completed successfully for volume %d: %s -> %s", |
|
||||
t.volumeID, t.sourceServer, t.destNode) |
|
||||
return nil |
|
||||
} |
|
||||
|
|
||||
// Register the typed task in the global registry
|
|
||||
func init() { |
|
||||
types.RegisterGlobalTypedTask(types.TaskTypeBalance, NewTypedTask) |
|
||||
glog.V(1).Infof("Registered typed balance task") |
|
||||
} |
|
||||
@ -1,138 +0,0 @@ |
|||||
package balance |
|
||||
|
|
||||
import ( |
|
||||
"sync" |
|
||||
"time" |
|
||||
) |
|
||||
|
|
||||
// BalanceMetrics contains balance-specific monitoring data
|
|
||||
type BalanceMetrics struct { |
|
||||
// Execution metrics
|
|
||||
VolumesBalanced int64 `json:"volumes_balanced"` |
|
||||
TotalDataTransferred int64 `json:"total_data_transferred"` |
|
||||
AverageImbalance float64 `json:"average_imbalance"` |
|
||||
LastBalanceTime time.Time `json:"last_balance_time"` |
|
||||
|
|
||||
// Performance metrics
|
|
||||
AverageTransferSpeed float64 `json:"average_transfer_speed_mbps"` |
|
||||
TotalExecutionTime int64 `json:"total_execution_time_seconds"` |
|
||||
SuccessfulOperations int64 `json:"successful_operations"` |
|
||||
FailedOperations int64 `json:"failed_operations"` |
|
||||
|
|
||||
// Current task metrics
|
|
||||
CurrentImbalanceScore float64 `json:"current_imbalance_score"` |
|
||||
PlannedDestinations int `json:"planned_destinations"` |
|
||||
|
|
||||
mutex sync.RWMutex |
|
||||
} |
|
||||
|
|
||||
// NewBalanceMetrics creates a new balance metrics instance
|
|
||||
func NewBalanceMetrics() *BalanceMetrics { |
|
||||
return &BalanceMetrics{ |
|
||||
LastBalanceTime: time.Now(), |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// RecordVolumeBalanced records a successful volume balance operation
|
|
||||
func (m *BalanceMetrics) RecordVolumeBalanced(volumeSize int64, transferTime time.Duration) { |
|
||||
m.mutex.Lock() |
|
||||
defer m.mutex.Unlock() |
|
||||
|
|
||||
m.VolumesBalanced++ |
|
||||
m.TotalDataTransferred += volumeSize |
|
||||
m.SuccessfulOperations++ |
|
||||
m.LastBalanceTime = time.Now() |
|
||||
m.TotalExecutionTime += int64(transferTime.Seconds()) |
|
||||
|
|
||||
// Calculate average transfer speed (MB/s)
|
|
||||
if transferTime > 0 { |
|
||||
speedMBps := float64(volumeSize) / (1024 * 1024) / transferTime.Seconds() |
|
||||
if m.AverageTransferSpeed == 0 { |
|
||||
m.AverageTransferSpeed = speedMBps |
|
||||
} else { |
|
||||
// Exponential moving average
|
|
||||
m.AverageTransferSpeed = 0.8*m.AverageTransferSpeed + 0.2*speedMBps |
|
||||
} |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// RecordFailure records a failed balance operation
|
|
||||
func (m *BalanceMetrics) RecordFailure() { |
|
||||
m.mutex.Lock() |
|
||||
defer m.mutex.Unlock() |
|
||||
|
|
||||
m.FailedOperations++ |
|
||||
} |
|
||||
|
|
||||
// UpdateImbalanceScore updates the current cluster imbalance score
|
|
||||
func (m *BalanceMetrics) UpdateImbalanceScore(score float64) { |
|
||||
m.mutex.Lock() |
|
||||
defer m.mutex.Unlock() |
|
||||
|
|
||||
m.CurrentImbalanceScore = score |
|
||||
|
|
||||
// Update average imbalance with exponential moving average
|
|
||||
if m.AverageImbalance == 0 { |
|
||||
m.AverageImbalance = score |
|
||||
} else { |
|
||||
m.AverageImbalance = 0.9*m.AverageImbalance + 0.1*score |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// SetPlannedDestinations sets the number of planned destinations
|
|
||||
func (m *BalanceMetrics) SetPlannedDestinations(count int) { |
|
||||
m.mutex.Lock() |
|
||||
defer m.mutex.Unlock() |
|
||||
|
|
||||
m.PlannedDestinations = count |
|
||||
} |
|
||||
|
|
||||
// GetMetrics returns a copy of the current metrics (without the mutex)
|
|
||||
func (m *BalanceMetrics) GetMetrics() BalanceMetrics { |
|
||||
m.mutex.RLock() |
|
||||
defer m.mutex.RUnlock() |
|
||||
|
|
||||
// Create a copy without the mutex to avoid copying lock value
|
|
||||
return BalanceMetrics{ |
|
||||
VolumesBalanced: m.VolumesBalanced, |
|
||||
TotalDataTransferred: m.TotalDataTransferred, |
|
||||
AverageImbalance: m.AverageImbalance, |
|
||||
LastBalanceTime: m.LastBalanceTime, |
|
||||
AverageTransferSpeed: m.AverageTransferSpeed, |
|
||||
TotalExecutionTime: m.TotalExecutionTime, |
|
||||
SuccessfulOperations: m.SuccessfulOperations, |
|
||||
FailedOperations: m.FailedOperations, |
|
||||
CurrentImbalanceScore: m.CurrentImbalanceScore, |
|
||||
PlannedDestinations: m.PlannedDestinations, |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// GetSuccessRate returns the success rate as a percentage
|
|
||||
func (m *BalanceMetrics) GetSuccessRate() float64 { |
|
||||
m.mutex.RLock() |
|
||||
defer m.mutex.RUnlock() |
|
||||
|
|
||||
total := m.SuccessfulOperations + m.FailedOperations |
|
||||
if total == 0 { |
|
||||
return 100.0 |
|
||||
} |
|
||||
return float64(m.SuccessfulOperations) / float64(total) * 100.0 |
|
||||
} |
|
||||
|
|
||||
// Reset resets all metrics to zero
|
|
||||
func (m *BalanceMetrics) Reset() { |
|
||||
m.mutex.Lock() |
|
||||
defer m.mutex.Unlock() |
|
||||
|
|
||||
*m = BalanceMetrics{ |
|
||||
LastBalanceTime: time.Now(), |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// Global metrics instance for balance tasks
|
|
||||
var globalBalanceMetrics = NewBalanceMetrics() |
|
||||
|
|
||||
// GetGlobalBalanceMetrics returns the global balance metrics instance
|
|
||||
func GetGlobalBalanceMetrics() *BalanceMetrics { |
|
||||
return globalBalanceMetrics |
|
||||
} |
|
||||
@ -1,86 +0,0 @@ |
|||||
package balance |
|
||||
|
|
||||
import ( |
|
||||
"fmt" |
|
||||
"time" |
|
||||
|
|
||||
"github.com/seaweedfs/seaweedfs/weed/glog" |
|
||||
"github.com/seaweedfs/seaweedfs/weed/pb/worker_pb" |
|
||||
"github.com/seaweedfs/seaweedfs/weed/worker/tasks" |
|
||||
"github.com/seaweedfs/seaweedfs/weed/worker/tasks/base" |
|
||||
"github.com/seaweedfs/seaweedfs/weed/worker/types" |
|
||||
) |
|
||||
|
|
||||
// Global variable to hold the task definition for configuration updates
|
|
||||
var globalTaskDef *base.TaskDefinition |
|
||||
|
|
||||
// Auto-register this task when the package is imported
|
|
||||
func init() { |
|
||||
RegisterBalanceTask() |
|
||||
|
|
||||
// Register config updater
|
|
||||
tasks.AutoRegisterConfigUpdater(types.TaskTypeBalance, UpdateConfigFromPersistence) |
|
||||
} |
|
||||
|
|
||||
// RegisterBalanceTask registers the balance task with the new architecture
|
|
||||
func RegisterBalanceTask() { |
|
||||
// Create configuration instance
|
|
||||
config := NewDefaultConfig() |
|
||||
|
|
||||
// Create complete task definition
|
|
||||
taskDef := &base.TaskDefinition{ |
|
||||
Type: types.TaskTypeBalance, |
|
||||
Name: "balance", |
|
||||
DisplayName: "Volume Balance", |
|
||||
Description: "Balances volume distribution across servers", |
|
||||
Icon: "fas fa-balance-scale text-warning", |
|
||||
Capabilities: []string{"balance", "distribution"}, |
|
||||
|
|
||||
Config: config, |
|
||||
ConfigSpec: GetConfigSpec(), |
|
||||
CreateTask: func(params *worker_pb.TaskParams) (types.Task, error) { |
|
||||
if params == nil { |
|
||||
return nil, fmt.Errorf("task parameters are required") |
|
||||
} |
|
||||
if len(params.Sources) == 0 { |
|
||||
return nil, fmt.Errorf("at least one source is required for balance task") |
|
||||
} |
|
||||
return NewBalanceTask( |
|
||||
fmt.Sprintf("balance-%d", params.VolumeId), |
|
||||
params.Sources[0].Node, // Use first source node
|
|
||||
params.VolumeId, |
|
||||
params.Collection, |
|
||||
), nil |
|
||||
}, |
|
||||
DetectionFunc: Detection, |
|
||||
ScanInterval: 30 * time.Minute, |
|
||||
SchedulingFunc: Scheduling, |
|
||||
MaxConcurrent: 1, |
|
||||
RepeatInterval: 2 * time.Hour, |
|
||||
} |
|
||||
|
|
||||
// Store task definition globally for configuration updates
|
|
||||
globalTaskDef = taskDef |
|
||||
|
|
||||
// Register everything with a single function call!
|
|
||||
base.RegisterTask(taskDef) |
|
||||
} |
|
||||
|
|
||||
// UpdateConfigFromPersistence updates the balance configuration from persistence
|
|
||||
func UpdateConfigFromPersistence(configPersistence interface{}) error { |
|
||||
if globalTaskDef == nil { |
|
||||
return fmt.Errorf("balance task not registered") |
|
||||
} |
|
||||
|
|
||||
// Load configuration from persistence
|
|
||||
newConfig := LoadConfigFromPersistence(configPersistence) |
|
||||
if newConfig == nil { |
|
||||
return fmt.Errorf("failed to load configuration from persistence") |
|
||||
} |
|
||||
|
|
||||
// Update the task definition's config
|
|
||||
globalTaskDef.Config = newConfig |
|
||||
|
|
||||
glog.V(1).Infof("Updated balance task configuration from persistence") |
|
||||
return nil |
|
||||
} |
|
||||
@ -1,37 +0,0 @@ |
|||||
package balance |
|
||||
|
|
||||
import ( |
|
||||
"github.com/seaweedfs/seaweedfs/weed/worker/tasks/base" |
|
||||
"github.com/seaweedfs/seaweedfs/weed/worker/types" |
|
||||
) |
|
||||
|
|
||||
// Scheduling implements the scheduling logic for balance tasks
|
|
||||
func Scheduling(task *types.TaskInput, runningTasks []*types.TaskInput, availableWorkers []*types.WorkerData, config base.TaskConfig) bool { |
|
||||
balanceConfig := config.(*Config) |
|
||||
|
|
||||
// Count running balance tasks
|
|
||||
runningBalanceCount := 0 |
|
||||
for _, runningTask := range runningTasks { |
|
||||
if runningTask.Type == types.TaskTypeBalance { |
|
||||
runningBalanceCount++ |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// Check concurrency limit
|
|
||||
if runningBalanceCount >= balanceConfig.MaxConcurrent { |
|
||||
return false |
|
||||
} |
|
||||
|
|
||||
// Check if we have available workers
|
|
||||
availableWorkerCount := 0 |
|
||||
for _, worker := range availableWorkers { |
|
||||
for _, capability := range worker.Capabilities { |
|
||||
if capability == types.TaskTypeBalance { |
|
||||
availableWorkerCount++ |
|
||||
break |
|
||||
} |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
return availableWorkerCount > 0 |
|
||||
} |
|
||||
@ -1,190 +0,0 @@ |
|||||
package vacuum |
|
||||
|
|
||||
import ( |
|
||||
"fmt" |
|
||||
|
|
||||
"github.com/seaweedfs/seaweedfs/weed/admin/config" |
|
||||
"github.com/seaweedfs/seaweedfs/weed/glog" |
|
||||
"github.com/seaweedfs/seaweedfs/weed/pb/worker_pb" |
|
||||
"github.com/seaweedfs/seaweedfs/weed/worker/tasks/base" |
|
||||
) |
|
||||
|
|
||||
// Config extends BaseConfig with vacuum-specific settings
|
|
||||
type Config struct { |
|
||||
base.BaseConfig |
|
||||
GarbageThreshold float64 `json:"garbage_threshold"` |
|
||||
MinVolumeAgeSeconds int `json:"min_volume_age_seconds"` |
|
||||
MinIntervalSeconds int `json:"min_interval_seconds"` |
|
||||
} |
|
||||
|
|
||||
// NewDefaultConfig creates a new default vacuum configuration
|
|
||||
func NewDefaultConfig() *Config { |
|
||||
return &Config{ |
|
||||
BaseConfig: base.BaseConfig{ |
|
||||
Enabled: true, |
|
||||
ScanIntervalSeconds: 2 * 60 * 60, // 2 hours
|
|
||||
MaxConcurrent: 2, |
|
||||
}, |
|
||||
GarbageThreshold: 0.3, // 30%
|
|
||||
MinVolumeAgeSeconds: 24 * 60 * 60, // 24 hours
|
|
||||
MinIntervalSeconds: 7 * 24 * 60 * 60, // 7 days
|
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// ToTaskPolicy converts configuration to a TaskPolicy protobuf message
|
|
||||
func (c *Config) ToTaskPolicy() *worker_pb.TaskPolicy { |
|
||||
return &worker_pb.TaskPolicy{ |
|
||||
Enabled: c.Enabled, |
|
||||
MaxConcurrent: int32(c.MaxConcurrent), |
|
||||
RepeatIntervalSeconds: int32(c.ScanIntervalSeconds), |
|
||||
CheckIntervalSeconds: int32(c.ScanIntervalSeconds), |
|
||||
TaskConfig: &worker_pb.TaskPolicy_VacuumConfig{ |
|
||||
VacuumConfig: &worker_pb.VacuumTaskConfig{ |
|
||||
GarbageThreshold: float64(c.GarbageThreshold), |
|
||||
MinVolumeAgeHours: int32(c.MinVolumeAgeSeconds / 3600), // Convert seconds to hours
|
|
||||
MinIntervalSeconds: int32(c.MinIntervalSeconds), |
|
||||
}, |
|
||||
}, |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// FromTaskPolicy loads configuration from a TaskPolicy protobuf message
|
|
||||
func (c *Config) FromTaskPolicy(policy *worker_pb.TaskPolicy) error { |
|
||||
if policy == nil { |
|
||||
return fmt.Errorf("policy is nil") |
|
||||
} |
|
||||
|
|
||||
// Set general TaskPolicy fields
|
|
||||
c.Enabled = policy.Enabled |
|
||||
c.MaxConcurrent = int(policy.MaxConcurrent) |
|
||||
c.ScanIntervalSeconds = int(policy.RepeatIntervalSeconds) // Direct seconds-to-seconds mapping
|
|
||||
|
|
||||
// Set vacuum-specific fields from the task config
|
|
||||
if vacuumConfig := policy.GetVacuumConfig(); vacuumConfig != nil { |
|
||||
c.GarbageThreshold = float64(vacuumConfig.GarbageThreshold) |
|
||||
c.MinVolumeAgeSeconds = int(vacuumConfig.MinVolumeAgeHours * 3600) // Convert hours to seconds
|
|
||||
c.MinIntervalSeconds = int(vacuumConfig.MinIntervalSeconds) |
|
||||
} |
|
||||
|
|
||||
return nil |
|
||||
} |
|
||||
|
|
||||
// LoadConfigFromPersistence loads configuration from the persistence layer if available
|
|
||||
func LoadConfigFromPersistence(configPersistence interface{}) *Config { |
|
||||
config := NewDefaultConfig() |
|
||||
|
|
||||
// Try to load from persistence if available
|
|
||||
if persistence, ok := configPersistence.(interface { |
|
||||
LoadVacuumTaskPolicy() (*worker_pb.TaskPolicy, error) |
|
||||
}); ok { |
|
||||
if policy, err := persistence.LoadVacuumTaskPolicy(); err == nil && policy != nil { |
|
||||
if err := config.FromTaskPolicy(policy); err == nil { |
|
||||
glog.V(1).Infof("Loaded vacuum configuration from persistence") |
|
||||
return config |
|
||||
} |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
glog.V(1).Infof("Using default vacuum configuration") |
|
||||
return config |
|
||||
} |
|
||||
|
|
||||
// GetConfigSpec returns the configuration schema for vacuum tasks
|
|
||||
func GetConfigSpec() base.ConfigSpec { |
|
||||
return base.ConfigSpec{ |
|
||||
Fields: []*config.Field{ |
|
||||
{ |
|
||||
Name: "enabled", |
|
||||
JSONName: "enabled", |
|
||||
Type: config.FieldTypeBool, |
|
||||
DefaultValue: true, |
|
||||
Required: false, |
|
||||
DisplayName: "Enable Vacuum Tasks", |
|
||||
Description: "Whether vacuum tasks should be automatically created", |
|
||||
HelpText: "Toggle this to enable or disable automatic vacuum task generation", |
|
||||
InputType: "checkbox", |
|
||||
CSSClasses: "form-check-input", |
|
||||
}, |
|
||||
{ |
|
||||
Name: "scan_interval_seconds", |
|
||||
JSONName: "scan_interval_seconds", |
|
||||
Type: config.FieldTypeInterval, |
|
||||
DefaultValue: 2 * 60 * 60, |
|
||||
MinValue: 10 * 60, |
|
||||
MaxValue: 24 * 60 * 60, |
|
||||
Required: true, |
|
||||
DisplayName: "Scan Interval", |
|
||||
Description: "How often to scan for volumes needing vacuum", |
|
||||
HelpText: "The system will check for volumes that need vacuuming at this interval", |
|
||||
Placeholder: "2", |
|
||||
Unit: config.UnitHours, |
|
||||
InputType: "interval", |
|
||||
CSSClasses: "form-control", |
|
||||
}, |
|
||||
{ |
|
||||
Name: "max_concurrent", |
|
||||
JSONName: "max_concurrent", |
|
||||
Type: config.FieldTypeInt, |
|
||||
DefaultValue: 2, |
|
||||
MinValue: 1, |
|
||||
MaxValue: 10, |
|
||||
Required: true, |
|
||||
DisplayName: "Max Concurrent Tasks", |
|
||||
Description: "Maximum number of vacuum tasks that can run simultaneously", |
|
||||
HelpText: "Limits the number of vacuum operations running at the same time to control system load", |
|
||||
Placeholder: "2 (default)", |
|
||||
Unit: config.UnitCount, |
|
||||
InputType: "number", |
|
||||
CSSClasses: "form-control", |
|
||||
}, |
|
||||
{ |
|
||||
Name: "garbage_threshold", |
|
||||
JSONName: "garbage_threshold", |
|
||||
Type: config.FieldTypeFloat, |
|
||||
DefaultValue: 0.3, |
|
||||
MinValue: 0.0, |
|
||||
MaxValue: 1.0, |
|
||||
Required: true, |
|
||||
DisplayName: "Garbage Percentage Threshold", |
|
||||
Description: "Trigger vacuum when garbage ratio exceeds this percentage", |
|
||||
HelpText: "Volumes with more deleted content than this threshold will be vacuumed", |
|
||||
Placeholder: "0.30 (30%)", |
|
||||
Unit: config.UnitNone, |
|
||||
InputType: "number", |
|
||||
CSSClasses: "form-control", |
|
||||
}, |
|
||||
{ |
|
||||
Name: "min_volume_age_seconds", |
|
||||
JSONName: "min_volume_age_seconds", |
|
||||
Type: config.FieldTypeInterval, |
|
||||
DefaultValue: 24 * 60 * 60, |
|
||||
MinValue: 1 * 60 * 60, |
|
||||
MaxValue: 7 * 24 * 60 * 60, |
|
||||
Required: true, |
|
||||
DisplayName: "Minimum Volume Age", |
|
||||
Description: "Only vacuum volumes older than this duration", |
|
||||
HelpText: "Prevents vacuuming of recently created volumes that may still be actively written to", |
|
||||
Placeholder: "24", |
|
||||
Unit: config.UnitHours, |
|
||||
InputType: "interval", |
|
||||
CSSClasses: "form-control", |
|
||||
}, |
|
||||
{ |
|
||||
Name: "min_interval_seconds", |
|
||||
JSONName: "min_interval_seconds", |
|
||||
Type: config.FieldTypeInterval, |
|
||||
DefaultValue: 7 * 24 * 60 * 60, |
|
||||
MinValue: 1 * 24 * 60 * 60, |
|
||||
MaxValue: 30 * 24 * 60 * 60, |
|
||||
Required: true, |
|
||||
DisplayName: "Minimum Interval", |
|
||||
Description: "Minimum time between vacuum operations on the same volume", |
|
||||
HelpText: "Prevents excessive vacuuming of the same volume by enforcing a minimum wait time", |
|
||||
Placeholder: "7", |
|
||||
Unit: config.UnitDays, |
|
||||
InputType: "interval", |
|
||||
CSSClasses: "form-control", |
|
||||
}, |
|
||||
}, |
|
||||
} |
|
||||
} |
|
||||
@ -1,133 +0,0 @@ |
|||||
package vacuum |
|
||||
|
|
||||
import ( |
|
||||
"fmt" |
|
||||
"time" |
|
||||
|
|
||||
"github.com/seaweedfs/seaweedfs/weed/glog" |
|
||||
"github.com/seaweedfs/seaweedfs/weed/pb/worker_pb" |
|
||||
"github.com/seaweedfs/seaweedfs/weed/worker/tasks/base" |
|
||||
"github.com/seaweedfs/seaweedfs/weed/worker/types" |
|
||||
) |
|
||||
|
|
||||
// Detection implements the detection logic for vacuum tasks
|
|
||||
func Detection(metrics []*types.VolumeHealthMetrics, clusterInfo *types.ClusterInfo, config base.TaskConfig) ([]*types.TaskDetectionResult, error) { |
|
||||
if !config.IsEnabled() { |
|
||||
return nil, nil |
|
||||
} |
|
||||
|
|
||||
vacuumConfig := config.(*Config) |
|
||||
var results []*types.TaskDetectionResult |
|
||||
minVolumeAge := time.Duration(vacuumConfig.MinVolumeAgeSeconds) * time.Second |
|
||||
|
|
||||
debugCount := 0 |
|
||||
skippedDueToGarbage := 0 |
|
||||
skippedDueToAge := 0 |
|
||||
|
|
||||
for _, metric := range metrics { |
|
||||
// Check if volume needs vacuum
|
|
||||
if metric.GarbageRatio >= vacuumConfig.GarbageThreshold && metric.Age >= minVolumeAge { |
|
||||
priority := types.TaskPriorityNormal |
|
||||
if metric.GarbageRatio > 0.6 { |
|
||||
priority = types.TaskPriorityHigh |
|
||||
} |
|
||||
|
|
||||
// Generate task ID for future ActiveTopology integration
|
|
||||
taskID := fmt.Sprintf("vacuum_vol_%d_%d", metric.VolumeID, time.Now().Unix()) |
|
||||
|
|
||||
result := &types.TaskDetectionResult{ |
|
||||
TaskID: taskID, // For future ActiveTopology integration
|
|
||||
TaskType: types.TaskTypeVacuum, |
|
||||
VolumeID: metric.VolumeID, |
|
||||
Server: metric.Server, |
|
||||
Collection: metric.Collection, |
|
||||
Priority: priority, |
|
||||
Reason: "Volume has excessive garbage requiring vacuum", |
|
||||
ScheduleAt: time.Now(), |
|
||||
} |
|
||||
|
|
||||
// Create typed parameters for vacuum task
|
|
||||
result.TypedParams = createVacuumTaskParams(result, metric, vacuumConfig, clusterInfo) |
|
||||
results = append(results, result) |
|
||||
} else { |
|
||||
// Debug why volume was not selected
|
|
||||
if debugCount < 5 { // Limit debug output to first 5 volumes
|
|
||||
if metric.GarbageRatio < vacuumConfig.GarbageThreshold { |
|
||||
skippedDueToGarbage++ |
|
||||
} |
|
||||
if metric.Age < minVolumeAge { |
|
||||
skippedDueToAge++ |
|
||||
} |
|
||||
} |
|
||||
debugCount++ |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// Log debug summary if no tasks were created
|
|
||||
if len(results) == 0 && len(metrics) > 0 { |
|
||||
totalVolumes := len(metrics) |
|
||||
glog.Infof("VACUUM: No tasks created for %d volumes. Threshold=%.2f%%, MinAge=%s. Skipped: %d (garbage<threshold), %d (age<minimum)", |
|
||||
totalVolumes, vacuumConfig.GarbageThreshold*100, minVolumeAge, skippedDueToGarbage, skippedDueToAge) |
|
||||
|
|
||||
// Show details for first few volumes
|
|
||||
for i, metric := range metrics { |
|
||||
if i >= 3 { // Limit to first 3 volumes
|
|
||||
break |
|
||||
} |
|
||||
glog.Infof("VACUUM: Volume %d: garbage=%.2f%% (need ≥%.2f%%), age=%s (need ≥%s)", |
|
||||
metric.VolumeID, metric.GarbageRatio*100, vacuumConfig.GarbageThreshold*100, |
|
||||
metric.Age.Truncate(time.Minute), minVolumeAge.Truncate(time.Minute)) |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
return results, nil |
|
||||
} |
|
||||
|
|
||||
// createVacuumTaskParams creates typed parameters for vacuum tasks
|
|
||||
// This function is moved from MaintenanceIntegration.createVacuumTaskParams to the detection logic
|
|
||||
func createVacuumTaskParams(task *types.TaskDetectionResult, metric *types.VolumeHealthMetrics, vacuumConfig *Config, clusterInfo *types.ClusterInfo) *worker_pb.TaskParams { |
|
||||
// Use configured values or defaults
|
|
||||
garbageThreshold := 0.3 // Default 30%
|
|
||||
verifyChecksum := true // Default to verify
|
|
||||
batchSize := int32(1000) // Default batch size
|
|
||||
workingDir := "/tmp/seaweedfs_vacuum_work" // Default working directory
|
|
||||
|
|
||||
if vacuumConfig != nil { |
|
||||
garbageThreshold = vacuumConfig.GarbageThreshold |
|
||||
// Note: VacuumTaskConfig has GarbageThreshold, MinVolumeAgeHours, MinIntervalSeconds
|
|
||||
// Other fields like VerifyChecksum, BatchSize, WorkingDir would need to be added
|
|
||||
// to the protobuf definition if they should be configurable
|
|
||||
} |
|
||||
|
|
||||
// Use DC and rack information directly from VolumeHealthMetrics
|
|
||||
sourceDC, sourceRack := metric.DataCenter, metric.Rack |
|
||||
|
|
||||
// Create typed protobuf parameters with unified sources
|
|
||||
return &worker_pb.TaskParams{ |
|
||||
TaskId: task.TaskID, // Link to ActiveTopology pending task (if integrated)
|
|
||||
VolumeId: task.VolumeID, |
|
||||
Collection: task.Collection, |
|
||||
VolumeSize: metric.Size, // Store original volume size for tracking changes
|
|
||||
|
|
||||
// Unified sources array
|
|
||||
Sources: []*worker_pb.TaskSource{ |
|
||||
{ |
|
||||
Node: task.Server, |
|
||||
VolumeId: task.VolumeID, |
|
||||
EstimatedSize: metric.Size, |
|
||||
DataCenter: sourceDC, |
|
||||
Rack: sourceRack, |
|
||||
}, |
|
||||
}, |
|
||||
|
|
||||
TaskParams: &worker_pb.TaskParams_VacuumParams{ |
|
||||
VacuumParams: &worker_pb.VacuumTaskParams{ |
|
||||
GarbageThreshold: garbageThreshold, |
|
||||
ForceVacuum: false, |
|
||||
BatchSize: batchSize, |
|
||||
WorkingDir: workingDir, |
|
||||
VerifyChecksum: verifyChecksum, |
|
||||
}, |
|
||||
}, |
|
||||
} |
|
||||
} |
|
||||
@ -1,151 +0,0 @@ |
|||||
package vacuum |
|
||||
|
|
||||
import ( |
|
||||
"sync" |
|
||||
"time" |
|
||||
) |
|
||||
|
|
||||
// VacuumMetrics contains vacuum-specific monitoring data
|
|
||||
type VacuumMetrics struct { |
|
||||
// Execution metrics
|
|
||||
VolumesVacuumed int64 `json:"volumes_vacuumed"` |
|
||||
TotalSpaceReclaimed int64 `json:"total_space_reclaimed"` |
|
||||
TotalFilesProcessed int64 `json:"total_files_processed"` |
|
||||
TotalGarbageCollected int64 `json:"total_garbage_collected"` |
|
||||
LastVacuumTime time.Time `json:"last_vacuum_time"` |
|
||||
|
|
||||
// Performance metrics
|
|
||||
AverageVacuumTime int64 `json:"average_vacuum_time_seconds"` |
|
||||
AverageGarbageRatio float64 `json:"average_garbage_ratio"` |
|
||||
SuccessfulOperations int64 `json:"successful_operations"` |
|
||||
FailedOperations int64 `json:"failed_operations"` |
|
||||
|
|
||||
// Current task metrics
|
|
||||
CurrentGarbageRatio float64 `json:"current_garbage_ratio"` |
|
||||
VolumesPendingVacuum int `json:"volumes_pending_vacuum"` |
|
||||
|
|
||||
mutex sync.RWMutex |
|
||||
} |
|
||||
|
|
||||
// NewVacuumMetrics creates a new vacuum metrics instance
|
|
||||
func NewVacuumMetrics() *VacuumMetrics { |
|
||||
return &VacuumMetrics{ |
|
||||
LastVacuumTime: time.Now(), |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// RecordVolumeVacuumed records a successful volume vacuum operation
|
|
||||
func (m *VacuumMetrics) RecordVolumeVacuumed(spaceReclaimed int64, filesProcessed int64, garbageCollected int64, vacuumTime time.Duration, garbageRatio float64) { |
|
||||
m.mutex.Lock() |
|
||||
defer m.mutex.Unlock() |
|
||||
|
|
||||
m.VolumesVacuumed++ |
|
||||
m.TotalSpaceReclaimed += spaceReclaimed |
|
||||
m.TotalFilesProcessed += filesProcessed |
|
||||
m.TotalGarbageCollected += garbageCollected |
|
||||
m.SuccessfulOperations++ |
|
||||
m.LastVacuumTime = time.Now() |
|
||||
|
|
||||
// Update average vacuum time
|
|
||||
if m.AverageVacuumTime == 0 { |
|
||||
m.AverageVacuumTime = int64(vacuumTime.Seconds()) |
|
||||
} else { |
|
||||
// Exponential moving average
|
|
||||
newTime := int64(vacuumTime.Seconds()) |
|
||||
m.AverageVacuumTime = (m.AverageVacuumTime*4 + newTime) / 5 |
|
||||
} |
|
||||
|
|
||||
// Update average garbage ratio
|
|
||||
if m.AverageGarbageRatio == 0 { |
|
||||
m.AverageGarbageRatio = garbageRatio |
|
||||
} else { |
|
||||
// Exponential moving average
|
|
||||
m.AverageGarbageRatio = 0.8*m.AverageGarbageRatio + 0.2*garbageRatio |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// RecordFailure records a failed vacuum operation
|
|
||||
func (m *VacuumMetrics) RecordFailure() { |
|
||||
m.mutex.Lock() |
|
||||
defer m.mutex.Unlock() |
|
||||
|
|
||||
m.FailedOperations++ |
|
||||
} |
|
||||
|
|
||||
// UpdateCurrentGarbageRatio updates the current volume's garbage ratio
|
|
||||
func (m *VacuumMetrics) UpdateCurrentGarbageRatio(ratio float64) { |
|
||||
m.mutex.Lock() |
|
||||
defer m.mutex.Unlock() |
|
||||
|
|
||||
m.CurrentGarbageRatio = ratio |
|
||||
} |
|
||||
|
|
||||
// SetVolumesPendingVacuum sets the number of volumes pending vacuum
|
|
||||
func (m *VacuumMetrics) SetVolumesPendingVacuum(count int) { |
|
||||
m.mutex.Lock() |
|
||||
defer m.mutex.Unlock() |
|
||||
|
|
||||
m.VolumesPendingVacuum = count |
|
||||
} |
|
||||
|
|
||||
// GetMetrics returns a copy of the current metrics (without the mutex)
|
|
||||
func (m *VacuumMetrics) GetMetrics() VacuumMetrics { |
|
||||
m.mutex.RLock() |
|
||||
defer m.mutex.RUnlock() |
|
||||
|
|
||||
// Create a copy without the mutex to avoid copying lock value
|
|
||||
return VacuumMetrics{ |
|
||||
VolumesVacuumed: m.VolumesVacuumed, |
|
||||
TotalSpaceReclaimed: m.TotalSpaceReclaimed, |
|
||||
TotalFilesProcessed: m.TotalFilesProcessed, |
|
||||
TotalGarbageCollected: m.TotalGarbageCollected, |
|
||||
LastVacuumTime: m.LastVacuumTime, |
|
||||
AverageVacuumTime: m.AverageVacuumTime, |
|
||||
AverageGarbageRatio: m.AverageGarbageRatio, |
|
||||
SuccessfulOperations: m.SuccessfulOperations, |
|
||||
FailedOperations: m.FailedOperations, |
|
||||
CurrentGarbageRatio: m.CurrentGarbageRatio, |
|
||||
VolumesPendingVacuum: m.VolumesPendingVacuum, |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// GetSuccessRate returns the success rate as a percentage
|
|
||||
func (m *VacuumMetrics) GetSuccessRate() float64 { |
|
||||
m.mutex.RLock() |
|
||||
defer m.mutex.RUnlock() |
|
||||
|
|
||||
total := m.SuccessfulOperations + m.FailedOperations |
|
||||
if total == 0 { |
|
||||
return 100.0 |
|
||||
} |
|
||||
return float64(m.SuccessfulOperations) / float64(total) * 100.0 |
|
||||
} |
|
||||
|
|
||||
// GetAverageSpaceReclaimed returns the average space reclaimed per volume
|
|
||||
func (m *VacuumMetrics) GetAverageSpaceReclaimed() float64 { |
|
||||
m.mutex.RLock() |
|
||||
defer m.mutex.RUnlock() |
|
||||
|
|
||||
if m.VolumesVacuumed == 0 { |
|
||||
return 0 |
|
||||
} |
|
||||
return float64(m.TotalSpaceReclaimed) / float64(m.VolumesVacuumed) |
|
||||
} |
|
||||
|
|
||||
// Reset resets all metrics to zero
|
|
||||
func (m *VacuumMetrics) Reset() { |
|
||||
m.mutex.Lock() |
|
||||
defer m.mutex.Unlock() |
|
||||
|
|
||||
*m = VacuumMetrics{ |
|
||||
LastVacuumTime: time.Now(), |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// Global metrics instance for vacuum tasks
|
|
||||
var globalVacuumMetrics = NewVacuumMetrics() |
|
||||
|
|
||||
// GetGlobalVacuumMetrics returns the global vacuum metrics instance
|
|
||||
func GetGlobalVacuumMetrics() *VacuumMetrics { |
|
||||
return globalVacuumMetrics |
|
||||
} |
|
||||
@ -1,86 +0,0 @@ |
|||||
package vacuum |
|
||||
|
|
||||
import ( |
|
||||
"fmt" |
|
||||
"time" |
|
||||
|
|
||||
"github.com/seaweedfs/seaweedfs/weed/glog" |
|
||||
"github.com/seaweedfs/seaweedfs/weed/pb/worker_pb" |
|
||||
"github.com/seaweedfs/seaweedfs/weed/worker/tasks" |
|
||||
"github.com/seaweedfs/seaweedfs/weed/worker/tasks/base" |
|
||||
"github.com/seaweedfs/seaweedfs/weed/worker/types" |
|
||||
) |
|
||||
|
|
||||
// Global variable to hold the task definition for configuration updates
|
|
||||
var globalTaskDef *base.TaskDefinition |
|
||||
|
|
||||
// Auto-register this task when the package is imported
|
|
||||
func init() { |
|
||||
RegisterVacuumTask() |
|
||||
|
|
||||
// Register config updater
|
|
||||
tasks.AutoRegisterConfigUpdater(types.TaskTypeVacuum, UpdateConfigFromPersistence) |
|
||||
} |
|
||||
|
|
||||
// RegisterVacuumTask registers the vacuum task with the new architecture
|
|
||||
func RegisterVacuumTask() { |
|
||||
// Create configuration instance
|
|
||||
config := NewDefaultConfig() |
|
||||
|
|
||||
// Create complete task definition
|
|
||||
taskDef := &base.TaskDefinition{ |
|
||||
Type: types.TaskTypeVacuum, |
|
||||
Name: "vacuum", |
|
||||
DisplayName: "Volume Vacuum", |
|
||||
Description: "Reclaims disk space by removing deleted files from volumes", |
|
||||
Icon: "fas fa-broom text-primary", |
|
||||
Capabilities: []string{"vacuum", "storage"}, |
|
||||
|
|
||||
Config: config, |
|
||||
ConfigSpec: GetConfigSpec(), |
|
||||
CreateTask: func(params *worker_pb.TaskParams) (types.Task, error) { |
|
||||
if params == nil { |
|
||||
return nil, fmt.Errorf("task parameters are required") |
|
||||
} |
|
||||
if len(params.Sources) == 0 { |
|
||||
return nil, fmt.Errorf("at least one source is required for vacuum task") |
|
||||
} |
|
||||
return NewVacuumTask( |
|
||||
fmt.Sprintf("vacuum-%d", params.VolumeId), |
|
||||
params.Sources[0].Node, // Use first source node
|
|
||||
params.VolumeId, |
|
||||
params.Collection, |
|
||||
), nil |
|
||||
}, |
|
||||
DetectionFunc: Detection, |
|
||||
ScanInterval: 2 * time.Hour, |
|
||||
SchedulingFunc: Scheduling, |
|
||||
MaxConcurrent: 2, |
|
||||
RepeatInterval: 7 * 24 * time.Hour, |
|
||||
} |
|
||||
|
|
||||
// Store task definition globally for configuration updates
|
|
||||
globalTaskDef = taskDef |
|
||||
|
|
||||
// Register everything with a single function call!
|
|
||||
base.RegisterTask(taskDef) |
|
||||
} |
|
||||
|
|
||||
// UpdateConfigFromPersistence updates the vacuum configuration from persistence
|
|
||||
func UpdateConfigFromPersistence(configPersistence interface{}) error { |
|
||||
if globalTaskDef == nil { |
|
||||
return fmt.Errorf("vacuum task not registered") |
|
||||
} |
|
||||
|
|
||||
// Load configuration from persistence
|
|
||||
newConfig := LoadConfigFromPersistence(configPersistence) |
|
||||
if newConfig == nil { |
|
||||
return fmt.Errorf("failed to load configuration from persistence") |
|
||||
} |
|
||||
|
|
||||
// Update the task definition's config
|
|
||||
globalTaskDef.Config = newConfig |
|
||||
|
|
||||
glog.V(1).Infof("Updated vacuum task configuration from persistence") |
|
||||
return nil |
|
||||
} |
|
||||
@ -1,37 +0,0 @@ |
|||||
package vacuum |
|
||||
|
|
||||
import ( |
|
||||
"github.com/seaweedfs/seaweedfs/weed/worker/tasks/base" |
|
||||
"github.com/seaweedfs/seaweedfs/weed/worker/types" |
|
||||
) |
|
||||
|
|
||||
// Scheduling implements the scheduling logic for vacuum tasks
|
|
||||
func Scheduling(task *types.TaskInput, runningTasks []*types.TaskInput, availableWorkers []*types.WorkerData, config base.TaskConfig) bool { |
|
||||
vacuumConfig := config.(*Config) |
|
||||
|
|
||||
// Count running vacuum tasks
|
|
||||
runningVacuumCount := 0 |
|
||||
for _, runningTask := range runningTasks { |
|
||||
if runningTask.Type == types.TaskTypeVacuum { |
|
||||
runningVacuumCount++ |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// Check concurrency limit
|
|
||||
if runningVacuumCount >= vacuumConfig.MaxConcurrent { |
|
||||
return false |
|
||||
} |
|
||||
|
|
||||
// Check for available workers with vacuum capability
|
|
||||
for _, worker := range availableWorkers { |
|
||||
if worker.CurrentLoad < worker.MaxConcurrent { |
|
||||
for _, capability := range worker.Capabilities { |
|
||||
if capability == types.TaskTypeVacuum { |
|
||||
return true |
|
||||
} |
|
||||
} |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
return false |
|
||||
} |
|
||||
@ -1,244 +0,0 @@ |
|||||
package vacuum |
|
||||
|
|
||||
import ( |
|
||||
"context" |
|
||||
"fmt" |
|
||||
"io" |
|
||||
"time" |
|
||||
|
|
||||
"github.com/seaweedfs/seaweedfs/weed/glog" |
|
||||
"github.com/seaweedfs/seaweedfs/weed/operation" |
|
||||
"github.com/seaweedfs/seaweedfs/weed/pb" |
|
||||
"github.com/seaweedfs/seaweedfs/weed/pb/volume_server_pb" |
|
||||
"github.com/seaweedfs/seaweedfs/weed/pb/worker_pb" |
|
||||
"github.com/seaweedfs/seaweedfs/weed/worker/types" |
|
||||
"github.com/seaweedfs/seaweedfs/weed/worker/types/base" |
|
||||
"google.golang.org/grpc" |
|
||||
) |
|
||||
|
|
||||
// VacuumTask implements the Task interface
|
|
||||
type VacuumTask struct { |
|
||||
*base.BaseTask |
|
||||
server string |
|
||||
volumeID uint32 |
|
||||
collection string |
|
||||
garbageThreshold float64 |
|
||||
progress float64 |
|
||||
} |
|
||||
|
|
||||
// NewVacuumTask creates a new unified vacuum task instance
|
|
||||
func NewVacuumTask(id string, server string, volumeID uint32, collection string) *VacuumTask { |
|
||||
return &VacuumTask{ |
|
||||
BaseTask: base.NewBaseTask(id, types.TaskTypeVacuum), |
|
||||
server: server, |
|
||||
volumeID: volumeID, |
|
||||
collection: collection, |
|
||||
garbageThreshold: 0.3, // Default 30% threshold
|
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// Execute implements the UnifiedTask interface
|
|
||||
func (t *VacuumTask) Execute(ctx context.Context, params *worker_pb.TaskParams) error { |
|
||||
if params == nil { |
|
||||
return fmt.Errorf("task parameters are required") |
|
||||
} |
|
||||
|
|
||||
vacuumParams := params.GetVacuumParams() |
|
||||
if vacuumParams == nil { |
|
||||
return fmt.Errorf("vacuum parameters are required") |
|
||||
} |
|
||||
|
|
||||
t.garbageThreshold = vacuumParams.GarbageThreshold |
|
||||
|
|
||||
t.GetLogger().WithFields(map[string]interface{}{ |
|
||||
"volume_id": t.volumeID, |
|
||||
"server": t.server, |
|
||||
"collection": t.collection, |
|
||||
"garbage_threshold": t.garbageThreshold, |
|
||||
}).Info("Starting vacuum task") |
|
||||
|
|
||||
// Step 1: Check volume status and garbage ratio
|
|
||||
t.ReportProgress(10.0) |
|
||||
t.GetLogger().Info("Checking volume status") |
|
||||
eligible, currentGarbageRatio, err := t.checkVacuumEligibility() |
|
||||
if err != nil { |
|
||||
return fmt.Errorf("failed to check vacuum eligibility: %v", err) |
|
||||
} |
|
||||
|
|
||||
if !eligible { |
|
||||
t.GetLogger().WithFields(map[string]interface{}{ |
|
||||
"current_garbage_ratio": currentGarbageRatio, |
|
||||
"required_threshold": t.garbageThreshold, |
|
||||
}).Info("Volume does not meet vacuum criteria, skipping") |
|
||||
t.ReportProgress(100.0) |
|
||||
return nil |
|
||||
} |
|
||||
|
|
||||
// Step 2: Perform vacuum operation
|
|
||||
t.ReportProgress(50.0) |
|
||||
t.GetLogger().WithFields(map[string]interface{}{ |
|
||||
"garbage_ratio": currentGarbageRatio, |
|
||||
"threshold": t.garbageThreshold, |
|
||||
}).Info("Performing vacuum operation") |
|
||||
|
|
||||
if err := t.performVacuum(); err != nil { |
|
||||
return fmt.Errorf("failed to perform vacuum: %v", err) |
|
||||
} |
|
||||
|
|
||||
// Step 3: Verify vacuum results
|
|
||||
t.ReportProgress(90.0) |
|
||||
t.GetLogger().Info("Verifying vacuum results") |
|
||||
if err := t.verifyVacuumResults(); err != nil { |
|
||||
glog.Warningf("Vacuum verification failed: %v", err) |
|
||||
// Don't fail the task - vacuum operation itself succeeded
|
|
||||
} |
|
||||
|
|
||||
t.ReportProgress(100.0) |
|
||||
glog.Infof("Vacuum task completed successfully: volume %d from %s (garbage ratio was %.2f%%)", |
|
||||
t.volumeID, t.server, currentGarbageRatio*100) |
|
||||
return nil |
|
||||
} |
|
||||
|
|
||||
// Validate implements the UnifiedTask interface
|
|
||||
func (t *VacuumTask) Validate(params *worker_pb.TaskParams) error { |
|
||||
if params == nil { |
|
||||
return fmt.Errorf("task parameters are required") |
|
||||
} |
|
||||
|
|
||||
vacuumParams := params.GetVacuumParams() |
|
||||
if vacuumParams == nil { |
|
||||
return fmt.Errorf("vacuum parameters are required") |
|
||||
} |
|
||||
|
|
||||
if params.VolumeId != t.volumeID { |
|
||||
return fmt.Errorf("volume ID mismatch: expected %d, got %d", t.volumeID, params.VolumeId) |
|
||||
} |
|
||||
|
|
||||
// Validate that at least one source matches our server
|
|
||||
found := false |
|
||||
for _, source := range params.Sources { |
|
||||
if source.Node == t.server { |
|
||||
found = true |
|
||||
break |
|
||||
} |
|
||||
} |
|
||||
if !found { |
|
||||
return fmt.Errorf("no source matches expected server %s", t.server) |
|
||||
} |
|
||||
|
|
||||
if vacuumParams.GarbageThreshold < 0 || vacuumParams.GarbageThreshold > 1.0 { |
|
||||
return fmt.Errorf("invalid garbage threshold: %f (must be between 0.0 and 1.0)", vacuumParams.GarbageThreshold) |
|
||||
} |
|
||||
|
|
||||
return nil |
|
||||
} |
|
||||
|
|
||||
// EstimateTime implements the UnifiedTask interface
|
|
||||
func (t *VacuumTask) EstimateTime(params *worker_pb.TaskParams) time.Duration { |
|
||||
// Basic estimate based on simulated steps
|
|
||||
return 14 * time.Second // Sum of all step durations
|
|
||||
} |
|
||||
|
|
||||
// GetProgress returns current progress
|
|
||||
func (t *VacuumTask) GetProgress() float64 { |
|
||||
return t.progress |
|
||||
} |
|
||||
|
|
||||
// Helper methods for real vacuum operations
|
|
||||
|
|
||||
// checkVacuumEligibility checks if the volume meets vacuum criteria
|
|
||||
func (t *VacuumTask) checkVacuumEligibility() (bool, float64, error) { |
|
||||
var garbageRatio float64 |
|
||||
|
|
||||
err := operation.WithVolumeServerClient(false, pb.ServerAddress(t.server), grpc.WithInsecure(), |
|
||||
func(client volume_server_pb.VolumeServerClient) error { |
|
||||
resp, err := client.VacuumVolumeCheck(context.Background(), &volume_server_pb.VacuumVolumeCheckRequest{ |
|
||||
VolumeId: t.volumeID, |
|
||||
}) |
|
||||
if err != nil { |
|
||||
return fmt.Errorf("failed to check volume vacuum status: %v", err) |
|
||||
} |
|
||||
|
|
||||
garbageRatio = resp.GarbageRatio |
|
||||
|
|
||||
return nil |
|
||||
}) |
|
||||
|
|
||||
if err != nil { |
|
||||
return false, 0, err |
|
||||
} |
|
||||
|
|
||||
eligible := garbageRatio >= t.garbageThreshold |
|
||||
glog.V(1).Infof("Volume %d garbage ratio: %.2f%%, threshold: %.2f%%, eligible: %v", |
|
||||
t.volumeID, garbageRatio*100, t.garbageThreshold*100, eligible) |
|
||||
|
|
||||
return eligible, garbageRatio, nil |
|
||||
} |
|
||||
|
|
||||
// performVacuum executes the actual vacuum operation
|
|
||||
func (t *VacuumTask) performVacuum() error { |
|
||||
return operation.WithVolumeServerClient(false, pb.ServerAddress(t.server), grpc.WithInsecure(), |
|
||||
func(client volume_server_pb.VolumeServerClient) error { |
|
||||
// Step 1: Compact the volume
|
|
||||
t.GetLogger().Info("Compacting volume") |
|
||||
stream, err := client.VacuumVolumeCompact(context.Background(), &volume_server_pb.VacuumVolumeCompactRequest{ |
|
||||
VolumeId: t.volumeID, |
|
||||
}) |
|
||||
if err != nil { |
|
||||
return fmt.Errorf("vacuum compact failed: %v", err) |
|
||||
} |
|
||||
|
|
||||
// Read compact progress
|
|
||||
for { |
|
||||
resp, recvErr := stream.Recv() |
|
||||
if recvErr != nil { |
|
||||
if recvErr == io.EOF { |
|
||||
break |
|
||||
} |
|
||||
return fmt.Errorf("vacuum compact stream error: %v", recvErr) |
|
||||
} |
|
||||
glog.V(2).Infof("Volume %d compact progress: %d bytes processed", t.volumeID, resp.ProcessedBytes) |
|
||||
} |
|
||||
|
|
||||
// Step 2: Commit the vacuum
|
|
||||
t.GetLogger().Info("Committing vacuum operation") |
|
||||
_, err = client.VacuumVolumeCommit(context.Background(), &volume_server_pb.VacuumVolumeCommitRequest{ |
|
||||
VolumeId: t.volumeID, |
|
||||
}) |
|
||||
if err != nil { |
|
||||
return fmt.Errorf("vacuum commit failed: %v", err) |
|
||||
} |
|
||||
|
|
||||
// Step 3: Cleanup old files
|
|
||||
t.GetLogger().Info("Cleaning up vacuum files") |
|
||||
_, err = client.VacuumVolumeCleanup(context.Background(), &volume_server_pb.VacuumVolumeCleanupRequest{ |
|
||||
VolumeId: t.volumeID, |
|
||||
}) |
|
||||
if err != nil { |
|
||||
return fmt.Errorf("vacuum cleanup failed: %v", err) |
|
||||
} |
|
||||
|
|
||||
glog.V(1).Infof("Volume %d vacuum operation completed successfully", t.volumeID) |
|
||||
return nil |
|
||||
}) |
|
||||
} |
|
||||
|
|
||||
// verifyVacuumResults checks the volume status after vacuum
|
|
||||
func (t *VacuumTask) verifyVacuumResults() error { |
|
||||
return operation.WithVolumeServerClient(false, pb.ServerAddress(t.server), grpc.WithInsecure(), |
|
||||
func(client volume_server_pb.VolumeServerClient) error { |
|
||||
resp, err := client.VacuumVolumeCheck(context.Background(), &volume_server_pb.VacuumVolumeCheckRequest{ |
|
||||
VolumeId: t.volumeID, |
|
||||
}) |
|
||||
if err != nil { |
|
||||
return fmt.Errorf("failed to verify vacuum results: %v", err) |
|
||||
} |
|
||||
|
|
||||
postVacuumGarbageRatio := resp.GarbageRatio |
|
||||
|
|
||||
glog.V(1).Infof("Volume %d post-vacuum garbage ratio: %.2f%%", |
|
||||
t.volumeID, postVacuumGarbageRatio*100) |
|
||||
|
|
||||
return nil |
|
||||
}) |
|
||||
} |
|
||||
Write
Preview
Loading…
Cancel
Save
Reference in new issue