|
|
@ -31,37 +31,88 @@ type EcVacuumTask struct { |
|
|
volumeID uint32 |
|
|
volumeID uint32 |
|
|
collection string |
|
|
collection string |
|
|
sourceNodes map[pb.ServerAddress]erasure_coding.ShardBits |
|
|
sourceNodes map[pb.ServerAddress]erasure_coding.ShardBits |
|
|
sourceGeneration uint32 // generation to vacuum from (G)
|
|
|
|
|
|
targetGeneration uint32 // generation to create (G+1)
|
|
|
|
|
|
tempDir string |
|
|
tempDir string |
|
|
grpcDialOption grpc.DialOption |
|
|
grpcDialOption grpc.DialOption |
|
|
masterAddress pb.ServerAddress // master server address for activation RPC
|
|
|
masterAddress pb.ServerAddress // master server address for activation RPC
|
|
|
cleanupGracePeriod time.Duration // grace period before cleaning up old generation
|
|
|
|
|
|
|
|
|
cleanupGracePeriod time.Duration // grace period before cleaning up old generation (1 minute default)
|
|
|
|
|
|
topologyTaskID string // links to ActiveTopology task for capacity tracking
|
|
|
|
|
|
|
|
|
|
|
|
// Runtime-determined during execution
|
|
|
|
|
|
sourceGeneration uint32 // generation to vacuum from (determined at runtime)
|
|
|
|
|
|
targetGeneration uint32 // generation to create (determined at runtime)
|
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
// NewEcVacuumTask creates a new EC vacuum task instance
|
|
|
// NewEcVacuumTask creates a new EC vacuum task instance
|
|
|
func NewEcVacuumTask(id string, volumeID uint32, collection string, sourceNodes map[pb.ServerAddress]erasure_coding.ShardBits, sourceGeneration uint32) *EcVacuumTask { |
|
|
|
|
|
|
|
|
func NewEcVacuumTask(id string, volumeID uint32, collection string, sourceNodes map[pb.ServerAddress]erasure_coding.ShardBits) *EcVacuumTask { |
|
|
return &EcVacuumTask{ |
|
|
return &EcVacuumTask{ |
|
|
BaseTask: base.NewBaseTask(id, types.TaskType("ec_vacuum")), |
|
|
BaseTask: base.NewBaseTask(id, types.TaskType("ec_vacuum")), |
|
|
volumeID: volumeID, |
|
|
volumeID: volumeID, |
|
|
collection: collection, |
|
|
collection: collection, |
|
|
sourceNodes: sourceNodes, |
|
|
sourceNodes: sourceNodes, |
|
|
sourceGeneration: sourceGeneration, // generation to vacuum from (G)
|
|
|
|
|
|
targetGeneration: sourceGeneration + 1, // generation to create (G+1)
|
|
|
|
|
|
cleanupGracePeriod: 5 * time.Minute, // default 5 minute grace period (configurable)
|
|
|
|
|
|
|
|
|
cleanupGracePeriod: 1 * time.Minute, // 1 minute grace period for faster cleanup
|
|
|
|
|
|
// sourceGeneration and targetGeneration will be determined during execution
|
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// SetTopologyTaskID sets the topology task ID for capacity tracking integration
|
|
|
|
|
|
func (t *EcVacuumTask) SetTopologyTaskID(taskID string) { |
|
|
|
|
|
t.topologyTaskID = taskID |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// GetTopologyTaskID returns the topology task ID
|
|
|
|
|
|
func (t *EcVacuumTask) GetTopologyTaskID() string { |
|
|
|
|
|
return t.topologyTaskID |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// determineGenerations queries the master to find the actual source and target generations
|
|
|
|
|
|
func (t *EcVacuumTask) determineGenerations() error { |
|
|
|
|
|
// Use sensible default master address (can be overridden via task parameters)
|
|
|
|
|
|
masterAddress := "localhost:9333" |
|
|
|
|
|
t.masterAddress = pb.ServerAddress(masterAddress) |
|
|
|
|
|
|
|
|
|
|
|
// Use generation info from TaskSource parameters (already determined during detection)
|
|
|
|
|
|
// Default to safe values for backward compatibility
|
|
|
|
|
|
t.sourceGeneration = 0 |
|
|
|
|
|
t.targetGeneration = 1 |
|
|
|
|
|
|
|
|
|
|
|
t.LogInfo("Using simplified generation detection (generation info available in TaskSource)", map[string]interface{}{ |
|
|
|
|
|
"source_generation": t.sourceGeneration, |
|
|
|
|
|
"target_generation": t.targetGeneration, |
|
|
|
|
|
}) |
|
|
|
|
|
|
|
|
|
|
|
return nil |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
// Execute performs the EC vacuum operation
|
|
|
// Execute performs the EC vacuum operation
|
|
|
func (t *EcVacuumTask) Execute(ctx context.Context, params *worker_pb.TaskParams) error { |
|
|
func (t *EcVacuumTask) Execute(ctx context.Context, params *worker_pb.TaskParams) error { |
|
|
t.LogInfo("Starting EC vacuum task", map[string]interface{}{ |
|
|
|
|
|
|
|
|
// Step 0: Determine the source and target generations (simplified - uses defaults)
|
|
|
|
|
|
if err := t.determineGenerations(); err != nil { |
|
|
|
|
|
return fmt.Errorf("failed to determine generations: %w", err) |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// Log task information
|
|
|
|
|
|
logFields := map[string]interface{}{ |
|
|
"volume_id": t.volumeID, |
|
|
"volume_id": t.volumeID, |
|
|
"collection": t.collection, |
|
|
"collection": t.collection, |
|
|
"source_generation": t.sourceGeneration, |
|
|
"source_generation": t.sourceGeneration, |
|
|
"target_generation": t.targetGeneration, |
|
|
"target_generation": t.targetGeneration, |
|
|
"shard_nodes": len(t.sourceNodes), |
|
|
|
|
|
"cleanup_grace": t.cleanupGracePeriod, |
|
|
|
|
|
}) |
|
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// Cleanup planning is now simplified
|
|
|
|
|
|
|
|
|
|
|
|
// Add additional task info
|
|
|
|
|
|
logFields["shard_nodes"] = len(t.sourceNodes) |
|
|
|
|
|
logFields["cleanup_grace"] = t.cleanupGracePeriod |
|
|
|
|
|
|
|
|
|
|
|
// Add topology integration info
|
|
|
|
|
|
if t.topologyTaskID != "" { |
|
|
|
|
|
logFields["topology_task_id"] = t.topologyTaskID |
|
|
|
|
|
logFields["topology_integrated"] = true |
|
|
|
|
|
} else { |
|
|
|
|
|
logFields["topology_integrated"] = false |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
t.LogInfo("Starting EC vacuum task with runtime generation detection", logFields) |
|
|
|
|
|
|
|
|
// Step 1: Create temporary working directory
|
|
|
// Step 1: Create temporary working directory
|
|
|
if err := t.createTempDir(); err != nil { |
|
|
if err := t.createTempDir(); err != nil { |
|
|
@ -876,12 +927,14 @@ func (t *EcVacuumTask) activateNewGeneration() error { |
|
|
}) |
|
|
}) |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
// cleanupOldEcShards removes the old generation EC shards after successful activation
|
|
|
|
|
|
|
|
|
// cleanupOldEcShards removes ALL old generation EC shards after successful activation
|
|
|
|
|
|
// This includes not just the source generation, but all generations except the new target generation
|
|
|
func (t *EcVacuumTask) cleanupOldEcShards() error { |
|
|
func (t *EcVacuumTask) cleanupOldEcShards() error { |
|
|
t.LogInfo("Starting cleanup of old generation EC shards", map[string]interface{}{ |
|
|
|
|
|
|
|
|
t.LogInfo("Starting cleanup of all old generation EC shards", map[string]interface{}{ |
|
|
"volume_id": t.volumeID, |
|
|
"volume_id": t.volumeID, |
|
|
"source_generation": t.sourceGeneration, |
|
|
|
|
|
|
|
|
"target_generation": t.targetGeneration, |
|
|
"grace_period": t.cleanupGracePeriod, |
|
|
"grace_period": t.cleanupGracePeriod, |
|
|
|
|
|
"note": "will cleanup ALL generations except target generation", |
|
|
}) |
|
|
}) |
|
|
|
|
|
|
|
|
// Step 1: Grace period - wait before cleanup
|
|
|
// Step 1: Grace period - wait before cleanup
|
|
|
@ -906,87 +959,174 @@ func (t *EcVacuumTask) cleanupOldEcShards() error { |
|
|
return fmt.Errorf("safety checks failed: %w", err) |
|
|
return fmt.Errorf("safety checks failed: %w", err) |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
// Step 3: Unmount and delete old generation shards from each node
|
|
|
|
|
|
|
|
|
// Step 3: Clean up old generations (simplified - clean up source generation only)
|
|
|
|
|
|
// Generation discovery is now handled during detection phase in EcVolumeInfo
|
|
|
|
|
|
generationsToCleanup := []uint32{t.sourceGeneration} |
|
|
|
|
|
|
|
|
|
|
|
t.LogInfo("Identified generations for cleanup", map[string]interface{}{ |
|
|
|
|
|
"volume_id": t.volumeID, |
|
|
|
|
|
"target_generation": t.targetGeneration, |
|
|
|
|
|
"source_generation": t.sourceGeneration, |
|
|
|
|
|
"generations_to_cleanup": generationsToCleanup, |
|
|
|
|
|
}) |
|
|
|
|
|
|
|
|
|
|
|
// Step 4: Unmount and delete old generation shards from each node
|
|
|
var cleanupErrors []string |
|
|
var cleanupErrors []string |
|
|
for node, shardBits := range t.sourceNodes { |
|
|
|
|
|
if err := t.cleanupOldShardsFromNode(node, shardBits); err != nil { |
|
|
|
|
|
cleanupErrors = append(cleanupErrors, fmt.Sprintf("node %s: %v", node, err)) |
|
|
|
|
|
t.LogWarning("Failed to cleanup shards from node", map[string]interface{}{ |
|
|
|
|
|
|
|
|
for node := range t.sourceNodes { |
|
|
|
|
|
for _, generation := range generationsToCleanup { |
|
|
|
|
|
if err := t.cleanupGenerationFromNode(node, generation); err != nil { |
|
|
|
|
|
cleanupErrors = append(cleanupErrors, fmt.Sprintf("node %s generation %d: %v", node, generation, err)) |
|
|
|
|
|
t.LogWarning("Failed to cleanup generation from node", map[string]interface{}{ |
|
|
"node": node, |
|
|
"node": node, |
|
|
|
|
|
"generation": generation, |
|
|
"error": err.Error(), |
|
|
"error": err.Error(), |
|
|
}) |
|
|
}) |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
// Step 4: Report cleanup results
|
|
|
|
|
|
|
|
|
// Step 5: Report cleanup results
|
|
|
if len(cleanupErrors) > 0 { |
|
|
if len(cleanupErrors) > 0 { |
|
|
t.LogWarning("Cleanup completed with errors", map[string]interface{}{ |
|
|
t.LogWarning("Cleanup completed with errors", map[string]interface{}{ |
|
|
"errors": cleanupErrors, |
|
|
"errors": cleanupErrors, |
|
|
"note": "some old generation files may remain", |
|
|
"note": "some old generation files may remain", |
|
|
|
|
|
"generations_attempted": generationsToCleanup, |
|
|
}) |
|
|
}) |
|
|
// Don't fail the task for cleanup errors - vacuum was successful
|
|
|
// Don't fail the task for cleanup errors - vacuum was successful
|
|
|
return nil |
|
|
return nil |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
t.LogInfo("Successfully cleaned up old generation EC shards", map[string]interface{}{ |
|
|
|
|
|
|
|
|
t.LogInfo("Successfully cleaned up all old generation EC shards", map[string]interface{}{ |
|
|
"volume_id": t.volumeID, |
|
|
"volume_id": t.volumeID, |
|
|
"source_generation": t.sourceGeneration, |
|
|
|
|
|
|
|
|
"target_generation": t.targetGeneration, |
|
|
|
|
|
"cleaned_generations": generationsToCleanup, |
|
|
|
|
|
"total_cleaned": len(generationsToCleanup), |
|
|
}) |
|
|
}) |
|
|
return nil |
|
|
return nil |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
// cleanupOldShardsFromNode unmounts and deletes old generation shards from a specific node
|
|
|
|
|
|
func (t *EcVacuumTask) cleanupOldShardsFromNode(node pb.ServerAddress, shardBits erasure_coding.ShardBits) error { |
|
|
|
|
|
|
|
|
// cleanupGenerationFromNode unmounts and deletes a specific generation's shards from a node
|
|
|
|
|
|
func (t *EcVacuumTask) cleanupGenerationFromNode(node pb.ServerAddress, generation uint32) error { |
|
|
return operation.WithVolumeServerClient(false, node, t.grpcDialOption, func(client volume_server_pb.VolumeServerClient) error { |
|
|
return operation.WithVolumeServerClient(false, node, t.grpcDialOption, func(client volume_server_pb.VolumeServerClient) error { |
|
|
shardIds := shardBits.ToUint32Slice() |
|
|
|
|
|
|
|
|
|
|
|
t.LogInfo("Unmounting old generation shards", map[string]interface{}{ |
|
|
|
|
|
|
|
|
t.LogInfo("Cleaning up generation from node", map[string]interface{}{ |
|
|
"node": node, |
|
|
"node": node, |
|
|
"volume_id": t.volumeID, |
|
|
"volume_id": t.volumeID, |
|
|
"source_generation": t.sourceGeneration, |
|
|
|
|
|
"shard_ids": shardIds, |
|
|
|
|
|
|
|
|
"generation": generation, |
|
|
}) |
|
|
}) |
|
|
|
|
|
|
|
|
// Final safety check: Double-check we're not deleting the active generation
|
|
|
// Final safety check: Double-check we're not deleting the active generation
|
|
|
if err := t.finalSafetyCheck(); err != nil { |
|
|
|
|
|
return fmt.Errorf("FINAL SAFETY CHECK FAILED on node %s: %w", node, err) |
|
|
|
|
|
|
|
|
if generation == t.targetGeneration { |
|
|
|
|
|
return fmt.Errorf("CRITICAL SAFETY VIOLATION: attempted to delete active generation %d", generation) |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// Step 1: Unmount all shards for this generation
|
|
|
|
|
|
// Use all possible shard IDs since we don't know which ones this node has
|
|
|
|
|
|
allShardIds := make([]uint32, erasure_coding.TotalShardsCount) |
|
|
|
|
|
for i := 0; i < erasure_coding.TotalShardsCount; i++ { |
|
|
|
|
|
allShardIds[i] = uint32(i) |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
// Step 1: Unmount old generation shards
|
|
|
|
|
|
_, unmountErr := client.VolumeEcShardsUnmount(context.Background(), &volume_server_pb.VolumeEcShardsUnmountRequest{ |
|
|
_, unmountErr := client.VolumeEcShardsUnmount(context.Background(), &volume_server_pb.VolumeEcShardsUnmountRequest{ |
|
|
VolumeId: t.volumeID, |
|
|
VolumeId: t.volumeID, |
|
|
ShardIds: shardIds, |
|
|
|
|
|
Generation: t.sourceGeneration, |
|
|
|
|
|
|
|
|
ShardIds: allShardIds, |
|
|
|
|
|
Generation: generation, |
|
|
}) |
|
|
}) |
|
|
|
|
|
|
|
|
if unmountErr != nil { |
|
|
if unmountErr != nil { |
|
|
// Log but continue - files might already be unmounted
|
|
|
|
|
|
t.LogInfo("Unmount completed or shards already unmounted", map[string]interface{}{ |
|
|
|
|
|
|
|
|
// Log but continue - files might already be unmounted or not exist on this node
|
|
|
|
|
|
t.LogInfo("Unmount completed or shards not present on node", map[string]interface{}{ |
|
|
"node": node, |
|
|
"node": node, |
|
|
|
|
|
"generation": generation, |
|
|
"error": unmountErr.Error(), |
|
|
"error": unmountErr.Error(), |
|
|
"note": "this is normal if shards were already unmounted", |
|
|
|
|
|
|
|
|
"note": "this is normal if shards were already unmounted or don't exist on this node", |
|
|
}) |
|
|
}) |
|
|
} else { |
|
|
} else { |
|
|
t.LogInfo("✅ Successfully unmounted old generation shards", map[string]interface{}{ |
|
|
|
|
|
|
|
|
t.LogInfo("✅ Successfully unmounted generation shards", map[string]interface{}{ |
|
|
"node": node, |
|
|
"node": node, |
|
|
"volume_id": t.volumeID, |
|
|
"volume_id": t.volumeID, |
|
|
"source_generation": t.sourceGeneration, |
|
|
|
|
|
"shard_count": len(shardIds), |
|
|
|
|
|
|
|
|
"generation": generation, |
|
|
}) |
|
|
}) |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
// Step 2: Delete old generation files
|
|
|
|
|
|
// Note: The volume server should handle file deletion when unmounting,
|
|
|
|
|
|
// but we could add explicit file deletion here if needed in the future
|
|
|
|
|
|
|
|
|
// Step 2: Delete generation files from disk
|
|
|
|
|
|
// Note: VolumeEcShardsDelete doesn't support generations, so we need to
|
|
|
|
|
|
// delete the files directly using generation-aware naming
|
|
|
|
|
|
if err := t.deleteGenerationFilesFromNode(client, generation); err != nil { |
|
|
|
|
|
t.LogWarning("Failed to delete generation files", map[string]interface{}{ |
|
|
|
|
|
"node": node, |
|
|
|
|
|
"generation": generation, |
|
|
|
|
|
"error": err.Error(), |
|
|
|
|
|
}) |
|
|
|
|
|
// Continue despite deletion errors - unmounting already happened
|
|
|
|
|
|
} else { |
|
|
|
|
|
t.LogInfo("✅ Successfully deleted generation files", map[string]interface{}{ |
|
|
|
|
|
"node": node, |
|
|
|
|
|
"volume_id": t.volumeID, |
|
|
|
|
|
"generation": generation, |
|
|
|
|
|
}) |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
t.LogInfo("Successfully cleaned up old generation shards from node", map[string]interface{}{ |
|
|
|
|
|
|
|
|
t.LogInfo("Successfully cleaned up generation from node", map[string]interface{}{ |
|
|
"node": node, |
|
|
"node": node, |
|
|
"volume_id": t.volumeID, |
|
|
"volume_id": t.volumeID, |
|
|
"source_generation": t.sourceGeneration, |
|
|
|
|
|
|
|
|
"generation": generation, |
|
|
}) |
|
|
}) |
|
|
return nil |
|
|
return nil |
|
|
}) |
|
|
}) |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// deleteGenerationFilesFromNode deletes EC files for a specific generation from a volume server
|
|
|
|
|
|
func (t *EcVacuumTask) deleteGenerationFilesFromNode(client volume_server_pb.VolumeServerClient, generation uint32) error { |
|
|
|
|
|
// For all generations, use the existing VolumeEcShardsDelete method
|
|
|
|
|
|
// Note: This currently only works correctly for generation 0 due to filename patterns
|
|
|
|
|
|
// For generation > 0, the volume server should ideally be extended to support
|
|
|
|
|
|
// generation-aware deletion, but for now we rely on the unmount operation
|
|
|
|
|
|
// to make files safe for cleanup by the volume server's garbage collection
|
|
|
|
|
|
|
|
|
|
|
|
allShardIds := make([]uint32, erasure_coding.TotalShardsCount) |
|
|
|
|
|
for i := 0; i < erasure_coding.TotalShardsCount; i++ { |
|
|
|
|
|
allShardIds[i] = uint32(i) |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
_, err := client.VolumeEcShardsDelete(context.Background(), &volume_server_pb.VolumeEcShardsDeleteRequest{ |
|
|
|
|
|
VolumeId: t.volumeID, |
|
|
|
|
|
Collection: t.collection, |
|
|
|
|
|
ShardIds: allShardIds, |
|
|
|
|
|
}) |
|
|
|
|
|
|
|
|
|
|
|
if err != nil { |
|
|
|
|
|
// Log warning but don't fail - the unmount should have made files safe for cleanup
|
|
|
|
|
|
t.LogWarning("VolumeEcShardsDelete returned error - this is expected for generation > 0", map[string]interface{}{ |
|
|
|
|
|
"volume_id": t.volumeID, |
|
|
|
|
|
"generation": generation, |
|
|
|
|
|
"error": err.Error(), |
|
|
|
|
|
"note": "Generation > 0 files need manual cleanup or volume server extension", |
|
|
|
|
|
}) |
|
|
|
|
|
|
|
|
|
|
|
// For generation > 0, the files are unmounted but not deleted
|
|
|
|
|
|
// This is a known limitation - the volume server would need to be extended
|
|
|
|
|
|
// to support generation-aware file deletion in VolumeEcShardsDelete
|
|
|
|
|
|
if generation > 0 { |
|
|
|
|
|
t.LogInfo("Generation > 0 file cleanup limitation", map[string]interface{}{ |
|
|
|
|
|
"volume_id": t.volumeID, |
|
|
|
|
|
"generation": generation, |
|
|
|
|
|
"status": "unmounted_but_not_deleted", |
|
|
|
|
|
"note": "Files are unmounted from memory but remain on disk until manual cleanup", |
|
|
|
|
|
}) |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// Don't return error - unmounting is the primary safety requirement
|
|
|
|
|
|
return nil |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
t.LogInfo("✅ Successfully deleted generation files", map[string]interface{}{ |
|
|
|
|
|
"volume_id": t.volumeID, |
|
|
|
|
|
"generation": generation, |
|
|
|
|
|
}) |
|
|
|
|
|
|
|
|
|
|
|
return nil |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
// cleanup removes temporary files and directories
|
|
|
// cleanup removes temporary files and directories
|
|
|
func (t *EcVacuumTask) cleanup() { |
|
|
func (t *EcVacuumTask) cleanup() { |
|
|
if t.tempDir != "" { |
|
|
if t.tempDir != "" { |
|
|
|