move ec task logic

4 months ago · 41cbbefb0b
2 changed files with 300 additions and 751 deletions
--- a/weed/worker/ec_worker.go
+++ b/weed/worker/ec_worker.go
@ -1,698 +0,0 @@
-package worker
-
-import (
-	"context"
-	"fmt"
-	"net"
-	"strconv"
-	"sync"
-	"time"
-
-	"google.golang.org/grpc"
-	"google.golang.org/grpc/credentials/insecure"
-
-	"github.com/seaweedfs/seaweedfs/weed/glog"
-	"github.com/seaweedfs/seaweedfs/weed/pb"
-	"github.com/seaweedfs/seaweedfs/weed/pb/volume_server_pb"
-	"github.com/seaweedfs/seaweedfs/weed/pb/worker_pb"
-)
-
-// ECWorker implements maintenance worker with actual EC functionality
-type ECWorker struct {
-	workerID      string
-	adminAddress  string
-	grpcAddress   string
-	capabilities  []string
-	maxConcurrent int
-
-	// gRPC server and client
-	server      *grpc.Server
-	adminConn   *grpc.ClientConn
-	adminClient worker_pb.WorkerServiceClient
-	adminStream worker_pb.WorkerService_WorkerStreamClient
-
-	// Task management
-	currentTasks map[string]*ActiveTask
-	taskMutex    sync.RWMutex
-
-	// Control
-	running bool
-	stopCh  chan struct{}
-	mutex   sync.RWMutex
-}
-
-// ActiveTask represents a task currently being executed
-type ActiveTask struct {
-	ID         string
-	Type       string
-	VolumeID   uint32
-	Server     string
-	Parameters map[string]string
-	StartedAt  time.Time
-	Progress   float32
-	Status     string
-	Context    context.Context
-	Cancel     context.CancelFunc
-}
-
-// NewECWorker creates a new EC worker
-func NewECWorker(workerID, adminAddress, grpcAddress string) *ECWorker {
-	return &ECWorker{
-		workerID:      workerID,
-		adminAddress:  adminAddress,
-		grpcAddress:   grpcAddress,
-		capabilities:  []string{"ec_encode", "ec_rebuild", "vacuum"},
-		maxConcurrent: 2, // Can handle 2 concurrent tasks
-		currentTasks:  make(map[string]*ActiveTask),
-		stopCh:        make(chan struct{}),
-	}
-}
-
-// Start starts the worker
-func (w *ECWorker) Start() error {
-	w.mutex.Lock()
-	defer w.mutex.Unlock()
-
-	if w.running {
-		return fmt.Errorf("worker already running")
-	}
-
-	glog.Infof("Starting EC worker %s", w.workerID)
-
-	// Start gRPC server
-	err := w.startGRPCServer()
-	if err != nil {
-		return fmt.Errorf("failed to start gRPC server: %v", err)
-	}
-
-	// Connect to admin server
-	err = w.connectToAdmin()
-	if err != nil {
-		return fmt.Errorf("failed to connect to admin: %v", err)
-	}
-
-	w.running = true
-
-	// Start background goroutines
-	go w.adminCommunicationLoop()
-	go w.heartbeatLoop()
-	go w.taskRequestLoop()
-
-	glog.Infof("EC worker %s started successfully", w.workerID)
-	return nil
-}
-
-// Stop stops the worker
-func (w *ECWorker) Stop() {
-	w.mutex.Lock()
-	defer w.mutex.Unlock()
-
-	if !w.running {
-		return
-	}
-
-	glog.Infof("Stopping EC worker %s", w.workerID)
-
-	close(w.stopCh)
-
-	// Cancel all active tasks
-	w.taskMutex.Lock()
-	for _, task := range w.currentTasks {
-		task.Cancel()
-	}
-	w.taskMutex.Unlock()
-
-	// Close connections
-	if w.adminConn != nil {
-		w.adminConn.Close()
-	}
-
-	if w.server != nil {
-		w.server.Stop()
-	}
-
-	w.running = false
-	glog.Infof("EC worker %s stopped", w.workerID)
-}
-
-// startGRPCServer starts the worker's gRPC server
-func (w *ECWorker) startGRPCServer() error {
-	listener, err := net.Listen("tcp", w.grpcAddress)
-	if err != nil {
-		return fmt.Errorf("failed to listen on %s: %v", w.grpcAddress, err)
-	}
-
-	w.server = grpc.NewServer()
-	// Register any worker-specific services here
-
-	go func() {
-		err := w.server.Serve(listener)
-		if err != nil {
-			glog.Errorf("gRPC server error: %v", err)
-		}
-	}()
-
-	glog.Infof("Worker gRPC server listening on %s", w.grpcAddress)
-	return nil
-}
-
-// connectToAdmin establishes connection to admin server
-func (w *ECWorker) connectToAdmin() error {
-	// Convert to gRPC address (HTTP port + 10000)
-	grpcAddress := pb.ServerToGrpcAddress(w.adminAddress)
-	conn, err := grpc.NewClient(grpcAddress, grpc.WithTransportCredentials(insecure.NewCredentials()))
-	if err != nil {
-		return fmt.Errorf("failed to connect to admin at %s: %v", w.adminAddress, err)
-	}
-
-	w.adminConn = conn
-	w.adminClient = worker_pb.NewWorkerServiceClient(conn)
-
-	// Create bidirectional stream
-	stream, err := w.adminClient.WorkerStream(context.Background())
-	if err != nil {
-		return fmt.Errorf("failed to create admin stream: %v", err)
-	}
-
-	w.adminStream = stream
-
-	// Send registration message
-	err = w.sendRegistration()
-	if err != nil {
-		return fmt.Errorf("failed to register with admin: %v", err)
-	}
-
-	glog.Infof("Connected to admin server at %s", w.adminAddress)
-	return nil
-}
-
-// sendRegistration sends worker registration to admin
-func (w *ECWorker) sendRegistration() error {
-	registration := &worker_pb.WorkerMessage{
-		WorkerId:  w.workerID,
-		Timestamp: time.Now().Unix(),
-		Message: &worker_pb.WorkerMessage_Registration{
-			Registration: &worker_pb.WorkerRegistration{
-				WorkerId:      w.workerID,
-				Address:       w.grpcAddress,
-				Capabilities:  w.capabilities,
-				MaxConcurrent: int32(w.maxConcurrent),
-				Metadata: map[string]string{
-					"version": "1.0",
-					"type":    "ec_worker",
-				},
-			},
-		},
-	}
-
-	return w.adminStream.Send(registration)
-}
-
-// adminCommunicationLoop handles messages from admin server
-func (w *ECWorker) adminCommunicationLoop() {
-	for {
-		select {
-		case <-w.stopCh:
-			return
-		default:
-		}
-
-		msg, err := w.adminStream.Recv()
-		if err != nil {
-			glog.Errorf("Error receiving from admin: %v", err)
-			time.Sleep(5 * time.Second) // Retry connection
-			continue
-		}
-
-		w.handleAdminMessage(msg)
-	}
-}
-
-// handleAdminMessage processes messages from admin server
-func (w *ECWorker) handleAdminMessage(msg *worker_pb.AdminMessage) {
-	switch message := msg.Message.(type) {
-	case *worker_pb.AdminMessage_RegistrationResponse:
-		w.handleRegistrationResponse(message.RegistrationResponse)
-	case *worker_pb.AdminMessage_TaskAssignment:
-		w.handleTaskAssignment(message.TaskAssignment)
-	case *worker_pb.AdminMessage_TaskCancellation:
-		w.handleTaskCancellation(message.TaskCancellation)
-	case *worker_pb.AdminMessage_AdminShutdown:
-		w.handleAdminShutdown(message.AdminShutdown)
-	default:
-		glog.Warningf("Unknown message type from admin")
-	}
-}
-
-// handleRegistrationResponse processes registration response
-func (w *ECWorker) handleRegistrationResponse(resp *worker_pb.RegistrationResponse) {
-	if resp.Success {
-		glog.Infof("Worker %s registered successfully with admin", w.workerID)
-	} else {
-		glog.Errorf("Worker registration failed: %s", resp.Message)
-	}
-}
-
-// handleTaskAssignment processes task assignment from admin
-func (w *ECWorker) handleTaskAssignment(assignment *worker_pb.TaskAssignment) {
-	glog.Infof("Received task assignment: %s (%s) for volume %d",
-		assignment.TaskId, assignment.TaskType, assignment.Params.VolumeId)
-
-	// Check if we can accept the task
-	w.taskMutex.RLock()
-	currentLoad := len(w.currentTasks)
-	w.taskMutex.RUnlock()
-
-	if currentLoad >= w.maxConcurrent {
-		glog.Warningf("Worker at capacity, cannot accept task %s", assignment.TaskId)
-		return
-	}
-
-	// Create active task
-	ctx, cancel := context.WithCancel(context.Background())
-	task := &ActiveTask{
-		ID:         assignment.TaskId,
-		Type:       assignment.TaskType,
-		VolumeID:   assignment.Params.VolumeId,
-		Server:     assignment.Params.Server,
-		Parameters: assignment.Params.Parameters,
-		StartedAt:  time.Now(),
-		Progress:   0.0,
-		Status:     "started",
-		Context:    ctx,
-		Cancel:     cancel,
-	}
-
-	w.taskMutex.Lock()
-	w.currentTasks[assignment.TaskId] = task
-	w.taskMutex.Unlock()
-
-	// Start task execution
-	go w.executeTask(task)
-}
-
-// handleTaskCancellation processes task cancellation
-func (w *ECWorker) handleTaskCancellation(cancellation *worker_pb.TaskCancellation) {
-	glog.Infof("Received task cancellation: %s", cancellation.TaskId)
-
-	w.taskMutex.Lock()
-	defer w.taskMutex.Unlock()
-
-	if task, exists := w.currentTasks[cancellation.TaskId]; exists {
-		task.Cancel()
-		delete(w.currentTasks, cancellation.TaskId)
-		glog.Infof("Cancelled task %s", cancellation.TaskId)
-	}
-}
-
-// handleAdminShutdown processes admin shutdown notification
-func (w *ECWorker) handleAdminShutdown(shutdown *worker_pb.AdminShutdown) {
-	glog.Infof("Admin server shutting down: %s", shutdown.Reason)
-	w.Stop()
-}
-
-// heartbeatLoop sends periodic heartbeats to admin
-func (w *ECWorker) heartbeatLoop() {
-	ticker := time.NewTicker(30 * time.Second)
-	defer ticker.Stop()
-
-	for {
-		select {
-		case <-ticker.C:
-			w.sendHeartbeat()
-		case <-w.stopCh:
-			return
-		}
-	}
-}
-
-// sendHeartbeat sends heartbeat to admin server
-func (w *ECWorker) sendHeartbeat() {
-	w.taskMutex.RLock()
-	currentLoad := len(w.currentTasks)
-	taskIDs := make([]string, 0, len(w.currentTasks))
-	for taskID := range w.currentTasks {
-		taskIDs = append(taskIDs, taskID)
-	}
-	w.taskMutex.RUnlock()
-
-	heartbeat := &worker_pb.WorkerMessage{
-		WorkerId:  w.workerID,
-		Timestamp: time.Now().Unix(),
-		Message: &worker_pb.WorkerMessage_Heartbeat{
-			Heartbeat: &worker_pb.WorkerHeartbeat{
-				WorkerId:       w.workerID,
-				Status:         "active",
-				CurrentLoad:    int32(currentLoad),
-				MaxConcurrent:  int32(w.maxConcurrent),
-				CurrentTaskIds: taskIDs,
-				TasksCompleted: 0,                                       // TODO: Track completed tasks
-				TasksFailed:    0,                                       // TODO: Track failed tasks
-				UptimeSeconds:  int64(time.Since(time.Now()).Seconds()), // TODO: Track actual uptime
-			},
-		},
-	}
-
-	if err := w.adminStream.Send(heartbeat); err != nil {
-		glog.Errorf("Failed to send heartbeat: %v", err)
-	}
-}
-
-// taskRequestLoop periodically requests new tasks from admin
-func (w *ECWorker) taskRequestLoop() {
-	ticker := time.NewTicker(10 * time.Second)
-	defer ticker.Stop()
-
-	for {
-		select {
-		case <-ticker.C:
-			w.requestTasks()
-		case <-w.stopCh:
-			return
-		}
-	}
-}
-
-// requestTasks requests new tasks from admin if we have capacity
-func (w *ECWorker) requestTasks() {
-	w.taskMutex.RLock()
-	currentLoad := len(w.currentTasks)
-	w.taskMutex.RUnlock()
-
-	availableSlots := w.maxConcurrent - currentLoad
-	if availableSlots <= 0 {
-		return // No capacity
-	}
-
-	request := &worker_pb.WorkerMessage{
-		WorkerId:  w.workerID,
-		Timestamp: time.Now().Unix(),
-		Message: &worker_pb.WorkerMessage_TaskRequest{
-			TaskRequest: &worker_pb.TaskRequest{
-				WorkerId:       w.workerID,
-				Capabilities:   w.capabilities,
-				AvailableSlots: int32(availableSlots),
-			},
-		},
-	}
-
-	if err := w.adminStream.Send(request); err != nil {
-		glog.Errorf("Failed to request tasks: %v", err)
-	}
-}
-
-// executeTask executes a task based on its type
-func (w *ECWorker) executeTask(task *ActiveTask) {
-	defer func() {
-		w.taskMutex.Lock()
-		delete(w.currentTasks, task.ID)
-		w.taskMutex.Unlock()
-	}()
-
-	glog.Infof("Starting execution of task %s (%s) for volume %d",
-		task.ID, task.Type, task.VolumeID)
-
-	var err error
-	var success bool
-
-	switch task.Type {
-	case "ec_encode":
-		success, err = w.executeECEncode(task)
-	case "ec_rebuild":
-		success, err = w.executeECRebuild(task)
-	case "vacuum":
-		success, err = w.executeVacuum(task)
-	default:
-		err = fmt.Errorf("unknown task type: %s", task.Type)
-		success = false
-	}
-
-	// Send completion message
-	w.sendTaskCompletion(task, success, err)
-
-	if success {
-		glog.Infof("Task %s completed successfully", task.ID)
-	} else {
-		glog.Errorf("Task %s failed: %v", task.ID, err)
-	}
-}
-
-// executeECEncode performs actual EC encoding on a volume
-func (w *ECWorker) executeECEncode(task *ActiveTask) (bool, error) {
-	glog.Infof("Performing EC encoding on volume %d", task.VolumeID)
-
-	// Update progress
-	w.sendTaskUpdate(task, 0.1, "Initializing EC encoding")
-
-	// Connect to volume server
-	volumeServerAddress := task.Server
-	if volumeServerAddress == "" {
-		return false, fmt.Errorf("no volume server address provided")
-	}
-
-	// Convert to gRPC address (HTTP port + 10000)
-	grpcAddress := pb.ServerToGrpcAddress(volumeServerAddress)
-	conn, err := grpc.NewClient(grpcAddress, grpc.WithTransportCredentials(insecure.NewCredentials()))
-	if err != nil {
-		return false, fmt.Errorf("failed to connect to volume server %s: %v", volumeServerAddress, err)
-	}
-	defer conn.Close()
-
-	client := volume_server_pb.NewVolumeServerClient(conn)
-
-	// Step 1: Generate EC shards
-	w.sendTaskUpdate(task, 0.2, "Generating EC shards")
-
-	generateReq := &volume_server_pb.VolumeEcShardsGenerateRequest{
-		VolumeId:   task.VolumeID,
-		Collection: task.Parameters["collection"],
-	}
-
-	_, err = client.VolumeEcShardsGenerate(task.Context, generateReq)
-	if err != nil {
-		return false, fmt.Errorf("EC shard generation failed: %v", err)
-	}
-
-	w.sendTaskUpdate(task, 0.6, "EC shards generated successfully")
-
-	// Step 2: Mount EC volume
-	w.sendTaskUpdate(task, 0.8, "Mounting EC volume")
-
-	mountReq := &volume_server_pb.VolumeEcShardsMountRequest{
-		VolumeId:   task.VolumeID,
-		Collection: task.Parameters["collection"],
-		ShardIds:   []uint32{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13}, // All EC shards
-	}
-
-	_, err = client.VolumeEcShardsMount(task.Context, mountReq)
-	if err != nil {
-		return false, fmt.Errorf("EC shard mount failed: %v", err)
-	}
-
-	// Step 3: Mark original volume as read-only
-	w.sendTaskUpdate(task, 0.9, "Marking volume read-only")
-
-	readOnlyReq := &volume_server_pb.VolumeMarkReadonlyRequest{
-		VolumeId: task.VolumeID,
-	}
-
-	_, err = client.VolumeMarkReadonly(task.Context, readOnlyReq)
-	if err != nil {
-		glog.Warningf("Failed to mark volume %d read-only: %v", task.VolumeID, err)
-		// This is not a critical failure for EC encoding
-	}
-
-	w.sendTaskUpdate(task, 1.0, "EC encoding completed")
-
-	return true, nil
-}
-
-// executeECRebuild performs EC shard rebuilding
-func (w *ECWorker) executeECRebuild(task *ActiveTask) (bool, error) {
-	glog.Infof("Performing EC rebuild on volume %d", task.VolumeID)
-
-	w.sendTaskUpdate(task, 0.1, "Initializing EC rebuild")
-
-	// Connect to volume server
-	grpcAddress := pb.ServerToGrpcAddress(task.Server)
-	conn, err := grpc.NewClient(grpcAddress, grpc.WithTransportCredentials(insecure.NewCredentials()))
-	if err != nil {
-		return false, fmt.Errorf("failed to connect to volume server: %v", err)
-	}
-	defer conn.Close()
-
-	client := volume_server_pb.NewVolumeServerClient(conn)
-
-	// Rebuild missing/corrupted shards
-	w.sendTaskUpdate(task, 0.5, "Rebuilding EC shards")
-
-	rebuildReq := &volume_server_pb.VolumeEcShardsRebuildRequest{
-		VolumeId:   task.VolumeID,
-		Collection: task.Parameters["collection"],
-	}
-
-	_, err = client.VolumeEcShardsRebuild(task.Context, rebuildReq)
-	if err != nil {
-		return false, fmt.Errorf("EC rebuild failed: %v", err)
-	}
-
-	w.sendTaskUpdate(task, 1.0, "EC rebuild completed")
-
-	return true, nil
-}
-
-// executeVacuum performs volume vacuum operation
-func (w *ECWorker) executeVacuum(task *ActiveTask) (bool, error) {
-	glog.Infof("Performing vacuum on volume %d", task.VolumeID)
-
-	w.sendTaskUpdate(task, 0.1, "Initializing vacuum")
-
-	// Parse garbage threshold
-	thresholdStr := task.Parameters["garbage_threshold"]
-	if thresholdStr == "" {
-		thresholdStr = "0.3" // Default 30%
-	}
-
-	threshold, err := strconv.ParseFloat(thresholdStr, 32)
-	if err != nil {
-		return false, fmt.Errorf("invalid garbage threshold: %v", err)
-	}
-
-	// Connect to volume server
-	grpcAddress := pb.ServerToGrpcAddress(task.Server)
-	conn, err := grpc.NewClient(grpcAddress, grpc.WithTransportCredentials(insecure.NewCredentials()))
-	if err != nil {
-		return false, fmt.Errorf("failed to connect to volume server: %v", err)
-	}
-	defer conn.Close()
-
-	client := volume_server_pb.NewVolumeServerClient(conn)
-
-	// Step 1: Check vacuum eligibility
-	w.sendTaskUpdate(task, 0.2, "Checking vacuum eligibility")
-
-	checkReq := &volume_server_pb.VacuumVolumeCheckRequest{
-		VolumeId: task.VolumeID,
-	}
-
-	checkResp, err := client.VacuumVolumeCheck(task.Context, checkReq)
-	if err != nil {
-		return false, fmt.Errorf("vacuum check failed: %v", err)
-	}
-
-	if checkResp.GarbageRatio < float64(threshold) {
-		return true, fmt.Errorf("volume %d garbage ratio %.2f%% below threshold %.2f%%",
-			task.VolumeID, checkResp.GarbageRatio*100, threshold*100)
-	}
-
-	// Step 2: Compact volume
-	w.sendTaskUpdate(task, 0.4, "Compacting volume")
-
-	compactReq := &volume_server_pb.VacuumVolumeCompactRequest{
-		VolumeId: task.VolumeID,
-	}
-
-	compactStream, err := client.VacuumVolumeCompact(task.Context, compactReq)
-	if err != nil {
-		return false, fmt.Errorf("vacuum compact failed: %v", err)
-	}
-
-	// Process compact stream
-	for {
-		resp, err := compactStream.Recv()
-		if err != nil {
-			if err.Error() == "EOF" {
-				break
-			}
-			return false, fmt.Errorf("vacuum compact stream error: %v", err)
-		}
-
-		progress := 0.4 + 0.4*(float64(resp.ProcessedBytes)/float64(resp.LoadAvg_1M)) // Rough progress estimate
-		w.sendTaskUpdate(task, float32(progress), "Compacting volume")
-	}
-
-	// Step 3: Commit vacuum
-	w.sendTaskUpdate(task, 0.9, "Committing vacuum")
-
-	commitReq := &volume_server_pb.VacuumVolumeCommitRequest{
-		VolumeId: task.VolumeID,
-	}
-
-	_, err = client.VacuumVolumeCommit(task.Context, commitReq)
-	if err != nil {
-		return false, fmt.Errorf("vacuum commit failed: %v", err)
-	}
-
-	// Step 4: Cleanup
-	w.sendTaskUpdate(task, 0.95, "Cleaning up")
-
-	cleanupReq := &volume_server_pb.VacuumVolumeCleanupRequest{
-		VolumeId: task.VolumeID,
-	}
-
-	_, err = client.VacuumVolumeCleanup(task.Context, cleanupReq)
-	if err != nil {
-		glog.Warningf("Vacuum cleanup warning: %v", err)
-		// Non-critical error
-	}
-
-	w.sendTaskUpdate(task, 1.0, "Vacuum completed successfully")
-
-	return true, nil
-}
-
-// sendTaskUpdate sends task progress update to admin
-func (w *ECWorker) sendTaskUpdate(task *ActiveTask, progress float32, message string) {
-	task.Progress = progress
-	task.Status = message
-
-	update := &worker_pb.WorkerMessage{
-		WorkerId:  w.workerID,
-		Timestamp: time.Now().Unix(),
-		Message: &worker_pb.WorkerMessage_TaskUpdate{
-			TaskUpdate: &worker_pb.TaskUpdate{
-				TaskId:   task.ID,
-				WorkerId: w.workerID,
-				Status:   task.Status,
-				Progress: progress,
-				Message:  message,
-				Metadata: map[string]string{
-					"updated_at": time.Now().Format(time.RFC3339),
-				},
-			},
-		},
-	}
-
-	if err := w.adminStream.Send(update); err != nil {
-		glog.Errorf("Failed to send task update: %v", err)
-	}
-}
-
-// sendTaskCompletion sends task completion to admin
-func (w *ECWorker) sendTaskCompletion(task *ActiveTask, success bool, taskErr error) {
-	var errorMessage string
-	if taskErr != nil {
-		errorMessage = taskErr.Error()
-	}
-
-	completion := &worker_pb.WorkerMessage{
-		WorkerId:  w.workerID,
-		Timestamp: time.Now().Unix(),
-		Message: &worker_pb.WorkerMessage_TaskComplete{
-			TaskComplete: &worker_pb.TaskComplete{
-				TaskId:         task.ID,
-				WorkerId:       w.workerID,
-				Success:        success,
-				ErrorMessage:   errorMessage,
-				CompletionTime: time.Now().Unix(),
-				ResultMetadata: map[string]string{
-					"duration": time.Since(task.StartedAt).String(),
-				},
-			},
-		},
-	}
-
-	if err := w.adminStream.Send(completion); err != nil {
-		glog.Errorf("Failed to send task completion: %v", err)
-	}
-}
--- a/weed/worker/tasks/erasure_coding/ec.go
+++ b/weed/worker/tasks/erasure_coding/ec.go
@ -7,6 +7,7 @@ import (
 	"os"
 	"path/filepath"
 	"sort"
+	"sync"
 	"time"

 	"github.com/seaweedfs/seaweedfs/weed/glog"
@ -14,6 +15,7 @@ import (
 	"github.com/seaweedfs/seaweedfs/weed/pb/master_pb"
 	"github.com/seaweedfs/seaweedfs/weed/pb/volume_server_pb"
 	"github.com/seaweedfs/seaweedfs/weed/storage/erasure_coding"
+	"github.com/seaweedfs/seaweedfs/weed/storage/needle"
 	"github.com/seaweedfs/seaweedfs/weed/worker/tasks"
 	"github.com/seaweedfs/seaweedfs/weed/worker/types"
 	"google.golang.org/grpc"
@ -98,87 +100,76 @@ func (t *Task) SetDialOption(dialOpt grpc.DialOption) {
 	t.grpcDialOpt = dialOpt
 }

-// Execute performs the EC operation using SeaweedFS built-in EC generation
+// Execute performs the EC operation following command_ec_encode.go pattern but with local processing
 func (t *Task) Execute(params types.TaskParams) error {
-	glog.Infof("Starting erasure coding for volume %d from server %s", t.volumeID, t.sourceServer)
+	glog.Infof("Starting erasure coding for volume %d from server %s (download → ec → distribute)", t.volumeID, t.sourceServer)

-	// Extract parameters - use the actual collection from task params, don't default to "default"
+	// Extract parameters - use the actual collection from task params
 	t.collection = params.Collection
-	// Note: Leave collection empty if not specified, as volumes may have been created with empty collection

 	// Override defaults with parameters if provided
 	if mc, ok := params.Parameters["master_client"].(string); ok && mc != "" {
 		t.masterClient = mc
 	}

-	// Use the built-in SeaweedFS EC generation approach
-	ctx := context.Background()
+	volumeId := needle.VolumeId(t.volumeID)

-	// Step 1: Connect to volume server
-	grpcAddress := pb.ServerToGrpcAddress(t.sourceServer)
-	conn, err := grpc.NewClient(grpcAddress, t.grpcDialOpt)
+	// Step 0: Collect volume locations BEFORE EC encoding starts (following command_ec_encode.go pattern)
+	t.SetProgress(5.0)
+	glog.V(1).Infof("Collecting volume %d replica locations before EC encoding", t.volumeID)
+	volumeLocations, err := t.collectVolumeLocations(volumeId)
 	if err != nil {
-		return fmt.Errorf("failed to connect to volume server %s: %v", t.sourceServer, err)
+		return fmt.Errorf("failed to collect volume locations before EC encoding: %v", err)
 	}
-	defer conn.Close()
+	glog.V(1).Infof("Found volume %d on %d servers: %v", t.volumeID, len(volumeLocations), volumeLocations)

-	client := volume_server_pb.NewVolumeServerClient(conn)
+	// Step 1: Mark volume as readonly on all replicas (following command_ec_encode.go)
+	t.SetProgress(10.0)
+	glog.V(1).Infof("Marking volume %d as readonly on all replicas", t.volumeID)
+	err = t.markVolumeReadonlyOnAllReplicas(volumeLocations)
+	if err != nil {
+		return fmt.Errorf("failed to mark volume as readonly: %v", err)
+	}

-	// Step 2: Generate EC shards on the volume server
+	// Step 2: Copy volume to local worker for processing
 	t.SetProgress(20.0)
-	glog.V(1).Infof("Generating EC shards for volume %d with collection '%s'", t.volumeID, t.collection)
-
-	generateReq := &volume_server_pb.VolumeEcShardsGenerateRequest{
-		VolumeId:   t.volumeID,
-		Collection: t.collection,
+	glog.V(1).Infof("Downloading volume %d files to worker for local EC processing", t.volumeID)
+	err = t.copyVolumeDataLocally(t.workDir)
+	if err != nil {
+		return fmt.Errorf("failed to copy volume data locally: %v", err)
 	}

-	_, err = client.VolumeEcShardsGenerate(ctx, generateReq)
+	// Step 3: Generate EC shards locally on worker
+	t.SetProgress(40.0)
+	glog.V(1).Infof("Generating EC shards locally for volume %d", t.volumeID)
+	shardFiles, err := t.performLocalECEncoding(t.workDir)
 	if err != nil {
-		return fmt.Errorf("failed to generate EC shards: %v", err)
+		return fmt.Errorf("failed to generate EC shards locally: %v", err)
 	}

+	// Step 4: Distribute shards across multiple servers (following command_ec_encode.go balance logic)
 	t.SetProgress(60.0)
-	glog.V(1).Infof("EC shards generated successfully for volume %d", t.volumeID)
-
-	// Step 3: Mount EC shards
-	glog.V(1).Infof("Mounting EC shards for volume %d", t.volumeID)
-
-	// Mount all EC shards (0-13: 10 data + 4 parity)
-	shardIds := make([]uint32, t.totalShards)
-	for i := 0; i < t.totalShards; i++ {
-		shardIds[i] = uint32(i)
-	}
-
-	mountReq := &volume_server_pb.VolumeEcShardsMountRequest{
-		VolumeId:   t.volumeID,
-		Collection: t.collection,
-		ShardIds:   shardIds,
-	}
-
-	_, err = client.VolumeEcShardsMount(ctx, mountReq)
+	glog.V(1).Infof("Distributing EC shards across multiple servers for volume %d", t.volumeID)
+	err = t.distributeEcShardsAcrossServers(shardFiles)
 	if err != nil {
-		return fmt.Errorf("failed to mount EC shards: %v", err)
-	}
-
-	t.SetProgress(80.0)
-	glog.V(1).Infof("EC shards mounted successfully for volume %d", t.volumeID)
-
-	// Step 4: Mark original volume as read-only
-	glog.V(1).Infof("Marking volume %d as read-only", t.volumeID)
-
-	readOnlyReq := &volume_server_pb.VolumeMarkReadonlyRequest{
-		VolumeId: t.volumeID,
+		return fmt.Errorf("failed to distribute EC shards: %v", err)
 	}

-	_, err = client.VolumeMarkReadonly(ctx, readOnlyReq)
+	// Step 5: Delete original volume from ALL replica locations (following command_ec_encode.go pattern)
+	t.SetProgress(90.0)
+	glog.V(1).Infof("Deleting original volume %d from all replica locations", t.volumeID)
+	err = t.deleteVolumeFromAllLocations(volumeId, volumeLocations)
 	if err != nil {
-		glog.Warningf("Failed to mark volume %d read-only: %v", t.volumeID, err)
-		// This is not a critical failure for EC encoding
+		glog.Warningf("Failed to delete original volume %d from all locations: %v (may need manual cleanup)", t.volumeID, err)
+		// This is not a critical failure - the EC encoding itself succeeded
 	}

+	// Step 6: Cleanup local files
+	t.SetProgress(95.0)
+	t.cleanup(t.workDir)
+
 	t.SetProgress(100.0)
-	glog.Infof("Successfully completed erasure coding for volume %d", t.volumeID)
+	glog.Infof("Successfully completed erasure coding with distributed shards for volume %d", t.volumeID)
 	return nil
 }

@ -759,3 +750,259 @@ func (t *Task) SetEstimatedDuration(duration time.Duration) {
 func (t *Task) Cancel() error {
 	return t.BaseTask.Cancel()
 }
+
+// collectVolumeLocations collects all server locations where a volume has replicas (following command_ec_encode.go pattern)
+func (t *Task) collectVolumeLocations(volumeId needle.VolumeId) ([]pb.ServerAddress, error) {
+	// Connect to master client to get volume locations
+	grpcAddress := pb.ServerToGrpcAddress(t.masterClient)
+	conn, err := grpc.NewClient(grpcAddress, t.grpcDialOpt)
+	if err != nil {
+		return nil, fmt.Errorf("failed to connect to master %s: %v", t.masterClient, err)
+	}
+	defer conn.Close()
+
+	client := master_pb.NewSeaweedClient(conn)
+	volumeListResp, err := client.VolumeList(context.Background(), &master_pb.VolumeListRequest{})
+	if err != nil {
+		return nil, fmt.Errorf("failed to get volume list from master: %v", err)
+	}
+
+	var locations []pb.ServerAddress
+	// Search through all data centers, racks, and volume servers
+	for _, dc := range volumeListResp.TopologyInfo.DataCenterInfos {
+		for _, rack := range dc.RackInfos {
+			for _, dataNode := range rack.DataNodeInfos {
+				for _, diskInfo := range dataNode.DiskInfos {
+					for _, volumeInfo := range diskInfo.VolumeInfos {
+						if volumeInfo.Id == uint32(volumeId) {
+							locations = append(locations, pb.ServerAddress(dataNode.Id))
+							goto nextDataNode // Found volume on this server, move to next server
+						}
+					}
+				}
+			nextDataNode:
+			}
+		}
+	}
+
+	if len(locations) == 0 {
+		return nil, fmt.Errorf("volume %d not found on any server", volumeId)
+	}
+
+	return locations, nil
+}
+
+// markVolumeReadonlyOnAllReplicas marks volume as readonly on all replicas (following command_ec_encode.go pattern)
+func (t *Task) markVolumeReadonlyOnAllReplicas(locations []pb.ServerAddress) error {
+	// Use parallel processing like command_ec_encode.go
+	var wg sync.WaitGroup
+	errorChan := make(chan error, len(locations))
+
+	for _, location := range locations {
+		wg.Add(1)
+		go func(addr pb.ServerAddress) {
+			defer wg.Done()
+
+			grpcAddress := pb.ServerToGrpcAddress(string(addr))
+			conn, err := grpc.NewClient(grpcAddress, t.grpcDialOpt)
+			if err != nil {
+				errorChan <- fmt.Errorf("failed to connect to %s: %v", addr, err)
+				return
+			}
+			defer conn.Close()
+
+			client := volume_server_pb.NewVolumeServerClient(conn)
+			_, err = client.VolumeMarkReadonly(context.Background(), &volume_server_pb.VolumeMarkReadonlyRequest{
+				VolumeId: t.volumeID,
+			})
+			if err != nil {
+				errorChan <- fmt.Errorf("failed to mark volume %d readonly on %s: %v", t.volumeID, addr, err)
+			}
+		}(location)
+	}
+
+	wg.Wait()
+	close(errorChan)
+
+	// Check for errors
+	for err := range errorChan {
+		if err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+// distributeEcShardsAcrossServers distributes EC shards following command_ec_encode.go balance logic
+func (t *Task) distributeEcShardsAcrossServers(shardFiles []string) error {
+	// Get available servers from master
+	grpcAddress := pb.ServerToGrpcAddress(t.masterClient)
+	conn, err := grpc.NewClient(grpcAddress, t.grpcDialOpt)
+	if err != nil {
+		return fmt.Errorf("failed to connect to master %s: %v", t.masterClient, err)
+	}
+	defer conn.Close()
+
+	client := master_pb.NewSeaweedClient(conn)
+	topologyResp, err := client.VolumeList(context.Background(), &master_pb.VolumeListRequest{})
+	if err != nil {
+		return fmt.Errorf("failed to get topology: %v", err)
+	}
+
+	// Collect available servers
+	var availableServers []pb.ServerAddress
+	for _, dc := range topologyResp.TopologyInfo.DataCenterInfos {
+		for _, rack := range dc.RackInfos {
+			for _, dataNode := range rack.DataNodeInfos {
+				// Check if server has available space for EC shards
+				for _, diskInfo := range dataNode.DiskInfos {
+					if diskInfo.FreeVolumeCount > 0 {
+						availableServers = append(availableServers, pb.ServerAddress(dataNode.Id))
+						break
+					}
+				}
+			}
+		}
+	}
+
+	if len(availableServers) < 4 {
+		return fmt.Errorf("insufficient servers for EC distribution: need at least 4, found %d", len(availableServers))
+	}
+
+	// Distribute shards across servers using round-robin
+	var wg sync.WaitGroup
+	errorChan := make(chan error, len(shardFiles))
+
+	for i, shardFile := range shardFiles {
+		wg.Add(1)
+		go func(shardIndex int, shardPath string) {
+			defer wg.Done()
+
+			// Round-robin distribution
+			targetServer := availableServers[shardIndex%len(availableServers)]
+
+			// Upload shard to target server
+			err := t.uploadShardToTargetServer(shardPath, targetServer, uint32(shardIndex))
+			if err != nil {
+				errorChan <- fmt.Errorf("failed to upload shard %d to %s: %v", shardIndex, targetServer, err)
+				return
+			}
+
+			// Mount shard on target server
+			err = t.mountShardOnServer(targetServer, uint32(shardIndex))
+			if err != nil {
+				errorChan <- fmt.Errorf("failed to mount shard %d on %s: %v", shardIndex, targetServer, err)
+			}
+		}(i, shardFile)
+	}
+
+	wg.Wait()
+	close(errorChan)
+
+	// Check for errors
+	for err := range errorChan {
+		if err != nil {
+			return err
+		}
+	}
+
+	glog.Infof("Successfully distributed %d EC shards across %d servers", len(shardFiles), len(availableServers))
+	return nil
+}
+
+// deleteVolumeFromAllLocations deletes volume from all replica locations (following command_ec_encode.go pattern)
+func (t *Task) deleteVolumeFromAllLocations(volumeId needle.VolumeId, locations []pb.ServerAddress) error {
+	if len(locations) == 0 {
+		glog.Warningf("No locations found for volume %d, skipping deletion", volumeId)
+		return nil
+	}
+
+	glog.V(1).Infof("Deleting volume %d from %d locations: %v", volumeId, len(locations), locations)
+
+	// Use parallel processing like command_ec_encode.go
+	var wg sync.WaitGroup
+	errorChan := make(chan error, len(locations))
+
+	for _, location := range locations {
+		wg.Add(1)
+		go func(addr pb.ServerAddress) {
+			defer wg.Done()
+
+			grpcAddress := pb.ServerToGrpcAddress(string(addr))
+			conn, err := grpc.NewClient(grpcAddress, t.grpcDialOpt)
+			if err != nil {
+				errorChan <- fmt.Errorf("failed to connect to %s: %v", addr, err)
+				return
+			}
+			defer conn.Close()
+
+			client := volume_server_pb.NewVolumeServerClient(conn)
+			_, err = client.VolumeDelete(context.Background(), &volume_server_pb.VolumeDeleteRequest{
+				VolumeId: uint32(volumeId),
+			})
+			if err != nil {
+				errorChan <- fmt.Errorf("failed to delete volume %d from %s: %v", volumeId, addr, err)
+				return
+			}
+
+			glog.V(1).Infof("Successfully deleted volume %d from %s", volumeId, addr)
+		}(location)
+	}
+
+	wg.Wait()
+	close(errorChan)
+
+	// Check for errors (but don't fail the whole operation for deletion errors)
+	errorCount := 0
+	for err := range errorChan {
+		if err != nil {
+			glog.Errorf("Volume deletion error: %v", err)
+			errorCount++
+		}
+	}
+
+	if errorCount > 0 {
+		return fmt.Errorf("failed to delete volume from %d locations", errorCount)
+	}
+
+	glog.Infof("Successfully deleted volume %d from all %d replica locations", volumeId, len(locations))
+	return nil
+}
+
+// uploadShardToTargetServer uploads a shard file to target server
+func (t *Task) uploadShardToTargetServer(shardFile string, targetServer pb.ServerAddress, shardId uint32) error {
+	// TODO: Implement actual shard upload using VolumeEcShardsCopy or similar mechanism
+	// For now, this is a placeholder that simulates the upload
+	glog.V(1).Infof("Uploading shard file %s (shard %d) to server %s", shardFile, shardId, targetServer)
+
+	// In a real implementation, this would:
+	// 1. Read the shard file content
+	// 2. Use VolumeEcShardsCopy or streaming upload to transfer the shard
+	// 3. Verify the upload succeeded
+
+	return nil // Placeholder - needs actual implementation
+}
+
+// mountShardOnServer mounts an EC shard on target server
+func (t *Task) mountShardOnServer(targetServer pb.ServerAddress, shardId uint32) error {
+	grpcAddress := pb.ServerToGrpcAddress(string(targetServer))
+	conn, err := grpc.NewClient(grpcAddress, t.grpcDialOpt)
+	if err != nil {
+		return fmt.Errorf("failed to connect to %s: %v", targetServer, err)
+	}
+	defer conn.Close()
+
+	client := volume_server_pb.NewVolumeServerClient(conn)
+	_, err = client.VolumeEcShardsMount(context.Background(), &volume_server_pb.VolumeEcShardsMountRequest{
+		VolumeId:   t.volumeID,
+		Collection: t.collection,
+		ShardIds:   []uint32{shardId},
+	})
+	if err != nil {
+		return fmt.Errorf("failed to mount shard %d: %v", shardId, err)
+	}
+
+	glog.V(1).Infof("Successfully mounted shard %d on server %s", shardId, targetServer)
+	return nil
+}