diff --git a/weed/admin/task/admin_server.go b/weed/admin/task/admin_server.go deleted file mode 100644 index f5e2eaa62..000000000 --- a/weed/admin/task/admin_server.go +++ /dev/null @@ -1,699 +0,0 @@ -package task - -import ( - "fmt" - "math/rand" - "sync" - "time" - - "github.com/seaweedfs/seaweedfs/weed/glog" - "github.com/seaweedfs/seaweedfs/weed/wdclient" - "github.com/seaweedfs/seaweedfs/weed/worker/types" -) - -// TaskHistory represents task execution history -type TaskHistory struct { - entries []TaskHistoryEntry - mutex sync.RWMutex -} - -// TaskHistoryEntry represents a single task history entry -type TaskHistoryEntry struct { - TaskID string - TaskType types.TaskType - VolumeID uint32 - WorkerID string - Status types.TaskStatus - StartedAt time.Time - CompletedAt time.Time - Duration time.Duration - ErrorMessage string -} - -// NewTaskHistory creates a new task history -func NewTaskHistory() *TaskHistory { - return &TaskHistory{ - entries: make([]TaskHistoryEntry, 0), - } -} - -// AddEntry adds a new task history entry -func (th *TaskHistory) AddEntry(entry TaskHistoryEntry) { - th.mutex.Lock() - defer th.mutex.Unlock() - - th.entries = append(th.entries, entry) - - // Keep only the last 1000 entries - if len(th.entries) > 1000 { - th.entries = th.entries[len(th.entries)-1000:] - } -} - -// GetRecentEntries returns the most recent entries -func (th *TaskHistory) GetRecentEntries(limit int) []*TaskHistoryEntry { - th.mutex.RLock() - defer th.mutex.RUnlock() - - start := len(th.entries) - limit - if start < 0 { - start = 0 - } - - result := make([]*TaskHistoryEntry, len(th.entries)-start) - for i, entry := range th.entries[start:] { - entryCopy := entry - result[i] = &entryCopy - } - - return result -} - -// AdminServer manages task distribution and worker coordination -type AdminServer struct { - ID string - Config *AdminConfig - masterClient *wdclient.MasterClient - volumeStateManager *VolumeStateManager - workerRegistry *WorkerRegistry - taskQueue *PriorityTaskQueue - taskScheduler *TaskScheduler - taskHistory *TaskHistory - failureHandler *FailureHandler - masterSync *MasterSynchronizer - workerComm *WorkerCommunicationManager - running bool - stopCh chan struct{} - mutex sync.RWMutex - - // Task tracking - activeTasks map[string]*InProgressTask - tasksMutex sync.RWMutex -} - -// AdminConfig holds configuration for the admin server -type AdminConfig struct { - ScanInterval time.Duration - WorkerTimeout time.Duration - TaskTimeout time.Duration - MaxRetries int - ReconcileInterval time.Duration - EnableFailureRecovery bool - MaxConcurrentTasks int -} - -// NewAdminServer creates a new admin server instance -func NewAdminServer(config *AdminConfig, masterClient *wdclient.MasterClient) *AdminServer { - adminServer := &AdminServer{ - ID: generateAdminServerID(), - Config: config, - masterClient: masterClient, - volumeStateManager: NewVolumeStateManager(masterClient), - workerRegistry: NewWorkerRegistry(), - taskQueue: NewPriorityTaskQueue(), - taskHistory: NewTaskHistory(), - failureHandler: NewFailureHandler(config), - activeTasks: make(map[string]*InProgressTask), - stopCh: make(chan struct{}), - } - - // Initialize components that depend on admin server - adminServer.taskScheduler = NewTaskScheduler(adminServer.workerRegistry, adminServer.taskQueue) - adminServer.masterSync = NewMasterSynchronizer(masterClient, adminServer.volumeStateManager, adminServer) - adminServer.workerComm = NewWorkerCommunicationManager(adminServer) - - glog.Infof("Created admin server %s", adminServer.ID) - return adminServer -} - -// Start starts the admin server -func (as *AdminServer) Start() error { - as.mutex.Lock() - defer as.mutex.Unlock() - - if as.running { - return nil - } - - glog.Infof("Starting admin server %s", as.ID) - - // Start components - as.masterSync.Start() - as.workerComm.Start() - - // Start background loops - go as.taskAssignmentLoop() - go as.taskMonitoringLoop() - go as.reconciliationLoop() - go as.metricsLoop() - - as.running = true - glog.Infof("Admin server %s started successfully", as.ID) - - return nil -} - -// Stop stops the admin server -func (as *AdminServer) Stop() { - as.mutex.Lock() - defer as.mutex.Unlock() - - if !as.running { - return - } - - glog.Infof("Stopping admin server %s", as.ID) - - close(as.stopCh) - - // Stop components - as.masterSync.Stop() - as.workerComm.Stop() - - as.running = false - glog.Infof("Admin server %s stopped", as.ID) -} - -// RegisterWorker registers a new worker -func (as *AdminServer) RegisterWorker(worker *types.Worker) error { - as.mutex.Lock() - defer as.mutex.Unlock() - - if !as.running { - return fmt.Errorf("admin server is not running") - } - - return as.workerRegistry.RegisterWorker(worker) -} - -// UnregisterWorker removes a worker -func (as *AdminServer) UnregisterWorker(workerID string) error { - as.mutex.Lock() - defer as.mutex.Unlock() - - // Reschedule any tasks assigned to this worker - for taskID, task := range as.activeTasks { - if task.WorkerID == workerID { - glog.Warningf("Rescheduling task %s due to worker %s unregistration", taskID, workerID) - as.ReassignTask(taskID, "worker unregistration") - delete(as.activeTasks, taskID) - } - } - - return as.workerRegistry.UnregisterWorker(workerID) -} - -// UpdateWorkerHeartbeat updates worker heartbeat -func (as *AdminServer) UpdateWorkerHeartbeat(workerID string, status *types.WorkerStatus) error { - as.mutex.Lock() - defer as.mutex.Unlock() - - return as.workerRegistry.UpdateWorkerHeartbeat(workerID, status) -} - -// RequestTask handles task requests from workers -func (as *AdminServer) RequestTask(workerID string, capabilities []types.TaskType) (*types.Task, error) { - as.mutex.RLock() - defer as.mutex.RUnlock() - - if !as.running { - return nil, fmt.Errorf("admin server is not running") - } - - worker, exists := as.workerRegistry.GetWorker(workerID) - if !exists { - return nil, fmt.Errorf("worker %s not registered", workerID) - } - - // Check if worker has capacity - if worker.CurrentLoad >= worker.MaxConcurrent { - return nil, nil // No capacity - } - - // Get next task for this worker - task := as.taskScheduler.GetNextTask(workerID, capabilities) - if task == nil { - return nil, nil // No suitable tasks - } - - // Check if volume can be assigned (using comprehensive state management) - if !as.canAssignTask(task, workerID) { - return nil, nil // Cannot assign due to capacity or state constraints - } - - // Assign task to worker - inProgressTask := &InProgressTask{ - Task: task, - WorkerID: workerID, - StartedAt: time.Now(), - LastUpdate: time.Now(), - Progress: 0.0, - EstimatedEnd: time.Now().Add(as.estimateTaskDuration(task)), - } - - as.activeTasks[task.ID] = inProgressTask - worker.CurrentLoad++ - - // Register task impact with state manager - impact := as.createTaskImpact(task) - as.volumeStateManager.RegisterTaskImpact(task.ID, impact) - inProgressTask.VolumeReserved = true - - glog.V(1).Infof("Assigned task %s to worker %s", task.ID, workerID) - return task, nil -} - -// UpdateTaskProgress updates task progress -func (as *AdminServer) UpdateTaskProgress(taskID string, progress float64) error { - as.tasksMutex.Lock() - defer as.tasksMutex.Unlock() - - inProgressTask, exists := as.activeTasks[taskID] - if !exists { - return fmt.Errorf("task %s not found", taskID) - } - - inProgressTask.Progress = progress - inProgressTask.LastUpdate = time.Now() - - glog.V(2).Infof("Task %s progress: %.1f%%", taskID, progress) - return nil -} - -// CompleteTask marks a task as completed -func (as *AdminServer) CompleteTask(taskID string, success bool, errorMsg string) error { - as.tasksMutex.Lock() - defer as.tasksMutex.Unlock() - - inProgressTask, exists := as.activeTasks[taskID] - if !exists { - return fmt.Errorf("task %s not found", taskID) - } - - // Remove from active tasks - delete(as.activeTasks, taskID) - - // Update worker load - if worker, exists := as.workerRegistry.GetWorker(inProgressTask.WorkerID); exists { - worker.CurrentLoad-- - } - - // Unregister task impact - as.volumeStateManager.UnregisterTaskImpact(taskID) - - // Record in task history - status := types.TaskStatusCompleted - if !success { - status = types.TaskStatusFailed - } - - as.taskHistory.AddEntry(TaskHistoryEntry{ - TaskID: taskID, - TaskType: inProgressTask.Task.Type, - VolumeID: inProgressTask.Task.VolumeID, - WorkerID: inProgressTask.WorkerID, - Status: status, - StartedAt: inProgressTask.StartedAt, - CompletedAt: time.Now(), - Duration: time.Since(inProgressTask.StartedAt), - ErrorMessage: errorMsg, - }) - - glog.Infof("Task %s completed: success=%v", taskID, success) - return nil -} - -// QueueTask adds a new task to the task queue -func (as *AdminServer) QueueTask(task *types.Task) error { - as.mutex.Lock() - defer as.mutex.Unlock() - - if !as.running { - return fmt.Errorf("admin server is not running") - } - - // Validate the task - if task == nil { - return fmt.Errorf("task cannot be nil") - } - - if task.ID == "" { - task.ID = generateTaskID() - } - - // Set creation timestamp if not set - if task.CreatedAt.IsZero() { - task.CreatedAt = time.Now() - } - - // Check if task for this volume is already queued or in progress - if as.isVolumeAlreadyQueued(task.VolumeID, task.Type) { - glog.V(2).Infof("Task for volume %d already queued or in progress, skipping", task.VolumeID) - return nil - } - - // Add to task queue - as.taskQueue.Push(task) - - glog.V(1).Infof("Queued task %s (%s) for volume %d with priority %v", - task.ID, task.Type, task.VolumeID, task.Priority) - - return nil -} - -// Helper methods - -// canAssignTask checks if a task can be assigned to a worker -func (as *AdminServer) canAssignTask(task *types.Task, workerID string) bool { - worker, exists := as.workerRegistry.GetWorker(workerID) - if !exists { - return false - } - - // Check worker capacity - if worker.CurrentLoad >= worker.MaxConcurrent { - return false - } - - // Check if worker has required capability - hasCapability := false - for _, cap := range worker.Capabilities { - if cap == task.Type { - hasCapability = true - break - } - } - if !hasCapability { - return false - } - - return true -} - -// createTaskImpact creates a TaskImpact for the given task -func (as *AdminServer) createTaskImpact(task *types.Task) *TaskImpact { - impact := &TaskImpact{ - TaskID: task.ID, - VolumeID: task.VolumeID, - TaskType: task.Type, - StartedAt: time.Now(), - EstimatedEnd: time.Now().Add(as.estimateTaskDuration(task)), - CapacityDelta: make(map[string]int64), - VolumeChanges: &VolumeChanges{}, - ShardChanges: make(map[int]*ShardChange), - } - - // Set task-specific impacts - switch task.Type { - case types.TaskTypeErasureCoding: - impact.VolumeChanges.WillBecomeReadOnly = true - impact.EstimatedEnd = time.Now().Add(2 * time.Hour) // EC takes longer - - // EC encoding requires temporary space - if server, ok := task.Parameters["server"]; ok { - if serverStr, ok := server.(string); ok { - volumeState := as.volumeStateManager.GetVolumeState(task.VolumeID) - if volumeState != nil && volumeState.CurrentState != nil { - // Estimate 2x volume size needed temporarily - impact.CapacityDelta[serverStr] = int64(volumeState.CurrentState.Size * 2) - } - } - } - - case types.TaskTypeVacuum: - // Vacuum reduces volume size - if server, ok := task.Parameters["server"]; ok { - if serverStr, ok := server.(string); ok { - // Estimate 30% space reclamation - volumeState := as.volumeStateManager.GetVolumeState(task.VolumeID) - if volumeState != nil && volumeState.CurrentState != nil { - impact.CapacityDelta[serverStr] = -int64(float64(volumeState.CurrentState.Size) * 0.3) - } - } - } - } - - return impact -} - -// estimateTaskDuration estimates how long a task will take -func (as *AdminServer) estimateTaskDuration(task *types.Task) time.Duration { - switch task.Type { - case types.TaskTypeErasureCoding: - return 2 * time.Hour - case types.TaskTypeVacuum: - return 30 * time.Minute - default: - return 1 * time.Hour - } -} - -// isVolumeAlreadyQueued checks if a task for the volume is already queued or in progress -func (as *AdminServer) isVolumeAlreadyQueued(volumeID uint32, taskType types.TaskType) bool { - // Check active tasks - as.tasksMutex.RLock() - for _, inProgressTask := range as.activeTasks { - if inProgressTask.Task.VolumeID == volumeID && inProgressTask.Task.Type == taskType { - as.tasksMutex.RUnlock() - return true - } - } - as.tasksMutex.RUnlock() - - // Check queued tasks - return as.taskQueue.HasTask(volumeID, taskType) -} - -// Background loops - -// taskAssignmentLoop handles automatic task assignment to workers -func (as *AdminServer) taskAssignmentLoop() { - ticker := time.NewTicker(5 * time.Second) - defer ticker.Stop() - - for { - select { - case <-ticker.C: - as.processTaskAssignments() - case <-as.stopCh: - return - } - } -} - -// processTaskAssignments attempts to assign pending tasks to available workers -func (as *AdminServer) processTaskAssignments() { - // Get available workers - workers := as.workerRegistry.GetAvailableWorkers() - if len(workers) == 0 { - return // No workers available - } - - // For each worker with available capacity, try to assign a task - for _, worker := range workers { - if worker.CurrentLoad < worker.MaxConcurrent { - task := as.taskScheduler.GetNextTask(worker.ID, worker.Capabilities) - if task != nil { - // Try to assign task directly - _, err := as.RequestTask(worker.ID, worker.Capabilities) - if err != nil { - glog.Errorf("Failed to assign task to worker %s: %v", worker.ID, err) - } - } - } - } -} - -// taskMonitoringLoop monitors task progress and handles timeouts -func (as *AdminServer) taskMonitoringLoop() { - ticker := time.NewTicker(1 * time.Minute) - defer ticker.Stop() - - for { - select { - case <-ticker.C: - as.checkTaskTimeouts() - case <-as.stopCh: - return - } - } -} - -// checkTaskTimeouts checks for tasks that have timed out -func (as *AdminServer) checkTaskTimeouts() { - as.tasksMutex.Lock() - defer as.tasksMutex.Unlock() - - now := time.Now() - timeout := 2 * time.Hour // Default task timeout - - for taskID, inProgressTask := range as.activeTasks { - if now.Sub(inProgressTask.LastUpdate) > timeout { - glog.Warningf("Task %s timed out (last update: %v)", taskID, inProgressTask.LastUpdate) - as.ReassignTask(taskID, "task timeout") - } - } -} - -// ReassignTask reassigns a task due to worker failure -func (as *AdminServer) ReassignTask(taskID, reason string) { - as.tasksMutex.Lock() - defer as.tasksMutex.Unlock() - - inProgressTask, exists := as.activeTasks[taskID] - if !exists { - return - } - - glog.Infof("Reassigning task %s due to: %s", taskID, reason) - - // Reset task status - inProgressTask.Task.Status = types.TaskStatusPending - - // Unregister current task impact - as.volumeStateManager.UnregisterTaskImpact(taskID) - - // Remove from active tasks - delete(as.activeTasks, taskID) - - // Put back in queue with higher priority - inProgressTask.Task.Priority = types.TaskPriorityHigh - as.taskQueue.Push(inProgressTask.Task) -} - -// reconciliationLoop periodically reconciles state with master -func (as *AdminServer) reconciliationLoop() { - ticker := time.NewTicker(5 * time.Minute) - defer ticker.Stop() - - for { - select { - case <-ticker.C: - as.performReconciliation() - case <-as.stopCh: - return - } - } -} - -// performReconciliation reconciles admin state with master -func (as *AdminServer) performReconciliation() { - glog.V(1).Infof("Starting state reconciliation") - - // Sync with master - err := as.volumeStateManager.SyncWithMaster() - if err != nil { - glog.Errorf("Failed to sync with master during reconciliation: %v", err) - return - } - - glog.V(1).Infof("State reconciliation completed") -} - -// metricsLoop periodically logs metrics and statistics -func (as *AdminServer) metricsLoop() { - ticker := time.NewTicker(1 * time.Minute) - defer ticker.Stop() - - for { - select { - case <-ticker.C: - as.logMetrics() - case <-as.stopCh: - return - } - } -} - -// logMetrics logs current system metrics -func (as *AdminServer) logMetrics() { - as.tasksMutex.RLock() - activeTasks := len(as.activeTasks) - as.tasksMutex.RUnlock() - - queuedTasks := as.taskQueue.Size() - activeWorkers := len(as.workerRegistry.GetAvailableWorkers()) - - glog.V(1).Infof("Admin server metrics: active_tasks=%d, queued_tasks=%d, active_workers=%d", - activeTasks, queuedTasks, activeWorkers) -} - -// GetAvailableWorkers returns workers capable of handling the specified task type -func (as *AdminServer) GetAvailableWorkers(taskType string) []*types.Worker { - workers := as.workerRegistry.GetAvailableWorkers() - var available []*types.Worker - - for _, worker := range workers { - if worker.CurrentLoad < worker.MaxConcurrent { - for _, cap := range worker.Capabilities { - if string(cap) == taskType { - available = append(available, worker) - break - } - } - } - } - - return available -} - -// GetSystemStats returns current system statistics -func (as *AdminServer) GetSystemStats() *SystemStats { - as.tasksMutex.RLock() - activeTasks := len(as.activeTasks) - as.tasksMutex.RUnlock() - - queuedTasks := as.taskQueue.Size() - activeWorkers := len(as.workerRegistry.GetAvailableWorkers()) - - return &SystemStats{ - ActiveTasks: activeTasks, - QueuedTasks: queuedTasks, - ActiveWorkers: activeWorkers, - TotalWorkers: len(as.workerRegistry.GetAvailableWorkers()), - Uptime: time.Since(time.Now()), // This should be tracked properly - } -} - -// Getter methods for testing -func (as *AdminServer) GetQueuedTaskCount() int { - return as.taskQueue.Size() -} - -func (as *AdminServer) GetActiveTaskCount() int { - as.tasksMutex.RLock() - defer as.tasksMutex.RUnlock() - return len(as.activeTasks) -} - -func (as *AdminServer) GetTaskHistory() []*TaskHistoryEntry { - return as.taskHistory.GetRecentEntries(100) -} - -func (as *AdminServer) GetVolumeStateManager() *VolumeStateManager { - return as.volumeStateManager -} - -func (as *AdminServer) GetWorkerRegistry() *WorkerRegistry { - return as.workerRegistry -} - -// generateTaskID generates a unique task ID -func generateTaskID() string { - return fmt.Sprintf("task_%d_%d", time.Now().UnixNano(), rand.Intn(10000)) -} - -// generateAdminServerID generates a unique admin server ID -func generateAdminServerID() string { - return fmt.Sprintf("admin-%d", time.Now().Unix()) -} - -// SystemStats represents system statistics -type SystemStats struct { - ActiveTasks int - QueuedTasks int - ActiveWorkers int - TotalWorkers int - Uptime time.Duration - LastMasterSync time.Time -} diff --git a/weed/admin/task/admin_server_test.go b/weed/admin/task/admin_server_test.go deleted file mode 100644 index 3862cf48d..000000000 --- a/weed/admin/task/admin_server_test.go +++ /dev/null @@ -1,524 +0,0 @@ -package task - -import ( - "fmt" - "testing" - - "github.com/seaweedfs/seaweedfs/weed/worker/types" -) - -func TestAdminServer_TaskAssignmentWithStateManagement(t *testing.T) { - // Test the core functionality: accurate task assignment based on comprehensive state - adminServer := NewAdminServer(DefaultAdminConfig(), nil) - - // Initialize components - adminServer.workerRegistry = NewWorkerRegistry() - adminServer.taskQueue = NewPriorityTaskQueue() - adminServer.volumeStateManager = NewVolumeStateManager(nil) - adminServer.taskScheduler = NewTaskScheduler(adminServer.workerRegistry, adminServer.taskQueue) - adminServer.running = true // Mark as running for test - - // Setup test worker - worker := &types.Worker{ - ID: "test_worker_1", - Address: "server1:8080", - Capabilities: []types.TaskType{types.TaskTypeErasureCoding, types.TaskTypeVacuum}, - MaxConcurrent: 2, - Status: "active", - CurrentLoad: 0, - } - adminServer.workerRegistry.RegisterWorker(worker) - - // Setup volume state - volumeID := uint32(1) - adminServer.volumeStateManager.volumes[volumeID] = &VolumeState{ - VolumeID: volumeID, - CurrentState: &VolumeInfo{ - ID: volumeID, - Size: 28 * 1024 * 1024 * 1024, // 28GB - good for EC - Server: "server1", - }, - InProgressTasks: []*TaskImpact{}, - PlannedChanges: []*PlannedOperation{}, - } - - // Setup server capacity - adminServer.volumeStateManager.capacityCache["server1"] = &CapacityInfo{ - Server: "server1", - TotalCapacity: 100 * 1024 * 1024 * 1024, // 100GB - UsedCapacity: 50 * 1024 * 1024 * 1024, // 50GB used - PredictedUsage: 50 * 1024 * 1024 * 1024, // Initially same as used - } - - // Create EC task - task := &types.Task{ - ID: "ec_task_1", - Type: types.TaskTypeErasureCoding, - VolumeID: volumeID, - Server: "server1", - Priority: types.TaskPriorityNormal, - } - - // Test task assignment - adminServer.taskQueue.Push(task) - - assignedTask, err := adminServer.RequestTask("test_worker_1", []types.TaskType{types.TaskTypeErasureCoding}) - if err != nil { - t.Errorf("Task assignment failed: %v", err) - } - - if assignedTask == nil { - t.Fatal("Expected task to be assigned, got nil") - } - - if assignedTask.ID != "ec_task_1" { - t.Errorf("Expected task ec_task_1, got %s", assignedTask.ID) - } - - // Verify state manager was updated - if len(adminServer.volumeStateManager.inProgressTasks) != 1 { - t.Errorf("Expected 1 in-progress task in state manager, got %d", len(adminServer.volumeStateManager.inProgressTasks)) - } - - // Verify capacity reservation - capacity := adminServer.volumeStateManager.GetAccurateCapacity("server1") - if capacity.ReservedCapacity <= 0 { - t.Error("Expected capacity to be reserved for EC task") - } - - t.Log("✅ Task assignment with state management test passed") -} - -func TestAdminServer_CanAssignTask(t *testing.T) { - adminServer := NewAdminServer(DefaultAdminConfig(), nil) - adminServer.volumeStateManager = NewVolumeStateManager(nil) - adminServer.inProgressTasks = make(map[string]*InProgressTask) - - // Setup volume state - volumeID := uint32(1) - adminServer.volumeStateManager.volumes[volumeID] = &VolumeState{ - VolumeID: volumeID, - CurrentState: &VolumeInfo{ - ID: volumeID, - Size: 25 * 1024 * 1024 * 1024, // 25GB - }, - } - - // Setup server capacity - limited space - serverID := "server1" - adminServer.volumeStateManager.capacityCache[serverID] = &CapacityInfo{ - Server: serverID, - TotalCapacity: 30 * 1024 * 1024 * 1024, // 30GB total - UsedCapacity: 20 * 1024 * 1024 * 1024, // 20GB used - PredictedUsage: 20 * 1024 * 1024 * 1024, // 10GB available - } - - worker := &types.Worker{ - ID: "worker1", - Address: serverID, - } - - tests := []struct { - name string - taskType types.TaskType - expected bool - desc string - }{ - { - name: "EC task fits", - taskType: types.TaskTypeErasureCoding, - expected: false, // 25GB * 1.4 = 35GB needed, but only 10GB available - desc: "EC task should not fit due to insufficient capacity", - }, - { - name: "Vacuum task fits", - taskType: types.TaskTypeVacuum, - expected: true, - desc: "Vacuum task should fit (no capacity increase)", - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - task := &types.Task{ - ID: "test_task", - Type: tt.taskType, - VolumeID: volumeID, - Server: serverID, - } - - result := adminServer.canAssignTask(task, worker) - if result != tt.expected { - t.Errorf("canAssignTask() = %v, want %v. %s", result, tt.expected, tt.desc) - } - }) - } -} - -func TestAdminServer_CreateTaskImpact(t *testing.T) { - adminServer := NewAdminServer(DefaultAdminConfig(), nil) - adminServer.volumeStateManager = NewVolumeStateManager(nil) - - // Setup volume state for EC task - volumeID := uint32(1) - adminServer.volumeStateManager.volumes[volumeID] = &VolumeState{ - VolumeID: volumeID, - CurrentState: &VolumeInfo{ - ID: volumeID, - Size: 25 * 1024 * 1024 * 1024, // 25GB - }, - } - - task := &types.Task{ - ID: "ec_task_1", - Type: types.TaskTypeErasureCoding, - VolumeID: volumeID, - Server: "server1", - } - - impact := adminServer.createTaskImpact(task, "worker1") - - // Verify impact structure - if impact.TaskID != "ec_task_1" { - t.Errorf("Expected task ID ec_task_1, got %s", impact.TaskID) - } - - if impact.TaskType != types.TaskTypeErasureCoding { - t.Errorf("Expected task type %v, got %v", types.TaskTypeErasureCoding, impact.TaskType) - } - - // Verify volume changes for EC task - if !impact.VolumeChanges.WillBecomeReadOnly { - t.Error("Expected volume to become read-only after EC") - } - - // Verify capacity delta (EC should require ~40% more space) - expectedCapacity := int64(float64(25*1024*1024*1024) * 1.4) // ~35GB - actualCapacity := impact.CapacityDelta["server1"] - if actualCapacity != expectedCapacity { - t.Errorf("Expected capacity delta %d, got %d", expectedCapacity, actualCapacity) - } - - // Verify shard changes (should plan 14 shards) - if len(impact.ShardChanges) != 14 { - t.Errorf("Expected 14 shard changes, got %d", len(impact.ShardChanges)) - } - - for i := 0; i < 14; i++ { - shardChange := impact.ShardChanges[i] - if shardChange == nil { - t.Errorf("Missing shard change for shard %d", i) - continue - } - - if !shardChange.WillBeCreated { - t.Errorf("Shard %d should be marked for creation", i) - } - } - - t.Log("✅ Task impact creation test passed") -} - -func TestAdminServer_TaskCompletionStateCleanup(t *testing.T) { - adminServer := NewAdminServer(DefaultAdminConfig(), nil) - adminServer.workerRegistry = NewWorkerRegistry() - adminServer.volumeStateManager = NewVolumeStateManager(nil) - adminServer.inProgressTasks = make(map[string]*InProgressTask) - - // Setup worker - worker := &types.Worker{ - ID: "worker1", - CurrentLoad: 1, // Has 1 task assigned - } - adminServer.workerRegistry.RegisterWorker(worker) - - // Setup in-progress task - task := &types.Task{ - ID: "test_task_1", - Type: types.TaskTypeVacuum, - VolumeID: 1, - } - - inProgressTask := &InProgressTask{ - Task: task, - WorkerID: "worker1", - VolumeReserved: true, - } - adminServer.inProgressTasks["test_task_1"] = inProgressTask - - // Register impact in state manager - impact := &TaskImpact{ - TaskID: "test_task_1", - VolumeID: 1, - CapacityDelta: map[string]int64{"server1": -100 * 1024 * 1024}, // 100MB savings - } - adminServer.volumeStateManager.RegisterTaskImpact("test_task_1", impact) - - // Complete the task - err := adminServer.CompleteTask("test_task_1", true, "") - if err != nil { - t.Errorf("Task completion failed: %v", err) - } - - // Verify cleanup - if len(adminServer.inProgressTasks) != 0 { - t.Errorf("Expected 0 in-progress tasks after completion, got %d", len(adminServer.inProgressTasks)) - } - - // Verify worker load updated - updatedWorker, _ := adminServer.workerRegistry.GetWorker("worker1") - if updatedWorker.CurrentLoad != 0 { - t.Errorf("Expected worker load 0 after task completion, got %d", updatedWorker.CurrentLoad) - } - - // Verify state manager cleaned up - if len(adminServer.volumeStateManager.inProgressTasks) != 0 { - t.Errorf("Expected 0 tasks in state manager after completion, got %d", len(adminServer.volumeStateManager.inProgressTasks)) - } - - t.Log("✅ Task completion state cleanup test passed") -} - -func TestAdminServer_PreventDuplicateTaskAssignment(t *testing.T) { - adminServer := NewAdminServer(DefaultAdminConfig(), nil) - adminServer.workerRegistry = NewWorkerRegistry() - adminServer.taskQueue = NewPriorityTaskQueue() - adminServer.volumeStateManager = NewVolumeStateManager(nil) - adminServer.inProgressTasks = make(map[string]*InProgressTask) - - // Setup worker - worker := &types.Worker{ - ID: "worker1", - Capabilities: []types.TaskType{types.TaskTypeVacuum}, - MaxConcurrent: 2, - Status: "active", - CurrentLoad: 0, - } - adminServer.workerRegistry.RegisterWorker(worker) - - // Setup volume state - volumeID := uint32(1) - adminServer.volumeStateManager.volumes[volumeID] = &VolumeState{ - VolumeID: volumeID, - CurrentState: &VolumeInfo{ID: volumeID, Size: 1024 * 1024 * 1024}, - } - - // Create first task and assign it - task1 := &types.Task{ - ID: "vacuum_task_1", - Type: types.TaskTypeVacuum, - VolumeID: volumeID, - Priority: types.TaskPriorityNormal, - } - - adminServer.taskQueue.Push(task1) - assignedTask1, err := adminServer.RequestTask("worker1", []types.TaskType{types.TaskTypeVacuum}) - if err != nil || assignedTask1 == nil { - t.Fatal("First task assignment failed") - } - - // Try to assign another vacuum task for the same volume - task2 := &types.Task{ - ID: "vacuum_task_2", - Type: types.TaskTypeVacuum, - VolumeID: volumeID, // Same volume! - Priority: types.TaskPriorityNormal, - } - - adminServer.taskQueue.Push(task2) - assignedTask2, err := adminServer.RequestTask("worker1", []types.TaskType{types.TaskTypeVacuum}) - - // Should not assign duplicate task - if assignedTask2 != nil { - t.Error("Should not assign duplicate vacuum task for same volume") - } - - t.Log("✅ Duplicate task prevention test passed") -} - -func TestAdminServer_SystemStats(t *testing.T) { - adminServer := NewAdminServer(DefaultAdminConfig(), nil) - adminServer.workerRegistry = NewWorkerRegistry() - adminServer.taskQueue = NewPriorityTaskQueue() - adminServer.volumeStateManager = NewVolumeStateManager(nil) - adminServer.inProgressTasks = make(map[string]*InProgressTask) - adminServer.running = true - - // Add some test data - worker := &types.Worker{ID: "worker1", Status: "active"} - adminServer.workerRegistry.RegisterWorker(worker) - - task := &types.Task{ID: "task1", Type: types.TaskTypeErasureCoding} - adminServer.taskQueue.Push(task) - - inProgressTask := &InProgressTask{ - Task: &types.Task{ID: "task2", Type: types.TaskTypeVacuum}, - } - adminServer.inProgressTasks["task2"] = inProgressTask - - // Get system stats - stats := adminServer.GetSystemStats() - - // Verify stats structure - if !stats["running"].(bool) { - t.Error("Expected running to be true") - } - - if stats["in_progress_tasks"].(int) != 1 { - t.Errorf("Expected 1 in-progress task, got %d", stats["in_progress_tasks"].(int)) - } - - if stats["queued_tasks"].(int) != 1 { - t.Errorf("Expected 1 queued task, got %d", stats["queued_tasks"].(int)) - } - - // Check task breakdown - tasksByType := stats["tasks_by_type"].(map[types.TaskType]int) - if tasksByType[types.TaskTypeVacuum] != 1 { - t.Errorf("Expected 1 vacuum task, got %d", tasksByType[types.TaskTypeVacuum]) - } - - t.Log("✅ System stats test passed") -} - -func TestAdminServer_VolumeStateIntegration(t *testing.T) { - // Integration test: Verify admin server correctly uses volume state for decisions - adminServer := NewAdminServer(DefaultAdminConfig(), nil) - adminServer.workerRegistry = NewWorkerRegistry() - adminServer.taskQueue = NewPriorityTaskQueue() - adminServer.volumeStateManager = NewVolumeStateManager(nil) - adminServer.inProgressTasks = make(map[string]*InProgressTask) - - // Setup worker - worker := &types.Worker{ - ID: "worker1", - Address: "server1", - Capabilities: []types.TaskType{types.TaskTypeErasureCoding}, - MaxConcurrent: 1, - Status: "active", - CurrentLoad: 0, - } - adminServer.workerRegistry.RegisterWorker(worker) - - // Setup volume and capacity that would normally allow EC - volumeID := uint32(1) - adminServer.volumeStateManager.volumes[volumeID] = &VolumeState{ - VolumeID: volumeID, - CurrentState: &VolumeInfo{ - ID: volumeID, - Size: 25 * 1024 * 1024 * 1024, // 25GB - Server: "server1", - }, - } - - adminServer.volumeStateManager.capacityCache["server1"] = &CapacityInfo{ - Server: "server1", - TotalCapacity: 100 * 1024 * 1024 * 1024, // 100GB - UsedCapacity: 20 * 1024 * 1024 * 1024, // 20GB used - PredictedUsage: 20 * 1024 * 1024 * 1024, // 80GB available - } - - // Create EC task - task := &types.Task{ - ID: "ec_task_1", - Type: types.TaskTypeErasureCoding, - VolumeID: volumeID, - Server: "server1", - } - - adminServer.taskQueue.Push(task) - - // First assignment should work - assignedTask1, err := adminServer.RequestTask("worker1", []types.TaskType{types.TaskTypeErasureCoding}) - if err != nil || assignedTask1 == nil { - t.Fatal("First EC task assignment should succeed") - } - - // Verify capacity is now reserved - capacity := adminServer.volumeStateManager.GetAccurateCapacity("server1") - if capacity.ReservedCapacity <= 0 { - t.Error("Expected capacity to be reserved for first EC task") - } - - // Try to assign another large EC task - should fail due to capacity - volumeID2 := uint32(2) - adminServer.volumeStateManager.volumes[volumeID2] = &VolumeState{ - VolumeID: volumeID2, - CurrentState: &VolumeInfo{ - ID: volumeID2, - Size: 30 * 1024 * 1024 * 1024, // 30GB - would need 42GB for EC - Server: "server1", - }, - } - - task2 := &types.Task{ - ID: "ec_task_2", - Type: types.TaskTypeErasureCoding, - VolumeID: volumeID2, - Server: "server1", - } - - adminServer.taskQueue.Push(task2) - - // Add another worker to test capacity-based rejection - worker2 := &types.Worker{ - ID: "worker2", - Address: "server1", - Capabilities: []types.TaskType{types.TaskTypeErasureCoding}, - MaxConcurrent: 1, - Status: "active", - CurrentLoad: 0, - } - adminServer.workerRegistry.RegisterWorker(worker2) - - assignedTask2, err := adminServer.RequestTask("worker2", []types.TaskType{types.TaskTypeErasureCoding}) - - // Should not assign due to insufficient capacity - if assignedTask2 != nil { - t.Error("Should not assign second EC task due to insufficient server capacity") - } - - t.Log("✅ Volume state integration test passed") - t.Log("✅ Admin server correctly uses comprehensive state for task assignment decisions") -} - -// Benchmark for task assignment performance -func BenchmarkAdminServer_RequestTask(b *testing.B) { - adminServer := NewAdminServer(DefaultAdminConfig(), nil) - adminServer.workerRegistry = NewWorkerRegistry() - adminServer.taskQueue = NewPriorityTaskQueue() - adminServer.volumeStateManager = NewVolumeStateManager(nil) - adminServer.inProgressTasks = make(map[string]*InProgressTask) - - // Setup worker - worker := &types.Worker{ - ID: "bench_worker", - Capabilities: []types.TaskType{types.TaskTypeVacuum}, - MaxConcurrent: 1000, // High limit for benchmark - Status: "active", - CurrentLoad: 0, - } - adminServer.workerRegistry.RegisterWorker(worker) - - // Setup many tasks - for i := 0; i < 1000; i++ { - volumeID := uint32(i + 1) - adminServer.volumeStateManager.volumes[volumeID] = &VolumeState{ - VolumeID: volumeID, - CurrentState: &VolumeInfo{ID: volumeID, Size: 1024 * 1024 * 1024}, - } - - task := &types.Task{ - ID: fmt.Sprintf("task_%d", i), - Type: types.TaskTypeVacuum, - VolumeID: volumeID, - } - adminServer.taskQueue.Push(task) - } - - b.ResetTimer() - - for i := 0; i < b.N; i++ { - adminServer.RequestTask("bench_worker", []types.TaskType{types.TaskTypeVacuum}) - } -} diff --git a/weed/admin/task/compilation_stubs.go b/weed/admin/task/compilation_stubs.go deleted file mode 100644 index 2c90361dd..000000000 --- a/weed/admin/task/compilation_stubs.go +++ /dev/null @@ -1,90 +0,0 @@ -package task - -import ( - "time" - - "github.com/seaweedfs/seaweedfs/weed/pb/worker_pb" - "github.com/seaweedfs/seaweedfs/weed/worker/types" -) - -// Compilation stubs for missing types and functions - -// Task is an alias for types.Task for backward compatibility -type Task = types.Task - -// TaskType is an alias for types.TaskType for backward compatibility -type TaskType = types.TaskType - -// TaskStatus is an alias for types.TaskStatus for backward compatibility -type TaskStatus = types.TaskStatus - -// TaskPriority is an alias for types.TaskPriority for backward compatibility -type TaskPriority = types.TaskPriority - -// Additional type aliases for compilation -var ( - TaskStatusCompleted = types.TaskStatusCompleted - TaskStatusFailed = types.TaskStatusFailed -) - -// Worker represents a worker node -type Worker struct { - ID string - Address string - Capabilities []string - Status string - LastSeen time.Time -} - -// convertAdminToWorkerMessage converts AdminMessage to WorkerMessage for stream compatibility -func convertAdminToWorkerMessage(msg *worker_pb.AdminMessage) *worker_pb.WorkerMessage { - // This is a workaround for the stream type mismatch - // In a real implementation, this would need proper message conversion - return &worker_pb.WorkerMessage{ - WorkerId: msg.AdminId, - Timestamp: msg.Timestamp, - // Add basic message conversion logic here - } -} - -// WorkerRegistry stub methods -func (wr *WorkerRegistry) UpdateWorkerStatus(workerID string, status interface{}) { - // Stub implementation -} - -// AdminServer stub methods -func (as *AdminServer) AssignTaskToWorker(workerID string) *Task { - // Stub implementation - return nil -} - -// DefaultAdminConfig returns default admin server configuration -func DefaultAdminConfig() *AdminConfig { - return &AdminConfig{ - ScanInterval: 30 * time.Minute, - WorkerTimeout: 5 * time.Minute, - TaskTimeout: 10 * time.Minute, - MaxRetries: 3, - ReconcileInterval: 5 * time.Minute, - EnableFailureRecovery: true, - MaxConcurrentTasks: 10, - } -} - -// SyncWithMasterData is a stub for the volume state manager -func (vsm *VolumeStateManager) SyncWithMasterData(volumes map[uint32]*VolumeInfo, ecShards map[uint32]map[int]*ShardInfo, serverCapacity map[string]*CapacityInfo) error { - // Stub implementation - would normally sync the data - return nil -} - -// GetAllVolumeStates is a stub for the volume state manager -func (vsm *VolumeStateManager) GetAllVolumeStates() map[uint32]*VolumeState { - // Stub implementation - return empty map - return make(map[uint32]*VolumeState) -} - -// DetectInconsistencies is a stub for the volume state manager -func (vsm *VolumeStateManager) DetectInconsistencies() []StateInconsistency { - // Stub implementation - return empty slice - return []StateInconsistency{} -} diff --git a/weed/admin/task/ec_integration_test.go b/weed/admin/task/ec_integration_test.go deleted file mode 100644 index d614495c0..000000000 --- a/weed/admin/task/ec_integration_test.go +++ /dev/null @@ -1,309 +0,0 @@ -package task - -import ( - "os" - "path/filepath" - "testing" - "time" - - ec_task "github.com/seaweedfs/seaweedfs/weed/worker/tasks/erasure_coding" - "github.com/seaweedfs/seaweedfs/weed/worker/types" -) - -// TestECIntegration tests the EC implementation with the admin server -func TestECIntegration(t *testing.T) { - t.Logf("Starting EC integration test") - - // Step 1: Create admin server - config := &MinimalAdminConfig{ - ScanInterval: 10 * time.Second, - WorkerTimeout: 30 * time.Second, - TaskTimeout: 30 * time.Minute, // EC takes longer - MaxRetries: 3, - ReconcileInterval: 5 * time.Minute, - EnableFailureRecovery: true, - MaxConcurrentTasks: 2, // Limit concurrency for EC tasks - } - - adminServer := NewMinimalAdminServer(config, nil) - err := adminServer.Start() - if err != nil { - t.Fatalf("Failed to start admin server: %v", err) - } - defer adminServer.Stop() - - // Step 2: Register an EC-capable worker - worker := &types.Worker{ - ID: "ec-worker-1", - Address: "localhost:9001", - Capabilities: []types.TaskType{types.TaskTypeErasureCoding}, - MaxConcurrent: 1, - Status: "active", - CurrentLoad: 0, - LastHeartbeat: time.Now(), - } - - err = adminServer.RegisterWorker(worker) - if err != nil { - t.Fatalf("Failed to register EC worker: %v", err) - } - t.Logf("Successfully registered EC worker %s", worker.ID) - - // Step 3: Create an EC task - ecTask := &types.Task{ - ID: "ec-task-1", - Type: types.TaskTypeErasureCoding, - VolumeID: 12345, - Server: "localhost:8080", - Status: types.TaskStatusPending, - Priority: types.TaskPriorityHigh, - Parameters: map[string]interface{}{ - "volume_size": int64(32 * 1024 * 1024 * 1024), // 32GB - "master_client": "localhost:9333", - "work_dir": "/tmp/seaweedfs_ec_work", - "collection": "test", - }, - CreatedAt: time.Now(), - } - - err = adminServer.QueueTask(ecTask) - if err != nil { - t.Fatalf("Failed to queue EC task: %v", err) - } - t.Logf("Successfully queued EC task %s for volume %d", ecTask.ID, ecTask.VolumeID) - - // Step 4: Worker requests the task - assignedTask, err := adminServer.RequestTask("ec-worker-1", []types.TaskType{types.TaskTypeErasureCoding}) - if err != nil { - t.Fatalf("Failed to request EC task: %v", err) - } - - if assignedTask != nil { - t.Logf("EC worker got task: %s (%s) for volume %d", - assignedTask.ID, assignedTask.Type, assignedTask.VolumeID) - - // Step 5: Simulate EC task execution phases - t.Logf("Simulating EC task execution phases") - - // Phase 1: Copying volume data - err = adminServer.UpdateTaskProgress(assignedTask.ID, 15.0) - if err != nil { - t.Errorf("Failed to update progress (copying): %v", err) - } - t.Logf("Phase 1: Volume data copied to local disk") - - // Phase 2: Marking read-only - err = adminServer.UpdateTaskProgress(assignedTask.ID, 25.0) - if err != nil { - t.Errorf("Failed to update progress (read-only): %v", err) - } - t.Logf("Phase 2: Source volume marked as read-only") - - // Phase 3: Local EC encoding - err = adminServer.UpdateTaskProgress(assignedTask.ID, 60.0) - if err != nil { - t.Errorf("Failed to update progress (encoding): %v", err) - } - t.Logf("Phase 3: Local Reed-Solomon encoding completed (10+4 shards)") - - // Phase 4: Calculating optimal placement - err = adminServer.UpdateTaskProgress(assignedTask.ID, 70.0) - if err != nil { - t.Errorf("Failed to update progress (placement): %v", err) - } - t.Logf("Phase 4: Optimal shard placement calculated with affinity") - - // Phase 5: Distributing shards - err = adminServer.UpdateTaskProgress(assignedTask.ID, 90.0) - if err != nil { - t.Errorf("Failed to update progress (distribution): %v", err) - } - t.Logf("Phase 5: Shards distributed across servers with rack diversity") - - // Phase 6: Verification and cleanup - err = adminServer.UpdateTaskProgress(assignedTask.ID, 100.0) - if err != nil { - t.Errorf("Failed to update progress (completion): %v", err) - } - t.Logf("Phase 6: Verification and cleanup completed") - - // Step 6: Complete the task - err = adminServer.CompleteTask(assignedTask.ID, true, "") - if err != nil { - t.Errorf("Failed to complete EC task: %v", err) - } - t.Logf("Successfully completed EC task %s", assignedTask.ID) - } else { - t.Logf("No EC task was assigned (expected in test environment)") - } - - // Step 7: Verify task completion - stats := adminServer.GetSystemStats() - t.Logf("Final stats: Active tasks=%d, Queued tasks=%d, Active workers=%d, Total tasks=%d", - stats.ActiveTasks, stats.QueuedTasks, stats.ActiveWorkers, stats.TotalTasks) - - history := adminServer.GetTaskHistory() - t.Logf("Task history contains %d completed tasks", len(history)) - - if len(history) > 0 { - lastEntry := history[len(history)-1] - t.Logf("Last completed task: %s (%s) - Duration: %v", - lastEntry.TaskID, lastEntry.TaskType, lastEntry.Duration) - - if lastEntry.TaskType == types.TaskTypeErasureCoding { - t.Logf("EC task completed successfully") - } - } - - t.Logf("EC integration test completed successfully") -} - -// TestECTaskValidation tests the EC task validation -func TestECTaskValidation(t *testing.T) { - t.Logf("Testing EC task validation") - - // Create a temporary work directory - workDir := filepath.Join(os.TempDir(), "seaweedfs_ec_test") - err := os.MkdirAll(workDir, 0755) - if err != nil { - t.Fatalf("Failed to create work directory: %v", err) - } - defer os.RemoveAll(workDir) - - // Create EC task - ecTask := ec_task.NewTaskWithParams( - "localhost:8080", // source server - 12345, // volume ID - "localhost:9333", // master client - workDir, // work directory - ) - - // Test validation with valid parameters - validParams := types.TaskParams{ - VolumeID: 12345, - Server: "localhost:8080", - Collection: "test", - Parameters: map[string]interface{}{ - "volume_size": int64(32 * 1024 * 1024 * 1024), - }, - } - - err = ecTask.Validate(validParams) - if err != nil { - t.Errorf("Valid parameters should pass validation: %v", err) - } - - // Test validation with invalid parameters - invalidParams := types.TaskParams{ - VolumeID: 0, // Invalid volume ID - Server: "", // Empty server - } - - err = ecTask.Validate(invalidParams) - if err == nil { - t.Errorf("Invalid parameters should fail validation") - } - - // Test time estimation - estimatedTime := ecTask.EstimateTime(validParams) - t.Logf("Estimated time for 32GB volume EC: %v", estimatedTime) - - if estimatedTime < 20*time.Minute { - t.Errorf("Expected at least 20 minutes for large volume EC, got %v", estimatedTime) - } - - t.Logf("EC task validation completed successfully") -} - -// TestECFeatures tests specific EC features -func TestECFeatures(t *testing.T) { - t.Logf("Testing EC features") - - // Create temporary work directory - workDir := filepath.Join(os.TempDir(), "seaweedfs_ec_features_test") - err := os.MkdirAll(workDir, 0755) - if err != nil { - t.Fatalf("Failed to create work directory: %v", err) - } - defer os.RemoveAll(workDir) - - ecTask := ec_task.NewTaskWithParams( - "localhost:8080", - 54321, - "localhost:9333", - workDir, - ) - - // Test step tracking - t.Logf("Testing step tracking functionality") - - currentStep := ecTask.GetCurrentStep() - t.Logf("Initial current step: %s", currentStep) - - progress := ecTask.GetProgress() - t.Logf("Initial progress: %.1f%%", progress) - - // Test parameter extraction - params := types.TaskParams{ - VolumeID: 54321, - Server: "localhost:8080", - Collection: "features_test", - Parameters: map[string]interface{}{ - "volume_size": int64(64 * 1024 * 1024 * 1024), // 64GB - "data_shards": 10, - "parity_shards": 4, - "affinity_zones": []string{"zone-a", "zone-b", "zone-c"}, - }, - } - - estimatedTime := ecTask.EstimateTime(params) - expectedMinTime := time.Duration(64*2) * time.Minute // 2 minutes per GB - - t.Logf("64GB volume estimated time: %v (expected minimum: %v)", estimatedTime, expectedMinTime) - - if estimatedTime < expectedMinTime { - t.Errorf("Time estimate seems too low for 64GB volume") - } - - t.Logf("EC features test completed successfully") -} - -// TestECTaskComparison tests EC implementation features -func TestECTaskComparison(t *testing.T) { - t.Logf("Testing EC implementation features") - - // EC task estimation - params := types.TaskParams{ - VolumeID: 11111, - Server: "localhost:8080", - Parameters: map[string]interface{}{ - "volume_size": int64(30 * 1024 * 1024 * 1024), // 30GB - }, - } - - // Create task - workDir := filepath.Join(os.TempDir(), "seaweedfs_ec_comparison") - defer os.RemoveAll(workDir) - - ecTask := ec_task.NewTaskWithParams( - "localhost:8080", - 22222, - "localhost:9333", - workDir, - ) - estimatedTime := ecTask.EstimateTime(params) - - t.Logf("EC task estimated time: %v", estimatedTime) - - // Test feature capabilities - t.Logf("EC implementation features:") - t.Logf(" - Local volume data copying with progress tracking") - t.Logf(" - Local Reed-Solomon encoding (10+4 shards)") - t.Logf(" - Intelligent shard placement with rack awareness") - t.Logf(" - Load balancing across available servers") - t.Logf(" - Backup server selection for redundancy") - t.Logf(" - Detailed step-by-step progress tracking") - t.Logf(" - Comprehensive error handling and recovery") - - t.Logf("EC implementation test completed successfully") -} diff --git a/weed/admin/task/ec_test_standalone/enhanced_ec_integration_test.go b/weed/admin/task/ec_test_standalone/enhanced_ec_integration_test.go deleted file mode 100644 index 37132d858..000000000 --- a/weed/admin/task/ec_test_standalone/enhanced_ec_integration_test.go +++ /dev/null @@ -1,324 +0,0 @@ -package task - -import ( - "os" - "path/filepath" - "testing" - "time" - - ec_task "github.com/seaweedfs/seaweedfs/weed/worker/tasks/erasure_coding" - "github.com/seaweedfs/seaweedfs/weed/worker/types" -) - -// TestEnhancedECIntegration tests the enhanced EC implementation with the admin server -func TestEnhancedECIntegration(t *testing.T) { - t.Logf("Starting enhanced EC integration test") - - // Step 1: Create admin server - config := &MinimalAdminConfig{ - ScanInterval: 10 * time.Second, - WorkerTimeout: 30 * time.Second, - TaskTimeout: 30 * time.Minute, // EC takes longer - MaxRetries: 3, - ReconcileInterval: 5 * time.Minute, - EnableFailureRecovery: true, - MaxConcurrentTasks: 2, // Limit concurrency for EC tasks - } - - adminServer := NewMinimalAdminServer(config, nil) - err := adminServer.Start() - if err != nil { - t.Fatalf("Failed to start admin server: %v", err) - } - defer adminServer.Stop() - - // Step 2: Register an EC-capable worker - worker := &types.Worker{ - ID: "ec-worker-1", - Address: "localhost:9001", - Capabilities: []types.TaskType{types.TaskTypeErasureCoding}, - MaxConcurrent: 1, - Status: "active", - CurrentLoad: 0, - LastHeartbeat: time.Now(), - } - - err = adminServer.RegisterWorker(worker) - if err != nil { - t.Fatalf("Failed to register EC worker: %v", err) - } - t.Logf("Successfully registered EC worker %s", worker.ID) - - // Step 3: Create an EC task - ecTask := &types.Task{ - ID: "enhanced-ec-task-1", - Type: types.TaskTypeErasureCoding, - VolumeID: 12345, - Server: "localhost:8080", - Status: types.TaskStatusPending, - Priority: types.TaskPriorityHigh, - Parameters: map[string]interface{}{ - "volume_size": int64(32 * 1024 * 1024 * 1024), // 32GB - "master_client": "localhost:9333", - "work_dir": "/tmp/seaweedfs_ec_work", - "collection": "test", - }, - CreatedAt: time.Now(), - } - - err = adminServer.QueueTask(ecTask) - if err != nil { - t.Fatalf("Failed to queue EC task: %v", err) - } - t.Logf("Successfully queued enhanced EC task %s for volume %d", ecTask.ID, ecTask.VolumeID) - - // Step 4: Worker requests the task - assignedTask, err := adminServer.RequestTask("ec-worker-1", []types.TaskType{types.TaskTypeErasureCoding}) - if err != nil { - t.Fatalf("Failed to request EC task: %v", err) - } - - if assignedTask != nil { - t.Logf("EC worker got task: %s (%s) for volume %d", - assignedTask.ID, assignedTask.Type, assignedTask.VolumeID) - - // Step 5: Simulate enhanced EC task execution progress - t.Logf("Simulating enhanced EC task execution phases") - - // Phase 1: Copying volume data - err = adminServer.UpdateTaskProgress(assignedTask.ID, 15.0) - if err != nil { - t.Errorf("Failed to update progress (copying): %v", err) - } - t.Logf("Phase 1: Volume data copied to local disk") - - // Phase 2: Marking read-only - err = adminServer.UpdateTaskProgress(assignedTask.ID, 25.0) - if err != nil { - t.Errorf("Failed to update progress (read-only): %v", err) - } - t.Logf("Phase 2: Source volume marked as read-only") - - // Phase 3: Local EC encoding - err = adminServer.UpdateTaskProgress(assignedTask.ID, 60.0) - if err != nil { - t.Errorf("Failed to update progress (encoding): %v", err) - } - t.Logf("Phase 3: Local Reed-Solomon encoding completed (10+4 shards)") - - // Phase 4: Calculating optimal placement - err = adminServer.UpdateTaskProgress(assignedTask.ID, 70.0) - if err != nil { - t.Errorf("Failed to update progress (placement): %v", err) - } - t.Logf("Phase 4: Optimal shard placement calculated with affinity") - - // Phase 5: Distributing shards - err = adminServer.UpdateTaskProgress(assignedTask.ID, 90.0) - if err != nil { - t.Errorf("Failed to update progress (distribution): %v", err) - } - t.Logf("Phase 5: Shards distributed across servers with rack diversity") - - // Phase 6: Verification and cleanup - err = adminServer.UpdateTaskProgress(assignedTask.ID, 100.0) - if err != nil { - t.Errorf("Failed to update progress (completion): %v", err) - } - t.Logf("Phase 6: Verification and cleanup completed") - - // Step 6: Complete the task - err = adminServer.CompleteTask(assignedTask.ID, true, "") - if err != nil { - t.Errorf("Failed to complete EC task: %v", err) - } - t.Logf("Successfully completed enhanced EC task %s", assignedTask.ID) - } else { - t.Logf("No EC task was assigned (expected in test environment)") - } - - // Step 7: Verify task completion - stats := adminServer.GetSystemStats() - t.Logf("Final stats: Active tasks=%d, Queued tasks=%d, Active workers=%d, Total tasks=%d", - stats.ActiveTasks, stats.QueuedTasks, stats.ActiveWorkers, stats.TotalTasks) - - history := adminServer.GetTaskHistory() - t.Logf("Task history contains %d completed tasks", len(history)) - - if len(history) > 0 { - lastEntry := history[len(history)-1] - t.Logf("Last completed task: %s (%s) - Duration: %v", - lastEntry.TaskID, lastEntry.TaskType, lastEntry.Duration) - - if lastEntry.TaskType == types.TaskTypeErasureCoding { - t.Logf("Enhanced EC task completed successfully") - } - } - - t.Logf("Enhanced EC integration test completed successfully") -} - -// TestEnhancedECTaskValidation tests the enhanced EC task validation -func TestEnhancedECTaskValidation(t *testing.T) { - t.Logf("Testing enhanced EC task validation") - - // Create a temporary work directory - workDir := filepath.Join(os.TempDir(), "seaweedfs_ec_test") - err := os.MkdirAll(workDir, 0755) - if err != nil { - t.Fatalf("Failed to create work directory: %v", err) - } - defer os.RemoveAll(workDir) - - // Create enhanced EC task - enhancedTask := ec_task.NewEnhancedECTask( - "localhost:8080", // source server - 12345, // volume ID - "localhost:9333", // master client - workDir, // work directory - ) - - // Test validation with valid parameters - validParams := types.TaskParams{ - VolumeID: 12345, - Server: "localhost:8080", - Collection: "test", - Parameters: map[string]interface{}{ - "volume_size": int64(32 * 1024 * 1024 * 1024), - }, - } - - err = enhancedTask.Validate(validParams) - if err != nil { - t.Errorf("Valid parameters should pass validation: %v", err) - } - - // Test validation with invalid parameters - invalidParams := types.TaskParams{ - VolumeID: 0, // Invalid volume ID - Server: "", // Empty server - } - - err = enhancedTask.Validate(invalidParams) - if err == nil { - t.Errorf("Invalid parameters should fail validation") - } - - // Test time estimation - estimatedTime := enhancedTask.EstimateTime(validParams) - t.Logf("Estimated time for 32GB volume EC: %v", estimatedTime) - - if estimatedTime < 20*time.Minute { - t.Errorf("Expected at least 20 minutes for large volume EC, got %v", estimatedTime) - } - - t.Logf("Enhanced EC task validation completed successfully") -} - -// TestEnhancedECFeatures tests specific enhanced EC features -func TestEnhancedECFeatures(t *testing.T) { - t.Logf("Testing enhanced EC features") - - // Create temporary work directory - workDir := filepath.Join(os.TempDir(), "seaweedfs_ec_features_test") - err := os.MkdirAll(workDir, 0755) - if err != nil { - t.Fatalf("Failed to create work directory: %v", err) - } - defer os.RemoveAll(workDir) - - enhancedTask := ec_task.NewEnhancedECTask( - "localhost:8080", - 54321, - "localhost:9333", - workDir, - ) - - // Test step tracking - t.Logf("Testing step tracking functionality") - - currentStep := enhancedTask.GetCurrentStep() - t.Logf("Initial current step: %s", currentStep) - - progress := enhancedTask.GetProgress() - t.Logf("Initial progress: %.1f%%", progress) - - // Test parameter extraction - params := types.TaskParams{ - VolumeID: 54321, - Server: "localhost:8080", - Collection: "enhanced_test", - Parameters: map[string]interface{}{ - "volume_size": int64(64 * 1024 * 1024 * 1024), // 64GB - "data_shards": 10, - "parity_shards": 4, - "affinity_zones": []string{"zone-a", "zone-b", "zone-c"}, - }, - } - - estimatedTime := enhancedTask.EstimateTime(params) - expectedMinTime := time.Duration(64*2) * time.Minute // 2 minutes per GB - - t.Logf("64GB volume estimated time: %v (expected minimum: %v)", estimatedTime, expectedMinTime) - - if estimatedTime < expectedMinTime { - t.Errorf("Time estimate seems too low for 64GB volume") - } - - t.Logf("Enhanced EC features test completed successfully") -} - -// TestECTaskComparison compares basic vs enhanced EC implementations -func TestECTaskComparison(t *testing.T) { - t.Logf("Comparing basic vs enhanced EC implementations") - - // Basic EC task estimation - basicParams := types.TaskParams{ - VolumeID: 11111, - Server: "localhost:8080", - Parameters: map[string]interface{}{ - "volume_size": int64(30 * 1024 * 1024 * 1024), // 30GB - }, - } - - // Create basic task (existing implementation) - basicTask := ec_task.NewTask("localhost:8080", 11111) - basicTime := basicTask.EstimateTime(basicParams) - - // Create enhanced task - workDir := filepath.Join(os.TempDir(), "seaweedfs_ec_comparison") - defer os.RemoveAll(workDir) - - enhancedTask := ec_task.NewEnhancedECTask( - "localhost:8080", - 22222, - "localhost:9333", - workDir, - ) - enhancedTime := enhancedTask.EstimateTime(basicParams) - - t.Logf("Basic EC task estimated time: %v", basicTime) - t.Logf("Enhanced EC task estimated time: %v", enhancedTime) - - // Enhanced should take longer due to additional processing - if enhancedTime <= basicTime { - t.Logf("Note: Enhanced EC might take longer due to local processing and smart distribution") - } - - // Test feature differences - t.Logf("Basic EC features:") - t.Logf(" - Direct volume server EC generation") - t.Logf(" - Simple shard mounting") - t.Logf(" - No custom placement logic") - - t.Logf("Enhanced EC features:") - t.Logf(" - Local volume data copying") - t.Logf(" - Local Reed-Solomon encoding") - t.Logf(" - Intelligent shard placement with affinity") - t.Logf(" - Rack diversity for data shards") - t.Logf(" - Load balancing across servers") - t.Logf(" - Backup server selection") - t.Logf(" - Detailed progress tracking") - - t.Logf("EC task comparison completed successfully") -} diff --git a/weed/admin/task/ec_test_standalone/go.mod b/weed/admin/task/ec_test_standalone/go.mod deleted file mode 100644 index 8c09ecf5c..000000000 --- a/weed/admin/task/ec_test_standalone/go.mod +++ /dev/null @@ -1,3 +0,0 @@ -module ec_test - -go 1.24.1 diff --git a/weed/admin/task/ec_test_standalone/minimal_admin_server.go b/weed/admin/task/ec_test_standalone/minimal_admin_server.go deleted file mode 100644 index d7dbfcd96..000000000 --- a/weed/admin/task/ec_test_standalone/minimal_admin_server.go +++ /dev/null @@ -1,324 +0,0 @@ -package task - -import ( - "fmt" - "sync" - "time" - - "github.com/seaweedfs/seaweedfs/weed/wdclient" - "github.com/seaweedfs/seaweedfs/weed/worker/types" -) - -// MinimalAdminConfig contains configuration for the minimal admin server -type MinimalAdminConfig struct { - ScanInterval time.Duration - WorkerTimeout time.Duration - TaskTimeout time.Duration - MaxRetries int - ReconcileInterval time.Duration - EnableFailureRecovery bool - MaxConcurrentTasks int -} - -// MinimalAdminServer manages workers and tasks with a simple implementation -type MinimalAdminServer struct { - config *MinimalAdminConfig - masterClient *wdclient.MasterClient - running bool - mutex sync.RWMutex - - // Task management - tasks map[string]*types.Task - taskQueue []*types.Task - activeTasks map[string]*types.Task - - // Worker management - workers map[string]*types.Worker - workerStatus map[string]*types.WorkerStatus - - // Task history - taskHistory []MinimalTaskHistoryEntry -} - -// MinimalTaskHistoryEntry represents a single task history entry -type MinimalTaskHistoryEntry struct { - TaskID string - TaskType types.TaskType - VolumeID uint32 - WorkerID string - Status types.TaskStatus - StartedAt time.Time - CompletedAt time.Time - Duration time.Duration - ErrorMessage string -} - -// MinimalSystemStats represents system statistics -type MinimalSystemStats struct { - ActiveTasks int - QueuedTasks int - ActiveWorkers int - TotalTasks int -} - -// NewMinimalAdminServer creates a new minimal admin server -func NewMinimalAdminServer(config *MinimalAdminConfig, masterClient *wdclient.MasterClient) *MinimalAdminServer { - return &MinimalAdminServer{ - config: config, - masterClient: masterClient, - tasks: make(map[string]*types.Task), - taskQueue: make([]*types.Task, 0), - activeTasks: make(map[string]*types.Task), - workers: make(map[string]*types.Worker), - workerStatus: make(map[string]*types.WorkerStatus), - taskHistory: make([]MinimalTaskHistoryEntry, 0), - } -} - -// Start starts the minimal admin server -func (as *MinimalAdminServer) Start() error { - as.mutex.Lock() - defer as.mutex.Unlock() - - if as.running { - return fmt.Errorf("admin server is already running") - } - - as.running = true - return nil -} - -// Stop stops the minimal admin server -func (as *MinimalAdminServer) Stop() error { - as.mutex.Lock() - defer as.mutex.Unlock() - - as.running = false - return nil -} - -// RegisterWorker registers a new worker -func (as *MinimalAdminServer) RegisterWorker(worker *types.Worker) error { - as.mutex.Lock() - defer as.mutex.Unlock() - - if !as.running { - return fmt.Errorf("admin server is not running") - } - - as.workers[worker.ID] = worker - as.workerStatus[worker.ID] = &types.WorkerStatus{ - Status: "active", - CurrentLoad: 0, - } - - return nil -} - -// QueueTask adds a new task to the task queue -func (as *MinimalAdminServer) QueueTask(task *types.Task) error { - as.mutex.Lock() - defer as.mutex.Unlock() - - if !as.running { - return fmt.Errorf("admin server is not running") - } - - if task.ID == "" { - task.ID = fmt.Sprintf("task-%d", time.Now().UnixNano()) - } - - task.Status = types.TaskStatusPending - task.CreatedAt = time.Now() - - as.tasks[task.ID] = task - as.taskQueue = append(as.taskQueue, task) - - return nil -} - -// RequestTask requests a task for a worker -func (as *MinimalAdminServer) RequestTask(workerID string, capabilities []types.TaskType) (*types.Task, error) { - as.mutex.Lock() - defer as.mutex.Unlock() - - if !as.running { - return nil, fmt.Errorf("admin server is not running") - } - - // Check if worker exists - worker, exists := as.workers[workerID] - if !exists { - return nil, fmt.Errorf("worker %s not found", workerID) - } - - // Check if worker has capacity - status := as.workerStatus[workerID] - if status.CurrentLoad >= worker.MaxConcurrent { - return nil, nil // No capacity - } - - // Find a suitable task - for i, task := range as.taskQueue { - if task.Status != types.TaskStatusPending { - continue - } - - // Check if worker can handle this task type - canHandle := false - for _, capability := range capabilities { - if task.Type == capability { - canHandle = true - break - } - } - - if canHandle { - // Assign task to worker - task.Status = types.TaskStatusInProgress - task.WorkerID = workerID - now := time.Now() - task.StartedAt = &now - - // Move task from queue to active tasks - as.taskQueue = append(as.taskQueue[:i], as.taskQueue[i+1:]...) - as.activeTasks[task.ID] = task - - // Update worker load - status.CurrentLoad++ - - return task, nil - } - } - - return nil, nil // No suitable task found -} - -// UpdateTaskProgress updates task progress -func (as *MinimalAdminServer) UpdateTaskProgress(taskID string, progress float64) error { - as.mutex.Lock() - defer as.mutex.Unlock() - - task, exists := as.tasks[taskID] - if !exists { - return fmt.Errorf("task %s not found", taskID) - } - - task.Progress = progress - - return nil -} - -// CompleteTask marks a task as completed -func (as *MinimalAdminServer) CompleteTask(taskID string, success bool, errorMessage string) error { - as.mutex.Lock() - defer as.mutex.Unlock() - - task, exists := as.tasks[taskID] - if !exists { - return fmt.Errorf("task %s not found", taskID) - } - - // Update task status - if success { - task.Status = types.TaskStatusCompleted - } else { - task.Status = types.TaskStatusFailed - task.Error = errorMessage - } - - now := time.Now() - task.CompletedAt = &now - - // Remove from active tasks - delete(as.activeTasks, taskID) - - // Update worker load - if task.WorkerID != "" { - if status, exists := as.workerStatus[task.WorkerID]; exists { - status.CurrentLoad-- - } - } - - // Add to history - var duration time.Duration - if task.StartedAt != nil { - duration = now.Sub(*task.StartedAt) - } - - entry := MinimalTaskHistoryEntry{ - TaskID: task.ID, - TaskType: task.Type, - VolumeID: task.VolumeID, - WorkerID: task.WorkerID, - Status: task.Status, - StartedAt: *task.StartedAt, - CompletedAt: now, - Duration: duration, - ErrorMessage: errorMessage, - } - as.taskHistory = append(as.taskHistory, entry) - - return nil -} - -// UpdateWorkerHeartbeat updates worker heartbeat -func (as *MinimalAdminServer) UpdateWorkerHeartbeat(workerID string, status *types.WorkerStatus) error { - as.mutex.Lock() - defer as.mutex.Unlock() - - worker, exists := as.workers[workerID] - if !exists { - return fmt.Errorf("worker %s not found", workerID) - } - - worker.LastHeartbeat = time.Now() - as.workerStatus[workerID] = status - - return nil -} - -// GetSystemStats returns system statistics -func (as *MinimalAdminServer) GetSystemStats() *MinimalSystemStats { - as.mutex.RLock() - defer as.mutex.RUnlock() - - activeWorkers := 0 - for _, status := range as.workerStatus { - if status.Status == "active" { - activeWorkers++ - } - } - - return &MinimalSystemStats{ - ActiveTasks: len(as.activeTasks), - QueuedTasks: len(as.taskQueue), - ActiveWorkers: activeWorkers, - TotalTasks: len(as.tasks), - } -} - -// GetQueuedTaskCount returns the number of queued tasks -func (as *MinimalAdminServer) GetQueuedTaskCount() int { - as.mutex.RLock() - defer as.mutex.RUnlock() - return len(as.taskQueue) -} - -// GetActiveTaskCount returns the number of active tasks -func (as *MinimalAdminServer) GetActiveTaskCount() int { - as.mutex.RLock() - defer as.mutex.RUnlock() - return len(as.activeTasks) -} - -// GetTaskHistory returns task history -func (as *MinimalAdminServer) GetTaskHistory() []MinimalTaskHistoryEntry { - as.mutex.RLock() - defer as.mutex.RUnlock() - - // Return a copy of the history - history := make([]MinimalTaskHistoryEntry, len(as.taskHistory)) - copy(history, as.taskHistory) - return history -} diff --git a/weed/admin/task/ec_test_standalone/minimal_integration_test.go b/weed/admin/task/ec_test_standalone/minimal_integration_test.go deleted file mode 100644 index c690456ef..000000000 --- a/weed/admin/task/ec_test_standalone/minimal_integration_test.go +++ /dev/null @@ -1,434 +0,0 @@ -package task - -import ( - "fmt" - "testing" - "time" - - "github.com/seaweedfs/seaweedfs/weed/worker/types" -) - -// TestMinimalIntegration tests basic admin-worker operational flow using the minimal implementation -func TestMinimalIntegration(t *testing.T) { - t.Logf("Starting minimal integration test") - - // Step 1: Create a minimal admin server configuration - config := &MinimalAdminConfig{ - ScanInterval: 10 * time.Second, - WorkerTimeout: 30 * time.Second, - TaskTimeout: 2 * time.Hour, - MaxRetries: 3, - ReconcileInterval: 5 * time.Minute, - EnableFailureRecovery: true, - MaxConcurrentTasks: 5, - } - - // Step 2: Create minimal admin server with nil master client (for testing) - adminServer := NewMinimalAdminServer(config, nil) - - // Step 3: Start admin server - err := adminServer.Start() - if err != nil { - t.Fatalf("Failed to start admin server: %v", err) - } - defer adminServer.Stop() - - // Step 4: Test worker registration - t.Logf("Testing worker registration") - - worker := &types.Worker{ - ID: "test-worker-1", - Address: "localhost:9001", - Capabilities: []types.TaskType{types.TaskTypeVacuum}, - MaxConcurrent: 2, - Status: "active", - CurrentLoad: 0, - LastHeartbeat: time.Now(), - } - - err = adminServer.RegisterWorker(worker) - if err != nil { - t.Fatalf("Failed to register worker: %v", err) - } - t.Logf("Successfully registered worker %s", worker.ID) - - // Step 5: Test task queueing - t.Logf("Testing task queueing") - - task := &types.Task{ - ID: "test-task-1", - Type: types.TaskTypeVacuum, - VolumeID: 1001, - Server: "localhost:8080", - Status: types.TaskStatusPending, - Priority: types.TaskPriorityNormal, - Parameters: map[string]interface{}{ - "garbage_threshold": "0.3", - }, - CreatedAt: time.Now(), - } - - err = adminServer.QueueTask(task) - if err != nil { - t.Fatalf("Failed to queue task: %v", err) - } - t.Logf("Successfully queued task %s", task.ID) - - // Step 6: Test task request by worker - t.Logf("Testing task request") - - assignedTask, err := adminServer.RequestTask("test-worker-1", []types.TaskType{types.TaskTypeVacuum}) - if err != nil { - t.Fatalf("Failed to request task: %v", err) - } - - if assignedTask != nil { - t.Logf("Successfully assigned task %s to worker", assignedTask.ID) - - // Step 7: Test task progress updates - t.Logf("Testing task progress updates") - - err = adminServer.UpdateTaskProgress(assignedTask.ID, 25.0) - if err != nil { - t.Errorf("Failed to update task progress to 25%%: %v", err) - } - - err = adminServer.UpdateTaskProgress(assignedTask.ID, 50.0) - if err != nil { - t.Errorf("Failed to update task progress to 50%%: %v", err) - } - - err = adminServer.UpdateTaskProgress(assignedTask.ID, 75.0) - if err != nil { - t.Errorf("Failed to update task progress to 75%%: %v", err) - } - - err = adminServer.UpdateTaskProgress(assignedTask.ID, 100.0) - if err != nil { - t.Errorf("Failed to update task progress to 100%%: %v", err) - } - - // Step 8: Test task completion - t.Logf("Testing task completion") - - err = adminServer.CompleteTask(assignedTask.ID, true, "") - if err != nil { - t.Errorf("Failed to complete task: %v", err) - } - t.Logf("Successfully completed task %s", assignedTask.ID) - } else { - t.Logf("No task was assigned (queue might be empty)") - } - - // Step 9: Test basic metrics - t.Logf("Testing basic metrics") - - stats := adminServer.GetSystemStats() - if stats != nil { - t.Logf("System stats: Active tasks=%d, Queued tasks=%d, Active workers=%d, Total tasks=%d", - stats.ActiveTasks, stats.QueuedTasks, stats.ActiveWorkers, stats.TotalTasks) - } - - queuedCount := adminServer.GetQueuedTaskCount() - activeCount := adminServer.GetActiveTaskCount() - t.Logf("Queue status: %d queued, %d active tasks", queuedCount, activeCount) - - // Step 10: Test task history - history := adminServer.GetTaskHistory() - t.Logf("Task history contains %d entries", len(history)) - - if len(history) > 0 { - lastEntry := history[len(history)-1] - t.Logf("Last task in history: %s (%s) - Status: %s, Duration: %v", - lastEntry.TaskID, lastEntry.TaskType, lastEntry.Status, lastEntry.Duration) - } - - t.Logf("Minimal integration test completed successfully") -} - -// TestMinimalWorkerHeartbeat tests worker heartbeat functionality -func TestMinimalWorkerHeartbeat(t *testing.T) { - t.Logf("Testing minimal worker heartbeat") - - config := &MinimalAdminConfig{ - ScanInterval: 10 * time.Second, - WorkerTimeout: 30 * time.Second, - TaskTimeout: 2 * time.Hour, - MaxRetries: 3, - ReconcileInterval: 5 * time.Minute, - EnableFailureRecovery: true, - MaxConcurrentTasks: 5, - } - - adminServer := NewMinimalAdminServer(config, nil) - err := adminServer.Start() - if err != nil { - t.Fatalf("Failed to start admin server: %v", err) - } - defer adminServer.Stop() - - // Register a worker - worker := &types.Worker{ - ID: "heartbeat-worker", - Address: "localhost:9002", - Capabilities: []types.TaskType{types.TaskTypeVacuum}, - MaxConcurrent: 1, - Status: "active", - CurrentLoad: 0, - LastHeartbeat: time.Now(), - } - - err = adminServer.RegisterWorker(worker) - if err != nil { - t.Fatalf("Failed to register worker: %v", err) - } - - // Test heartbeat update - status := &types.WorkerStatus{ - Status: "active", - CurrentLoad: 0, - } - - err = adminServer.UpdateWorkerHeartbeat("heartbeat-worker", status) - if err != nil { - t.Errorf("Failed to update worker heartbeat: %v", err) - } - - t.Logf("Minimal worker heartbeat test completed successfully") -} - -// TestMinimalTaskQueueOperations tests task queue operations -func TestMinimalTaskQueueOperations(t *testing.T) { - t.Logf("Testing minimal task queue operations") - - config := &MinimalAdminConfig{ - ScanInterval: 10 * time.Second, - WorkerTimeout: 30 * time.Second, - TaskTimeout: 2 * time.Hour, - MaxRetries: 3, - ReconcileInterval: 5 * time.Minute, - EnableFailureRecovery: true, - MaxConcurrentTasks: 5, - } - - adminServer := NewMinimalAdminServer(config, nil) - err := adminServer.Start() - if err != nil { - t.Fatalf("Failed to start admin server: %v", err) - } - defer adminServer.Stop() - - // Test queuing multiple tasks - taskCount := 3 - for i := 0; i < taskCount; i++ { - task := &types.Task{ - ID: fmt.Sprintf("queue-test-task-%d", i), - Type: types.TaskTypeVacuum, - VolumeID: uint32(2000 + i), - Server: "localhost:8080", - Status: types.TaskStatusPending, - Priority: types.TaskPriorityNormal, - Parameters: map[string]interface{}{ - "garbage_threshold": "0.3", - }, - CreatedAt: time.Now(), - } - - err = adminServer.QueueTask(task) - if err != nil { - t.Errorf("Failed to queue task %d: %v", i, err) - } - } - - // Check queue size - queuedCount := adminServer.GetQueuedTaskCount() - if queuedCount != taskCount { - t.Errorf("Expected %d queued tasks, got %d", taskCount, queuedCount) - } - - t.Logf("Minimal task queue operations test completed successfully") -} - -// TestMinimalFullWorkflow tests the complete workflow from task creation to completion -func TestMinimalFullWorkflow(t *testing.T) { - t.Logf("Testing minimal full workflow") - - config := &MinimalAdminConfig{ - ScanInterval: 10 * time.Second, - WorkerTimeout: 30 * time.Second, - TaskTimeout: 2 * time.Hour, - MaxRetries: 3, - ReconcileInterval: 5 * time.Minute, - EnableFailureRecovery: true, - MaxConcurrentTasks: 5, - } - - adminServer := NewMinimalAdminServer(config, nil) - err := adminServer.Start() - if err != nil { - t.Fatalf("Failed to start admin server: %v", err) - } - defer adminServer.Stop() - - // Register multiple workers with different capabilities - workers := []*types.Worker{ - { - ID: "vacuum-worker-1", - Address: "localhost:9001", - Capabilities: []types.TaskType{types.TaskTypeVacuum}, - MaxConcurrent: 2, - Status: "active", - CurrentLoad: 0, - LastHeartbeat: time.Now(), - }, - { - ID: "ec-worker-1", - Address: "localhost:9002", - Capabilities: []types.TaskType{types.TaskTypeErasureCoding}, - MaxConcurrent: 1, - Status: "active", - CurrentLoad: 0, - LastHeartbeat: time.Now(), - }, - { - ID: "multi-worker-1", - Address: "localhost:9003", - Capabilities: []types.TaskType{types.TaskTypeVacuum, types.TaskTypeErasureCoding}, - MaxConcurrent: 3, - Status: "active", - CurrentLoad: 0, - LastHeartbeat: time.Now(), - }, - } - - for _, worker := range workers { - err = adminServer.RegisterWorker(worker) - if err != nil { - t.Fatalf("Failed to register worker %s: %v", worker.ID, err) - } - t.Logf("Registered worker %s with capabilities %v", worker.ID, worker.Capabilities) - } - - // Create multiple tasks of different types - tasks := []*types.Task{ - { - ID: "vacuum-task-1", - Type: types.TaskTypeVacuum, - VolumeID: 3001, - Server: "localhost:8080", - Status: types.TaskStatusPending, - Priority: types.TaskPriorityNormal, - Parameters: map[string]interface{}{ - "garbage_threshold": "0.4", - }, - CreatedAt: time.Now(), - }, - { - ID: "ec-task-1", - Type: types.TaskTypeErasureCoding, - VolumeID: 3002, - Server: "localhost:8080", - Status: types.TaskStatusPending, - Priority: types.TaskPriorityHigh, - Parameters: map[string]interface{}{ - "shard_count": "14", - }, - CreatedAt: time.Now(), - }, - { - ID: "vacuum-task-2", - Type: types.TaskTypeVacuum, - VolumeID: 3003, - Server: "localhost:8081", - Status: types.TaskStatusPending, - Priority: types.TaskPriorityLow, - Parameters: map[string]interface{}{ - "garbage_threshold": "0.5", - }, - CreatedAt: time.Now(), - }, - } - - for _, task := range tasks { - err = adminServer.QueueTask(task) - if err != nil { - t.Fatalf("Failed to queue task %s: %v", task.ID, err) - } - t.Logf("Queued task %s (%s) for volume %d", task.ID, task.Type, task.VolumeID) - } - - // Test task assignment to different workers - t.Logf("Testing task assignments") - - // Vacuum worker should get vacuum tasks - assignedTask, err := adminServer.RequestTask("vacuum-worker-1", []types.TaskType{types.TaskTypeVacuum}) - if err != nil { - t.Errorf("Failed to request task for vacuum worker: %v", err) - } else if assignedTask != nil { - t.Logf("Vacuum worker got task: %s (%s)", assignedTask.ID, assignedTask.Type) - - // Complete the task - err = adminServer.UpdateTaskProgress(assignedTask.ID, 50.0) - if err != nil { - t.Errorf("Failed to update progress: %v", err) - } - - err = adminServer.CompleteTask(assignedTask.ID, true, "") - if err != nil { - t.Errorf("Failed to complete task: %v", err) - } - } - - // EC worker should get EC tasks - assignedTask, err = adminServer.RequestTask("ec-worker-1", []types.TaskType{types.TaskTypeErasureCoding}) - if err != nil { - t.Errorf("Failed to request task for EC worker: %v", err) - } else if assignedTask != nil { - t.Logf("EC worker got task: %s (%s)", assignedTask.ID, assignedTask.Type) - - // Complete the task - err = adminServer.UpdateTaskProgress(assignedTask.ID, 100.0) - if err != nil { - t.Errorf("Failed to update progress: %v", err) - } - - err = adminServer.CompleteTask(assignedTask.ID, true, "") - if err != nil { - t.Errorf("Failed to complete task: %v", err) - } - } - - // Multi-capability worker should be able to get any remaining task - assignedTask, err = adminServer.RequestTask("multi-worker-1", []types.TaskType{types.TaskTypeVacuum, types.TaskTypeErasureCoding}) - if err != nil { - t.Errorf("Failed to request task for multi worker: %v", err) - } else if assignedTask != nil { - t.Logf("Multi worker got task: %s (%s)", assignedTask.ID, assignedTask.Type) - - // Complete the task - err = adminServer.UpdateTaskProgress(assignedTask.ID, 100.0) - if err != nil { - t.Errorf("Failed to update progress: %v", err) - } - - err = adminServer.CompleteTask(assignedTask.ID, true, "") - if err != nil { - t.Errorf("Failed to complete task: %v", err) - } - } - - // Check final statistics - stats := adminServer.GetSystemStats() - t.Logf("Final stats: Active tasks=%d, Queued tasks=%d, Active workers=%d, Total tasks=%d", - stats.ActiveTasks, stats.QueuedTasks, stats.ActiveWorkers, stats.TotalTasks) - - history := adminServer.GetTaskHistory() - t.Logf("Task history contains %d completed tasks", len(history)) - - for _, entry := range history { - t.Logf("Completed: %s (%s) - Worker: %s, Duration: %v", - entry.TaskID, entry.TaskType, entry.WorkerID, entry.Duration) - } - - t.Logf("Minimal full workflow test completed successfully") -} diff --git a/weed/admin/task/ec_worker_test.go b/weed/admin/task/ec_worker_test.go deleted file mode 100644 index 75286c08f..000000000 --- a/weed/admin/task/ec_worker_test.go +++ /dev/null @@ -1,488 +0,0 @@ -package task - -import ( - "os" - "path/filepath" - "testing" - "time" - - "github.com/seaweedfs/seaweedfs/weed/worker/tasks/erasure_coding" - "github.com/seaweedfs/seaweedfs/weed/worker/types" -) - -// TestECWorkerIntegration tests the complete EC worker functionality -func TestECWorkerIntegration(t *testing.T) { - t.Logf("Starting EC worker integration test") - - // Step 1: Create admin server with EC configuration - config := &MinimalAdminConfig{ - ScanInterval: 5 * time.Second, - WorkerTimeout: 60 * time.Second, - TaskTimeout: 45 * time.Minute, // EC takes longer - MaxRetries: 3, - ReconcileInterval: 5 * time.Minute, - EnableFailureRecovery: true, - MaxConcurrentTasks: 1, // One at a time for EC - } - - adminServer := NewMinimalAdminServer(config, nil) - err := adminServer.Start() - if err != nil { - t.Fatalf("Failed to start admin server: %v", err) - } - defer adminServer.Stop() - t.Logf("✓ Admin server started successfully") - - // Step 2: Register EC-capable worker - worker := &types.Worker{ - ID: "ec-worker-1", - Address: "localhost:9001", - Capabilities: []types.TaskType{types.TaskTypeErasureCoding}, - MaxConcurrent: 1, - Status: "active", - CurrentLoad: 0, - LastHeartbeat: time.Now(), - } - - err = adminServer.RegisterWorker(worker) - if err != nil { - t.Fatalf("Failed to register EC worker: %v", err) - } - t.Logf("✓ EC worker registered: %s", worker.ID) - - // Step 3: Create work directory for EC processing - workDir := filepath.Join(os.TempDir(), "seaweedfs_ec_test") - err = os.MkdirAll(workDir, 0755) - if err != nil { - t.Fatalf("Failed to create work directory: %v", err) - } - defer os.RemoveAll(workDir) - t.Logf("✓ Work directory created: %s", workDir) - - // Step 4: Create EC task with comprehensive parameters - ecTask := &types.Task{ - ID: "ec-test-task-1", - Type: types.TaskTypeErasureCoding, - VolumeID: 54321, - Server: "localhost:8080", - Status: types.TaskStatusPending, - Priority: types.TaskPriorityHigh, - Parameters: map[string]interface{}{ - "volume_size": int64(64 * 1024 * 1024 * 1024), // 64GB volume - "master_client": "localhost:9333", - "work_dir": workDir, - "collection": "test", - "data_shards": 10, - "parity_shards": 4, - "rack_aware": true, - "load_balance": true, - }, - CreatedAt: time.Now(), - } - - err = adminServer.QueueTask(ecTask) - if err != nil { - t.Fatalf("Failed to queue EC task: %v", err) - } - t.Logf("✓ EC task queued: %s for volume %d", ecTask.ID, ecTask.VolumeID) - - // Step 5: Worker requests and receives the EC task - assignedTask, err := adminServer.RequestTask("ec-worker-1", []types.TaskType{types.TaskTypeErasureCoding}) - if err != nil { - t.Fatalf("Failed to request EC task: %v", err) - } - - if assignedTask == nil { - t.Fatalf("No EC task was assigned") - } - - t.Logf("✓ EC task assigned: %s (%s) for volume %d", - assignedTask.ID, assignedTask.Type, assignedTask.VolumeID) - - // Step 6: Test EC task creation and validation - t.Logf("Testing EC task creation and validation") - - // Create EC task instance directly - factory := erasure_coding.NewFactory() - taskParams := types.TaskParams{ - VolumeID: assignedTask.VolumeID, - Server: assignedTask.Server, - Collection: "test", - Parameters: assignedTask.Parameters, - } - - taskInstance, err := factory.Create(taskParams) - if err != nil { - t.Fatalf("Failed to create EC task instance: %v", err) - } - t.Logf("✓ EC task instance created successfully") - - // Step 7: Validate task parameters - err = taskInstance.Validate(taskParams) - if err != nil { - t.Errorf("EC task validation failed: %v", err) - } else { - t.Logf("✓ EC task validation passed") - } - - // Step 8: Test time estimation - estimatedTime := taskInstance.EstimateTime(taskParams) - expectedMinTime := time.Duration(64*2) * time.Minute // 2 minutes per GB for 64GB - - t.Logf("✓ EC estimated time: %v (minimum expected: %v)", estimatedTime, expectedMinTime) - - if estimatedTime < expectedMinTime { - t.Logf("⚠ Note: Estimated time seems optimistic for 64GB volume") - } - - // Step 9: Simulate EC task execution phases - t.Logf("Simulating EC execution phases:") - - phases := []struct { - progress float64 - phase string - }{ - {5.0, "Initializing EC processing"}, - {15.0, "Volume data copied to local disk with progress tracking"}, - {25.0, "Source volume marked as read-only"}, - {45.0, "Local Reed-Solomon encoding (10+4 shards) completed"}, - {60.0, "Created 14 EC shards with verification"}, - {70.0, "Optimal shard placement calculated with rack awareness"}, - {85.0, "Intelligent shard distribution with load balancing"}, - {95.0, "Shard placement verified across multiple racks"}, - {100.0, "EC processing completed with cleanup"}, - } - - for _, phase := range phases { - err = adminServer.UpdateTaskProgress(assignedTask.ID, phase.progress) - if err != nil { - t.Errorf("Failed to update task progress to %.1f%%: %v", phase.progress, err) - } else { - t.Logf(" %.1f%% - %s", phase.progress, phase.phase) - } - time.Sleep(50 * time.Millisecond) // Simulate processing time - } - - // Step 10: Complete the EC task - err = adminServer.CompleteTask(assignedTask.ID, true, "") - if err != nil { - t.Errorf("Failed to complete EC task: %v", err) - } else { - t.Logf("✓ EC task completed successfully") - } - - // Step 11: Verify EC task completion and metrics - stats := adminServer.GetSystemStats() - t.Logf("✓ Final stats: Active tasks=%d, Queued tasks=%d, Active workers=%d, Total tasks=%d", - stats.ActiveTasks, stats.QueuedTasks, stats.ActiveWorkers, stats.TotalTasks) - - history := adminServer.GetTaskHistory() - t.Logf("✓ Task history contains %d completed tasks", len(history)) - - if len(history) > 0 { - lastEntry := history[len(history)-1] - t.Logf("✓ Last completed task: %s (%s) - Duration: %v", - lastEntry.TaskID, lastEntry.TaskType, lastEntry.Duration) - - if lastEntry.TaskType == types.TaskTypeErasureCoding { - t.Logf("✅ EC task execution verified!") - } - } - - t.Logf("✅ EC worker integration test completed successfully") -} - -// TestECFeatureValidation tests specific EC features -func TestECFeatureValidation(t *testing.T) { - t.Logf("Testing EC feature validation") - - // Create work directory - workDir := filepath.Join(os.TempDir(), "seaweedfs_ec_features_test") - err := os.MkdirAll(workDir, 0755) - if err != nil { - t.Fatalf("Failed to create work directory: %v", err) - } - defer os.RemoveAll(workDir) - - // Test EC task features - ecTask := erasure_coding.NewTaskWithParams( - "localhost:8080", // source server - 98765, // volume ID - "localhost:9333", // master client - workDir, // work directory - ) - - // Test current step tracking - currentStep := ecTask.GetCurrentStep() - t.Logf("✓ Initial current step: '%s'", currentStep) - - initialProgress := ecTask.GetProgress() - t.Logf("✓ Initial progress: %.1f%%", initialProgress) - - // Test parameter validation with features - validParams := types.TaskParams{ - VolumeID: 98765, - Server: "localhost:8080", - Collection: "features_test", - Parameters: map[string]interface{}{ - "volume_size": int64(128 * 1024 * 1024 * 1024), // 128GB - "master_client": "localhost:9333", - "work_dir": workDir, - "data_shards": 10, - "parity_shards": 4, - "rack_awareness": true, - "load_balancing": true, - "backup_servers": 2, - "affinity_zones": []string{"zone-a", "zone-b", "zone-c"}, - }, - } - - err = ecTask.Validate(validParams) - if err != nil { - t.Errorf("Valid parameters should pass validation: %v", err) - } else { - t.Logf("✓ Parameter validation passed") - } - - // Test time estimation for large volume - estimatedTime := ecTask.EstimateTime(validParams) - expectedMinTime := time.Duration(128*2) * time.Minute // 2 minutes per GB - - t.Logf("✓ 128GB volume estimated time: %v (expected minimum: %v)", estimatedTime, expectedMinTime) - - if estimatedTime < expectedMinTime { - t.Errorf("Time estimate seems too low for 128GB volume") - } - - // Test invalid parameters - invalidParams := types.TaskParams{ - VolumeID: 0, // Invalid - Server: "", // Invalid - } - - err = ecTask.Validate(invalidParams) - if err == nil { - t.Errorf("Invalid parameters should fail validation") - } else { - t.Logf("✓ Invalid parameter validation correctly failed: %v", err) - } - - t.Logf("✅ EC feature validation completed successfully") -} - -// TestECWorkflow tests the complete EC workflow -func TestECWorkflow(t *testing.T) { - t.Logf("Testing complete EC workflow") - - // Create admin server - config := &MinimalAdminConfig{ - ScanInterval: 10 * time.Second, - WorkerTimeout: 30 * time.Second, - TaskTimeout: 60 * time.Minute, - MaxRetries: 3, - ReconcileInterval: 5 * time.Minute, - EnableFailureRecovery: true, - MaxConcurrentTasks: 1, - } - - adminServer := NewMinimalAdminServer(config, nil) - err := adminServer.Start() - if err != nil { - t.Fatalf("Failed to start admin server: %v", err) - } - defer adminServer.Stop() - - // Register multiple workers with different capabilities - workers := []*types.Worker{ - { - ID: "ec-specialist-1", - Address: "localhost:9001", - Capabilities: []types.TaskType{types.TaskTypeErasureCoding}, - MaxConcurrent: 1, - Status: "active", - CurrentLoad: 0, - LastHeartbeat: time.Now(), - }, - { - ID: "vacuum-worker-1", - Address: "localhost:9002", - Capabilities: []types.TaskType{types.TaskTypeVacuum}, - MaxConcurrent: 2, - Status: "active", - CurrentLoad: 0, - LastHeartbeat: time.Now(), - }, - { - ID: "multi-capability-worker-1", - Address: "localhost:9003", - Capabilities: []types.TaskType{types.TaskTypeVacuum, types.TaskTypeErasureCoding}, - MaxConcurrent: 2, - Status: "active", - CurrentLoad: 0, - LastHeartbeat: time.Now(), - }, - } - - for _, worker := range workers { - err = adminServer.RegisterWorker(worker) - if err != nil { - t.Fatalf("Failed to register worker %s: %v", worker.ID, err) - } - t.Logf("✓ Registered worker %s with capabilities %v", worker.ID, worker.Capabilities) - } - - // Create test work directory - workDir := filepath.Join(os.TempDir(), "seaweedfs_workflow_test") - err = os.MkdirAll(workDir, 0755) - if err != nil { - t.Fatalf("Failed to create work directory: %v", err) - } - defer os.RemoveAll(workDir) - - // Create multiple tasks of different types - tasks := []*types.Task{ - { - ID: "ec-workflow-1", - Type: types.TaskTypeErasureCoding, - VolumeID: 11111, - Server: "localhost:8080", - Status: types.TaskStatusPending, - Priority: types.TaskPriorityHigh, - Parameters: map[string]interface{}{ - "volume_size": int64(50 * 1024 * 1024 * 1024), - "master_client": "localhost:9333", - "work_dir": workDir, - "collection": "workflow_test", - }, - CreatedAt: time.Now(), - }, - { - ID: "vacuum-workflow-1", - Type: types.TaskTypeVacuum, - VolumeID: 22222, - Server: "localhost:8081", - Status: types.TaskStatusPending, - Priority: types.TaskPriorityNormal, - Parameters: map[string]interface{}{ - "garbage_threshold": "0.4", - "volume_size": int64(20 * 1024 * 1024 * 1024), - }, - CreatedAt: time.Now(), - }, - { - ID: "ec-workflow-2", - Type: types.TaskTypeErasureCoding, - VolumeID: 33333, - Server: "localhost:8082", - Status: types.TaskStatusPending, - Priority: types.TaskPriorityNormal, - Parameters: map[string]interface{}{ - "volume_size": int64(80 * 1024 * 1024 * 1024), - "master_client": "localhost:9333", - "work_dir": workDir, - "collection": "workflow_test", - }, - CreatedAt: time.Now(), - }, - } - - // Queue all tasks - for _, task := range tasks { - err = adminServer.QueueTask(task) - if err != nil { - t.Fatalf("Failed to queue task %s: %v", task.ID, err) - } - t.Logf("✓ Queued task %s (%s) for volume %d", task.ID, task.Type, task.VolumeID) - } - - // Test task assignment to appropriate workers - t.Logf("Testing task assignments to appropriate workers") - - // EC specialist should get EC tasks - assignedTask, err := adminServer.RequestTask("ec-specialist-1", []types.TaskType{types.TaskTypeErasureCoding}) - if err != nil { - t.Errorf("Failed to request task for EC specialist: %v", err) - } else if assignedTask != nil { - t.Logf("✓ EC specialist got task: %s (%s)", assignedTask.ID, assignedTask.Type) - - // Complete the task - err = adminServer.UpdateTaskProgress(assignedTask.ID, 100.0) - if err != nil { - t.Errorf("Failed to update progress: %v", err) - } - - err = adminServer.CompleteTask(assignedTask.ID, true, "") - if err != nil { - t.Errorf("Failed to complete task: %v", err) - } - t.Logf("✓ EC task completed by specialist") - } - - // Vacuum worker should get vacuum tasks - assignedTask, err = adminServer.RequestTask("vacuum-worker-1", []types.TaskType{types.TaskTypeVacuum}) - if err != nil { - t.Errorf("Failed to request task for vacuum worker: %v", err) - } else if assignedTask != nil { - t.Logf("✓ Vacuum worker got task: %s (%s)", assignedTask.ID, assignedTask.Type) - - // Complete the task - err = adminServer.UpdateTaskProgress(assignedTask.ID, 100.0) - if err != nil { - t.Errorf("Failed to update progress: %v", err) - } - - err = adminServer.CompleteTask(assignedTask.ID, true, "") - if err != nil { - t.Errorf("Failed to complete task: %v", err) - } - t.Logf("✓ Vacuum task completed by vacuum worker") - } - - // Multi-capability worker should get remaining tasks - assignedTask, err = adminServer.RequestTask("multi-capability-worker-1", []types.TaskType{types.TaskTypeVacuum, types.TaskTypeErasureCoding}) - if err != nil { - t.Errorf("Failed to request task for multi-capability worker: %v", err) - } else if assignedTask != nil { - t.Logf("✓ Multi-capability worker got task: %s (%s)", assignedTask.ID, assignedTask.Type) - - // Complete the task - err = adminServer.UpdateTaskProgress(assignedTask.ID, 100.0) - if err != nil { - t.Errorf("Failed to update progress: %v", err) - } - - err = adminServer.CompleteTask(assignedTask.ID, true, "") - if err != nil { - t.Errorf("Failed to complete task: %v", err) - } - t.Logf("✓ Task completed by multi-capability worker") - } - - // Check final workflow statistics - stats := adminServer.GetSystemStats() - t.Logf("✓ Final workflow stats: Active tasks=%d, Queued tasks=%d, Active workers=%d, Total tasks=%d", - stats.ActiveTasks, stats.QueuedTasks, stats.ActiveWorkers, stats.TotalTasks) - - history := adminServer.GetTaskHistory() - t.Logf("✓ Workflow history contains %d completed tasks", len(history)) - - // Analyze task completion by type - ecTasks := 0 - vacuumTasks := 0 - - for _, entry := range history { - switch entry.TaskType { - case types.TaskTypeErasureCoding: - ecTasks++ - t.Logf(" EC: %s - Worker: %s, Duration: %v", - entry.TaskID, entry.WorkerID, entry.Duration) - case types.TaskTypeVacuum: - vacuumTasks++ - t.Logf(" Vacuum: %s - Worker: %s, Duration: %v", - entry.TaskID, entry.WorkerID, entry.Duration) - } - } - - t.Logf("✓ Completed tasks: %d EC, %d Vacuum", ecTasks, vacuumTasks) - t.Logf("✅ EC workflow test completed successfully") -} diff --git a/weed/admin/task/example_usage.go b/weed/admin/task/example_usage.go deleted file mode 100644 index 469fcfdc4..000000000 --- a/weed/admin/task/example_usage.go +++ /dev/null @@ -1,346 +0,0 @@ -package task - -import ( - "time" - - "github.com/seaweedfs/seaweedfs/weed/glog" - "github.com/seaweedfs/seaweedfs/weed/wdclient" - "github.com/seaweedfs/seaweedfs/weed/worker/types" -) - -// ExampleUsage demonstrates how to use the task distribution system -func ExampleUsage() { - glog.Infof("=== SeaweedFS Task Distribution System Example ===") - - // Example 1: Setting up the Admin Server - setupAdminServerExample() - - // Example 2: Simulating Workers - simulateWorkersExample() - - // Example 3: Running Simulations - runSimulationsExample() - - // Example 4: Demonstrating Features - demonstrateFeaturesExample() -} - -// setupAdminServerExample shows how to set up the admin server -func setupAdminServerExample() { - glog.Infof("\n--- Example 1: Setting up Admin Server ---") - - // Create master client (in real usage, this would connect to actual master) - masterClient := &wdclient.MasterClient{} // Simplified for example - - // Create admin server configuration - config := &AdminConfig{ - ScanInterval: 30 * time.Minute, - WorkerTimeout: 5 * time.Minute, - TaskTimeout: 10 * time.Minute, - MaxRetries: 3, - ReconcileInterval: 5 * time.Minute, - EnableFailureRecovery: true, - MaxConcurrentTasks: 10, - } - - // Create admin server - adminServer := NewAdminServer(config, masterClient) - - // Start the admin server - if err := adminServer.Start(); err != nil { - glog.Errorf("Failed to start admin server: %v", err) - return - } - - glog.Infof("✓ Admin server started with configuration:") - glog.Infof(" - Scan Interval: %v", config.ScanInterval) - glog.Infof(" - Worker Timeout: %v", config.WorkerTimeout) - glog.Infof(" - Max Concurrent Tasks: %d", config.MaxConcurrentTasks) - - // Simulate some operations - time.Sleep(2 * time.Second) - - // Stop the admin server - adminServer.Stop() - glog.Infof("✓ Admin server stopped gracefully") -} - -// simulateWorkersExample shows how workers would register and operate -func simulateWorkersExample() { - glog.Infof("\n--- Example 2: Worker Registration and Operation ---") - - // Create mock workers - workers := []*types.Worker{ - { - ID: "worker-ec-01", - Address: "192.168.1.100:8080", - Capabilities: []types.TaskType{types.TaskTypeErasureCoding}, - MaxConcurrent: 2, - Status: "active", - CurrentLoad: 0, - }, - { - ID: "worker-vacuum-01", - Address: "192.168.1.101:8080", - Capabilities: []types.TaskType{types.TaskTypeVacuum}, - MaxConcurrent: 3, - Status: "active", - CurrentLoad: 0, - }, - { - ID: "worker-multi-01", - Address: "192.168.1.102:8080", - Capabilities: []types.TaskType{types.TaskTypeErasureCoding, types.TaskTypeVacuum}, - MaxConcurrent: 2, - Status: "active", - CurrentLoad: 0, - }, - } - - // Create worker registry - registry := NewWorkerRegistry() - - // Register workers - for _, worker := range workers { - if err := registry.RegisterWorker(worker); err != nil { - glog.Errorf("Failed to register worker %s: %v", worker.ID, err) - } else { - glog.Infof("✓ Registered worker %s with capabilities: %v", worker.ID, worker.Capabilities) - } - } - - // Demonstrate worker selection - bestECWorker := registry.GetBestWorkerForTask(types.TaskTypeErasureCoding) - if bestECWorker != nil { - glog.Infof("✓ Best worker for EC tasks: %s", bestECWorker.ID) - } - - bestVacuumWorker := registry.GetBestWorkerForTask(types.TaskTypeVacuum) - if bestVacuumWorker != nil { - glog.Infof("✓ Best worker for vacuum tasks: %s", bestVacuumWorker.ID) - } - - // Show registry statistics - stats := registry.GetRegistryStats() - glog.Infof("✓ Registry statistics: %+v", stats) -} - -// runSimulationsExample shows how to run simulation scenarios -func runSimulationsExample() { - glog.Infof("\n--- Example 3: Running Simulation Scenarios ---") - - // Note: Simulation framework moved to simulation package - // To use: simulationRunner := simulation.NewComprehensiveSimulationRunner() - // simulationRunner.RunAllComprehensiveTests() - - glog.Infof("✅ Simulation framework available in separate package") - glog.Infof("Use simulation.NewComprehensiveSimulationRunner() to access comprehensive testing") -} - -// demonstrateFeaturesExample shows key system features -func demonstrateFeaturesExample() { - glog.Infof("\n--- Example 4: Key System Features ---") - - // Feature 1: Task Discovery - demonstrateTaskDiscovery() - - // Feature 2: Volume State Tracking - demonstrateVolumeStateTracking() - - // Feature 3: Failure Handling - demonstrateFailureHandling() - - // Feature 4: Task Scheduling - demonstrateTaskScheduling() -} - -// demonstrateTaskDiscovery shows how task discovery works -func demonstrateTaskDiscovery() { - glog.Infof("\n Feature 1: Task Discovery") - - // Create mock volumes - volumes := []*VolumeInfo{ - { - ID: 1, - Size: 28 * 1024 * 1024 * 1024, // 28GB (93% of 30GB) - Collection: "photos", - DeletedByteCount: 0, - ReadOnly: false, - ModifiedAtSecond: time.Now().Add(-2 * time.Hour).Unix(), // 2 hours old - }, - { - ID: 2, - Size: 20 * 1024 * 1024 * 1024, // 20GB - Collection: "documents", - DeletedByteCount: 8 * 1024 * 1024 * 1024, // 8GB garbage (40%) - ReadOnly: false, - ModifiedAtSecond: time.Now().Add(-1 * time.Hour).Unix(), // 1 hour old - }, - } - - // Create detectors - ecDetector := NewECDetector() - vacuumDetector := NewVacuumDetector() - - // Test EC detection - ecCandidates, _ := ecDetector.DetectECCandidates(volumes) - glog.Infof(" ✓ EC detector found %d candidates", len(ecCandidates)) - for _, candidate := range ecCandidates { - glog.Infof(" - Volume %d: %s (priority: %d)", candidate.VolumeID, candidate.Reason, candidate.Priority) - } - - // Test vacuum detection - vacuumCandidates, _ := vacuumDetector.DetectVacuumCandidates(volumes) - glog.Infof(" ✓ Vacuum detector found %d candidates", len(vacuumCandidates)) - for _, candidate := range vacuumCandidates { - glog.Infof(" - Volume %d: %s (priority: %d)", candidate.VolumeID, candidate.Reason, candidate.Priority) - } -} - -// demonstrateVolumeStateTracking shows volume state management -func demonstrateVolumeStateTracking() { - glog.Infof("\n Feature 2: Volume State Tracking") - - // Create volume state tracker - tracker := NewVolumeStateTracker(nil, 5*time.Minute) - - // Reserve volumes for tasks - tracker.ReserveVolume(1, "task-ec-001") - tracker.ReserveVolume(2, "task-vacuum-001") - - glog.Infof(" ✓ Reserved volumes for tasks") - - // Check reservations - if tracker.IsVolumeReserved(1) { - glog.Infof(" ✓ Volume 1 is correctly reserved") - } - - // Record volume changes - tracker.RecordVolumeChange(1, types.TaskTypeErasureCoding, "task-ec-001") - glog.Infof(" ✓ Recorded volume change for EC completion") - - // Get pending changes - if change := tracker.GetPendingChange(1); change != nil { - glog.Infof(" ✓ Pending change found: %s for volume %d", change.ChangeType, change.VolumeID) - } - - // Release reservation - tracker.ReleaseVolume(2, "task-vacuum-001") - glog.Infof(" ✓ Released volume reservation") - - // Show statistics - stats := tracker.GetStats() - glog.Infof(" ✓ Tracker statistics: %+v", stats) -} - -// demonstrateFailureHandling shows failure recovery mechanisms -func demonstrateFailureHandling() { - glog.Infof("\n Feature 3: Failure Handling") - - // Create failure handler - config := DefaultAdminConfig() - handler := NewFailureHandler(config) - - // Create mock task - task := &InProgressTask{ - Task: &types.Task{ - ID: "test-task-001", - Type: types.TaskTypeErasureCoding, - VolumeID: 1, - RetryCount: 0, - }, - WorkerID: "worker-01", - StartedAt: time.Now(), - LastUpdate: time.Now().Add(-30 * time.Minute), // 30 minutes ago - Progress: 45.0, - } - - // Demonstrate different failure scenarios - glog.Infof(" ✓ Simulating worker timeout scenario") - handler.HandleWorkerTimeout("worker-01", []*InProgressTask{task}) - - glog.Infof(" ✓ Simulating stuck task scenario") - handler.HandleTaskStuck(task) - - glog.Infof(" ✓ Simulating duplicate task detection") - handler.HandleDuplicateTask("existing-task", "duplicate-task", 1) - - // Show failure statistics - stats := handler.GetFailureStats() - glog.Infof(" ✓ Failure handler statistics: %+v", stats) -} - -// demonstrateTaskScheduling shows task scheduling logic -func demonstrateTaskScheduling() { - glog.Infof("\n Feature 4: Task Scheduling") - - // Create worker registry and task queue - registry := NewWorkerRegistry() - queue := NewPriorityTaskQueue() - scheduler := NewTaskScheduler(registry, queue) - - // Add mock worker - worker := &types.Worker{ - ID: "scheduler-worker-01", - Capabilities: []types.TaskType{types.TaskTypeErasureCoding, types.TaskTypeVacuum}, - MaxConcurrent: 2, - Status: "active", - CurrentLoad: 0, - } - registry.RegisterWorker(worker) - - // Create mock tasks with different priorities - highPriorityTask := &types.Task{ - ID: "high-priority-task", - Type: types.TaskTypeErasureCoding, - Priority: types.TaskPriorityHigh, - VolumeID: 1, - } - - normalPriorityTask := &types.Task{ - ID: "normal-priority-task", - Type: types.TaskTypeVacuum, - Priority: types.TaskPriorityNormal, - VolumeID: 2, - } - - // Add tasks to queue - queue.Push(normalPriorityTask) - queue.Push(highPriorityTask) // Should be prioritized - - glog.Infof(" ✓ Added tasks to priority queue (size: %d)", queue.Size()) - - // Test worker selection - selectedWorker := scheduler.SelectWorker(highPriorityTask, []*types.Worker{worker}) - if selectedWorker != nil { - glog.Infof(" ✓ Selected worker %s for high-priority task", selectedWorker.ID) - } - - // Test task retrieval - nextTask := scheduler.GetNextTask("scheduler-worker-01", []types.TaskType{types.TaskTypeErasureCoding, types.TaskTypeVacuum}) - if nextTask != nil { - glog.Infof(" ✓ Next task for worker: %s (priority: %d)", nextTask.ID, nextTask.Priority) - } - - glog.Infof(" ✓ Task scheduling demonstration complete") -} - -// RunComprehensiveDemo runs a full demonstration of the system -func RunComprehensiveDemo() { - glog.Infof("Starting comprehensive task distribution system demonstration...") - - // Run comprehensive example - ExampleUsage() - - // Note: To run the comprehensive simulation framework, use: - // simulationRunner := simulation.NewComprehensiveSimulationRunner() - // simulationRunner.RunAllComprehensiveTests() - - glog.Infof("=== Comprehensive demonstration complete ===") - glog.Infof("💡 To run comprehensive simulations, use the simulation package separately") - glog.Infof("Step 9: Comprehensive Simulation Testing") - glog.Infof("Note: Simulation framework moved to separate 'simulation' package") - glog.Infof("To run simulations: simulation.NewComprehensiveSimulationRunner().RunAllComprehensiveTests()") - glog.Infof("✅ Simulation framework available in separate package") - glog.Infof("") -} diff --git a/weed/admin/task/failure_handler.go b/weed/admin/task/failure_handler.go deleted file mode 100644 index 651d9db88..000000000 --- a/weed/admin/task/failure_handler.go +++ /dev/null @@ -1,123 +0,0 @@ -package task - -import ( - "time" - - "github.com/seaweedfs/seaweedfs/weed/glog" -) - -// FailureHandler handles various failure scenarios in the task system -type FailureHandler struct { - config *AdminConfig -} - -// NewFailureHandler creates a new failure handler -func NewFailureHandler(config *AdminConfig) *FailureHandler { - return &FailureHandler{ - config: config, - } -} - -// HandleWorkerTimeout handles worker timeout scenarios -func (fh *FailureHandler) HandleWorkerTimeout(workerID string, affectedTasks []*InProgressTask) { - glog.Warningf("Handling worker timeout for worker %s with %d affected tasks", workerID, len(affectedTasks)) - - for _, task := range affectedTasks { - fh.handleTaskFailure(task, "worker_timeout", "Worker became unresponsive") - } -} - -// HandleTaskStuck handles stuck task scenarios -func (fh *FailureHandler) HandleTaskStuck(task *InProgressTask) { - glog.Warningf("Handling stuck task %s (no progress for %v)", task.Task.ID, time.Since(task.LastUpdate)) - - fh.handleTaskFailure(task, "task_stuck", "Task made no progress within timeout period") -} - -// HandleTaskFailure handles general task failure scenarios -func (fh *FailureHandler) HandleTaskFailure(task *InProgressTask, reason string, details string) { - glog.Errorf("Handling task failure for task %s: %s - %s", task.Task.ID, reason, details) - - fh.handleTaskFailure(task, reason, details) -} - -// handleTaskFailure is the internal handler for task failures -func (fh *FailureHandler) handleTaskFailure(task *InProgressTask, reason string, details string) { - // Record failure reason - task.Task.Error = details - - // Determine if task should be retried - if task.Task.RetryCount < fh.config.MaxRetries { - fh.scheduleRetry(task, reason) - } else { - fh.markTaskFailed(task, reason) - } -} - -// scheduleRetry schedules a task for retry -func (fh *FailureHandler) scheduleRetry(task *InProgressTask, reason string) { - task.Task.RetryCount++ - - // Calculate retry delay with exponential backoff - retryDelay := time.Duration(task.Task.RetryCount) * 5 * time.Minute - task.Task.ScheduledAt = time.Now().Add(retryDelay) - - glog.Infof("Scheduling retry %d/%d for task %s (reason: %s, delay: %v)", - task.Task.RetryCount, fh.config.MaxRetries, task.Task.ID, reason, retryDelay) -} - -// markTaskFailed permanently marks a task as failed -func (fh *FailureHandler) markTaskFailed(task *InProgressTask, reason string) { - glog.Errorf("Task %s permanently failed after %d retries (reason: %s)", - task.Task.ID, task.Task.RetryCount, reason) - - // Could trigger alerts or notifications here - fh.sendFailureAlert(task, reason) -} - -// sendFailureAlert sends alerts for permanently failed tasks -func (fh *FailureHandler) sendFailureAlert(task *InProgressTask, reason string) { - // In a real implementation, this would: - // 1. Send notifications to administrators - // 2. Update monitoring dashboards - // 3. Log to audit trails - // 4. Possibly trigger automatic remediation - - glog.Errorf("ALERT: Task permanently failed - ID: %s, Type: %s, Volume: %d, Reason: %s", - task.Task.ID, task.Task.Type, task.Task.VolumeID, reason) -} - -// HandleDuplicateTask handles duplicate task detection -func (fh *FailureHandler) HandleDuplicateTask(existingTaskID string, duplicateTaskID string, volumeID uint32) { - glog.Warningf("Detected duplicate task for volume %d: existing=%s, duplicate=%s", - volumeID, existingTaskID, duplicateTaskID) - - // Cancel the duplicate task - // In a real implementation, this would send a cancellation signal -} - -// HandleResourceExhaustion handles resource exhaustion scenarios -func (fh *FailureHandler) HandleResourceExhaustion(workerID string, taskType string) { - glog.Warningf("Worker %s reported resource exhaustion for task type %s", workerID, taskType) - - // Could implement: - // 1. Temporary worker blacklisting - // 2. Task redistribution - // 3. Resource monitoring alerts -} - -// GetFailureStats returns failure statistics -func (fh *FailureHandler) GetFailureStats() map[string]interface{} { - // In a real implementation, this would track: - // - Failure rates by type - // - Worker reliability scores - // - Task retry statistics - // - System health metrics - - return map[string]interface{}{ - "enabled": true, - "max_retries": fh.config.MaxRetries, - "task_timeout": fh.config.TaskTimeout.String(), - "worker_timeout": fh.config.WorkerTimeout.String(), - } -} diff --git a/weed/admin/task/master_sync.go b/weed/admin/task/master_sync.go deleted file mode 100644 index 5d094f052..000000000 --- a/weed/admin/task/master_sync.go +++ /dev/null @@ -1,486 +0,0 @@ -package task - -import ( - "context" - "fmt" - "strconv" - "strings" - "time" - - "github.com/seaweedfs/seaweedfs/weed/glog" - "github.com/seaweedfs/seaweedfs/weed/pb/master_pb" - "github.com/seaweedfs/seaweedfs/weed/wdclient" - "github.com/seaweedfs/seaweedfs/weed/worker/tasks/erasure_coding" - "github.com/seaweedfs/seaweedfs/weed/worker/tasks/vacuum" - "github.com/seaweedfs/seaweedfs/weed/worker/types" -) - -// MasterSynchronizer handles periodic synchronization with the master server -type MasterSynchronizer struct { - masterClient *wdclient.MasterClient - volumeStateManager *VolumeStateManager - adminServer *AdminServer - syncInterval time.Duration - stopCh chan struct{} - volumeSizeLimitMB uint64 // Volume size limit from master in MB -} - -// NewMasterSynchronizer creates a new master synchronizer -func NewMasterSynchronizer(masterClient *wdclient.MasterClient, vsm *VolumeStateManager, admin *AdminServer) *MasterSynchronizer { - return &MasterSynchronizer{ - masterClient: masterClient, - volumeStateManager: vsm, - adminServer: admin, - syncInterval: 30 * time.Second, // Default 30 second sync interval - stopCh: make(chan struct{}), - } -} - -// Start begins the periodic master synchronization -func (ms *MasterSynchronizer) Start() { - glog.Infof("Starting master synchronization with interval %v", ms.syncInterval) - - go func() { - // Immediate sync on startup - ms.performSync() - - ticker := time.NewTicker(ms.syncInterval) - defer ticker.Stop() - - for { - select { - case <-ticker.C: - ms.performSync() - case <-ms.stopCh: - glog.Infof("Master synchronization stopped") - return - } - } - }() -} - -// Stop stops the master synchronization -func (ms *MasterSynchronizer) Stop() { - close(ms.stopCh) -} - -// performSync executes a single synchronization cycle -func (ms *MasterSynchronizer) performSync() { - glog.V(1).Infof("Starting master sync cycle") - startTime := time.Now() - - // Get volume list from master - volumeData, err := ms.getVolumeListFromMaster() - if err != nil { - glog.Errorf("Failed to get volume list from master: %v", err) - return - } - - // Update volume size limit from master - if volumeData.VolumeSizeLimitMb > 0 { - ms.volumeSizeLimitMB = volumeData.VolumeSizeLimitMb - glog.V(2).Infof("Updated volume size limit to %d MB from master", ms.volumeSizeLimitMB) - } - - // Merge data into volume state manager - err = ms.mergeVolumeData(volumeData) - if err != nil { - glog.Errorf("Failed to merge volume data: %v", err) - return - } - - // Detect volumes needing work - candidates := ms.detectMaintenanceCandidates(volumeData) - - // Process candidates for task assignment - ms.processCandidates(candidates) - - duration := time.Since(startTime) - glog.V(1).Infof("Master sync completed in %v, found %d maintenance candidates", - duration, len(candidates)) -} - -// getVolumeListFromMaster retrieves the current volume topology from master -func (ms *MasterSynchronizer) getVolumeListFromMaster() (*master_pb.VolumeListResponse, error) { - ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) - defer cancel() - - err := ms.masterClient.WithClient(false, func(client master_pb.SeaweedClient) error { - req := &master_pb.VolumeListRequest{} - response, err := client.VolumeList(ctx, req) - if err != nil { - return fmt.Errorf("VolumeList RPC failed: %v", err) - } - volumeData = response - return nil - }) - - if err != nil { - return nil, err - } - - return volumeData, nil -} - -// VolumeMaintenanceCandidate represents a volume that needs maintenance -type VolumeMaintenanceCandidate struct { - VolumeID uint32 - Server string - TaskType string - Priority TaskPriority - Reason string - VolumeInfo *VolumeInfo - ECShardInfo map[int]*ShardInfo -} - -// mergeVolumeData merges master volume data into the volume state manager -func (ms *MasterSynchronizer) mergeVolumeData(data *master_pb.VolumeListResponse) error { - if data.TopologyInfo == nil { - return fmt.Errorf("empty topology info from master") - } - - volumes := make(map[uint32]*VolumeInfo) - ecShards := make(map[uint32]map[int]*ShardInfo) - serverCapacity := make(map[string]*CapacityInfo) - - // Extract volume information from topology - ms.extractVolumesFromTopology(data.TopologyInfo, volumes, ecShards, serverCapacity) - - // Update volume state manager - err := ms.volumeStateManager.SyncWithMasterData(volumes, ecShards, serverCapacity) - if err != nil { - return fmt.Errorf("failed to sync with volume state manager: %v", err) - } - - glog.V(2).Infof("Synced %d volumes, %d EC volume groups, %d servers", - len(volumes), len(ecShards), len(serverCapacity)) - - return nil -} - -// extractVolumesFromTopology extracts volume and capacity data from master topology -func (ms *MasterSynchronizer) extractVolumesFromTopology( - topology *master_pb.TopologyInfo, - volumes map[uint32]*VolumeInfo, - ecShards map[uint32]map[int]*ShardInfo, - serverCapacity map[string]*CapacityInfo) { - - for _, dcInfo := range topology.DataCenterInfos { - for _, rackInfo := range dcInfo.RackInfos { - for _, nodeInfo := range rackInfo.DataNodeInfos { - serverID := fmt.Sprintf("%s:%d", nodeInfo.Id, nodeInfo.GrpcPort) - - // Initialize server capacity info - if serverCapacity[serverID] == nil { - serverCapacity[serverID] = &CapacityInfo{ - Server: serverID, - } - } - - // Process disk information - for diskType, diskInfo := range nodeInfo.DiskInfos { - ms.processDiskInfo(diskInfo, diskType, serverID, volumes, ecShards, serverCapacity) - } - } - } - } -} - -// processDiskInfo processes disk information for a specific server -func (ms *MasterSynchronizer) processDiskInfo( - diskInfo *master_pb.DiskInfo, - diskType string, - serverID string, - volumes map[uint32]*VolumeInfo, - ecShards map[uint32]map[int]*ShardInfo, - serverCapacity map[string]*CapacityInfo) { - - // Update capacity information - capacity := serverCapacity[serverID] - volumeSizeBytes := int64(ms.volumeSizeLimitMB) * 1024 * 1024 // Convert MB to bytes - capacity.TotalCapacity += int64(diskInfo.MaxVolumeCount) * volumeSizeBytes - capacity.UsedCapacity += int64(diskInfo.ActiveVolumeCount) * volumeSizeBytes - - // Process regular volumes - for _, volInfo := range diskInfo.VolumeInfos { - volumes[volInfo.Id] = &VolumeInfo{ - ID: volInfo.Id, - Size: volInfo.Size, - Collection: volInfo.Collection, - FileCount: volInfo.FileCount, - DeleteCount: volInfo.DeleteCount, - DeletedByteCount: volInfo.DeletedByteCount, - ReadOnly: volInfo.ReadOnly, - Server: serverID, - DiskType: diskType, - ModifiedAtSecond: volInfo.ModifiedAtSecond, - } - } - - // Process EC shards - for _, shardInfo := range diskInfo.EcShardInfos { - volumeID := shardInfo.Id - if ecShards[volumeID] == nil { - ecShards[volumeID] = make(map[int]*ShardInfo) - } - - // Extract shard IDs from ec_index_bits - for shardID := 0; shardID < 14; shardID++ { - if (shardInfo.EcIndexBits & (1 << uint(shardID))) != 0 { - ecShards[volumeID][shardID] = &ShardInfo{ - ShardID: shardID, - Server: serverID, - Status: ShardStatusExists, - Size: 0, // Size not available in shard info - } - } - } - } -} - -// detectMaintenanceCandidates identifies volumes that need maintenance -func (ms *MasterSynchronizer) detectMaintenanceCandidates(data *master_pb.VolumeListResponse) []*VolumeMaintenanceCandidate { - var candidates []*VolumeMaintenanceCandidate - - // Get current volume states - currentVolumes := ms.volumeStateManager.GetAllVolumeStates() - - for volumeID, volumeState := range currentVolumes { - // Skip volumes with in-progress tasks - if len(volumeState.InProgressTasks) > 0 { - continue - } - - // Check for EC encoding candidates - if candidate := ms.checkECEncodingCandidate(volumeID, volumeState); candidate != nil { - candidates = append(candidates, candidate) - } - - // Check for vacuum candidates - if candidate := ms.checkVacuumCandidate(volumeID, volumeState); candidate != nil { - candidates = append(candidates, candidate) - } - - // Check for EC rebuild candidates - if candidate := ms.checkECRebuildCandidate(volumeID, volumeState); candidate != nil { - candidates = append(candidates, candidate) - } - } - - return candidates -} - -// EC encoding criteria - using configuration from EC detector -func (ms *MasterSynchronizer) checkECEncodingCandidate(volumeID uint32, state *VolumeState) *VolumeMaintenanceCandidate { - volume := state.CurrentState - if volume == nil { - return nil - } - - // Get the current configuration from the EC detector - ecDetector, _ := erasure_coding.GetSharedInstances() - if ecDetector == nil || !ecDetector.IsEnabled() { - return nil - } - - // Get configuration values from the detector - fullnessThreshold := ecDetector.GetFullnessRatio() - quietForSeconds := ecDetector.GetQuietForSeconds() - collectionFilter := ecDetector.GetCollectionFilter() - - // EC encoding criteria: - // 1. Volume meets fullness ratio threshold - // 2. Volume has been quiet for required duration - // 3. Collection filter matches (if specified) - // 4. Not already EC encoded - - // Check fullness ratio (if we have size info) - if volume.Size == 0 { - return nil - } - - // Calculate fullness ratio (assuming total capacity is close to actual size for near-full volumes) - // For a more accurate calculation, we'd need the volume's max capacity - fullnessRatio := float64(volume.Size-volume.DeletedByteCount) / float64(volume.Size) - if fullnessRatio < fullnessThreshold { - return nil - } - - // Check collection filter if specified - if collectionFilter != "" { - // Parse comma-separated collections - allowedCollections := make(map[string]bool) - for _, collection := range strings.Split(collectionFilter, ",") { - allowedCollections[strings.TrimSpace(collection)] = true - } - // Skip if volume's collection is not in the allowed list - if !allowedCollections[volume.Collection] { - return nil - } - } - - // Check quiet duration using volume's last modification time - now := time.Now() - lastModified := time.Unix(volume.ModifiedAtSecond, 0) - timeSinceModification := now.Sub(lastModified) - - if timeSinceModification < time.Duration(quietForSeconds)*time.Second { - return nil // Volume hasn't been quiet long enough - } - - return &VolumeMaintenanceCandidate{ - VolumeID: volumeID, - Server: volume.Server, - TaskType: "ec_encode", - Priority: types.TaskPriorityLow, // EC is typically low priority - Reason: fmt.Sprintf("Volume meets EC criteria: fullness=%.1f%% (>%.1f%%), quiet for %s (>%ds), collection='%s'", fullnessRatio*100, fullnessThreshold*100, timeSinceModification.Truncate(time.Second), quietForSeconds, volume.Collection), - VolumeInfo: volume, - } -} - -// checkVacuumCandidate checks if a volume is a candidate for vacuum -func (ms *MasterSynchronizer) checkVacuumCandidate(volumeID uint32, state *VolumeState) *VolumeMaintenanceCandidate { - volume := state.CurrentState - if volume == nil || volume.ReadOnly { - return nil - } - - // Get the current configuration from the vacuum detector - vacuumDetector, _ := vacuum.GetSharedInstances() - if vacuumDetector == nil || !vacuumDetector.IsEnabled() { - return nil - } - - // Get configuration values from the detector - garbageThreshold := vacuumDetector.GetGarbageThreshold() - minVolumeAge := vacuumDetector.GetMinVolumeAge() - - // Vacuum criteria: - // 1. Volume meets garbage threshold - // 2. Volume is old enough (respects minimum age) - // 3. Volume has sufficient size - - // Check minimum volume size (avoid vacuum on tiny volumes) - if volume.Size == 0 { - return nil - } - - // Check garbage ratio - deletedRatio := float64(volume.DeletedByteCount) / float64(volume.Size) - if deletedRatio < garbageThreshold { - return nil - } - - // Check minimum volume age using volume's last modification time - now := time.Now() - lastModified := time.Unix(volume.ModifiedAtSecond, 0) - volumeAge := now.Sub(lastModified) - - if volumeAge < minVolumeAge { - return nil // Volume is too new for vacuum - } - - // Determine priority based on garbage ratio - priority := types.TaskPriorityNormal - if deletedRatio > 0.6 { // High garbage ratio gets higher priority - priority = types.TaskPriorityHigh - } - - return &VolumeMaintenanceCandidate{ - VolumeID: volumeID, - Server: volume.Server, - TaskType: "vacuum", - Priority: priority, - Reason: fmt.Sprintf("Volume meets vacuum criteria: garbage=%.1f%% (>%.1f%%), age=%s (>%s)", - deletedRatio*100, garbageThreshold*100, volumeAge.Truncate(time.Second), minVolumeAge.Truncate(time.Second)), - VolumeInfo: volume, - } -} - -// checkECRebuildCandidate checks if an EC volume needs shard rebuilding -func (ms *MasterSynchronizer) checkECRebuildCandidate(volumeID uint32, state *VolumeState) *VolumeMaintenanceCandidate { - // For now, skip EC rebuild detection as it requires more complex shard state tracking - // This would be implemented when the volume state manager provides proper EC shard access - return nil -} - -// processCandidates attempts to assign tasks for maintenance candidates -func (ms *MasterSynchronizer) processCandidates(candidates []*VolumeMaintenanceCandidate) { - for _, candidate := range candidates { - // Check if we can assign this task - if !ms.canAssignCandidate(candidate) { - glog.V(2).Infof("Cannot assign task for volume %d: insufficient capacity or no workers", - candidate.VolumeID) - continue - } - - // Create and queue the task - task := ms.createTaskFromCandidate(candidate) - if task != nil { - ms.adminServer.QueueTask(task) - glog.V(1).Infof("Queued %s task for volume %d on server %s: %s", - candidate.TaskType, candidate.VolumeID, candidate.Server, candidate.Reason) - } - } -} - -// canAssignCandidate checks if a candidate can be assigned (capacity, workers available) -func (ms *MasterSynchronizer) canAssignCandidate(candidate *VolumeMaintenanceCandidate) bool { - // Check if server has capacity for the task - if candidate.TaskType == "ec_encode" { - // EC encoding requires significant temporary space - requiredSpace := int64(candidate.VolumeInfo.Size * 2) // Estimate 2x volume size needed - if !ms.volumeStateManager.CanAssignVolumeToServer(requiredSpace, candidate.Server) { - return false - } - } - - // Check if we have workers capable of this task type - availableWorkers := ms.adminServer.GetAvailableWorkers(candidate.TaskType) - if len(availableWorkers) == 0 { - return false - } - - return true -} - -// createTaskFromCandidate creates a task from a maintenance candidate -func (ms *MasterSynchronizer) createTaskFromCandidate(candidate *VolumeMaintenanceCandidate) *Task { - now := time.Now() - - task := &Task{ - ID: generateTaskID(), - Type: TaskType(candidate.TaskType), - VolumeID: candidate.VolumeID, - Priority: candidate.Priority, - Status: types.TaskStatusPending, - CreatedAt: now, - Parameters: map[string]interface{}{ - "volume_id": fmt.Sprintf("%d", candidate.VolumeID), - "server": candidate.Server, - "reason": candidate.Reason, - }, - } - - // Add task-specific parameters - switch candidate.TaskType { - case "ec_encode": - task.Parameters["replication"] = "001" // Default replication for EC - task.Parameters["collection"] = candidate.VolumeInfo.Collection - case "vacuum": - // Get the current garbage threshold from the vacuum detector - vacuumDetector, _ := vacuum.GetSharedInstances() - var garbageThreshold float64 = 0.3 // Default fallback - if vacuumDetector != nil { - garbageThreshold = vacuumDetector.GetGarbageThreshold() - } - task.Parameters["garbage_threshold"] = strconv.FormatFloat(garbageThreshold, 'f', -1, 64) - case "ec_rebuild": - // Add info about which shards need rebuilding - } - - return task -} - -// Global variable to hold the master volume data -var volumeData *master_pb.VolumeListResponse diff --git a/weed/admin/task/minimal_admin_server.go b/weed/admin/task/minimal_admin_server.go deleted file mode 100644 index d7dbfcd96..000000000 --- a/weed/admin/task/minimal_admin_server.go +++ /dev/null @@ -1,324 +0,0 @@ -package task - -import ( - "fmt" - "sync" - "time" - - "github.com/seaweedfs/seaweedfs/weed/wdclient" - "github.com/seaweedfs/seaweedfs/weed/worker/types" -) - -// MinimalAdminConfig contains configuration for the minimal admin server -type MinimalAdminConfig struct { - ScanInterval time.Duration - WorkerTimeout time.Duration - TaskTimeout time.Duration - MaxRetries int - ReconcileInterval time.Duration - EnableFailureRecovery bool - MaxConcurrentTasks int -} - -// MinimalAdminServer manages workers and tasks with a simple implementation -type MinimalAdminServer struct { - config *MinimalAdminConfig - masterClient *wdclient.MasterClient - running bool - mutex sync.RWMutex - - // Task management - tasks map[string]*types.Task - taskQueue []*types.Task - activeTasks map[string]*types.Task - - // Worker management - workers map[string]*types.Worker - workerStatus map[string]*types.WorkerStatus - - // Task history - taskHistory []MinimalTaskHistoryEntry -} - -// MinimalTaskHistoryEntry represents a single task history entry -type MinimalTaskHistoryEntry struct { - TaskID string - TaskType types.TaskType - VolumeID uint32 - WorkerID string - Status types.TaskStatus - StartedAt time.Time - CompletedAt time.Time - Duration time.Duration - ErrorMessage string -} - -// MinimalSystemStats represents system statistics -type MinimalSystemStats struct { - ActiveTasks int - QueuedTasks int - ActiveWorkers int - TotalTasks int -} - -// NewMinimalAdminServer creates a new minimal admin server -func NewMinimalAdminServer(config *MinimalAdminConfig, masterClient *wdclient.MasterClient) *MinimalAdminServer { - return &MinimalAdminServer{ - config: config, - masterClient: masterClient, - tasks: make(map[string]*types.Task), - taskQueue: make([]*types.Task, 0), - activeTasks: make(map[string]*types.Task), - workers: make(map[string]*types.Worker), - workerStatus: make(map[string]*types.WorkerStatus), - taskHistory: make([]MinimalTaskHistoryEntry, 0), - } -} - -// Start starts the minimal admin server -func (as *MinimalAdminServer) Start() error { - as.mutex.Lock() - defer as.mutex.Unlock() - - if as.running { - return fmt.Errorf("admin server is already running") - } - - as.running = true - return nil -} - -// Stop stops the minimal admin server -func (as *MinimalAdminServer) Stop() error { - as.mutex.Lock() - defer as.mutex.Unlock() - - as.running = false - return nil -} - -// RegisterWorker registers a new worker -func (as *MinimalAdminServer) RegisterWorker(worker *types.Worker) error { - as.mutex.Lock() - defer as.mutex.Unlock() - - if !as.running { - return fmt.Errorf("admin server is not running") - } - - as.workers[worker.ID] = worker - as.workerStatus[worker.ID] = &types.WorkerStatus{ - Status: "active", - CurrentLoad: 0, - } - - return nil -} - -// QueueTask adds a new task to the task queue -func (as *MinimalAdminServer) QueueTask(task *types.Task) error { - as.mutex.Lock() - defer as.mutex.Unlock() - - if !as.running { - return fmt.Errorf("admin server is not running") - } - - if task.ID == "" { - task.ID = fmt.Sprintf("task-%d", time.Now().UnixNano()) - } - - task.Status = types.TaskStatusPending - task.CreatedAt = time.Now() - - as.tasks[task.ID] = task - as.taskQueue = append(as.taskQueue, task) - - return nil -} - -// RequestTask requests a task for a worker -func (as *MinimalAdminServer) RequestTask(workerID string, capabilities []types.TaskType) (*types.Task, error) { - as.mutex.Lock() - defer as.mutex.Unlock() - - if !as.running { - return nil, fmt.Errorf("admin server is not running") - } - - // Check if worker exists - worker, exists := as.workers[workerID] - if !exists { - return nil, fmt.Errorf("worker %s not found", workerID) - } - - // Check if worker has capacity - status := as.workerStatus[workerID] - if status.CurrentLoad >= worker.MaxConcurrent { - return nil, nil // No capacity - } - - // Find a suitable task - for i, task := range as.taskQueue { - if task.Status != types.TaskStatusPending { - continue - } - - // Check if worker can handle this task type - canHandle := false - for _, capability := range capabilities { - if task.Type == capability { - canHandle = true - break - } - } - - if canHandle { - // Assign task to worker - task.Status = types.TaskStatusInProgress - task.WorkerID = workerID - now := time.Now() - task.StartedAt = &now - - // Move task from queue to active tasks - as.taskQueue = append(as.taskQueue[:i], as.taskQueue[i+1:]...) - as.activeTasks[task.ID] = task - - // Update worker load - status.CurrentLoad++ - - return task, nil - } - } - - return nil, nil // No suitable task found -} - -// UpdateTaskProgress updates task progress -func (as *MinimalAdminServer) UpdateTaskProgress(taskID string, progress float64) error { - as.mutex.Lock() - defer as.mutex.Unlock() - - task, exists := as.tasks[taskID] - if !exists { - return fmt.Errorf("task %s not found", taskID) - } - - task.Progress = progress - - return nil -} - -// CompleteTask marks a task as completed -func (as *MinimalAdminServer) CompleteTask(taskID string, success bool, errorMessage string) error { - as.mutex.Lock() - defer as.mutex.Unlock() - - task, exists := as.tasks[taskID] - if !exists { - return fmt.Errorf("task %s not found", taskID) - } - - // Update task status - if success { - task.Status = types.TaskStatusCompleted - } else { - task.Status = types.TaskStatusFailed - task.Error = errorMessage - } - - now := time.Now() - task.CompletedAt = &now - - // Remove from active tasks - delete(as.activeTasks, taskID) - - // Update worker load - if task.WorkerID != "" { - if status, exists := as.workerStatus[task.WorkerID]; exists { - status.CurrentLoad-- - } - } - - // Add to history - var duration time.Duration - if task.StartedAt != nil { - duration = now.Sub(*task.StartedAt) - } - - entry := MinimalTaskHistoryEntry{ - TaskID: task.ID, - TaskType: task.Type, - VolumeID: task.VolumeID, - WorkerID: task.WorkerID, - Status: task.Status, - StartedAt: *task.StartedAt, - CompletedAt: now, - Duration: duration, - ErrorMessage: errorMessage, - } - as.taskHistory = append(as.taskHistory, entry) - - return nil -} - -// UpdateWorkerHeartbeat updates worker heartbeat -func (as *MinimalAdminServer) UpdateWorkerHeartbeat(workerID string, status *types.WorkerStatus) error { - as.mutex.Lock() - defer as.mutex.Unlock() - - worker, exists := as.workers[workerID] - if !exists { - return fmt.Errorf("worker %s not found", workerID) - } - - worker.LastHeartbeat = time.Now() - as.workerStatus[workerID] = status - - return nil -} - -// GetSystemStats returns system statistics -func (as *MinimalAdminServer) GetSystemStats() *MinimalSystemStats { - as.mutex.RLock() - defer as.mutex.RUnlock() - - activeWorkers := 0 - for _, status := range as.workerStatus { - if status.Status == "active" { - activeWorkers++ - } - } - - return &MinimalSystemStats{ - ActiveTasks: len(as.activeTasks), - QueuedTasks: len(as.taskQueue), - ActiveWorkers: activeWorkers, - TotalTasks: len(as.tasks), - } -} - -// GetQueuedTaskCount returns the number of queued tasks -func (as *MinimalAdminServer) GetQueuedTaskCount() int { - as.mutex.RLock() - defer as.mutex.RUnlock() - return len(as.taskQueue) -} - -// GetActiveTaskCount returns the number of active tasks -func (as *MinimalAdminServer) GetActiveTaskCount() int { - as.mutex.RLock() - defer as.mutex.RUnlock() - return len(as.activeTasks) -} - -// GetTaskHistory returns task history -func (as *MinimalAdminServer) GetTaskHistory() []MinimalTaskHistoryEntry { - as.mutex.RLock() - defer as.mutex.RUnlock() - - // Return a copy of the history - history := make([]MinimalTaskHistoryEntry, len(as.taskHistory)) - copy(history, as.taskHistory) - return history -} diff --git a/weed/admin/task/minimal_integration_test.go b/weed/admin/task/minimal_integration_test.go deleted file mode 100644 index c690456ef..000000000 --- a/weed/admin/task/minimal_integration_test.go +++ /dev/null @@ -1,434 +0,0 @@ -package task - -import ( - "fmt" - "testing" - "time" - - "github.com/seaweedfs/seaweedfs/weed/worker/types" -) - -// TestMinimalIntegration tests basic admin-worker operational flow using the minimal implementation -func TestMinimalIntegration(t *testing.T) { - t.Logf("Starting minimal integration test") - - // Step 1: Create a minimal admin server configuration - config := &MinimalAdminConfig{ - ScanInterval: 10 * time.Second, - WorkerTimeout: 30 * time.Second, - TaskTimeout: 2 * time.Hour, - MaxRetries: 3, - ReconcileInterval: 5 * time.Minute, - EnableFailureRecovery: true, - MaxConcurrentTasks: 5, - } - - // Step 2: Create minimal admin server with nil master client (for testing) - adminServer := NewMinimalAdminServer(config, nil) - - // Step 3: Start admin server - err := adminServer.Start() - if err != nil { - t.Fatalf("Failed to start admin server: %v", err) - } - defer adminServer.Stop() - - // Step 4: Test worker registration - t.Logf("Testing worker registration") - - worker := &types.Worker{ - ID: "test-worker-1", - Address: "localhost:9001", - Capabilities: []types.TaskType{types.TaskTypeVacuum}, - MaxConcurrent: 2, - Status: "active", - CurrentLoad: 0, - LastHeartbeat: time.Now(), - } - - err = adminServer.RegisterWorker(worker) - if err != nil { - t.Fatalf("Failed to register worker: %v", err) - } - t.Logf("Successfully registered worker %s", worker.ID) - - // Step 5: Test task queueing - t.Logf("Testing task queueing") - - task := &types.Task{ - ID: "test-task-1", - Type: types.TaskTypeVacuum, - VolumeID: 1001, - Server: "localhost:8080", - Status: types.TaskStatusPending, - Priority: types.TaskPriorityNormal, - Parameters: map[string]interface{}{ - "garbage_threshold": "0.3", - }, - CreatedAt: time.Now(), - } - - err = adminServer.QueueTask(task) - if err != nil { - t.Fatalf("Failed to queue task: %v", err) - } - t.Logf("Successfully queued task %s", task.ID) - - // Step 6: Test task request by worker - t.Logf("Testing task request") - - assignedTask, err := adminServer.RequestTask("test-worker-1", []types.TaskType{types.TaskTypeVacuum}) - if err != nil { - t.Fatalf("Failed to request task: %v", err) - } - - if assignedTask != nil { - t.Logf("Successfully assigned task %s to worker", assignedTask.ID) - - // Step 7: Test task progress updates - t.Logf("Testing task progress updates") - - err = adminServer.UpdateTaskProgress(assignedTask.ID, 25.0) - if err != nil { - t.Errorf("Failed to update task progress to 25%%: %v", err) - } - - err = adminServer.UpdateTaskProgress(assignedTask.ID, 50.0) - if err != nil { - t.Errorf("Failed to update task progress to 50%%: %v", err) - } - - err = adminServer.UpdateTaskProgress(assignedTask.ID, 75.0) - if err != nil { - t.Errorf("Failed to update task progress to 75%%: %v", err) - } - - err = adminServer.UpdateTaskProgress(assignedTask.ID, 100.0) - if err != nil { - t.Errorf("Failed to update task progress to 100%%: %v", err) - } - - // Step 8: Test task completion - t.Logf("Testing task completion") - - err = adminServer.CompleteTask(assignedTask.ID, true, "") - if err != nil { - t.Errorf("Failed to complete task: %v", err) - } - t.Logf("Successfully completed task %s", assignedTask.ID) - } else { - t.Logf("No task was assigned (queue might be empty)") - } - - // Step 9: Test basic metrics - t.Logf("Testing basic metrics") - - stats := adminServer.GetSystemStats() - if stats != nil { - t.Logf("System stats: Active tasks=%d, Queued tasks=%d, Active workers=%d, Total tasks=%d", - stats.ActiveTasks, stats.QueuedTasks, stats.ActiveWorkers, stats.TotalTasks) - } - - queuedCount := adminServer.GetQueuedTaskCount() - activeCount := adminServer.GetActiveTaskCount() - t.Logf("Queue status: %d queued, %d active tasks", queuedCount, activeCount) - - // Step 10: Test task history - history := adminServer.GetTaskHistory() - t.Logf("Task history contains %d entries", len(history)) - - if len(history) > 0 { - lastEntry := history[len(history)-1] - t.Logf("Last task in history: %s (%s) - Status: %s, Duration: %v", - lastEntry.TaskID, lastEntry.TaskType, lastEntry.Status, lastEntry.Duration) - } - - t.Logf("Minimal integration test completed successfully") -} - -// TestMinimalWorkerHeartbeat tests worker heartbeat functionality -func TestMinimalWorkerHeartbeat(t *testing.T) { - t.Logf("Testing minimal worker heartbeat") - - config := &MinimalAdminConfig{ - ScanInterval: 10 * time.Second, - WorkerTimeout: 30 * time.Second, - TaskTimeout: 2 * time.Hour, - MaxRetries: 3, - ReconcileInterval: 5 * time.Minute, - EnableFailureRecovery: true, - MaxConcurrentTasks: 5, - } - - adminServer := NewMinimalAdminServer(config, nil) - err := adminServer.Start() - if err != nil { - t.Fatalf("Failed to start admin server: %v", err) - } - defer adminServer.Stop() - - // Register a worker - worker := &types.Worker{ - ID: "heartbeat-worker", - Address: "localhost:9002", - Capabilities: []types.TaskType{types.TaskTypeVacuum}, - MaxConcurrent: 1, - Status: "active", - CurrentLoad: 0, - LastHeartbeat: time.Now(), - } - - err = adminServer.RegisterWorker(worker) - if err != nil { - t.Fatalf("Failed to register worker: %v", err) - } - - // Test heartbeat update - status := &types.WorkerStatus{ - Status: "active", - CurrentLoad: 0, - } - - err = adminServer.UpdateWorkerHeartbeat("heartbeat-worker", status) - if err != nil { - t.Errorf("Failed to update worker heartbeat: %v", err) - } - - t.Logf("Minimal worker heartbeat test completed successfully") -} - -// TestMinimalTaskQueueOperations tests task queue operations -func TestMinimalTaskQueueOperations(t *testing.T) { - t.Logf("Testing minimal task queue operations") - - config := &MinimalAdminConfig{ - ScanInterval: 10 * time.Second, - WorkerTimeout: 30 * time.Second, - TaskTimeout: 2 * time.Hour, - MaxRetries: 3, - ReconcileInterval: 5 * time.Minute, - EnableFailureRecovery: true, - MaxConcurrentTasks: 5, - } - - adminServer := NewMinimalAdminServer(config, nil) - err := adminServer.Start() - if err != nil { - t.Fatalf("Failed to start admin server: %v", err) - } - defer adminServer.Stop() - - // Test queuing multiple tasks - taskCount := 3 - for i := 0; i < taskCount; i++ { - task := &types.Task{ - ID: fmt.Sprintf("queue-test-task-%d", i), - Type: types.TaskTypeVacuum, - VolumeID: uint32(2000 + i), - Server: "localhost:8080", - Status: types.TaskStatusPending, - Priority: types.TaskPriorityNormal, - Parameters: map[string]interface{}{ - "garbage_threshold": "0.3", - }, - CreatedAt: time.Now(), - } - - err = adminServer.QueueTask(task) - if err != nil { - t.Errorf("Failed to queue task %d: %v", i, err) - } - } - - // Check queue size - queuedCount := adminServer.GetQueuedTaskCount() - if queuedCount != taskCount { - t.Errorf("Expected %d queued tasks, got %d", taskCount, queuedCount) - } - - t.Logf("Minimal task queue operations test completed successfully") -} - -// TestMinimalFullWorkflow tests the complete workflow from task creation to completion -func TestMinimalFullWorkflow(t *testing.T) { - t.Logf("Testing minimal full workflow") - - config := &MinimalAdminConfig{ - ScanInterval: 10 * time.Second, - WorkerTimeout: 30 * time.Second, - TaskTimeout: 2 * time.Hour, - MaxRetries: 3, - ReconcileInterval: 5 * time.Minute, - EnableFailureRecovery: true, - MaxConcurrentTasks: 5, - } - - adminServer := NewMinimalAdminServer(config, nil) - err := adminServer.Start() - if err != nil { - t.Fatalf("Failed to start admin server: %v", err) - } - defer adminServer.Stop() - - // Register multiple workers with different capabilities - workers := []*types.Worker{ - { - ID: "vacuum-worker-1", - Address: "localhost:9001", - Capabilities: []types.TaskType{types.TaskTypeVacuum}, - MaxConcurrent: 2, - Status: "active", - CurrentLoad: 0, - LastHeartbeat: time.Now(), - }, - { - ID: "ec-worker-1", - Address: "localhost:9002", - Capabilities: []types.TaskType{types.TaskTypeErasureCoding}, - MaxConcurrent: 1, - Status: "active", - CurrentLoad: 0, - LastHeartbeat: time.Now(), - }, - { - ID: "multi-worker-1", - Address: "localhost:9003", - Capabilities: []types.TaskType{types.TaskTypeVacuum, types.TaskTypeErasureCoding}, - MaxConcurrent: 3, - Status: "active", - CurrentLoad: 0, - LastHeartbeat: time.Now(), - }, - } - - for _, worker := range workers { - err = adminServer.RegisterWorker(worker) - if err != nil { - t.Fatalf("Failed to register worker %s: %v", worker.ID, err) - } - t.Logf("Registered worker %s with capabilities %v", worker.ID, worker.Capabilities) - } - - // Create multiple tasks of different types - tasks := []*types.Task{ - { - ID: "vacuum-task-1", - Type: types.TaskTypeVacuum, - VolumeID: 3001, - Server: "localhost:8080", - Status: types.TaskStatusPending, - Priority: types.TaskPriorityNormal, - Parameters: map[string]interface{}{ - "garbage_threshold": "0.4", - }, - CreatedAt: time.Now(), - }, - { - ID: "ec-task-1", - Type: types.TaskTypeErasureCoding, - VolumeID: 3002, - Server: "localhost:8080", - Status: types.TaskStatusPending, - Priority: types.TaskPriorityHigh, - Parameters: map[string]interface{}{ - "shard_count": "14", - }, - CreatedAt: time.Now(), - }, - { - ID: "vacuum-task-2", - Type: types.TaskTypeVacuum, - VolumeID: 3003, - Server: "localhost:8081", - Status: types.TaskStatusPending, - Priority: types.TaskPriorityLow, - Parameters: map[string]interface{}{ - "garbage_threshold": "0.5", - }, - CreatedAt: time.Now(), - }, - } - - for _, task := range tasks { - err = adminServer.QueueTask(task) - if err != nil { - t.Fatalf("Failed to queue task %s: %v", task.ID, err) - } - t.Logf("Queued task %s (%s) for volume %d", task.ID, task.Type, task.VolumeID) - } - - // Test task assignment to different workers - t.Logf("Testing task assignments") - - // Vacuum worker should get vacuum tasks - assignedTask, err := adminServer.RequestTask("vacuum-worker-1", []types.TaskType{types.TaskTypeVacuum}) - if err != nil { - t.Errorf("Failed to request task for vacuum worker: %v", err) - } else if assignedTask != nil { - t.Logf("Vacuum worker got task: %s (%s)", assignedTask.ID, assignedTask.Type) - - // Complete the task - err = adminServer.UpdateTaskProgress(assignedTask.ID, 50.0) - if err != nil { - t.Errorf("Failed to update progress: %v", err) - } - - err = adminServer.CompleteTask(assignedTask.ID, true, "") - if err != nil { - t.Errorf("Failed to complete task: %v", err) - } - } - - // EC worker should get EC tasks - assignedTask, err = adminServer.RequestTask("ec-worker-1", []types.TaskType{types.TaskTypeErasureCoding}) - if err != nil { - t.Errorf("Failed to request task for EC worker: %v", err) - } else if assignedTask != nil { - t.Logf("EC worker got task: %s (%s)", assignedTask.ID, assignedTask.Type) - - // Complete the task - err = adminServer.UpdateTaskProgress(assignedTask.ID, 100.0) - if err != nil { - t.Errorf("Failed to update progress: %v", err) - } - - err = adminServer.CompleteTask(assignedTask.ID, true, "") - if err != nil { - t.Errorf("Failed to complete task: %v", err) - } - } - - // Multi-capability worker should be able to get any remaining task - assignedTask, err = adminServer.RequestTask("multi-worker-1", []types.TaskType{types.TaskTypeVacuum, types.TaskTypeErasureCoding}) - if err != nil { - t.Errorf("Failed to request task for multi worker: %v", err) - } else if assignedTask != nil { - t.Logf("Multi worker got task: %s (%s)", assignedTask.ID, assignedTask.Type) - - // Complete the task - err = adminServer.UpdateTaskProgress(assignedTask.ID, 100.0) - if err != nil { - t.Errorf("Failed to update progress: %v", err) - } - - err = adminServer.CompleteTask(assignedTask.ID, true, "") - if err != nil { - t.Errorf("Failed to complete task: %v", err) - } - } - - // Check final statistics - stats := adminServer.GetSystemStats() - t.Logf("Final stats: Active tasks=%d, Queued tasks=%d, Active workers=%d, Total tasks=%d", - stats.ActiveTasks, stats.QueuedTasks, stats.ActiveWorkers, stats.TotalTasks) - - history := adminServer.GetTaskHistory() - t.Logf("Task history contains %d completed tasks", len(history)) - - for _, entry := range history { - t.Logf("Completed: %s (%s) - Worker: %s, Duration: %v", - entry.TaskID, entry.TaskType, entry.WorkerID, entry.Duration) - } - - t.Logf("Minimal full workflow test completed successfully") -} diff --git a/weed/admin/task/operational_integration_test.go b/weed/admin/task/operational_integration_test.go deleted file mode 100644 index e9966ef5b..000000000 --- a/weed/admin/task/operational_integration_test.go +++ /dev/null @@ -1,197 +0,0 @@ -package task - -import ( - "fmt" - "testing" - "time" - - "github.com/seaweedfs/seaweedfs/weed/wdclient" - "github.com/seaweedfs/seaweedfs/weed/worker/types" -) - -// TestOperationalIntegration tests the basic admin-worker operational flow -func TestOperationalIntegration(t *testing.T) { - t.Logf("Starting operational integration test") - - // Step 1: Create admin server with operational configuration - config := &AdminConfig{ - ScanInterval: 10 * time.Second, - WorkerTimeout: 30 * time.Second, - TaskTimeout: 2 * time.Hour, - MaxRetries: 3, - ReconcileInterval: 5 * time.Minute, - EnableFailureRecovery: true, - MaxConcurrentTasks: 5, - } - - // Create a nil master client for testing (simplified) - var masterClient *wdclient.MasterClient - - adminServer := NewAdminServer(config, masterClient) - - // Step 2: Start admin server - err := adminServer.Start() - if err != nil { - t.Fatalf("Failed to start admin server: %v", err) - } - defer adminServer.Stop() - - // Step 3: Create and register test workers - worker1 := createTestWorker("worker-1", []types.TaskType{types.TaskTypeVacuum, types.TaskTypeErasureCoding}) - worker2 := createTestWorker("worker-2", []types.TaskType{types.TaskTypeVacuum}) - - err = adminServer.RegisterWorker(worker1) - if err != nil { - t.Fatalf("Failed to register worker1: %v", err) - } - - err = adminServer.RegisterWorker(worker2) - if err != nil { - t.Fatalf("Failed to register worker2: %v", err) - } - - // Step 4: Test basic task queueing - t.Logf("Testing task queueing") - - // Create a simple test task - testTask := &types.Task{ - ID: "test-vacuum-1", - Type: types.TaskTypeVacuum, - VolumeID: 1001, - Server: "localhost:8080", - Status: types.TaskStatusPending, - Priority: types.TaskPriorityNormal, - Parameters: map[string]interface{}{ - "garbage_threshold": "0.3", - "server": "localhost:8080", - }, - CreatedAt: time.Now(), - } - - err = adminServer.QueueTask(testTask) - if err != nil { - t.Fatalf("Failed to queue test task: %v", err) - } - t.Logf("Successfully queued test vacuum task for volume %d", testTask.VolumeID) - - // Step 5: Test worker task request and assignment - t.Logf("Testing worker task requests and assignment") - - // Worker requests task - task, err := adminServer.RequestTask("worker-1", []types.TaskType{types.TaskTypeVacuum}) - if err != nil { - t.Fatalf("Failed to request task from worker: %v", err) - } - - if task == nil { - t.Logf("No tasks available for assignment (this is expected in test environment)") - } else { - t.Logf("Successfully assigned task %s (%s) to worker-1", task.ID, task.Type) - - // Step 6: Simulate task progress updates - t.Logf("Testing task progress updates") - - err = adminServer.UpdateTaskProgress(task.ID, 25.0) - if err != nil { - t.Errorf("Failed to update task progress: %v", err) - } - - err = adminServer.UpdateTaskProgress(task.ID, 50.0) - if err != nil { - t.Errorf("Failed to update task progress: %v", err) - } - - err = adminServer.UpdateTaskProgress(task.ID, 100.0) - if err != nil { - t.Errorf("Failed to update task progress: %v", err) - } - - // Step 7: Test task completion - t.Logf("Testing task completion") - - err = adminServer.CompleteTask(task.ID, true, "") - if err != nil { - t.Errorf("Failed to complete task: %v", err) - } - - t.Logf("Successfully completed task %s", task.ID) - } - - // Step 8: Test metrics and statistics - t.Logf("Testing system metrics") - - stats := adminServer.GetSystemStats() - t.Logf("System stats: Active tasks=%d, Queued tasks=%d, Active workers=%d", - stats.ActiveTasks, stats.QueuedTasks, stats.ActiveWorkers) - - queuedCount := adminServer.GetQueuedTaskCount() - activeCount := adminServer.GetActiveTaskCount() - t.Logf("Queue status: %d queued, %d active tasks", queuedCount, activeCount) - - // Step 9: Test task history - history := adminServer.GetTaskHistory() - t.Logf("Task history contains %d entries", len(history)) - - t.Logf("Operational integration test completed successfully") -} - -func createTestWorker(id string, capabilities []types.TaskType) *types.Worker { - return &types.Worker{ - ID: id, - Address: fmt.Sprintf("localhost:900%s", id[len(id)-1:]), - Capabilities: capabilities, - MaxConcurrent: 2, - Status: "active", - CurrentLoad: 0, - LastHeartbeat: time.Now(), - } -} - -// TestECTaskExecution tests the EC task validation (without actual execution) -func TestECTaskExecution(t *testing.T) { - t.Logf("Testing EC task validation") - - params := types.TaskParams{ - VolumeID: 1002, - Server: "localhost:8080", - Collection: "test", - Parameters: map[string]interface{}{ - "volume_size": int64(32 * 1024 * 1024 * 1024), - }, - } - - // Test that basic validation would work - if params.VolumeID == 0 { - t.Errorf("VolumeID should not be zero") - } - if params.Server == "" { - t.Errorf("Server should not be empty") - } - - t.Logf("EC task validation passed") -} - -// TestVacuumTaskExecution tests the vacuum task validation (without actual execution) -func TestVacuumTaskExecution(t *testing.T) { - t.Logf("Testing vacuum task validation") - - params := types.TaskParams{ - VolumeID: 1001, - Server: "localhost:8080", - Collection: "test", - Parameters: map[string]interface{}{ - "garbage_threshold": "0.3", - "volume_size": int64(25 * 1024 * 1024 * 1024), - }, - } - - // Test that basic validation would work - if params.VolumeID == 0 { - t.Errorf("VolumeID should not be zero") - } - if params.Server == "" { - t.Errorf("Server should not be empty") - } - - t.Logf("Vacuum task validation passed") -} diff --git a/weed/admin/task/simple_integration_test.go b/weed/admin/task/simple_integration_test.go deleted file mode 100644 index a7859e569..000000000 --- a/weed/admin/task/simple_integration_test.go +++ /dev/null @@ -1,233 +0,0 @@ -package task - -import ( - "fmt" - "testing" - "time" - - "github.com/seaweedfs/seaweedfs/weed/worker/types" -) - -// TestSimpleIntegration tests basic admin-worker operational flow without complex dependencies -func TestSimpleIntegration(t *testing.T) { - t.Logf("Starting simple integration test") - - // Step 1: Create a minimal admin server configuration - config := &AdminConfig{ - ScanInterval: 10 * time.Second, - WorkerTimeout: 30 * time.Second, - TaskTimeout: 2 * time.Hour, - MaxRetries: 3, - ReconcileInterval: 5 * time.Minute, - EnableFailureRecovery: true, - MaxConcurrentTasks: 5, - } - - // Step 2: Create admin server with nil master client (for testing) - adminServer := NewAdminServer(config, nil) - - // Step 3: Start admin server - err := adminServer.Start() - if err != nil { - t.Fatalf("Failed to start admin server: %v", err) - } - defer adminServer.Stop() - - // Step 4: Test worker registration - t.Logf("Testing worker registration") - - worker := &types.Worker{ - ID: "test-worker-1", - Address: "localhost:9001", - Capabilities: []types.TaskType{types.TaskTypeVacuum}, - MaxConcurrent: 2, - Status: "active", - CurrentLoad: 0, - LastHeartbeat: time.Now(), - } - - err = adminServer.RegisterWorker(worker) - if err != nil { - t.Fatalf("Failed to register worker: %v", err) - } - t.Logf("Successfully registered worker %s", worker.ID) - - // Step 5: Test task queueing - t.Logf("Testing task queueing") - - task := &types.Task{ - ID: "test-task-1", - Type: types.TaskTypeVacuum, - VolumeID: 1001, - Server: "localhost:8080", - Status: types.TaskStatusPending, - Priority: types.TaskPriorityNormal, - Parameters: map[string]interface{}{ - "garbage_threshold": "0.3", - }, - CreatedAt: time.Now(), - } - - err = adminServer.QueueTask(task) - if err != nil { - t.Fatalf("Failed to queue task: %v", err) - } - t.Logf("Successfully queued task %s", task.ID) - - // Step 6: Test task request by worker - t.Logf("Testing task request") - - assignedTask, err := adminServer.RequestTask("test-worker-1", []types.TaskType{types.TaskTypeVacuum}) - if err != nil { - t.Fatalf("Failed to request task: %v", err) - } - - if assignedTask != nil { - t.Logf("Successfully assigned task %s to worker", assignedTask.ID) - - // Step 7: Test task progress updates - t.Logf("Testing task progress updates") - - err = adminServer.UpdateTaskProgress(assignedTask.ID, 50.0) - if err != nil { - t.Errorf("Failed to update task progress: %v", err) - } - - err = adminServer.UpdateTaskProgress(assignedTask.ID, 100.0) - if err != nil { - t.Errorf("Failed to update task progress: %v", err) - } - - // Step 8: Test task completion - t.Logf("Testing task completion") - - err = adminServer.CompleteTask(assignedTask.ID, true, "") - if err != nil { - t.Errorf("Failed to complete task: %v", err) - } - t.Logf("Successfully completed task %s", assignedTask.ID) - } else { - t.Logf("No task was assigned (queue might be empty)") - } - - // Step 9: Test basic metrics - t.Logf("Testing basic metrics") - - stats := adminServer.GetSystemStats() - if stats != nil { - t.Logf("System stats: Active tasks=%d, Queued tasks=%d, Active workers=%d", - stats.ActiveTasks, stats.QueuedTasks, stats.ActiveWorkers) - } - - queuedCount := adminServer.GetQueuedTaskCount() - activeCount := adminServer.GetActiveTaskCount() - t.Logf("Queue status: %d queued, %d active tasks", queuedCount, activeCount) - - // Step 10: Test task history - history := adminServer.GetTaskHistory() - t.Logf("Task history contains %d entries", len(history)) - - t.Logf("Simple integration test completed successfully") -} - -// TestWorkerHeartbeat tests worker heartbeat functionality -func TestWorkerHeartbeat(t *testing.T) { - t.Logf("Testing worker heartbeat") - - config := &AdminConfig{ - ScanInterval: 10 * time.Second, - WorkerTimeout: 30 * time.Second, - TaskTimeout: 2 * time.Hour, - MaxRetries: 3, - ReconcileInterval: 5 * time.Minute, - EnableFailureRecovery: true, - MaxConcurrentTasks: 5, - } - - adminServer := NewAdminServer(config, nil) - err := adminServer.Start() - if err != nil { - t.Fatalf("Failed to start admin server: %v", err) - } - defer adminServer.Stop() - - // Register a worker - worker := &types.Worker{ - ID: "heartbeat-worker", - Address: "localhost:9002", - Capabilities: []types.TaskType{types.TaskTypeVacuum}, - MaxConcurrent: 1, - Status: "active", - CurrentLoad: 0, - LastHeartbeat: time.Now(), - } - - err = adminServer.RegisterWorker(worker) - if err != nil { - t.Fatalf("Failed to register worker: %v", err) - } - - // Test heartbeat update - status := &types.WorkerStatus{ - Status: "active", - CurrentLoad: 0, - } - - err = adminServer.UpdateWorkerHeartbeat("heartbeat-worker", status) - if err != nil { - t.Errorf("Failed to update worker heartbeat: %v", err) - } - - t.Logf("Worker heartbeat test completed successfully") -} - -// TestTaskQueueOperations tests task queue operations -func TestTaskQueueOperations(t *testing.T) { - t.Logf("Testing task queue operations") - - config := &AdminConfig{ - ScanInterval: 10 * time.Second, - WorkerTimeout: 30 * time.Second, - TaskTimeout: 2 * time.Hour, - MaxRetries: 3, - ReconcileInterval: 5 * time.Minute, - EnableFailureRecovery: true, - MaxConcurrentTasks: 5, - } - - adminServer := NewAdminServer(config, nil) - err := adminServer.Start() - if err != nil { - t.Fatalf("Failed to start admin server: %v", err) - } - defer adminServer.Stop() - - // Test queuing multiple tasks - for i := 0; i < 3; i++ { - task := &types.Task{ - ID: fmt.Sprintf("queue-test-task-%d", i), - Type: types.TaskTypeVacuum, - VolumeID: uint32(2000 + i), - Server: "localhost:8080", - Status: types.TaskStatusPending, - Priority: types.TaskPriorityNormal, - Parameters: map[string]interface{}{ - "garbage_threshold": "0.3", - }, - CreatedAt: time.Now(), - } - - err = adminServer.QueueTask(task) - if err != nil { - t.Errorf("Failed to queue task %d: %v", i, err) - } - } - - // Check queue size - queuedCount := adminServer.GetQueuedTaskCount() - if queuedCount != 3 { - t.Errorf("Expected 3 queued tasks, got %d", queuedCount) - } - - t.Logf("Task queue operations test completed successfully") -} diff --git a/weed/admin/task/simulation.go b/weed/admin/task/simulation.go deleted file mode 100644 index e30b326fc..000000000 --- a/weed/admin/task/simulation.go +++ /dev/null @@ -1,604 +0,0 @@ -package task - -import ( - "context" - "fmt" - "math/rand" - "sync" - "time" - - "github.com/seaweedfs/seaweedfs/weed/glog" - "github.com/seaweedfs/seaweedfs/weed/worker/types" -) - -// TaskSimulator provides a comprehensive simulation framework for testing the task distribution system -type TaskSimulator struct { - adminServer *AdminServer - mockWorkers []*MockWorker - mockMaster *MockMasterClient - scenarios map[string]*SimulationScenario - results map[string]*SimulationResult - mutex sync.RWMutex -} - -// SimulationScenario defines a test scenario -type SimulationScenario struct { - Name string - Description string - WorkerCount int - VolumeCount int - Duration time.Duration - FailurePatterns []*FailurePattern - TestCases []*TestCase -} - -// FailurePattern defines how failures occur during simulation -type FailurePattern struct { - Type FailureType - Probability float64 // 0.0 to 1.0 - Timing *TimingSpec // When during task execution - Duration time.Duration - Details string -} - -// TestCase defines specific test scenarios -type TestCase struct { - Name string - VolumeID uint32 - TaskType types.TaskType - ExpectedOutcome string - FailureToInject *FailurePattern -} - -// FailureType represents different types of failures -type FailureType string - -const ( - FailureWorkerTimeout FailureType = "worker_timeout" - FailureTaskStuck FailureType = "task_stuck" - FailureTaskCrash FailureType = "task_crash" - FailureDuplicate FailureType = "duplicate_task" - FailureResourceExhaust FailureType = "resource_exhaustion" - FailureNetworkPartition FailureType = "network_partition" -) - -// TimingSpec defines when a failure occurs -type TimingSpec struct { - MinProgress float64 // Minimum progress before failure can occur - MaxProgress float64 // Maximum progress before failure must occur - Delay time.Duration // Fixed delay before failure -} - -// SimulationResult tracks the results of a simulation -type SimulationResult struct { - ScenarioName string - StartTime time.Time - EndTime time.Time - Duration time.Duration - TasksCreated int - TasksCompleted int - TasksFailed int - TasksStuck int - WorkerTimeouts int - DuplicatesFound int - StateInconsistencies int - Errors []string - Warnings []string - Success bool -} - -// MockWorker simulates a worker with controllable behavior -type MockWorker struct { - ID string - Capabilities []types.TaskType - MaxConcurrent int - CurrentTasks map[string]*MockTask - Status string - FailureMode *FailurePattern - mutex sync.Mutex -} - -// MockTask represents a simulated task execution -type MockTask struct { - Task *types.Task - StartTime time.Time - Progress float64 - Stuck bool - Failed bool - Completed bool -} - -// MockMasterClient simulates master server interactions -type MockMasterClient struct { - volumes map[uint32]*VolumeInfo - inconsistency bool - mutex sync.RWMutex -} - -// NewTaskSimulator creates a new task simulator -func NewTaskSimulator() *TaskSimulator { - return &TaskSimulator{ - scenarios: make(map[string]*SimulationScenario), - results: make(map[string]*SimulationResult), - } -} - -// RegisterScenario registers a simulation scenario -func (ts *TaskSimulator) RegisterScenario(scenario *SimulationScenario) { - ts.mutex.Lock() - defer ts.mutex.Unlock() - - ts.scenarios[scenario.Name] = scenario - glog.Infof("Registered simulation scenario: %s", scenario.Name) -} - -// RunScenario executes a simulation scenario -func (ts *TaskSimulator) RunScenario(scenarioName string) (*SimulationResult, error) { - ts.mutex.RLock() - scenario, exists := ts.scenarios[scenarioName] - ts.mutex.RUnlock() - - if !exists { - return nil, fmt.Errorf("scenario %s not found", scenarioName) - } - - glog.Infof("Starting simulation scenario: %s", scenarioName) - - result := &SimulationResult{ - ScenarioName: scenarioName, - StartTime: time.Now(), - Errors: make([]string, 0), - Warnings: make([]string, 0), - } - - // Setup simulation environment - if err := ts.setupEnvironment(scenario); err != nil { - return nil, fmt.Errorf("failed to setup environment: %v", err) - } - - // Execute test cases - ctx, cancel := context.WithTimeout(context.Background(), scenario.Duration) - defer cancel() - - ts.executeScenario(ctx, scenario, result) - - // Cleanup - ts.cleanup() - - result.EndTime = time.Now() - result.Duration = result.EndTime.Sub(result.StartTime) - result.Success = len(result.Errors) == 0 - - ts.mutex.Lock() - ts.results[scenarioName] = result - ts.mutex.Unlock() - - glog.Infof("Completed simulation scenario: %s (success: %v)", scenarioName, result.Success) - return result, nil -} - -// setupEnvironment prepares the simulation environment -func (ts *TaskSimulator) setupEnvironment(scenario *SimulationScenario) error { - // Create mock master client - ts.mockMaster = &MockMasterClient{ - volumes: make(map[uint32]*VolumeInfo), - } - - // Generate mock volumes - for i := uint32(1); i <= uint32(scenario.VolumeCount); i++ { - volume := &VolumeInfo{ - ID: i, - Size: uint64(rand.Intn(30 * 1024 * 1024 * 1024)), // Random size up to 30GB - Collection: fmt.Sprintf("collection_%d", (i%3)+1), - DeletedByteCount: uint64(rand.Intn(1024 * 1024 * 1024)), // Random garbage - ReadOnly: false, - Server: fmt.Sprintf("server_%d", (i%6)+1), - ModifiedAtSecond: time.Now().Add(-time.Duration(rand.Intn(86400)) * time.Second).Unix(), - } - ts.mockMaster.volumes[i] = volume - } - - // Create mock workers - ts.mockWorkers = make([]*MockWorker, scenario.WorkerCount) - for i := 0; i < scenario.WorkerCount; i++ { - worker := &MockWorker{ - ID: fmt.Sprintf("worker_%d", i+1), - Capabilities: []types.TaskType{types.TaskTypeErasureCoding, types.TaskTypeVacuum}, - MaxConcurrent: 2, - CurrentTasks: make(map[string]*MockTask), - Status: "active", - } - - // Apply failure patterns - if i < len(scenario.FailurePatterns) { - worker.FailureMode = scenario.FailurePatterns[i] - } - - ts.mockWorkers[i] = worker - } - - // Initialize admin server (simplified for simulation) - config := DefaultAdminConfig() - config.ScanInterval = 10 * time.Second - config.TaskTimeout = 30 * time.Second - - // Note: In a real implementation, this would use the actual master client - // For simulation, we'd need to inject our mock - - return nil -} - -// executeScenario runs the actual simulation scenario -func (ts *TaskSimulator) executeScenario(ctx context.Context, scenario *SimulationScenario, result *SimulationResult) { - // Execute each test case - for _, testCase := range scenario.TestCases { - ts.executeTestCase(ctx, testCase, result) - } - - // Run continuous simulation for remaining duration - ts.runContinuousSimulation(ctx, scenario, result) -} - -// executeTestCase runs a specific test case -func (ts *TaskSimulator) executeTestCase(ctx context.Context, testCase *TestCase, result *SimulationResult) { - glog.V(1).Infof("Executing test case: %s", testCase.Name) - - // Create task for the test case - task := &types.Task{ - ID: fmt.Sprintf("test_%s_%d", testCase.Name, time.Now().UnixNano()), - Type: testCase.TaskType, - VolumeID: testCase.VolumeID, - Priority: types.TaskPriorityNormal, - CreatedAt: time.Now(), - } - - result.TasksCreated++ - - // Assign to worker - worker := ts.selectWorkerForTask(task) - if worker == nil { - result.Errors = append(result.Errors, fmt.Sprintf("No available worker for test case %s", testCase.Name)) - return - } - - // Execute task with potential failure injection - ts.executeTaskOnWorker(ctx, task, worker, testCase.FailureToInject, result) -} - -// runContinuousSimulation runs ongoing simulation -func (ts *TaskSimulator) runContinuousSimulation(ctx context.Context, scenario *SimulationScenario, result *SimulationResult) { - ticker := time.NewTicker(5 * time.Second) - defer ticker.Stop() - - for { - select { - case <-ctx.Done(): - return - case <-ticker.C: - ts.simulateOngoingTasks(result) - ts.checkForInconsistencies(result) - } - } -} - -// executeTaskOnWorker simulates task execution on a worker -func (ts *TaskSimulator) executeTaskOnWorker(ctx context.Context, task *types.Task, worker *MockWorker, failurePattern *FailurePattern, result *SimulationResult) { - worker.mutex.Lock() - defer worker.mutex.Unlock() - - mockTask := &MockTask{ - Task: task, - StartTime: time.Now(), - Progress: 0.0, - } - - worker.CurrentTasks[task.ID] = mockTask - - // Simulate task execution - go ts.simulateTaskExecution(ctx, mockTask, worker, failurePattern, result) -} - -// simulateTaskExecution simulates the execution of a single task -func (ts *TaskSimulator) simulateTaskExecution(ctx context.Context, mockTask *MockTask, worker *MockWorker, failurePattern *FailurePattern, result *SimulationResult) { - defer func() { - worker.mutex.Lock() - delete(worker.CurrentTasks, mockTask.Task.ID) - worker.mutex.Unlock() - }() - - duration := 20 * time.Second // Base task duration - progressTicker := time.NewTicker(time.Second) - defer progressTicker.Stop() - - startTime := time.Now() - - for { - select { - case <-ctx.Done(): - return - case <-progressTicker.C: - elapsed := time.Since(startTime) - progress := float64(elapsed) / float64(duration) * 100.0 - - if progress >= 100.0 { - mockTask.Completed = true - result.TasksCompleted++ - glog.V(2).Infof("Task %s completed successfully", mockTask.Task.ID) - return - } - - mockTask.Progress = progress - - // Check for failure injection - if failurePattern != nil && ts.shouldInjectFailure(failurePattern, progress, elapsed) { - ts.injectFailure(mockTask, worker, failurePattern, result) - return - } - - // Check for worker failure mode - if worker.FailureMode != nil && ts.shouldInjectFailure(worker.FailureMode, progress, elapsed) { - ts.injectFailure(mockTask, worker, worker.FailureMode, result) - return - } - } - } -} - -// shouldInjectFailure determines if a failure should be injected -func (ts *TaskSimulator) shouldInjectFailure(pattern *FailurePattern, progress float64, elapsed time.Duration) bool { - if pattern.Timing != nil { - if progress < pattern.Timing.MinProgress || progress > pattern.Timing.MaxProgress { - return false - } - if elapsed < pattern.Timing.Delay { - return false - } - } - - return rand.Float64() < pattern.Probability -} - -// injectFailure simulates a failure -func (ts *TaskSimulator) injectFailure(mockTask *MockTask, worker *MockWorker, pattern *FailurePattern, result *SimulationResult) { - glog.Warningf("Injecting failure: %s for task %s", pattern.Type, mockTask.Task.ID) - - switch pattern.Type { - case FailureWorkerTimeout: - worker.Status = "timeout" - result.WorkerTimeouts++ - - case FailureTaskStuck: - mockTask.Stuck = true - result.TasksStuck++ - - case FailureTaskCrash: - mockTask.Failed = true - result.TasksFailed++ - - case FailureDuplicate: - result.DuplicatesFound++ - - case FailureResourceExhaust: - worker.Status = "resource_exhausted" - result.Warnings = append(result.Warnings, fmt.Sprintf("Worker %s resource exhausted", worker.ID)) - - case FailureNetworkPartition: - worker.Status = "partitioned" - result.Warnings = append(result.Warnings, fmt.Sprintf("Worker %s network partitioned", worker.ID)) - } -} - -// selectWorkerForTask selects an available worker for a task -func (ts *TaskSimulator) selectWorkerForTask(task *types.Task) *MockWorker { - for _, worker := range ts.mockWorkers { - if worker.Status == "active" && len(worker.CurrentTasks) < worker.MaxConcurrent { - // Check capabilities - for _, capability := range worker.Capabilities { - if capability == task.Type { - return worker - } - } - } - } - return nil -} - -// simulateOngoingTasks handles ongoing task simulation -func (ts *TaskSimulator) simulateOngoingTasks(result *SimulationResult) { - // Create random new tasks - if rand.Float64() < 0.3 { // 30% chance to create new task every tick - taskType := types.TaskTypeVacuum - if rand.Float64() < 0.5 { - taskType = types.TaskTypeErasureCoding - } - - task := &types.Task{ - ID: fmt.Sprintf("auto_%d", time.Now().UnixNano()), - Type: taskType, - VolumeID: uint32(rand.Intn(len(ts.mockMaster.volumes)) + 1), - Priority: types.TaskPriorityNormal, - CreatedAt: time.Now(), - } - - result.TasksCreated++ - - worker := ts.selectWorkerForTask(task) - if worker != nil { - ts.executeTaskOnWorker(context.Background(), task, worker, nil, result) - } - } -} - -// checkForInconsistencies checks for state inconsistencies -func (ts *TaskSimulator) checkForInconsistencies(result *SimulationResult) { - // Check for volume reservation inconsistencies - // Check for duplicate tasks - // Check for orphaned tasks - // This would be more comprehensive in a real implementation - - for _, worker := range ts.mockWorkers { - worker.mutex.Lock() - for taskID, mockTask := range worker.CurrentTasks { - if mockTask.Stuck && time.Since(mockTask.StartTime) > 60*time.Second { - result.StateInconsistencies++ - result.Warnings = append(result.Warnings, fmt.Sprintf("Long-running stuck task detected: %s", taskID)) - } - } - worker.mutex.Unlock() - } -} - -// cleanup cleans up simulation resources -func (ts *TaskSimulator) cleanup() { - ts.mockWorkers = nil - ts.mockMaster = nil -} - -// GetSimulationResults returns all simulation results -func (ts *TaskSimulator) GetSimulationResults() map[string]*SimulationResult { - ts.mutex.RLock() - defer ts.mutex.RUnlock() - - results := make(map[string]*SimulationResult) - for k, v := range ts.results { - results[k] = v - } - return results -} - -// CreateStandardScenarios creates a set of standard test scenarios -func (ts *TaskSimulator) CreateStandardScenarios() { - // Scenario 1: Worker Timeout During EC - ts.RegisterScenario(&SimulationScenario{ - Name: "worker_timeout_during_ec", - Description: "Test worker timeout during erasure coding operation", - WorkerCount: 3, - VolumeCount: 10, - Duration: 2 * time.Minute, - FailurePatterns: []*FailurePattern{ - { - Type: FailureWorkerTimeout, - Probability: 1.0, - Timing: &TimingSpec{ - MinProgress: 50.0, - MaxProgress: 60.0, - }, - }, - }, - TestCases: []*TestCase{ - { - Name: "ec_timeout_test", - VolumeID: 1, - TaskType: types.TaskTypeErasureCoding, - ExpectedOutcome: "task_reassigned", - }, - }, - }) - - // Scenario 2: Stuck Vacuum Task - ts.RegisterScenario(&SimulationScenario{ - Name: "stuck_vacuum_task", - Description: "Test stuck vacuum task detection and cleanup", - WorkerCount: 2, - VolumeCount: 5, - Duration: 90 * time.Second, - TestCases: []*TestCase{ - { - Name: "vacuum_stuck_test", - VolumeID: 2, - TaskType: types.TaskTypeVacuum, - FailureToInject: &FailurePattern{ - Type: FailureTaskStuck, - Probability: 1.0, - Timing: &TimingSpec{ - MinProgress: 75.0, - MaxProgress: 80.0, - }, - }, - ExpectedOutcome: "task_timeout_detected", - }, - }, - }) - - // Scenario 3: Duplicate Task Prevention - ts.RegisterScenario(&SimulationScenario{ - Name: "duplicate_task_prevention", - Description: "Test duplicate task detection and prevention", - WorkerCount: 4, - VolumeCount: 8, - Duration: 60 * time.Second, - TestCases: []*TestCase{ - { - Name: "duplicate_ec_test_1", - VolumeID: 3, - TaskType: types.TaskTypeErasureCoding, - }, - { - Name: "duplicate_ec_test_2", // Same volume, should be detected as duplicate - VolumeID: 3, - TaskType: types.TaskTypeErasureCoding, - FailureToInject: &FailurePattern{ - Type: FailureDuplicate, - Probability: 1.0, - }, - ExpectedOutcome: "duplicate_detected", - }, - }, - }) - - // Scenario 4: Master-Admin State Divergence - ts.RegisterScenario(&SimulationScenario{ - Name: "master_admin_divergence", - Description: "Test state reconciliation between master and admin server", - WorkerCount: 3, - VolumeCount: 15, - Duration: 2 * time.Minute, - TestCases: []*TestCase{ - { - Name: "state_reconciliation_test", - VolumeID: 4, - TaskType: types.TaskTypeErasureCoding, - ExpectedOutcome: "state_reconciled", - }, - }, - }) -} - -// GenerateSimulationReport creates a comprehensive report of simulation results -func (ts *TaskSimulator) GenerateSimulationReport() string { - ts.mutex.RLock() - defer ts.mutex.RUnlock() - - report := "# Task Distribution System Simulation Report\n\n" - - for scenarioName, result := range ts.results { - report += fmt.Sprintf("## Scenario: %s\n", scenarioName) - report += fmt.Sprintf("- **Duration**: %v\n", result.Duration) - report += fmt.Sprintf("- **Success**: %v\n", result.Success) - report += fmt.Sprintf("- **Tasks Created**: %d\n", result.TasksCreated) - report += fmt.Sprintf("- **Tasks Completed**: %d\n", result.TasksCompleted) - report += fmt.Sprintf("- **Tasks Failed**: %d\n", result.TasksFailed) - report += fmt.Sprintf("- **Tasks Stuck**: %d\n", result.TasksStuck) - report += fmt.Sprintf("- **Worker Timeouts**: %d\n", result.WorkerTimeouts) - report += fmt.Sprintf("- **Duplicates Found**: %d\n", result.DuplicatesFound) - report += fmt.Sprintf("- **State Inconsistencies**: %d\n", result.StateInconsistencies) - - if len(result.Errors) > 0 { - report += "- **Errors**:\n" - for _, err := range result.Errors { - report += fmt.Sprintf(" - %s\n", err) - } - } - - if len(result.Warnings) > 0 { - report += "- **Warnings**:\n" - for _, warning := range result.Warnings { - report += fmt.Sprintf(" - %s\n", warning) - } - } - - report += "\n" - } - - return report -} diff --git a/weed/admin/task/simulation/comprehensive_simulation.go b/weed/admin/task/simulation/comprehensive_simulation.go deleted file mode 100644 index 127c201d6..000000000 --- a/weed/admin/task/simulation/comprehensive_simulation.go +++ /dev/null @@ -1,695 +0,0 @@ -package simulation - -import ( - "context" - "fmt" - "math/rand" - "sync" - "time" - - "github.com/seaweedfs/seaweedfs/weed/admin/task" - "github.com/seaweedfs/seaweedfs/weed/glog" - "github.com/seaweedfs/seaweedfs/weed/worker/types" -) - -// ComprehensiveSimulator tests all possible edge cases in volume/shard state management -type ComprehensiveSimulator struct { - stateManager *task.VolumeStateManager - mockMaster *MockMasterServer - mockWorkers []*MockWorker - scenarios []*StateTestScenario - currentScenario *StateTestScenario - results *SimulationResults - eventLog []*SimulationEvent - mutex sync.RWMutex -} - -// StateTestScenario represents a specific state management test case -type StateTestScenario struct { - Name string - Description string - InitialState *ClusterState - EventSequence []*SimulationEvent - ExpectedFinalState *ClusterState - InconsistencyChecks []*InconsistencyCheck - Duration time.Duration -} - -// ClusterState represents the complete state of the cluster -type ClusterState struct { - Volumes map[uint32]*task.VolumeInfo - ECShards map[uint32]map[int]*task.ShardInfo - ServerCapacity map[string]*task.CapacityInfo - InProgressTasks map[string]*task.TaskImpact - Timestamp time.Time -} - -// SimulationEvent represents an event that can occur during simulation -type SimulationEvent struct { - Type EventType - Timestamp time.Time - VolumeID uint32 - ShardID *int - Server string - TaskID string - Parameters map[string]interface{} - Description string -} - -// EventType represents different types of simulation events -type EventType string - -const ( - // Volume events - EventVolumeCreated EventType = "volume_created" - EventVolumeDeleted EventType = "volume_deleted" - EventVolumeSizeChanged EventType = "volume_size_changed" - EventVolumeReadOnly EventType = "volume_readonly" - - // Shard events - EventShardCreated EventType = "shard_created" - EventShardDeleted EventType = "shard_deleted" - EventShardMoved EventType = "shard_moved" - EventShardCorrupted EventType = "shard_corrupted" - - // Task events - EventTaskStarted EventType = "task_started" - EventTaskCompleted EventType = "task_completed" - EventTaskFailed EventType = "task_failed" - EventTaskStuck EventType = "task_stuck" - EventTaskCancelled EventType = "task_cancelled" - - // Worker events - EventWorkerJoined EventType = "worker_joined" - EventWorkerLeft EventType = "worker_left" - EventWorkerTimeout EventType = "worker_timeout" - EventWorkerRestarted EventType = "worker_restarted" - - // Master events - EventMasterSync EventType = "master_sync" - EventMasterInconsistent EventType = "master_inconsistent" - EventMasterPartitioned EventType = "master_partitioned" - EventMasterReconnected EventType = "master_reconnected" - - // Network events - EventNetworkPartition EventType = "network_partition" - EventNetworkHealed EventType = "network_healed" - EventMessageDelayed EventType = "message_delayed" - EventMessageLost EventType = "message_lost" -) - -// InconsistencyCheck defines what inconsistencies to check for -type InconsistencyCheck struct { - Name string - Type task.InconsistencyType - ExpectedCount int - MaxAllowedCount int - SeverityThreshold task.SeverityLevel -} - -// MockMasterServer simulates master server behavior with controllable inconsistencies -type MockMasterServer struct { - volumes map[uint32]*task.VolumeInfo - ecShards map[uint32]map[int]*task.ShardInfo - serverCapacity map[string]*task.CapacityInfo - inconsistencyMode bool - networkPartitioned bool - responseDelay time.Duration - mutex sync.RWMutex -} - -// MockWorker represents a mock worker for testing -type MockWorker struct { - ID string - Capabilities []types.TaskType - IsActive bool - TaskDelay time.Duration - FailureRate float64 -} - -// SimulationResults tracks comprehensive simulation results -type SimulationResults struct { - ScenarioName string - StartTime time.Time - EndTime time.Time - Duration time.Duration - TotalEvents int - EventsByType map[EventType]int - InconsistenciesFound map[task.InconsistencyType]int - TasksExecuted int - TasksSucceeded int - TasksFailed int - StateValidationsPassed int - StateValidationsFailed int - CriticalErrors []string - Warnings []string - DetailedLog []string - Success bool -} - -// NewComprehensiveSimulator creates a new comprehensive simulator -func NewComprehensiveSimulator() *ComprehensiveSimulator { - return &ComprehensiveSimulator{ - stateManager: task.NewVolumeStateManager(nil), - mockMaster: NewMockMasterServer(), - scenarios: []*StateTestScenario{}, - eventLog: []*SimulationEvent{}, - results: &SimulationResults{ - EventsByType: make(map[EventType]int), - InconsistenciesFound: make(map[task.InconsistencyType]int), - CriticalErrors: []string{}, - Warnings: []string{}, - DetailedLog: []string{}, - }, - } -} - -// CreateComprehensiveScenarios creates all possible edge case scenarios -func (cs *ComprehensiveSimulator) CreateComprehensiveScenarios() { - cs.scenarios = []*StateTestScenario{ - cs.createVolumeCreationDuringTaskScenario(), - cs.createVolumeDeletionDuringTaskScenario(), - cs.createShardCreationRaceConditionScenario(), - cs.createMasterSyncDuringTaskScenario(), - cs.createNetworkPartitionScenario(), - cs.createWorkerFailureDuringECScenario(), - cs.createConcurrentTasksScenario(), - cs.createCapacityOverflowScenario(), - cs.createShardCorruptionScenario(), - cs.createMasterInconsistencyScenario(), - cs.createTaskOrphanScenario(), - cs.createDuplicateTaskDetectionScenario(), - cs.createVolumeStateRollbackScenario(), - cs.createComplexECOperationScenario(), - cs.createHighLoadStressTestScenario(), - } - - glog.Infof("Created %d comprehensive test scenarios", len(cs.scenarios)) -} - -// RunAllComprehensiveScenarios runs all edge case scenarios -func (cs *ComprehensiveSimulator) RunAllComprehensiveScenarios() (*SimulationResults, error) { - glog.Infof("Starting comprehensive state management simulation") - - cs.results.StartTime = time.Now() - - for _, scenario := range cs.scenarios { - glog.Infof("Running scenario: %s", scenario.Name) - - if err := cs.RunScenario(scenario); err != nil { - cs.results.CriticalErrors = append(cs.results.CriticalErrors, - fmt.Sprintf("Scenario %s failed: %v", scenario.Name, err)) - } - - // Brief pause between scenarios - time.Sleep(1 * time.Second) - } - - cs.results.EndTime = time.Now() - cs.results.Duration = cs.results.EndTime.Sub(cs.results.StartTime) - cs.results.Success = len(cs.results.CriticalErrors) == 0 - - cs.generateDetailedReport() - - glog.Infof("Comprehensive simulation completed: %v", cs.results.Success) - return cs.results, nil -} - -// Scenario creation methods - -func (cs *ComprehensiveSimulator) createVolumeCreationDuringTaskScenario() *StateTestScenario { - return &StateTestScenario{ - Name: "volume_creation_during_task", - Description: "Tests state consistency when master reports new volume while task is creating it", - InitialState: &ClusterState{ - Volumes: make(map[uint32]*task.VolumeInfo), - ECShards: make(map[uint32]map[int]*task.ShardInfo), - }, - EventSequence: []*SimulationEvent{ - {Type: EventTaskStarted, VolumeID: 1, TaskID: "create_task_1", Parameters: map[string]interface{}{"type": "create"}}, - {Type: EventVolumeCreated, VolumeID: 1, Parameters: map[string]interface{}{"size": int64(1024 * 1024 * 1024)}}, - {Type: EventMasterSync}, - {Type: EventTaskCompleted, TaskID: "create_task_1"}, - }, - ExpectedFinalState: &ClusterState{ - Volumes: map[uint32]*task.VolumeInfo{ - 1: {ID: 1, Size: 1024 * 1024 * 1024}, - }, - }, - InconsistencyChecks: []*InconsistencyCheck{ - {Name: "No unexpected volumes", Type: task.InconsistencyVolumeUnexpected, MaxAllowedCount: 0}, - }, - Duration: 30 * time.Second, - } -} - -func (cs *ComprehensiveSimulator) createVolumeDeletionDuringTaskScenario() *StateTestScenario { - return &StateTestScenario{ - Name: "volume_deletion_during_task", - Description: "Tests handling when volume is deleted while task is working on it", - InitialState: &ClusterState{ - Volumes: map[uint32]*task.VolumeInfo{ - 1: {ID: 1, Size: 1024 * 1024 * 1024}, - }, - }, - EventSequence: []*SimulationEvent{ - {Type: EventTaskStarted, VolumeID: 1, TaskID: "vacuum_task_1", Parameters: map[string]interface{}{"type": "vacuum"}}, - {Type: EventVolumeDeleted, VolumeID: 1}, - {Type: EventMasterSync}, - {Type: EventTaskFailed, TaskID: "vacuum_task_1", Parameters: map[string]interface{}{"reason": "volume_deleted"}}, - }, - InconsistencyChecks: []*InconsistencyCheck{ - {Name: "Missing volume detected", Type: task.InconsistencyVolumeMissing, ExpectedCount: 1}, - }, - Duration: 30 * time.Second, - } -} - -func (cs *ComprehensiveSimulator) createShardCreationRaceConditionScenario() *StateTestScenario { - return &StateTestScenario{ - Name: "shard_creation_race_condition", - Description: "Tests race condition between EC task creating shards and master sync", - InitialState: &ClusterState{ - Volumes: map[uint32]*task.VolumeInfo{ - 1: {ID: 1, Size: 28 * 1024 * 1024 * 1024}, // Large volume ready for EC - }, - }, - EventSequence: []*SimulationEvent{ - {Type: EventTaskStarted, VolumeID: 1, TaskID: "ec_task_1", Parameters: map[string]interface{}{"type": "ec_encode"}}, - // Simulate shards being created one by one - {Type: EventShardCreated, VolumeID: 1, ShardID: intPtr(0), Server: "server1"}, - {Type: EventShardCreated, VolumeID: 1, ShardID: intPtr(1), Server: "server1"}, - {Type: EventMasterSync}, // Master sync happens while shards are being created - {Type: EventShardCreated, VolumeID: 1, ShardID: intPtr(2), Server: "server2"}, - {Type: EventShardCreated, VolumeID: 1, ShardID: intPtr(3), Server: "server2"}, - {Type: EventTaskCompleted, TaskID: "ec_task_1"}, - {Type: EventMasterSync}, - }, - InconsistencyChecks: []*InconsistencyCheck{ - {Name: "All shards accounted for", Type: task.InconsistencyShardMissing, MaxAllowedCount: 0}, - }, - Duration: 45 * time.Second, - } -} - -func (cs *ComprehensiveSimulator) createNetworkPartitionScenario() *StateTestScenario { - return &StateTestScenario{ - Name: "network_partition_recovery", - Description: "Tests state consistency during and after network partitions", - EventSequence: []*SimulationEvent{ - {Type: EventTaskStarted, VolumeID: 1, TaskID: "partition_task_1"}, - {Type: EventNetworkPartition, Parameters: map[string]interface{}{"duration": "30s"}}, - {Type: EventVolumeCreated, VolumeID: 2}, // Created during partition - {Type: EventNetworkHealed}, - {Type: EventMasterReconnected}, - {Type: EventMasterSync}, - {Type: EventTaskCompleted, TaskID: "partition_task_1"}, - }, - InconsistencyChecks: []*InconsistencyCheck{ - {Name: "State reconciled after partition", Type: task.InconsistencyVolumeUnexpected, MaxAllowedCount: 1}, - }, - Duration: 60 * time.Second, - } -} - -func (cs *ComprehensiveSimulator) createConcurrentTasksScenario() *StateTestScenario { - return &StateTestScenario{ - Name: "concurrent_tasks_capacity_tracking", - Description: "Tests capacity tracking with multiple concurrent tasks", - EventSequence: []*SimulationEvent{ - {Type: EventTaskStarted, VolumeID: 1, TaskID: "ec_task_1"}, - {Type: EventTaskStarted, VolumeID: 2, TaskID: "vacuum_task_1"}, - {Type: EventTaskStarted, VolumeID: 3, TaskID: "ec_task_2"}, - {Type: EventMasterSync}, - {Type: EventTaskCompleted, TaskID: "vacuum_task_1"}, - {Type: EventTaskCompleted, TaskID: "ec_task_1"}, - {Type: EventTaskCompleted, TaskID: "ec_task_2"}, - {Type: EventMasterSync}, - }, - InconsistencyChecks: []*InconsistencyCheck{ - {Name: "Capacity tracking accurate", Type: task.InconsistencyCapacityMismatch, MaxAllowedCount: 0}, - }, - Duration: 90 * time.Second, - } -} - -func (cs *ComprehensiveSimulator) createComplexECOperationScenario() *StateTestScenario { - return &StateTestScenario{ - Name: "complex_ec_operation", - Description: "Tests complex EC operations with shard movements and rebuilds", - EventSequence: []*SimulationEvent{ - {Type: EventTaskStarted, VolumeID: 1, TaskID: "ec_encode_1"}, - // Create all 14 shards - {Type: EventShardCreated, VolumeID: 1, ShardID: intPtr(0), Server: "server1"}, - {Type: EventShardCreated, VolumeID: 1, ShardID: intPtr(1), Server: "server1"}, - // ... more shards - {Type: EventTaskCompleted, TaskID: "ec_encode_1"}, - {Type: EventShardCorrupted, VolumeID: 1, ShardID: intPtr(2)}, - {Type: EventTaskStarted, VolumeID: 1, TaskID: "ec_rebuild_1"}, - {Type: EventShardCreated, VolumeID: 1, ShardID: intPtr(2), Server: "server3"}, // Rebuilt - {Type: EventTaskCompleted, TaskID: "ec_rebuild_1"}, - {Type: EventMasterSync}, - }, - Duration: 120 * time.Second, - } -} - -func (cs *ComprehensiveSimulator) createHighLoadStressTestScenario() *StateTestScenario { - events := []*SimulationEvent{} - - // Create 100 concurrent tasks - for i := 0; i < 100; i++ { - events = append(events, &SimulationEvent{ - Type: EventTaskStarted, - VolumeID: uint32(i + 1), - TaskID: fmt.Sprintf("stress_task_%d", i), - }) - } - - // Add master syncs throughout - for i := 0; i < 10; i++ { - events = append(events, &SimulationEvent{ - Type: EventMasterSync, - }) - } - - // Complete all tasks - for i := 0; i < 100; i++ { - events = append(events, &SimulationEvent{ - Type: EventTaskCompleted, - TaskID: fmt.Sprintf("stress_task_%d", i), - }) - } - - return &StateTestScenario{ - Name: "high_load_stress_test", - Description: "Tests system under high load with many concurrent operations", - EventSequence: events, - Duration: 5 * time.Minute, - } -} - -// Add more scenario creation methods... -func (cs *ComprehensiveSimulator) createMasterSyncDuringTaskScenario() *StateTestScenario { - return &StateTestScenario{Name: "master_sync_during_task", Description: "Test", Duration: 30 * time.Second} -} - -func (cs *ComprehensiveSimulator) createWorkerFailureDuringECScenario() *StateTestScenario { - return &StateTestScenario{Name: "worker_failure_during_ec", Description: "Test", Duration: 30 * time.Second} -} - -func (cs *ComprehensiveSimulator) createCapacityOverflowScenario() *StateTestScenario { - return &StateTestScenario{Name: "capacity_overflow", Description: "Test", Duration: 30 * time.Second} -} - -func (cs *ComprehensiveSimulator) createShardCorruptionScenario() *StateTestScenario { - return &StateTestScenario{Name: "shard_corruption", Description: "Test", Duration: 30 * time.Second} -} - -func (cs *ComprehensiveSimulator) createMasterInconsistencyScenario() *StateTestScenario { - return &StateTestScenario{Name: "master_inconsistency", Description: "Test", Duration: 30 * time.Second} -} - -func (cs *ComprehensiveSimulator) createTaskOrphanScenario() *StateTestScenario { - return &StateTestScenario{Name: "task_orphan", Description: "Test", Duration: 30 * time.Second} -} - -func (cs *ComprehensiveSimulator) createDuplicateTaskDetectionScenario() *StateTestScenario { - return &StateTestScenario{Name: "duplicate_task_detection", Description: "Test", Duration: 30 * time.Second} -} - -func (cs *ComprehensiveSimulator) createVolumeStateRollbackScenario() *StateTestScenario { - return &StateTestScenario{Name: "volume_state_rollback", Description: "Test", Duration: 30 * time.Second} -} - -// RunScenario executes a single test scenario -func (cs *ComprehensiveSimulator) RunScenario(scenario *StateTestScenario) error { - cs.mutex.Lock() - cs.currentScenario = scenario - cs.mutex.Unlock() - - glog.V(1).Infof("Setting up scenario: %s", scenario.Name) - - // Setup initial state - if err := cs.setupInitialState(scenario.InitialState); err != nil { - return fmt.Errorf("failed to setup initial state: %v", err) - } - - // Execute event sequence - ctx, cancel := context.WithTimeout(context.Background(), scenario.Duration) - defer cancel() - - for _, event := range scenario.EventSequence { - select { - case <-ctx.Done(): - return fmt.Errorf("scenario timed out") - default: - if err := cs.executeEvent(event); err != nil { - cs.results.Warnings = append(cs.results.Warnings, - fmt.Sprintf("Event execution warning in %s: %v", scenario.Name, err)) - } - cs.logEvent(event) - } - - // Small delay between events - time.Sleep(100 * time.Millisecond) - } - - // Validate final state - if err := cs.validateFinalState(scenario); err != nil { - cs.results.StateValidationsFailed++ - return fmt.Errorf("final state validation failed: %v", err) - } else { - cs.results.StateValidationsPassed++ - } - - glog.V(1).Infof("Scenario %s completed successfully", scenario.Name) - return nil -} - -// executeEvent executes a single simulation event -func (cs *ComprehensiveSimulator) executeEvent(event *SimulationEvent) error { - cs.results.TotalEvents++ - cs.results.EventsByType[event.Type]++ - - switch event.Type { - case EventTaskStarted: - return cs.simulateTaskStart(event) - case EventTaskCompleted: - return cs.simulateTaskCompletion(event) - case EventVolumeCreated: - return cs.simulateVolumeCreation(event) - case EventVolumeDeleted: - return cs.simulateVolumeDeletion(event) - case EventShardCreated: - return cs.simulateShardCreation(event) - case EventMasterSync: - return cs.simulateMasterSync(event) - case EventNetworkPartition: - return cs.simulateNetworkPartition(event) - default: - return nil // Unsupported event type - } -} - -// Event simulation methods -func (cs *ComprehensiveSimulator) simulateTaskStart(event *SimulationEvent) error { - taskType, _ := event.Parameters["type"].(string) - - impact := &task.TaskImpact{ - TaskID: event.TaskID, - TaskType: types.TaskType(taskType), - VolumeID: event.VolumeID, - StartedAt: time.Now(), - EstimatedEnd: time.Now().Add(30 * time.Second), - VolumeChanges: &task.VolumeChanges{}, - ShardChanges: make(map[int]*task.ShardChange), - CapacityDelta: make(map[string]int64), - } - - cs.stateManager.RegisterTaskImpact(event.TaskID, impact) - cs.results.TasksExecuted++ - - return nil -} - -func (cs *ComprehensiveSimulator) simulateTaskCompletion(event *SimulationEvent) error { - cs.stateManager.UnregisterTaskImpact(event.TaskID) - cs.results.TasksSucceeded++ - return nil -} - -func (cs *ComprehensiveSimulator) simulateVolumeCreation(event *SimulationEvent) error { - size, _ := event.Parameters["size"].(int64) - cs.mockMaster.CreateVolume(event.VolumeID, size) - return nil -} - -func (cs *ComprehensiveSimulator) simulateVolumeDeletion(event *SimulationEvent) error { - cs.mockMaster.DeleteVolume(event.VolumeID) - return nil -} - -func (cs *ComprehensiveSimulator) simulateShardCreation(event *SimulationEvent) error { - if event.ShardID != nil { - cs.mockMaster.CreateShard(event.VolumeID, *event.ShardID, event.Server) - } - return nil -} - -func (cs *ComprehensiveSimulator) simulateMasterSync(event *SimulationEvent) error { - return cs.stateManager.SyncWithMaster() -} - -func (cs *ComprehensiveSimulator) simulateNetworkPartition(event *SimulationEvent) error { - cs.mockMaster.SetNetworkPartitioned(true) - - // Auto-heal after duration - if durationStr, ok := event.Parameters["duration"].(string); ok { - if duration, err := time.ParseDuration(durationStr); err == nil { - time.AfterFunc(duration, func() { - cs.mockMaster.SetNetworkPartitioned(false) - }) - } - } - - return nil -} - -// Helper methods -func (cs *ComprehensiveSimulator) setupInitialState(initialState *ClusterState) error { - if initialState == nil { - return nil - } - - // Setup mock master with initial state - for volumeID, volume := range initialState.Volumes { - cs.mockMaster.CreateVolume(volumeID, int64(volume.Size)) - } - - for volumeID, shards := range initialState.ECShards { - for shardID, shard := range shards { - cs.mockMaster.CreateShard(volumeID, shardID, shard.Server) - } - } - - return nil -} - -func (cs *ComprehensiveSimulator) validateFinalState(scenario *StateTestScenario) error { - // Run inconsistency checks - for _, check := range scenario.InconsistencyChecks { - if err := cs.validateInconsistencyCheck(check); err != nil { - return err - } - } - - return nil -} - -func (cs *ComprehensiveSimulator) validateInconsistencyCheck(check *InconsistencyCheck) error { - // This would check for specific inconsistencies - // For now, we'll simulate the check - found := rand.Intn(check.MaxAllowedCount + 1) - - if found > check.MaxAllowedCount { - return fmt.Errorf("inconsistency check %s failed: found %d, max allowed %d", - check.Name, found, check.MaxAllowedCount) - } - - cs.results.InconsistenciesFound[check.Type] += found - return nil -} - -func (cs *ComprehensiveSimulator) logEvent(event *SimulationEvent) { - cs.mutex.Lock() - defer cs.mutex.Unlock() - - cs.eventLog = append(cs.eventLog, event) - logMsg := fmt.Sprintf("Event: %s, Volume: %d, Task: %s", event.Type, event.VolumeID, event.TaskID) - cs.results.DetailedLog = append(cs.results.DetailedLog, logMsg) -} - -func (cs *ComprehensiveSimulator) generateDetailedReport() { - glog.Infof("=== COMPREHENSIVE SIMULATION REPORT ===") - glog.Infof("Duration: %v", cs.results.Duration) - glog.Infof("Total Events: %d", cs.results.TotalEvents) - glog.Infof("Tasks Executed: %d", cs.results.TasksExecuted) - glog.Infof("Tasks Succeeded: %d", cs.results.TasksSucceeded) - glog.Infof("State Validations Passed: %d", cs.results.StateValidationsPassed) - glog.Infof("State Validations Failed: %d", cs.results.StateValidationsFailed) - - glog.Infof("Events by Type:") - for eventType, count := range cs.results.EventsByType { - glog.Infof(" %s: %d", eventType, count) - } - - glog.Infof("Inconsistencies Found:") - for incType, count := range cs.results.InconsistenciesFound { - glog.Infof(" %s: %d", incType, count) - } - - if len(cs.results.CriticalErrors) > 0 { - glog.Errorf("Critical Errors:") - for _, err := range cs.results.CriticalErrors { - glog.Errorf(" %s", err) - } - } - - glog.Infof("Overall Success: %v", cs.results.Success) - glog.Infof("========================================") -} - -// Mock Master Server implementation -func NewMockMasterServer() *MockMasterServer { - return &MockMasterServer{ - volumes: make(map[uint32]*task.VolumeInfo), - ecShards: make(map[uint32]map[int]*task.ShardInfo), - serverCapacity: make(map[string]*task.CapacityInfo), - } -} - -func (mms *MockMasterServer) CreateVolume(volumeID uint32, size int64) { - mms.mutex.Lock() - defer mms.mutex.Unlock() - - mms.volumes[volumeID] = &task.VolumeInfo{ - ID: volumeID, - Size: uint64(size), - } -} - -func (mms *MockMasterServer) DeleteVolume(volumeID uint32) { - mms.mutex.Lock() - defer mms.mutex.Unlock() - - delete(mms.volumes, volumeID) - delete(mms.ecShards, volumeID) -} - -func (mms *MockMasterServer) CreateShard(volumeID uint32, shardID int, server string) { - mms.mutex.Lock() - defer mms.mutex.Unlock() - - if mms.ecShards[volumeID] == nil { - mms.ecShards[volumeID] = make(map[int]*task.ShardInfo) - } - - mms.ecShards[volumeID][shardID] = &task.ShardInfo{ - ShardID: shardID, - Server: server, - Status: task.ShardStatusExists, - } -} - -func (mms *MockMasterServer) SetNetworkPartitioned(partitioned bool) { - mms.mutex.Lock() - defer mms.mutex.Unlock() - - mms.networkPartitioned = partitioned -} - -// Helper function -func intPtr(i int) *int { - return &i -} diff --git a/weed/admin/task/simulation/comprehensive_simulation_test.go b/weed/admin/task/simulation/comprehensive_simulation_test.go deleted file mode 100644 index 9cdbba006..000000000 --- a/weed/admin/task/simulation/comprehensive_simulation_test.go +++ /dev/null @@ -1,444 +0,0 @@ -package simulation - -import ( - "fmt" - "testing" - "time" - - "github.com/seaweedfs/seaweedfs/weed/admin/task" -) - -func TestComprehensiveSimulation_VolumeCreationDuringTask(t *testing.T) { - simulator := NewComprehensiveSimulator() - - scenario := &StateTestScenario{ - Name: "volume_creation_during_task", - Description: "Tests state consistency when master reports new volume while task is creating it", - InitialState: &ClusterState{ - Volumes: make(map[uint32]*task.VolumeInfo), - ECShards: make(map[uint32]map[int]*task.ShardInfo), - }, - EventSequence: []*SimulationEvent{ - {Type: EventTaskStarted, VolumeID: 1, TaskID: "create_task_1", Parameters: map[string]interface{}{"type": "create"}}, - {Type: EventVolumeCreated, VolumeID: 1, Parameters: map[string]interface{}{"size": int64(1024 * 1024 * 1024)}}, - {Type: EventMasterSync}, - {Type: EventTaskCompleted, TaskID: "create_task_1"}, - }, - InconsistencyChecks: []*InconsistencyCheck{ - {Name: "No unexpected volumes", Type: task.InconsistencyVolumeUnexpected, MaxAllowedCount: 0}, - }, - Duration: 30 * time.Second, - } - - err := simulator.RunScenario(scenario) - if err != nil { - t.Errorf("Volume creation during task scenario failed: %v", err) - } - - t.Log("✅ Volume creation during task test passed") -} - -func TestComprehensiveSimulation_VolumeDeletionDuringTask(t *testing.T) { - simulator := NewComprehensiveSimulator() - - scenario := &StateTestScenario{ - Name: "volume_deletion_during_task", - Description: "Tests handling when volume is deleted while task is working on it", - InitialState: &ClusterState{ - Volumes: map[uint32]*task.VolumeInfo{ - 1: {ID: 1, Size: 1024 * 1024 * 1024}, - }, - }, - EventSequence: []*SimulationEvent{ - {Type: EventTaskStarted, VolumeID: 1, TaskID: "vacuum_task_1", Parameters: map[string]interface{}{"type": "vacuum"}}, - {Type: EventVolumeDeleted, VolumeID: 1}, - {Type: EventMasterSync}, - {Type: EventTaskFailed, TaskID: "vacuum_task_1", Parameters: map[string]interface{}{"reason": "volume_deleted"}}, - }, - InconsistencyChecks: []*InconsistencyCheck{ - {Name: "Missing volume detected", Type: task.InconsistencyVolumeMissing, ExpectedCount: 1, MaxAllowedCount: 1}, - }, - Duration: 30 * time.Second, - } - - err := simulator.RunScenario(scenario) - if err != nil { - t.Errorf("Volume deletion during task scenario failed: %v", err) - } - - t.Log("✅ Volume deletion during task test passed") -} - -func TestComprehensiveSimulation_ShardCreationRaceCondition(t *testing.T) { - simulator := NewComprehensiveSimulator() - - scenario := &StateTestScenario{ - Name: "shard_creation_race_condition", - Description: "Tests race condition between EC task creating shards and master sync", - InitialState: &ClusterState{ - Volumes: map[uint32]*task.VolumeInfo{ - 1: {ID: 1, Size: 28 * 1024 * 1024 * 1024}, // Large volume ready for EC - }, - }, - EventSequence: []*SimulationEvent{ - {Type: EventTaskStarted, VolumeID: 1, TaskID: "ec_task_1", Parameters: map[string]interface{}{"type": "ec_encode"}}, - // Simulate shards being created one by one - {Type: EventShardCreated, VolumeID: 1, ShardID: intPtr(0), Server: "server1"}, - {Type: EventShardCreated, VolumeID: 1, ShardID: intPtr(1), Server: "server1"}, - {Type: EventMasterSync}, // Master sync happens while shards are being created - {Type: EventShardCreated, VolumeID: 1, ShardID: intPtr(2), Server: "server2"}, - {Type: EventShardCreated, VolumeID: 1, ShardID: intPtr(3), Server: "server2"}, - {Type: EventTaskCompleted, TaskID: "ec_task_1"}, - {Type: EventMasterSync}, - }, - InconsistencyChecks: []*InconsistencyCheck{ - {Name: "All shards accounted for", Type: task.InconsistencyShardMissing, MaxAllowedCount: 0}, - }, - Duration: 45 * time.Second, - } - - err := simulator.RunScenario(scenario) - if err != nil { - t.Errorf("Shard creation race condition scenario failed: %v", err) - } - - t.Log("✅ Shard creation race condition test passed") -} - -func TestComprehensiveSimulation_NetworkPartitionRecovery(t *testing.T) { - simulator := NewComprehensiveSimulator() - - scenario := &StateTestScenario{ - Name: "network_partition_recovery", - Description: "Tests state consistency during and after network partitions", - EventSequence: []*SimulationEvent{ - {Type: EventTaskStarted, VolumeID: 1, TaskID: "partition_task_1"}, - {Type: EventNetworkPartition, Parameters: map[string]interface{}{"duration": "5s"}}, // Shorter for test - {Type: EventVolumeCreated, VolumeID: 2}, // Created during partition - {Type: EventNetworkHealed}, - {Type: EventMasterReconnected}, - {Type: EventMasterSync}, - {Type: EventTaskCompleted, TaskID: "partition_task_1"}, - }, - InconsistencyChecks: []*InconsistencyCheck{ - {Name: "State reconciled after partition", Type: task.InconsistencyVolumeUnexpected, MaxAllowedCount: 1}, - }, - Duration: 30 * time.Second, - } - - err := simulator.RunScenario(scenario) - if err != nil { - t.Errorf("Network partition recovery scenario failed: %v", err) - } - - t.Log("✅ Network partition recovery test passed") -} - -func TestComprehensiveSimulation_ConcurrentTasksCapacityTracking(t *testing.T) { - simulator := NewComprehensiveSimulator() - - scenario := &StateTestScenario{ - Name: "concurrent_tasks_capacity_tracking", - Description: "Tests capacity tracking with multiple concurrent tasks", - EventSequence: []*SimulationEvent{ - {Type: EventTaskStarted, VolumeID: 1, TaskID: "ec_task_1"}, - {Type: EventTaskStarted, VolumeID: 2, TaskID: "vacuum_task_1"}, - {Type: EventTaskStarted, VolumeID: 3, TaskID: "ec_task_2"}, - {Type: EventMasterSync}, - {Type: EventTaskCompleted, TaskID: "vacuum_task_1"}, - {Type: EventTaskCompleted, TaskID: "ec_task_1"}, - {Type: EventTaskCompleted, TaskID: "ec_task_2"}, - {Type: EventMasterSync}, - }, - InconsistencyChecks: []*InconsistencyCheck{ - {Name: "Capacity tracking accurate", Type: task.InconsistencyCapacityMismatch, MaxAllowedCount: 0}, - }, - Duration: 60 * time.Second, - } - - err := simulator.RunScenario(scenario) - if err != nil { - t.Errorf("Concurrent tasks capacity tracking scenario failed: %v", err) - } - - t.Log("✅ Concurrent tasks capacity tracking test passed") -} - -func TestComprehensiveSimulation_ComplexECOperation(t *testing.T) { - simulator := NewComprehensiveSimulator() - - scenario := &StateTestScenario{ - Name: "complex_ec_operation", - Description: "Tests complex EC operations with shard movements and rebuilds", - EventSequence: []*SimulationEvent{ - {Type: EventTaskStarted, VolumeID: 1, TaskID: "ec_encode_1"}, - // Create some shards - {Type: EventShardCreated, VolumeID: 1, ShardID: intPtr(0), Server: "server1"}, - {Type: EventShardCreated, VolumeID: 1, ShardID: intPtr(1), Server: "server1"}, - {Type: EventShardCreated, VolumeID: 1, ShardID: intPtr(2), Server: "server2"}, - {Type: EventTaskCompleted, TaskID: "ec_encode_1"}, - {Type: EventShardCorrupted, VolumeID: 1, ShardID: intPtr(2)}, - {Type: EventTaskStarted, VolumeID: 1, TaskID: "ec_rebuild_1"}, - {Type: EventShardCreated, VolumeID: 1, ShardID: intPtr(2), Server: "server3"}, // Rebuilt - {Type: EventTaskCompleted, TaskID: "ec_rebuild_1"}, - {Type: EventMasterSync}, - }, - Duration: 60 * time.Second, - } - - err := simulator.RunScenario(scenario) - if err != nil { - t.Errorf("Complex EC operation scenario failed: %v", err) - } - - t.Log("✅ Complex EC operation test passed") -} - -func TestComprehensiveSimulation_HighLoadStressTest(t *testing.T) { - if testing.Short() { - t.Skip("Skipping high load stress test in short mode") - } - - simulator := NewComprehensiveSimulator() - - events := []*SimulationEvent{} - - // Create 50 concurrent tasks (reduced from 100 for faster test) - for i := 0; i < 50; i++ { - events = append(events, &SimulationEvent{ - Type: EventTaskStarted, - VolumeID: uint32(i + 1), - TaskID: fmt.Sprintf("stress_task_%d", i), - }) - } - - // Add master syncs throughout - for i := 0; i < 5; i++ { - events = append(events, &SimulationEvent{ - Type: EventMasterSync, - }) - } - - // Complete all tasks - for i := 0; i < 50; i++ { - events = append(events, &SimulationEvent{ - Type: EventTaskCompleted, - TaskID: fmt.Sprintf("stress_task_%d", i), - }) - } - - scenario := &StateTestScenario{ - Name: "high_load_stress_test", - Description: "Tests system under high load with many concurrent operations", - EventSequence: events, - Duration: 2 * time.Minute, // Reduced for faster test - } - - err := simulator.RunScenario(scenario) - if err != nil { - t.Errorf("High load stress test scenario failed: %v", err) - } - - t.Log("✅ High load stress test passed") -} - -func TestComprehensiveSimulation_AllScenarios(t *testing.T) { - if testing.Short() { - t.Skip("Skipping comprehensive simulation in short mode") - } - - simulator := NewComprehensiveSimulator() - simulator.CreateComprehensiveScenarios() - - // Run a subset of scenarios for testing (full suite would be too slow) - testScenarios := []string{ - "volume_creation_during_task", - "volume_deletion_during_task", - "shard_creation_race_condition", - "network_partition_recovery", - "concurrent_tasks_capacity_tracking", - } - - passedScenarios := 0 - totalScenarios := len(testScenarios) - - for _, scenarioName := range testScenarios { - t.Run(scenarioName, func(t *testing.T) { - // Find the scenario - var scenario *StateTestScenario - for _, s := range simulator.scenarios { - if s.Name == scenarioName { - scenario = s - break - } - } - - if scenario == nil { - t.Errorf("Scenario %s not found", scenarioName) - return - } - - // Reduce duration for faster testing - scenario.Duration = 15 * time.Second - - err := simulator.RunScenario(scenario) - if err != nil { - t.Errorf("Scenario %s failed: %v", scenarioName, err) - } else { - passedScenarios++ - t.Logf("✅ Scenario %s passed", scenarioName) - } - }) - } - - successRate := float64(passedScenarios) / float64(totalScenarios) * 100.0 - t.Logf("=== COMPREHENSIVE SIMULATION TEST RESULTS ===") - t.Logf("Scenarios Passed: %d/%d (%.1f%%)", passedScenarios, totalScenarios, successRate) - - if successRate < 100.0 { - t.Errorf("Some scenarios failed. Success rate: %.1f%%", successRate) - } else { - t.Log("🎉 All comprehensive simulation scenarios passed!") - } -} - -func TestComprehensiveSimulation_SimulationFramework(t *testing.T) { - // Test the simulation framework itself - simulator := NewComprehensiveSimulator() - - // Test event execution - event := &SimulationEvent{ - Type: EventTaskStarted, - VolumeID: 1, - TaskID: "test_task", - Parameters: map[string]interface{}{ - "type": "vacuum", - }, - } - - err := simulator.executeEvent(event) - if err != nil { - t.Errorf("Event execution failed: %v", err) - } - - // Verify task was registered - if simulator.results.TasksExecuted != 1 { - t.Errorf("Expected 1 task executed, got %d", simulator.results.TasksExecuted) - } - - // Test event logging - simulator.logEvent(event) - if len(simulator.eventLog) != 1 { - t.Errorf("Expected 1 logged event, got %d", len(simulator.eventLog)) - } - - // Test mock master - simulator.mockMaster.CreateVolume(1, 1024*1024*1024) - if len(simulator.mockMaster.volumes) != 1 { - t.Errorf("Expected 1 volume in mock master, got %d", len(simulator.mockMaster.volumes)) - } - - t.Log("✅ Simulation framework test passed") -} - -// Integration test that validates the complete state management flow -func TestComprehensiveSimulation_StateManagementIntegration(t *testing.T) { - // This test validates the core requirement: accurate volume/shard state tracking - simulator := NewComprehensiveSimulator() - - // Use mock master client instead of nil to avoid nil pointer errors - simulator.stateManager = task.NewVolumeStateManager(nil) // Skip master client calls for test - - // Setup realistic initial state - initialState := &ClusterState{ - Volumes: map[uint32]*task.VolumeInfo{ - 1: {ID: 1, Size: 28 * 1024 * 1024 * 1024, Server: "server1"}, // Ready for EC - 2: {ID: 2, Size: 20 * 1024 * 1024 * 1024, Server: "server2", DeletedByteCount: 8 * 1024 * 1024 * 1024}, // Needs vacuum - }, - ServerCapacity: map[string]*task.CapacityInfo{ - "server1": {Server: "server1", TotalCapacity: 100 * 1024 * 1024 * 1024, UsedCapacity: 30 * 1024 * 1024 * 1024}, - "server2": {Server: "server2", TotalCapacity: 100 * 1024 * 1024 * 1024, UsedCapacity: 25 * 1024 * 1024 * 1024}, - }, - } - - // Complex event sequence that tests state consistency (excluding master sync for test) - eventSequence := []*SimulationEvent{ - // Start EC task on volume 1 - {Type: EventTaskStarted, VolumeID: 1, TaskID: "ec_task_1", Parameters: map[string]interface{}{"type": "ec_encode"}}, - - // Start vacuum task on volume 2 - {Type: EventTaskStarted, VolumeID: 2, TaskID: "vacuum_task_1", Parameters: map[string]interface{}{"type": "vacuum"}}, - - // EC task creates shards - {Type: EventShardCreated, VolumeID: 1, ShardID: intPtr(0), Server: "server1"}, - {Type: EventShardCreated, VolumeID: 1, ShardID: intPtr(1), Server: "server1"}, - {Type: EventShardCreated, VolumeID: 1, ShardID: intPtr(2), Server: "server2"}, - - // Vacuum task completes (volume 2 size reduces) - {Type: EventTaskCompleted, TaskID: "vacuum_task_1"}, - {Type: EventVolumeSizeChanged, VolumeID: 2, Parameters: map[string]interface{}{"new_size": int64(12 * 1024 * 1024 * 1024)}}, - - // EC task completes - {Type: EventTaskCompleted, TaskID: "ec_task_1"}, - {Type: EventVolumeReadOnly, VolumeID: 1}, // Volume becomes read-only after EC - } - - scenario := &StateTestScenario{ - Name: "state_management_integration", - Description: "Complete state management integration test", - InitialState: initialState, - EventSequence: eventSequence, - Duration: 30 * time.Second, // Reduced for faster test - InconsistencyChecks: []*InconsistencyCheck{ - {Name: "No state inconsistencies", Type: task.InconsistencyVolumeUnexpected, MaxAllowedCount: 0}, - {Name: "No capacity mismatches", Type: task.InconsistencyCapacityMismatch, MaxAllowedCount: 0}, - {Name: "No orphaned tasks", Type: task.InconsistencyTaskOrphaned, MaxAllowedCount: 0}, - }, - } - - err := simulator.RunScenario(scenario) - if err != nil { - t.Errorf("State management integration test failed: %v", err) - } - - // Verify final state - if simulator.results.TasksExecuted != 2 { - t.Errorf("Expected 2 tasks executed, got %d", simulator.results.TasksExecuted) - } - - if simulator.results.TasksSucceeded != 2 { - t.Errorf("Expected 2 tasks succeeded, got %d", simulator.results.TasksSucceeded) - } - - t.Log("✅ State management integration test passed") - t.Log("✅ System accurately tracked volume/shard states throughout complex operation sequence") -} - -// Performance test for simulation framework -func BenchmarkComprehensiveSimulation_EventExecution(b *testing.B) { - simulator := NewComprehensiveSimulator() - - events := []*SimulationEvent{ - {Type: EventTaskStarted, VolumeID: 1, TaskID: "task_1"}, - {Type: EventVolumeCreated, VolumeID: 2}, - {Type: EventShardCreated, VolumeID: 1, ShardID: intPtr(0), Server: "server1"}, - {Type: EventMasterSync}, - {Type: EventTaskCompleted, TaskID: "task_1"}, - } - - b.ResetTimer() - - for i := 0; i < b.N; i++ { - for _, event := range events { - simulator.executeEvent(event) - } - } -} - -// Helper functions for tests -func createTestVolumeInfo(id uint32, size uint64) *task.VolumeInfo { - return &task.VolumeInfo{ - ID: id, - Size: size, - } -} diff --git a/weed/admin/task/simulation/simulation_runner.go b/weed/admin/task/simulation/simulation_runner.go deleted file mode 100644 index 339b0edc5..000000000 --- a/weed/admin/task/simulation/simulation_runner.go +++ /dev/null @@ -1,294 +0,0 @@ -package simulation - -import ( - "fmt" - - "github.com/seaweedfs/seaweedfs/weed/glog" -) - -// ComprehensiveSimulationRunner orchestrates all comprehensive state management tests -type ComprehensiveSimulationRunner struct { - simulator *ComprehensiveSimulator -} - -// NewComprehensiveSimulationRunner creates a new comprehensive simulation runner -func NewComprehensiveSimulationRunner() *ComprehensiveSimulationRunner { - return &ComprehensiveSimulationRunner{ - simulator: NewComprehensiveSimulator(), - } -} - -// RunAllComprehensiveTests runs all comprehensive edge case scenarios -func (csr *ComprehensiveSimulationRunner) RunAllComprehensiveTests() error { - glog.Infof("=== STARTING COMPREHENSIVE VOLUME/SHARD STATE MANAGEMENT SIMULATION ===") - - // Create all test scenarios - csr.simulator.CreateComprehensiveScenarios() - - // Run all scenarios - results, err := csr.simulator.RunAllComprehensiveScenarios() - if err != nil { - return fmt.Errorf("comprehensive simulation failed: %v", err) - } - - // Analyze results - csr.analyzeResults(results) - - // Generate final report - csr.generateFinalReport(results) - - return nil -} - -// analyzeResults analyzes the simulation results -func (csr *ComprehensiveSimulationRunner) analyzeResults(results *SimulationResults) { - glog.Infof("=== ANALYZING COMPREHENSIVE SIMULATION RESULTS ===") - - // Check critical errors - if len(results.CriticalErrors) > 0 { - glog.Errorf("CRITICAL ISSUES FOUND:") - for i, err := range results.CriticalErrors { - glog.Errorf(" %d. %s", i+1, err) - } - } - - // Check state validation success rate - totalValidations := results.StateValidationsPassed + results.StateValidationsFailed - if totalValidations > 0 { - successRate := float64(results.StateValidationsPassed) / float64(totalValidations) * 100.0 - glog.Infof("State Validation Success Rate: %.2f%% (%d/%d)", - successRate, results.StateValidationsPassed, totalValidations) - - if successRate < 95.0 { - glog.Warningf("State validation success rate is below 95%% - investigation needed") - } - } - - // Check task execution success rate - if results.TasksExecuted > 0 { - taskSuccessRate := float64(results.TasksSucceeded) / float64(results.TasksExecuted) * 100.0 - glog.Infof("Task Execution Success Rate: %.2f%% (%d/%d)", - taskSuccessRate, results.TasksSucceeded, results.TasksExecuted) - } - - // Analyze inconsistency patterns - if len(results.InconsistenciesFound) > 0 { - glog.Infof("Inconsistency Analysis:") - for incType, count := range results.InconsistenciesFound { - if count > 0 { - glog.Infof(" %s: %d occurrences", incType, count) - } - } - } -} - -// generateFinalReport generates a comprehensive final report -func (csr *ComprehensiveSimulationRunner) generateFinalReport(results *SimulationResults) { - glog.Infof("=== COMPREHENSIVE SIMULATION FINAL REPORT ===") - glog.Infof("Test Duration: %v", results.Duration) - glog.Infof("Total Events Simulated: %d", results.TotalEvents) - glog.Infof("Scenarios Tested: %d", len(csr.simulator.scenarios)) - glog.Infof("Overall Success: %v", results.Success) - - // Event breakdown - glog.Infof("\nEvent Breakdown:") - for eventType, count := range results.EventsByType { - glog.Infof(" %s: %d", eventType, count) - } - - // Test coverage summary - glog.Infof("\nTest Coverage Summary:") - glog.Infof("✓ Volume creation during task execution") - glog.Infof("✓ Volume deletion during task execution") - glog.Infof("✓ EC shard creation race conditions") - glog.Infof("✓ Network partition scenarios") - glog.Infof("✓ Concurrent task capacity tracking") - glog.Infof("✓ Complex EC operations with rebuilds") - glog.Infof("✓ High load stress testing") - glog.Infof("✓ Master sync timing issues") - glog.Infof("✓ Worker failure during operations") - glog.Infof("✓ Capacity overflow handling") - glog.Infof("✓ Shard corruption scenarios") - glog.Infof("✓ Master state inconsistencies") - glog.Infof("✓ Task orphan detection") - glog.Infof("✓ Duplicate task prevention") - glog.Infof("✓ Volume state rollback scenarios") - - // Quality metrics - glog.Infof("\nQuality Metrics:") - if results.StateValidationsPassed > 0 { - glog.Infof("✓ State consistency maintained across all scenarios") - } - if len(results.CriticalErrors) == 0 { - glog.Infof("✓ No critical errors detected") - } - if results.TasksSucceeded > 0 { - glog.Infof("✓ Task execution reliability verified") - } - - // Recommendations - glog.Infof("\nRecommendations:") - if results.Success { - glog.Infof("✓ The task distribution system is ready for production deployment") - glog.Infof("✓ All edge cases have been tested and handled correctly") - glog.Infof("✓ Volume and shard state management is robust and consistent") - } else { - glog.Warningf("⚠ System requires additional work before production deployment") - glog.Warningf("⚠ Address critical errors before proceeding") - } - - glog.Infof("==========================================") -} - -// RunSpecificEdgeCaseTest runs a specific edge case test -func (csr *ComprehensiveSimulationRunner) RunSpecificEdgeCaseTest(scenarioName string) error { - glog.Infof("Running specific edge case test: %s", scenarioName) - - // Create scenarios if not already done - if len(csr.simulator.scenarios) == 0 { - csr.simulator.CreateComprehensiveScenarios() - } - - // Find and run specific scenario - for _, scenario := range csr.simulator.scenarios { - if scenario.Name == scenarioName { - err := csr.simulator.RunScenario(scenario) - if err != nil { - return fmt.Errorf("scenario %s failed: %v", scenarioName, err) - } - glog.Infof("Scenario %s completed successfully", scenarioName) - return nil - } - } - - return fmt.Errorf("scenario %s not found", scenarioName) -} - -// ValidateSystemReadiness performs final validation of system readiness -func (csr *ComprehensiveSimulationRunner) ValidateSystemReadiness() error { - glog.Infof("=== VALIDATING SYSTEM READINESS FOR PRODUCTION ===") - - checklistItems := []struct { - name string - description string - validator func() error - }{ - { - "Volume State Accuracy", - "Verify volume state tracking is accurate under all conditions", - csr.validateVolumeStateAccuracy, - }, - { - "Shard Management", - "Verify EC shard creation/deletion/movement is handled correctly", - csr.validateShardManagement, - }, - { - "Capacity Planning", - "Verify capacity calculations include in-progress and planned operations", - csr.validateCapacityPlanning, - }, - { - "Failure Recovery", - "Verify system recovers gracefully from all failure scenarios", - csr.validateFailureRecovery, - }, - { - "Consistency Guarantees", - "Verify state consistency is maintained across all operations", - csr.validateConsistencyGuarantees, - }, - } - - var failedChecks []string - - for _, item := range checklistItems { - glog.Infof("Validating: %s", item.name) - if err := item.validator(); err != nil { - failedChecks = append(failedChecks, fmt.Sprintf("%s: %v", item.name, err)) - glog.Errorf("❌ %s: %v", item.name, err) - } else { - glog.Infof("✅ %s: PASSED", item.name) - } - } - - if len(failedChecks) > 0 { - return fmt.Errorf("system readiness validation failed: %v", failedChecks) - } - - glog.Infof("🎉 SYSTEM IS READY FOR PRODUCTION DEPLOYMENT!") - return nil -} - -// Validation methods -func (csr *ComprehensiveSimulationRunner) validateVolumeStateAccuracy() error { - // Run volume state accuracy tests - return csr.RunSpecificEdgeCaseTest("volume_creation_during_task") -} - -func (csr *ComprehensiveSimulationRunner) validateShardManagement() error { - // Run shard management tests - return csr.RunSpecificEdgeCaseTest("shard_creation_race_condition") -} - -func (csr *ComprehensiveSimulationRunner) validateCapacityPlanning() error { - // Run capacity planning tests - return csr.RunSpecificEdgeCaseTest("concurrent_tasks_capacity_tracking") -} - -func (csr *ComprehensiveSimulationRunner) validateFailureRecovery() error { - // Run failure recovery tests - return csr.RunSpecificEdgeCaseTest("network_partition_recovery") -} - -func (csr *ComprehensiveSimulationRunner) validateConsistencyGuarantees() error { - // Run consistency tests - return csr.RunSpecificEdgeCaseTest("complex_ec_operation") -} - -// DemonstrateBugPrevention shows how the simulation prevents bugs -func (csr *ComprehensiveSimulationRunner) DemonstrateBugPrevention() { - glog.Infof("=== DEMONSTRATING BUG PREVENTION CAPABILITIES ===") - - bugScenarios := []struct { - name string - description string - impact string - }{ - { - "Race Condition Prevention", - "Master sync occurs while EC shards are being created", - "Prevents state inconsistencies that could lead to data loss", - }, - { - "Capacity Overflow Prevention", - "Multiple tasks assigned without considering cumulative capacity impact", - "Prevents server disk space exhaustion", - }, - { - "Orphaned Task Detection", - "Worker fails but task remains marked as in-progress", - "Prevents volumes from being stuck in intermediate states", - }, - { - "Duplicate Task Prevention", - "Same volume assigned to multiple workers simultaneously", - "Prevents data corruption from conflicting operations", - }, - { - "Network Partition Handling", - "Admin server loses connection to master during operations", - "Ensures eventual consistency when connectivity is restored", - }, - } - - for i, scenario := range bugScenarios { - glog.Infof("%d. %s", i+1, scenario.name) - glog.Infof(" Scenario: %s", scenario.description) - glog.Infof(" Impact Prevention: %s", scenario.impact) - glog.Infof("") - } - - glog.Infof("✅ All potential bugs are detected and prevented by the simulation framework") - glog.Infof("✅ The system is thoroughly validated for production use") -} diff --git a/weed/admin/task/simulation/system_demo_test.go b/weed/admin/task/simulation/system_demo_test.go deleted file mode 100644 index 7cf095d0e..000000000 --- a/weed/admin/task/simulation/system_demo_test.go +++ /dev/null @@ -1,237 +0,0 @@ -package simulation - -import ( - "testing" - - "github.com/seaweedfs/seaweedfs/weed/admin/task" - "github.com/seaweedfs/seaweedfs/weed/worker/types" -) - -// TestSystemDemo demonstrates the complete working system -func TestSystemDemo(t *testing.T) { - t.Log("🚀 SEAWEEDFS TASK DISTRIBUTION SYSTEM DEMONSTRATION") - t.Log("====================================================") - - // Test 1: Volume State Management - t.Log("\n📊 1. VOLUME STATE MANAGEMENT") - testVolumeStateManagement(t) - - // Test 2: Task Assignment Logic - t.Log("\n⚡ 2. TASK ASSIGNMENT LOGIC") - testTaskAssignment(t) - - // Test 3: Capacity Management - t.Log("\n💾 3. CAPACITY MANAGEMENT") - testCapacityManagement(t) - - // Test 4: Edge Case Handling - t.Log("\n🛡️ 4. EDGE CASE HANDLING") - testEdgeCaseHandling(t) - - t.Log("\n🎉 SYSTEM DEMONSTRATION COMPLETE") - t.Log("✅ All core features working correctly") - t.Log("✅ System ready for production deployment") -} - -func testVolumeStateManagement(t *testing.T) { - vsm := task.NewVolumeStateManager(nil) - - // Create volume - volumeID := uint32(1) - - // Register task impact - impact := &task.TaskImpact{ - TaskID: "ec_task_1", - VolumeID: volumeID, - TaskType: types.TaskTypeErasureCoding, - VolumeChanges: &task.VolumeChanges{ - WillBecomeReadOnly: true, - }, - CapacityDelta: map[string]int64{"server1": 12 * 1024 * 1024 * 1024}, // 12GB - } - - vsm.RegisterTaskImpact(impact.TaskID, impact) - - t.Log(" ✅ Volume state registration works") - t.Log(" ✅ Task impact tracking works") - t.Log(" ✅ State consistency maintained") -} - -func testTaskAssignment(t *testing.T) { - registry := task.NewWorkerRegistry() - queue := task.NewPriorityTaskQueue() - scheduler := task.NewTaskScheduler(registry, queue) - - // Register worker - worker := &types.Worker{ - ID: "worker1", - Capabilities: []types.TaskType{types.TaskTypeVacuum}, - MaxConcurrent: 2, - Status: "active", - CurrentLoad: 0, - } - registry.RegisterWorker(worker) - - // Create task - taskItem := &types.Task{ - ID: "vacuum_task_1", - Type: types.TaskTypeVacuum, - Priority: types.TaskPriorityNormal, - } - queue.Push(taskItem) - - // Test assignment - assignedTask := scheduler.GetNextTask("worker1", []types.TaskType{types.TaskTypeVacuum}) - if assignedTask == nil { - t.Error("❌ Task assignment failed") - return - } - - if assignedTask.ID != "vacuum_task_1" { - t.Errorf("❌ Wrong task assigned: expected vacuum_task_1, got %s", assignedTask.ID) - return - } - - t.Log(" ✅ Worker registration works") - t.Log(" ✅ Task queueing works") - t.Log(" ✅ Task assignment logic works") - t.Log(" ✅ Capability matching works") -} - -func testCapacityManagement(t *testing.T) { - vsm := task.NewVolumeStateManager(nil) - - // Note: We can't directly set capacityCache due to private fields, - // but we can test the public interface - - // Test capacity checking with a made-up scenario - serverID := "test_server" - - // This would normally fail since we can't set the capacity cache, - // but we can demonstrate the interface - canAssign := vsm.CanAssignVolumeToServer(5*1024*1024*1024, serverID) - - // Since we can't set up the test data properly due to private fields, - // we'll just verify the method works without error - _ = canAssign - - t.Log(" ✅ Capacity calculation interface works") - t.Log(" ✅ Reserved capacity tracking interface works") - t.Log(" ✅ Assignment constraints interface works") -} - -func testEdgeCaseHandling(t *testing.T) { - // Test empty queue - registry := task.NewWorkerRegistry() - queue := task.NewPriorityTaskQueue() - scheduler := task.NewTaskScheduler(registry, queue) - - worker := &types.Worker{ - ID: "worker1", - Capabilities: []types.TaskType{types.TaskTypeVacuum}, - Status: "active", - } - registry.RegisterWorker(worker) - - // Empty queue should return nil - taskItem := scheduler.GetNextTask("worker1", []types.TaskType{types.TaskTypeVacuum}) - if taskItem != nil { - t.Error("❌ Empty queue should return nil") - return - } - - // Test unknown worker - unknownTask := scheduler.GetNextTask("unknown", []types.TaskType{types.TaskTypeVacuum}) - if unknownTask != nil { - t.Error("❌ Unknown worker should not get tasks") - return - } - - t.Log(" ✅ Empty queue handled correctly") - t.Log(" ✅ Unknown worker handled correctly") - t.Log(" ✅ Edge cases properly managed") -} - -// TestSystemCapabilities demonstrates key system capabilities -func TestSystemCapabilities(t *testing.T) { - t.Log("\n🎯 SEAWEEDFS TASK DISTRIBUTION SYSTEM CAPABILITIES") - t.Log("==================================================") - - capabilities := []string{ - "✅ Comprehensive volume/shard state tracking", - "✅ Accurate capacity planning with reservations", - "✅ Task assignment based on worker capabilities", - "✅ Priority-based task scheduling", - "✅ Concurrent task management", - "✅ EC shard lifecycle tracking", - "✅ Capacity overflow prevention", - "✅ Duplicate task prevention", - "✅ Worker performance metrics", - "✅ Failure detection and recovery", - "✅ State reconciliation with master", - "✅ Comprehensive simulation framework", - "✅ Production-ready error handling", - "✅ Scalable distributed architecture", - "✅ Real-time progress monitoring", - } - - for _, capability := range capabilities { - t.Log(" " + capability) - } - - t.Log("\n📈 SYSTEM METRICS") - t.Log(" Total Lines of Code: 4,919") - t.Log(" Test Coverage: Comprehensive") - t.Log(" Edge Cases: 15+ scenarios tested") - t.Log(" Simulation Framework: Complete") - t.Log(" Production Ready: ✅ YES") - - t.Log("\n🚀 READY FOR PRODUCTION DEPLOYMENT!") -} - -// TestBugPrevention demonstrates how the system prevents common bugs -func TestBugPrevention(t *testing.T) { - t.Log("\n🛡️ BUG PREVENTION DEMONSTRATION") - t.Log("================================") - - bugScenarios := []struct { - name string - description string - prevention string - }{ - { - "Race Conditions", - "Master sync during shard creation", - "State manager tracks in-progress changes", - }, - { - "Capacity Overflow", - "Multiple tasks overwhelming server disk", - "Reserved capacity tracking prevents overflow", - }, - { - "Orphaned Tasks", - "Worker fails, task stuck in-progress", - "Timeout detection and automatic cleanup", - }, - { - "Duplicate Tasks", - "Same volume assigned to multiple workers", - "Volume reservation prevents conflicts", - }, - { - "State Inconsistency", - "Admin view diverges from master", - "Periodic reconciliation ensures consistency", - }, - } - - for i, scenario := range bugScenarios { - t.Logf(" %d. %s", i+1, scenario.name) - t.Logf(" Problem: %s", scenario.description) - t.Logf(" Solution: %s", scenario.prevention) - t.Log("") - } - - t.Log("✅ All major bug categories prevented through design") -} diff --git a/weed/admin/task/task_assignment_test.go b/weed/admin/task/task_assignment_test.go deleted file mode 100644 index 0f9f41f16..000000000 --- a/weed/admin/task/task_assignment_test.go +++ /dev/null @@ -1,509 +0,0 @@ -package task - -import ( - "fmt" - "testing" - "time" - - "github.com/seaweedfs/seaweedfs/weed/worker/types" -) - -func TestTaskAssignment_BasicAssignment(t *testing.T) { - registry := NewWorkerRegistry() - queue := NewPriorityTaskQueue() - scheduler := NewTaskScheduler(registry, queue) - - // Register worker - worker := &types.Worker{ - ID: "worker1", - Capabilities: []types.TaskType{types.TaskTypeVacuum}, - MaxConcurrent: 1, - Status: "active", - CurrentLoad: 0, - } - registry.RegisterWorker(worker) - - // Create task - task := &types.Task{ - ID: "task1", - Type: types.TaskTypeVacuum, - Priority: types.TaskPriorityNormal, - } - queue.Push(task) - - // Test assignment - nextTask := scheduler.GetNextTask("worker1", []types.TaskType{types.TaskTypeVacuum}) - if nextTask == nil { - t.Fatal("Expected task to be assigned") - } - - if nextTask.ID != "task1" { - t.Errorf("Expected task1, got %s", nextTask.ID) - } - - t.Log("✅ Basic task assignment test passed") -} - -func TestTaskAssignment_CapabilityMatching(t *testing.T) { - registry := NewWorkerRegistry() - queue := NewPriorityTaskQueue() - scheduler := NewTaskScheduler(registry, queue) - - // Register workers with different capabilities - ecWorker := &types.Worker{ - ID: "ec_worker", - Capabilities: []types.TaskType{types.TaskTypeErasureCoding}, - Status: "active", - CurrentLoad: 0, - } - registry.RegisterWorker(ecWorker) - - vacuumWorker := &types.Worker{ - ID: "vacuum_worker", - Capabilities: []types.TaskType{types.TaskTypeVacuum}, - Status: "active", - CurrentLoad: 0, - } - registry.RegisterWorker(vacuumWorker) - - // Create different types of tasks - ecTask := &types.Task{ - ID: "ec_task", - Type: types.TaskTypeErasureCoding, - } - vacuumTask := &types.Task{ - ID: "vacuum_task", - Type: types.TaskTypeVacuum, - } - - queue.Push(ecTask) - queue.Push(vacuumTask) - - // Test EC worker gets EC task - assignedECTask := scheduler.GetNextTask("ec_worker", []types.TaskType{types.TaskTypeErasureCoding}) - if assignedECTask == nil || assignedECTask.Type != types.TaskTypeErasureCoding { - t.Error("EC worker should get EC task") - } - - // Test vacuum worker gets vacuum task - assignedVacuumTask := scheduler.GetNextTask("vacuum_worker", []types.TaskType{types.TaskTypeVacuum}) - if assignedVacuumTask == nil || assignedVacuumTask.Type != types.TaskTypeVacuum { - t.Error("Vacuum worker should get vacuum task") - } - - // Test wrong capability - should get nothing - wrongTask := scheduler.GetNextTask("ec_worker", []types.TaskType{types.TaskTypeVacuum}) - if wrongTask != nil { - t.Error("EC worker should not get vacuum task") - } - - t.Log("✅ Capability matching test passed") -} - -func TestTaskAssignment_PriorityOrdering(t *testing.T) { - queue := NewPriorityTaskQueue() - - // Add tasks in reverse priority order - lowTask := &types.Task{ - ID: "low_task", - Priority: types.TaskPriorityLow, - } - highTask := &types.Task{ - ID: "high_task", - Priority: types.TaskPriorityHigh, - } - normalTask := &types.Task{ - ID: "normal_task", - Priority: types.TaskPriorityNormal, - } - - queue.Push(lowTask) - queue.Push(normalTask) - queue.Push(highTask) - - // Should get high priority first - first := queue.Pop() - if first.Priority != types.TaskPriorityHigh { - t.Errorf("Expected high priority first, got %d", first.Priority) - } - - // Then normal priority - second := queue.Pop() - if second.Priority != types.TaskPriorityNormal { - t.Errorf("Expected normal priority second, got %d", second.Priority) - } - - // Finally low priority - third := queue.Pop() - if third.Priority != types.TaskPriorityLow { - t.Errorf("Expected low priority third, got %d", third.Priority) - } - - t.Log("✅ Priority ordering test passed") -} - -func TestTaskAssignment_WorkerCapacityLimits(t *testing.T) { - registry := NewWorkerRegistry() - - // Register worker with limited capacity - worker := &types.Worker{ - ID: "limited_worker", - Capabilities: []types.TaskType{types.TaskTypeVacuum}, - MaxConcurrent: 2, - Status: "active", - CurrentLoad: 2, // Already at capacity - } - registry.RegisterWorker(worker) - - // Worker should not be available - availableWorkers := registry.GetAvailableWorkers() - if len(availableWorkers) != 0 { - t.Error("Worker at capacity should not be available") - } - - // Reduce load - worker.CurrentLoad = 1 - - // Worker should now be available - availableWorkers = registry.GetAvailableWorkers() - if len(availableWorkers) != 1 { - t.Error("Worker with capacity should be available") - } - - t.Log("✅ Worker capacity limits test passed") -} - -func TestTaskAssignment_ScheduledTasks(t *testing.T) { - registry := NewWorkerRegistry() - queue := NewPriorityTaskQueue() - scheduler := NewTaskScheduler(registry, queue) - - worker := &types.Worker{ - ID: "worker1", - Capabilities: []types.TaskType{types.TaskTypeVacuum}, - Status: "active", - CurrentLoad: 0, - } - registry.RegisterWorker(worker) - - // Create task scheduled for future - futureTask := &types.Task{ - ID: "future_task", - Type: types.TaskTypeVacuum, - ScheduledAt: time.Now().Add(1 * time.Hour), // 1 hour from now - } - - // Create task ready now - readyTask := &types.Task{ - ID: "ready_task", - Type: types.TaskTypeVacuum, - ScheduledAt: time.Now().Add(-1 * time.Minute), // 1 minute ago - } - - queue.Push(futureTask) - queue.Push(readyTask) - - // Should get ready task, not future task - assignedTask := scheduler.GetNextTask("worker1", []types.TaskType{types.TaskTypeVacuum}) - if assignedTask == nil || assignedTask.ID != "ready_task" { - t.Error("Should assign ready task, not future scheduled task") - } - - t.Log("✅ Scheduled tasks test passed") -} - -func TestTaskAssignment_WorkerSelection(t *testing.T) { - registry := NewWorkerRegistry() - queue := NewPriorityTaskQueue() - scheduler := NewTaskScheduler(registry, queue) - - // Register workers with different characteristics - highPerformanceWorker := &types.Worker{ - ID: "high_perf_worker", - Address: "server1", - Capabilities: []types.TaskType{types.TaskTypeErasureCoding}, - Status: "active", - CurrentLoad: 0, - MaxConcurrent: 4, - } - - lowPerformanceWorker := &types.Worker{ - ID: "low_perf_worker", - Address: "server2", - Capabilities: []types.TaskType{types.TaskTypeErasureCoding}, - Status: "active", - CurrentLoad: 1, - MaxConcurrent: 2, - } - - registry.RegisterWorker(highPerformanceWorker) - registry.RegisterWorker(lowPerformanceWorker) - - // Set up metrics to favor high performance worker - registry.metrics[highPerformanceWorker.ID] = &WorkerMetrics{ - TasksCompleted: 100, - TasksFailed: 5, - SuccessRate: 0.95, - AverageTaskTime: 10 * time.Minute, - LastTaskTime: time.Now().Add(-5 * time.Minute), - } - - registry.metrics[lowPerformanceWorker.ID] = &WorkerMetrics{ - TasksCompleted: 50, - TasksFailed: 10, - SuccessRate: 0.83, - AverageTaskTime: 20 * time.Minute, - LastTaskTime: time.Now().Add(-1 * time.Hour), - } - - // Create high priority task - task := &types.Task{ - ID: "important_task", - Type: types.TaskTypeErasureCoding, - Priority: types.TaskPriorityHigh, - Server: "server1", // Prefers server1 - } - - availableWorkers := []*types.Worker{highPerformanceWorker, lowPerformanceWorker} - selectedWorker := scheduler.SelectWorker(task, availableWorkers) - - if selectedWorker == nil { - t.Fatal("No worker selected") - } - - if selectedWorker.ID != "high_perf_worker" { - t.Errorf("Expected high performance worker to be selected, got %s", selectedWorker.ID) - } - - t.Log("✅ Worker selection test passed") -} - -func TestTaskAssignment_ServerAffinity(t *testing.T) { - registry := NewWorkerRegistry() - queue := NewPriorityTaskQueue() - scheduler := NewTaskScheduler(registry, queue) - - // Workers on different servers - worker1 := &types.Worker{ - ID: "worker1", - Address: "server1", - Capabilities: []types.TaskType{types.TaskTypeVacuum}, - Status: "active", - CurrentLoad: 0, - } - - worker2 := &types.Worker{ - ID: "worker2", - Address: "server2", - Capabilities: []types.TaskType{types.TaskTypeVacuum}, - Status: "active", - CurrentLoad: 0, - } - - registry.RegisterWorker(worker1) - registry.RegisterWorker(worker2) - - // Task that prefers server1 - task := &types.Task{ - ID: "affinity_task", - Type: types.TaskTypeVacuum, - Server: "server1", // Should prefer worker on server1 - } - - availableWorkers := []*types.Worker{worker1, worker2} - selectedWorker := scheduler.SelectWorker(task, availableWorkers) - - if selectedWorker == nil { - t.Fatal("No worker selected") - } - - if selectedWorker.Address != "server1" { - t.Errorf("Expected worker on server1 to be selected for server affinity") - } - - t.Log("✅ Server affinity test passed") -} - -func TestTaskAssignment_DuplicateTaskPrevention(t *testing.T) { - queue := NewPriorityTaskQueue() - - // Add initial task - task1 := &types.Task{ - ID: "task1", - Type: types.TaskTypeVacuum, - VolumeID: 1, - } - queue.Push(task1) - - // Check for duplicate - hasDuplicate := queue.HasTask(1, types.TaskTypeVacuum) - if !hasDuplicate { - t.Error("Should detect existing task for volume") - } - - // Check for non-existent task - hasNonExistent := queue.HasTask(2, types.TaskTypeVacuum) - if hasNonExistent { - t.Error("Should not detect task for different volume") - } - - // Check for different task type - hasDifferentType := queue.HasTask(1, types.TaskTypeErasureCoding) - if hasDifferentType { - t.Error("Should not detect different task type for same volume") - } - - t.Log("✅ Duplicate task prevention test passed") -} - -func TestTaskAssignment_TaskRemoval(t *testing.T) { - queue := NewPriorityTaskQueue() - - // Add tasks - task1 := &types.Task{ID: "task1", Priority: types.TaskPriorityNormal} - task2 := &types.Task{ID: "task2", Priority: types.TaskPriorityHigh} - task3 := &types.Task{ID: "task3", Priority: types.TaskPriorityLow} - - queue.Push(task1) - queue.Push(task2) - queue.Push(task3) - - if queue.Size() != 3 { - t.Errorf("Expected queue size 3, got %d", queue.Size()) - } - - // Remove middle priority task - removed := queue.RemoveTask("task1") - if !removed { - t.Error("Should have removed task1") - } - - if queue.Size() != 2 { - t.Errorf("Expected queue size 2 after removal, got %d", queue.Size()) - } - - // Verify order maintained (high priority first) - next := queue.Peek() - if next.ID != "task2" { - t.Errorf("Expected task2 (high priority) to be next, got %s", next.ID) - } - - t.Log("✅ Task removal test passed") -} - -func TestTaskAssignment_EdgeCases(t *testing.T) { - t.Run("EmptyQueue", func(t *testing.T) { - registry := NewWorkerRegistry() - queue := NewPriorityTaskQueue() - scheduler := NewTaskScheduler(registry, queue) - - worker := &types.Worker{ - ID: "worker1", - Capabilities: []types.TaskType{types.TaskTypeVacuum}, - Status: "active", - } - registry.RegisterWorker(worker) - - // Empty queue should return nil - task := scheduler.GetNextTask("worker1", []types.TaskType{types.TaskTypeVacuum}) - if task != nil { - t.Error("Empty queue should return nil task") - } - }) - - t.Run("UnknownWorker", func(t *testing.T) { - registry := NewWorkerRegistry() - queue := NewPriorityTaskQueue() - scheduler := NewTaskScheduler(registry, queue) - - task := &types.Task{ID: "task1", Type: types.TaskTypeVacuum} - queue.Push(task) - - // Unknown worker should return nil - assignedTask := scheduler.GetNextTask("unknown_worker", []types.TaskType{types.TaskTypeVacuum}) - if assignedTask != nil { - t.Error("Unknown worker should not get tasks") - } - }) - - t.Run("InactiveWorker", func(t *testing.T) { - registry := NewWorkerRegistry() - - worker := &types.Worker{ - ID: "inactive_worker", - Capabilities: []types.TaskType{types.TaskTypeVacuum}, - Status: "inactive", - CurrentLoad: 0, - } - registry.RegisterWorker(worker) - - // Inactive worker should not be available - available := registry.GetAvailableWorkers() - if len(available) != 0 { - t.Error("Inactive worker should not be available") - } - }) - - t.Log("✅ Edge cases test passed") -} - -// Performance test for task assignment -func BenchmarkTaskAssignment_GetNextTask(b *testing.B) { - registry := NewWorkerRegistry() - queue := NewPriorityTaskQueue() - scheduler := NewTaskScheduler(registry, queue) - - // Setup worker - worker := &types.Worker{ - ID: "bench_worker", - Capabilities: []types.TaskType{types.TaskTypeVacuum}, - Status: "active", - CurrentLoad: 0, - } - registry.RegisterWorker(worker) - - // Add many tasks - for i := 0; i < 1000; i++ { - task := &types.Task{ - ID: fmt.Sprintf("task_%d", i), - Type: types.TaskTypeVacuum, - Priority: types.TaskPriorityNormal, - } - queue.Push(task) - } - - b.ResetTimer() - - for i := 0; i < b.N; i++ { - scheduler.GetNextTask("bench_worker", []types.TaskType{types.TaskTypeVacuum}) - } -} - -func BenchmarkTaskAssignment_WorkerSelection(b *testing.B) { - registry := NewWorkerRegistry() - scheduler := NewTaskScheduler(registry, nil) - - // Create many workers - workers := make([]*types.Worker, 100) - for i := 0; i < 100; i++ { - worker := &types.Worker{ - ID: fmt.Sprintf("worker_%d", i), - Capabilities: []types.TaskType{types.TaskTypeVacuum}, - Status: "active", - CurrentLoad: i % 3, // Varying loads - } - registry.RegisterWorker(worker) - workers[i] = worker - } - - task := &types.Task{ - ID: "bench_task", - Type: types.TaskTypeVacuum, - } - - b.ResetTimer() - - for i := 0; i < b.N; i++ { - scheduler.SelectWorker(task, workers) - } -} diff --git a/weed/admin/task/task_detectors.go b/weed/admin/task/task_detectors.go deleted file mode 100644 index 4e70fb475..000000000 --- a/weed/admin/task/task_detectors.go +++ /dev/null @@ -1,168 +0,0 @@ -package task - -import ( - "time" - - "github.com/seaweedfs/seaweedfs/weed/glog" - "github.com/seaweedfs/seaweedfs/weed/worker/types" -) - -// ECDetector detects volumes that need erasure coding -type ECDetector struct { - minUtilization float64 - minIdleTime time.Duration -} - -// NewECDetector creates a new EC detector -func NewECDetector() *ECDetector { - return &ECDetector{ - minUtilization: 95.0, // 95% full - minIdleTime: time.Hour, // 1 hour idle - } -} - -// DetectECCandidates finds volumes that need erasure coding -func (ed *ECDetector) DetectECCandidates(volumes []*VolumeInfo) ([]*VolumeCandidate, error) { - var candidates []*VolumeCandidate - - for _, vol := range volumes { - if ed.isECCandidate(vol) { - candidate := &VolumeCandidate{ - VolumeID: vol.ID, - Server: vol.Server, - Collection: vol.Collection, - TaskType: types.TaskTypeErasureCoding, - Priority: ed.calculateECPriority(vol), - Reason: "Volume is full and idle, ready for erasure coding", - DetectedAt: time.Now(), - ScheduleAt: time.Now(), - Parameters: map[string]interface{}{ - "utilization": vol.GetUtilization(), - "idle_time": vol.GetIdleTime().String(), - "volume_size": vol.Size, - }, - } - candidates = append(candidates, candidate) - } - } - - glog.V(2).Infof("EC detector found %d candidates", len(candidates)) - return candidates, nil -} - -// isECCandidate checks if a volume is suitable for EC -func (ed *ECDetector) isECCandidate(vol *VolumeInfo) bool { - // Skip if read-only - if vol.ReadOnly { - return false - } - - // Skip if already has remote storage (likely already EC'd) - if vol.RemoteStorageKey != "" { - return false - } - - // Check utilization - if vol.GetUtilization() < ed.minUtilization { - return false - } - - // Check idle time - if vol.GetIdleTime() < ed.minIdleTime { - return false - } - - return true -} - -// calculateECPriority calculates priority for EC tasks -func (ed *ECDetector) calculateECPriority(vol *VolumeInfo) types.TaskPriority { - utilization := vol.GetUtilization() - idleTime := vol.GetIdleTime() - - // Higher priority for fuller volumes that have been idle longer - if utilization >= 98.0 && idleTime > 24*time.Hour { - return types.TaskPriorityHigh - } - if utilization >= 96.0 && idleTime > 6*time.Hour { - return types.TaskPriorityNormal - } - return types.TaskPriorityLow -} - -// VacuumDetector detects volumes that need vacuum operations -type VacuumDetector struct { - minGarbageRatio float64 - minDeleteCount uint64 -} - -// NewVacuumDetector creates a new vacuum detector -func NewVacuumDetector() *VacuumDetector { - return &VacuumDetector{ - minGarbageRatio: 0.3, // 30% garbage - minDeleteCount: 100, // At least 100 deleted files - } -} - -// DetectVacuumCandidates finds volumes that need vacuum operations -func (vd *VacuumDetector) DetectVacuumCandidates(volumes []*VolumeInfo) ([]*VolumeCandidate, error) { - var candidates []*VolumeCandidate - - for _, vol := range volumes { - if vd.isVacuumCandidate(vol) { - candidate := &VolumeCandidate{ - VolumeID: vol.ID, - Server: vol.Server, - Collection: vol.Collection, - TaskType: types.TaskTypeVacuum, - Priority: vd.calculateVacuumPriority(vol), - Reason: "Volume has high garbage ratio and needs vacuum", - DetectedAt: time.Now(), - ScheduleAt: time.Now(), - Parameters: map[string]interface{}{ - "garbage_ratio": vol.GetGarbageRatio(), - "delete_count": vol.DeleteCount, - "deleted_byte_count": vol.DeletedByteCount, - }, - } - candidates = append(candidates, candidate) - } - } - - glog.V(2).Infof("Vacuum detector found %d candidates", len(candidates)) - return candidates, nil -} - -// isVacuumCandidate checks if a volume needs vacuum -func (vd *VacuumDetector) isVacuumCandidate(vol *VolumeInfo) bool { - // Skip if read-only - if vol.ReadOnly { - return false - } - - // Check garbage ratio - if vol.GetGarbageRatio() < vd.minGarbageRatio { - return false - } - - // Check delete count - if vol.DeleteCount < vd.minDeleteCount { - return false - } - - return true -} - -// calculateVacuumPriority calculates priority for vacuum tasks -func (vd *VacuumDetector) calculateVacuumPriority(vol *VolumeInfo) types.TaskPriority { - garbageRatio := vol.GetGarbageRatio() - - // Higher priority for volumes with more garbage - if garbageRatio >= 0.6 { - return types.TaskPriorityHigh - } - if garbageRatio >= 0.4 { - return types.TaskPriorityNormal - } - return types.TaskPriorityLow -} diff --git a/weed/admin/task/task_discovery.go b/weed/admin/task/task_discovery.go deleted file mode 100644 index 285a453a9..000000000 --- a/weed/admin/task/task_discovery.go +++ /dev/null @@ -1,161 +0,0 @@ -package task - -import ( - "context" - "time" - - "github.com/seaweedfs/seaweedfs/weed/glog" - "github.com/seaweedfs/seaweedfs/weed/pb/master_pb" - "github.com/seaweedfs/seaweedfs/weed/wdclient" -) - -// TaskDiscoveryEngine discovers volumes that need maintenance tasks -type TaskDiscoveryEngine struct { - masterClient *wdclient.MasterClient - scanInterval time.Duration - ecDetector *ECDetector - vacuumDetector *VacuumDetector -} - -// NewTaskDiscoveryEngine creates a new task discovery engine -func NewTaskDiscoveryEngine(masterClient *wdclient.MasterClient, scanInterval time.Duration) *TaskDiscoveryEngine { - return &TaskDiscoveryEngine{ - masterClient: masterClient, - scanInterval: scanInterval, - ecDetector: NewECDetector(), - vacuumDetector: NewVacuumDetector(), - } -} - -// ScanForTasks scans for volumes that need maintenance tasks -func (tde *TaskDiscoveryEngine) ScanForTasks() ([]*VolumeCandidate, error) { - var candidates []*VolumeCandidate - - // Get cluster topology and volume information - volumeInfos, err := tde.getVolumeInformation() - if err != nil { - return nil, err - } - - // Scan for EC candidates - ecCandidates, err := tde.ecDetector.DetectECCandidates(volumeInfos) - if err != nil { - glog.Errorf("EC detection failed: %v", err) - } else { - candidates = append(candidates, ecCandidates...) - } - - // Scan for vacuum candidates - vacuumCandidates, err := tde.vacuumDetector.DetectVacuumCandidates(volumeInfos) - if err != nil { - glog.Errorf("Vacuum detection failed: %v", err) - } else { - candidates = append(candidates, vacuumCandidates...) - } - - glog.V(1).Infof("Task discovery found %d candidates (%d EC, %d vacuum)", - len(candidates), len(ecCandidates), len(vacuumCandidates)) - - return candidates, nil -} - -// getVolumeInformation retrieves volume information from master -func (tde *TaskDiscoveryEngine) getVolumeInformation() ([]*VolumeInfo, error) { - var volumeInfos []*VolumeInfo - - err := tde.masterClient.WithClient(false, func(client master_pb.SeaweedClient) error { - resp, err := client.VolumeList(context.Background(), &master_pb.VolumeListRequest{}) - if err != nil { - return err - } - - if resp.TopologyInfo != nil { - for _, dc := range resp.TopologyInfo.DataCenterInfos { - for _, rack := range dc.RackInfos { - for _, node := range rack.DataNodeInfos { - for _, diskInfo := range node.DiskInfos { - for _, volInfo := range diskInfo.VolumeInfos { - volumeInfo := &VolumeInfo{ - ID: volInfo.Id, - Size: volInfo.Size, - Collection: volInfo.Collection, - FileCount: volInfo.FileCount, - DeleteCount: volInfo.DeleteCount, - DeletedByteCount: volInfo.DeletedByteCount, - ReadOnly: volInfo.ReadOnly, - Server: node.Id, - DataCenter: dc.Id, - Rack: rack.Id, - DiskType: volInfo.DiskType, - ModifiedAtSecond: volInfo.ModifiedAtSecond, - RemoteStorageKey: volInfo.RemoteStorageKey, - } - volumeInfos = append(volumeInfos, volumeInfo) - } - } - } - } - } - } - - return nil - }) - - return volumeInfos, err -} - -// VolumeInfo contains detailed volume information -type VolumeInfo struct { - ID uint32 - Size uint64 - Collection string - FileCount uint64 - DeleteCount uint64 - DeletedByteCount uint64 - ReadOnly bool - Server string - DataCenter string - Rack string - DiskType string - ModifiedAtSecond int64 - RemoteStorageKey string -} - -// GetUtilization calculates volume utilization percentage -func (vi *VolumeInfo) GetUtilization() float64 { - if vi.Size == 0 { - return 0.0 - } - // Assuming max volume size of 30GB - maxSize := uint64(30 * 1024 * 1024 * 1024) - return float64(vi.Size) / float64(maxSize) * 100.0 -} - -// GetGarbageRatio calculates the garbage ratio -func (vi *VolumeInfo) GetGarbageRatio() float64 { - if vi.Size == 0 { - return 0.0 - } - return float64(vi.DeletedByteCount) / float64(vi.Size) -} - -// GetIdleTime calculates how long the volume has been idle -func (vi *VolumeInfo) GetIdleTime() time.Duration { - lastModified := time.Unix(vi.ModifiedAtSecond, 0) - return time.Since(lastModified) -} - -// IsECCandidate checks if volume is a candidate for EC -func (vi *VolumeInfo) IsECCandidate() bool { - return !vi.ReadOnly && - vi.GetUtilization() >= 95.0 && - vi.GetIdleTime() > time.Hour && - vi.RemoteStorageKey == "" // Not already EC'd -} - -// IsVacuumCandidate checks if volume is a candidate for vacuum -func (vi *VolumeInfo) IsVacuumCandidate() bool { - return !vi.ReadOnly && - vi.GetGarbageRatio() >= 0.3 && - vi.DeleteCount > 0 -} diff --git a/weed/admin/task/task_scheduler.go b/weed/admin/task/task_scheduler.go deleted file mode 100644 index 6a7fecfc9..000000000 --- a/weed/admin/task/task_scheduler.go +++ /dev/null @@ -1,257 +0,0 @@ -package task - -import ( - "sync" - "time" - - "github.com/seaweedfs/seaweedfs/weed/glog" - "github.com/seaweedfs/seaweedfs/weed/worker/types" -) - -// TaskScheduler handles task assignment to workers -type TaskScheduler struct { - workerRegistry *WorkerRegistry - taskQueue *PriorityTaskQueue - mutex sync.RWMutex -} - -// NewTaskScheduler creates a new task scheduler -func NewTaskScheduler(registry *WorkerRegistry, queue *PriorityTaskQueue) *TaskScheduler { - return &TaskScheduler{ - workerRegistry: registry, - taskQueue: queue, - } -} - -// GetNextTask gets the next suitable task for a worker -func (ts *TaskScheduler) GetNextTask(workerID string, capabilities []types.TaskType) *types.Task { - ts.mutex.RLock() - defer ts.mutex.RUnlock() - - // Get worker info - _, exists := ts.workerRegistry.GetWorker(workerID) - if !exists { - return nil - } - - // Check worker capabilities - capabilityMap := make(map[types.TaskType]bool) - for _, cap := range capabilities { - capabilityMap[cap] = true - } - - // Find next suitable task - tasks := ts.taskQueue.GetTasks() - for _, task := range tasks { - // Check if worker can handle this task type - if !capabilityMap[task.Type] { - continue - } - - // Check if task is ready to be scheduled - if !task.ScheduledAt.IsZero() && task.ScheduledAt.After(time.Now()) { - continue - } - - // Additional checks can be added here - // (e.g., server affinity, resource requirements) - - return task - } - - return nil -} - -// SelectWorker selects the best worker for a task -func (ts *TaskScheduler) SelectWorker(task *types.Task, availableWorkers []*types.Worker) *types.Worker { - ts.mutex.RLock() - defer ts.mutex.RUnlock() - - var bestWorker *types.Worker - bestScore := -1.0 - - for _, worker := range availableWorkers { - // Check if worker supports this task type - if !ts.workerSupportsTask(worker, task.Type) { - continue - } - - // Calculate selection score - score := ts.calculateSelectionScore(worker, task) - if bestWorker == nil || score > bestScore { - bestWorker = worker - bestScore = score - } - } - - if bestWorker != nil { - glog.V(2).Infof("Selected worker %s for task %s (score: %.2f)", bestWorker.ID, task.Type, bestScore) - } - - return bestWorker -} - -// workerSupportsTask checks if a worker supports a task type -func (ts *TaskScheduler) workerSupportsTask(worker *types.Worker, taskType types.TaskType) bool { - for _, capability := range worker.Capabilities { - if capability == taskType { - return true - } - } - return false -} - -// calculateSelectionScore calculates a score for worker selection -func (ts *TaskScheduler) calculateSelectionScore(worker *types.Worker, task *types.Task) float64 { - // Base score from worker registry - baseScore := ts.workerRegistry.calculateWorkerScore(worker) - - // Task-specific adjustments - taskScore := baseScore - - // Priority adjustment - switch task.Priority { - case types.TaskPriorityHigh: - taskScore *= 1.2 // Prefer high-performing workers for high-priority tasks - case types.TaskPriorityLow: - taskScore *= 0.9 // Low-priority tasks can use any available worker - } - - // Server affinity bonus (if worker and volume are on same server) - if task.Server != "" && worker.Address == task.Server { - taskScore += 0.1 - } - - // Retry penalty (prefer different workers for retried tasks) - if task.RetryCount > 0 { - taskScore *= 0.8 - } - - return taskScore -} - -// PriorityTaskQueue implements a priority queue for tasks -type PriorityTaskQueue struct { - tasks []*types.Task - mutex sync.RWMutex -} - -// NewPriorityTaskQueue creates a new priority task queue -func NewPriorityTaskQueue() *PriorityTaskQueue { - return &PriorityTaskQueue{ - tasks: make([]*types.Task, 0), - } -} - -// Push adds a task to the queue -func (ptq *PriorityTaskQueue) Push(task *types.Task) { - ptq.mutex.Lock() - defer ptq.mutex.Unlock() - - // Insert task in priority order (highest priority first) - inserted := false - for i, existingTask := range ptq.tasks { - if task.Priority > existingTask.Priority { - // Insert at position i - ptq.tasks = append(ptq.tasks[:i], append([]*types.Task{task}, ptq.tasks[i:]...)...) - inserted = true - break - } - } - - if !inserted { - // Add to end - ptq.tasks = append(ptq.tasks, task) - } - - glog.V(3).Infof("Added task %s to queue (priority: %d, queue size: %d)", task.ID, task.Priority, len(ptq.tasks)) -} - -// Pop removes and returns the highest priority task -func (ptq *PriorityTaskQueue) Pop() *types.Task { - ptq.mutex.Lock() - defer ptq.mutex.Unlock() - - if len(ptq.tasks) == 0 { - return nil - } - - task := ptq.tasks[0] - ptq.tasks = ptq.tasks[1:] - return task -} - -// Peek returns the highest priority task without removing it -func (ptq *PriorityTaskQueue) Peek() *types.Task { - ptq.mutex.RLock() - defer ptq.mutex.RUnlock() - - if len(ptq.tasks) == 0 { - return nil - } - - return ptq.tasks[0] -} - -// IsEmpty returns true if the queue is empty -func (ptq *PriorityTaskQueue) IsEmpty() bool { - ptq.mutex.RLock() - defer ptq.mutex.RUnlock() - - return len(ptq.tasks) == 0 -} - -// Size returns the number of tasks in the queue -func (ptq *PriorityTaskQueue) Size() int { - ptq.mutex.RLock() - defer ptq.mutex.RUnlock() - - return len(ptq.tasks) -} - -// HasTask checks if a task exists for a volume and task type -func (ptq *PriorityTaskQueue) HasTask(volumeID uint32, taskType types.TaskType) bool { - ptq.mutex.RLock() - defer ptq.mutex.RUnlock() - - for _, task := range ptq.tasks { - if task.VolumeID == volumeID && task.Type == taskType { - return true - } - } - return false -} - -// GetTasks returns a copy of all tasks in the queue -func (ptq *PriorityTaskQueue) GetTasks() []*types.Task { - ptq.mutex.RLock() - defer ptq.mutex.RUnlock() - - tasksCopy := make([]*types.Task, len(ptq.tasks)) - copy(tasksCopy, ptq.tasks) - return tasksCopy -} - -// RemoveTask removes a specific task from the queue -func (ptq *PriorityTaskQueue) RemoveTask(taskID string) bool { - ptq.mutex.Lock() - defer ptq.mutex.Unlock() - - for i, task := range ptq.tasks { - if task.ID == taskID { - ptq.tasks = append(ptq.tasks[:i], ptq.tasks[i+1:]...) - glog.V(3).Infof("Removed task %s from queue", taskID) - return true - } - } - return false -} - -// Clear removes all tasks from the queue -func (ptq *PriorityTaskQueue) Clear() { - ptq.mutex.Lock() - defer ptq.mutex.Unlock() - - ptq.tasks = ptq.tasks[:0] - glog.V(3).Infof("Cleared task queue") -} diff --git a/weed/admin/task/task_types.go b/weed/admin/task/task_types.go deleted file mode 100644 index bfe507c7d..000000000 --- a/weed/admin/task/task_types.go +++ /dev/null @@ -1,68 +0,0 @@ -package task - -import ( - "time" - - "github.com/seaweedfs/seaweedfs/weed/worker/types" -) - -// InProgressTask represents a task currently being executed -type InProgressTask struct { - Task *types.Task - WorkerID string - StartedAt time.Time - LastUpdate time.Time - Progress float64 - EstimatedEnd time.Time - VolumeReserved bool // Reserved for capacity planning -} - -// VolumeCandidate represents a volume that needs maintenance -type VolumeCandidate struct { - VolumeID uint32 - Server string - Collection string - TaskType types.TaskType - Priority types.TaskPriority - Reason string - DetectedAt time.Time - ScheduleAt time.Time - Parameters map[string]interface{} -} - -// VolumeChange represents a volume state change -type VolumeChange struct { - VolumeID uint32 - ChangeType ChangeType - OldCapacity int64 - NewCapacity int64 - TaskID string - CompletedAt time.Time - ReportedToMaster bool -} - -// ChangeType represents the type of volume change -type ChangeType string - -const ( - ChangeTypeECEncoding ChangeType = "ec_encoding" - ChangeTypeVacuumComplete ChangeType = "vacuum_completed" -) - -// WorkerMetrics represents performance metrics for a worker -type WorkerMetrics struct { - TasksCompleted int - TasksFailed int - AverageTaskTime time.Duration - LastTaskTime time.Time - SuccessRate float64 -} - -// VolumeReservation represents a reserved volume capacity -type VolumeReservation struct { - VolumeID uint32 - TaskID string - ReservedAt time.Time - ExpectedEnd time.Time - CapacityDelta int64 // Expected change in capacity -} diff --git a/weed/admin/task/volume_state_manager.go b/weed/admin/task/volume_state_manager.go deleted file mode 100644 index a0058096f..000000000 --- a/weed/admin/task/volume_state_manager.go +++ /dev/null @@ -1,640 +0,0 @@ -package task - -import ( - "context" - "sync" - "time" - - "github.com/seaweedfs/seaweedfs/weed/glog" - "github.com/seaweedfs/seaweedfs/weed/pb/master_pb" - "github.com/seaweedfs/seaweedfs/weed/storage/erasure_coding" - "github.com/seaweedfs/seaweedfs/weed/wdclient" - "github.com/seaweedfs/seaweedfs/weed/worker/types" -) - -// VolumeStateManager provides comprehensive tracking of all volume and shard states -type VolumeStateManager struct { - masterClient *wdclient.MasterClient - volumes map[uint32]*VolumeState - ecShards map[uint32]*ECShardState // Key: VolumeID - inProgressTasks map[string]*TaskImpact // Key: TaskID - plannedOperations map[string]*PlannedOperation // Key: OperationID - capacityCache map[string]*CapacityInfo // Key: Server address - lastMasterSync time.Time - mutex sync.RWMutex -} - -// VolumeState tracks comprehensive state of a volume -type VolumeState struct { - VolumeID uint32 - CurrentState *VolumeInfo // Current state from master - InProgressTasks []*TaskImpact // Tasks currently affecting this volume - PlannedChanges []*PlannedOperation // Future operations planned - PredictedState *VolumeInfo // Predicted state after all operations - LastMasterUpdate time.Time - Inconsistencies []StateInconsistency -} - -// ECShardState tracks EC shard information -type ECShardState struct { - VolumeID uint32 - CurrentShards map[int]*ShardInfo // Current shards from master (0-13) - InProgressTasks []*TaskImpact // Tasks affecting shards - PlannedShards map[int]*PlannedShard // Planned shard operations - PredictedShards map[int]*ShardInfo // Predicted final state - LastUpdate time.Time -} - -// ShardInfo represents information about an EC shard -type ShardInfo struct { - ShardID int - Server string - Size uint64 - Status ShardStatus - LastUpdate time.Time -} - -// ShardStatus represents the status of a shard -type ShardStatus string - -const ( - ShardStatusExists ShardStatus = "exists" - ShardStatusCreating ShardStatus = "creating" - ShardStatusDeleting ShardStatus = "deleting" - ShardStatusMissing ShardStatus = "missing" - ShardStatusCorrupted ShardStatus = "corrupted" -) - -// TaskImpact describes how a task affects volume/shard state -type TaskImpact struct { - TaskID string - TaskType types.TaskType - VolumeID uint32 - WorkerID string - StartedAt time.Time - EstimatedEnd time.Time - - // Volume impacts - VolumeChanges *VolumeChanges - - // Shard impacts - ShardChanges map[int]*ShardChange // Key: ShardID - - // Capacity impacts - CapacityDelta map[string]int64 // Key: Server, Value: capacity change -} - -// VolumeChanges describes changes to a volume -type VolumeChanges struct { - SizeChange int64 - WillBeDeleted bool - WillBeCreated bool - WillBecomeReadOnly bool - CollectionChange string - DiskTypeChange string -} - -// ShardChange describes changes to a shard -type ShardChange struct { - ShardID int - WillBeCreated bool - WillBeDeleted bool - TargetServer string - SizeChange int64 -} - -// PlannedOperation represents a future operation -type PlannedOperation struct { - OperationID string - Type OperationType - VolumeID uint32 - ScheduledAt time.Time - Priority types.TaskPriority - Prerequisites []string // Other operation IDs that must complete first - Impact *TaskImpact -} - -// OperationType represents different types of planned operations -type OperationType string - -const ( - OperationECEncode OperationType = "ec_encode" - OperationECRebuild OperationType = "ec_rebuild" - OperationECBalance OperationType = "ec_balance" - OperationVacuum OperationType = "vacuum" - OperationVolumeMove OperationType = "volume_move" - OperationShardMove OperationType = "shard_move" - OperationVolumeDelete OperationType = "volume_delete" -) - -// CapacityInfo tracks server capacity information -type CapacityInfo struct { - Server string - TotalCapacity int64 - UsedCapacity int64 - ReservedCapacity int64 // Capacity reserved for in-progress tasks - PredictedUsage int64 // Predicted usage after all operations - LastUpdate time.Time -} - -// StateInconsistency represents detected inconsistencies -type StateInconsistency struct { - Type InconsistencyType - Description string - DetectedAt time.Time - Severity SeverityLevel - VolumeID uint32 - ShardID *int -} - -// InconsistencyType represents different types of state inconsistencies -type InconsistencyType string - -const ( - InconsistencyVolumeMissing InconsistencyType = "volume_missing" - InconsistencyVolumeUnexpected InconsistencyType = "volume_unexpected" - InconsistencyShardMissing InconsistencyType = "shard_missing" - InconsistencyShardUnexpected InconsistencyType = "shard_unexpected" - InconsistencyCapacityMismatch InconsistencyType = "capacity_mismatch" - InconsistencyTaskOrphaned InconsistencyType = "task_orphaned" - InconsistencyDuplicateTask InconsistencyType = "duplicate_task" -) - -// SeverityLevel represents the severity of an inconsistency -type SeverityLevel string - -const ( - SeverityLow SeverityLevel = "low" - SeverityMedium SeverityLevel = "medium" - SeverityHigh SeverityLevel = "high" - SeverityCritical SeverityLevel = "critical" -) - -// NewVolumeStateManager creates a new volume state manager -func NewVolumeStateManager(masterClient *wdclient.MasterClient) *VolumeStateManager { - return &VolumeStateManager{ - masterClient: masterClient, - volumes: make(map[uint32]*VolumeState), - ecShards: make(map[uint32]*ECShardState), - inProgressTasks: make(map[string]*TaskImpact), - plannedOperations: make(map[string]*PlannedOperation), - capacityCache: make(map[string]*CapacityInfo), - } -} - -// SyncWithMaster synchronizes state with the master server -func (vsm *VolumeStateManager) SyncWithMaster() error { - vsm.mutex.Lock() - defer vsm.mutex.Unlock() - - glog.V(2).Infof("Syncing volume state with master") - - // Get current volume list from master - masterVolumes, masterShards, err := vsm.fetchMasterState() - if err != nil { - return err - } - - // Update volume states - vsm.updateVolumeStates(masterVolumes) - - // Update shard states - vsm.updateShardStates(masterShards) - - // Detect inconsistencies - vsm.detectInconsistencies() - - // Update capacity information - vsm.updateCapacityInfo() - - // Recalculate predicted states - vsm.recalculatePredictedStates() - - vsm.lastMasterSync = time.Now() - glog.V(2).Infof("Master sync completed, tracking %d volumes, %d EC volumes", - len(vsm.volumes), len(vsm.ecShards)) - - return nil -} - -// RegisterTaskImpact registers the impact of a new task -func (vsm *VolumeStateManager) RegisterTaskImpact(taskID string, impact *TaskImpact) { - vsm.mutex.Lock() - defer vsm.mutex.Unlock() - - vsm.inProgressTasks[taskID] = impact - - // Update volume state - if volumeState, exists := vsm.volumes[impact.VolumeID]; exists { - volumeState.InProgressTasks = append(volumeState.InProgressTasks, impact) - } - - // Update shard state for EC operations - if impact.TaskType == types.TaskTypeErasureCoding { - if shardState, exists := vsm.ecShards[impact.VolumeID]; exists { - shardState.InProgressTasks = append(shardState.InProgressTasks, impact) - } - } - - // Update capacity reservations - for server, capacityDelta := range impact.CapacityDelta { - if capacity, exists := vsm.capacityCache[server]; exists { - capacity.ReservedCapacity += capacityDelta - } - } - - // Recalculate predicted states - vsm.recalculatePredictedStates() - - glog.V(2).Infof("Registered task impact: %s for volume %d", taskID, impact.VolumeID) -} - -// UnregisterTaskImpact removes a completed task's impact -func (vsm *VolumeStateManager) UnregisterTaskImpact(taskID string) { - vsm.mutex.Lock() - defer vsm.mutex.Unlock() - - impact, exists := vsm.inProgressTasks[taskID] - if !exists { - return - } - - delete(vsm.inProgressTasks, taskID) - - // Remove from volume state - if volumeState, exists := vsm.volumes[impact.VolumeID]; exists { - vsm.removeTaskFromVolume(volumeState, taskID) - } - - // Remove from shard state - if shardState, exists := vsm.ecShards[impact.VolumeID]; exists { - vsm.removeTaskFromShards(shardState, taskID) - } - - // Update capacity reservations - for server, capacityDelta := range impact.CapacityDelta { - if capacity, exists := vsm.capacityCache[server]; exists { - capacity.ReservedCapacity -= capacityDelta - } - } - - // Recalculate predicted states - vsm.recalculatePredictedStates() - - glog.V(2).Infof("Unregistered task impact: %s", taskID) -} - -// GetAccurateCapacity returns accurate capacity information for a server -func (vsm *VolumeStateManager) GetAccurateCapacity(server string) *CapacityInfo { - vsm.mutex.RLock() - defer vsm.mutex.RUnlock() - - if capacity, exists := vsm.capacityCache[server]; exists { - // Return a copy to avoid external modifications - return &CapacityInfo{ - Server: capacity.Server, - TotalCapacity: capacity.TotalCapacity, - UsedCapacity: capacity.UsedCapacity, - ReservedCapacity: capacity.ReservedCapacity, - PredictedUsage: capacity.PredictedUsage, - LastUpdate: capacity.LastUpdate, - } - } - return nil -} - -// GetVolumeState returns the current state of a volume -func (vsm *VolumeStateManager) GetVolumeState(volumeID uint32) *VolumeState { - vsm.mutex.RLock() - defer vsm.mutex.RUnlock() - - if state, exists := vsm.volumes[volumeID]; exists { - // Return a copy to avoid external modifications - return vsm.copyVolumeState(state) - } - return nil -} - -// GetECShardState returns the current state of EC shards for a volume -func (vsm *VolumeStateManager) GetECShardState(volumeID uint32) *ECShardState { - vsm.mutex.RLock() - defer vsm.mutex.RUnlock() - - if state, exists := vsm.ecShards[volumeID]; exists { - return vsm.copyECShardState(state) - } - return nil -} - -// CanAssignVolumeToServer checks if a volume can be assigned to a server -func (vsm *VolumeStateManager) CanAssignVolumeToServer(volumeSize int64, server string) bool { - vsm.mutex.RLock() - defer vsm.mutex.RUnlock() - - capacity := vsm.capacityCache[server] - if capacity == nil { - return false - } - - // Calculate available capacity: Total - Used - Reserved - availableCapacity := capacity.TotalCapacity - capacity.UsedCapacity - capacity.ReservedCapacity - return availableCapacity >= volumeSize -} - -// PlanOperation schedules a future operation -func (vsm *VolumeStateManager) PlanOperation(operation *PlannedOperation) { - vsm.mutex.Lock() - defer vsm.mutex.Unlock() - - vsm.plannedOperations[operation.OperationID] = operation - - // Add to volume planned changes - if volumeState, exists := vsm.volumes[operation.VolumeID]; exists { - volumeState.PlannedChanges = append(volumeState.PlannedChanges, operation) - } - - glog.V(2).Infof("Planned operation: %s for volume %d", operation.OperationID, operation.VolumeID) -} - -// GetPendingChange returns pending change for a volume -func (vsm *VolumeStateManager) GetPendingChange(volumeID uint32) *VolumeChange { - vsm.mutex.RLock() - defer vsm.mutex.RUnlock() - - // Look for pending changes in volume state - if volumeState, exists := vsm.volumes[volumeID]; exists { - // Return the most recent pending change - if len(volumeState.PlannedChanges) > 0 { - latestOp := volumeState.PlannedChanges[len(volumeState.PlannedChanges)-1] - if latestOp.Impact != nil && latestOp.Impact.VolumeChanges != nil { - return &VolumeChange{ - VolumeID: volumeID, - ChangeType: ChangeType(latestOp.Type), - OldCapacity: int64(volumeState.CurrentState.Size), - NewCapacity: int64(volumeState.CurrentState.Size) + latestOp.Impact.VolumeChanges.SizeChange, - TaskID: latestOp.Impact.TaskID, - CompletedAt: time.Time{}, // Not completed yet - ReportedToMaster: false, - } - } - } - } - - return nil -} - -// fetchMasterState retrieves current state from master -func (vsm *VolumeStateManager) fetchMasterState() (map[uint32]*VolumeInfo, map[uint32]map[int]*ShardInfo, error) { - volumes := make(map[uint32]*VolumeInfo) - shards := make(map[uint32]map[int]*ShardInfo) - - err := vsm.masterClient.WithClient(false, func(client master_pb.SeaweedClient) error { - // Fetch volume list - resp, err := client.VolumeList(context.Background(), &master_pb.VolumeListRequest{}) - if err != nil { - return err - } - - // Process topology info - if resp.TopologyInfo != nil { - for _, dc := range resp.TopologyInfo.DataCenterInfos { - for _, rack := range dc.RackInfos { - for _, node := range rack.DataNodeInfos { - for _, diskInfo := range node.DiskInfos { - // Process regular volumes - for _, volInfo := range diskInfo.VolumeInfos { - volumes[volInfo.Id] = &VolumeInfo{ - ID: volInfo.Id, - Size: volInfo.Size, - Collection: volInfo.Collection, - FileCount: volInfo.FileCount, - DeleteCount: volInfo.DeleteCount, - DeletedByteCount: volInfo.DeletedByteCount, - ReadOnly: volInfo.ReadOnly, - Server: node.Id, - DataCenter: dc.Id, - Rack: rack.Id, - DiskType: volInfo.DiskType, - ModifiedAtSecond: volInfo.ModifiedAtSecond, - RemoteStorageKey: volInfo.RemoteStorageKey, - } - } - - // Process EC shards - for _, ecShardInfo := range diskInfo.EcShardInfos { - volumeID := ecShardInfo.Id - if shards[volumeID] == nil { - shards[volumeID] = make(map[int]*ShardInfo) - } - - // Decode shard bits - for shardID := 0; shardID < erasure_coding.TotalShardsCount; shardID++ { - if (ecShardInfo.EcIndexBits & (1 << uint(shardID))) != 0 { - shards[volumeID][shardID] = &ShardInfo{ - ShardID: shardID, - Server: node.Id, - Size: 0, // Size would need to be fetched separately - Status: ShardStatusExists, - LastUpdate: time.Now(), - } - } - } - } - } - } - } - } - } - - return nil - }) - - return volumes, shards, err -} - -// updateVolumeStates updates volume states based on master data -func (vsm *VolumeStateManager) updateVolumeStates(masterVolumes map[uint32]*VolumeInfo) { - now := time.Now() - - // Update existing volumes and add new ones - for volumeID, masterVolume := range masterVolumes { - if volumeState, exists := vsm.volumes[volumeID]; exists { - // Update existing volume - oldState := volumeState.CurrentState - volumeState.CurrentState = masterVolume - volumeState.LastMasterUpdate = now - - // Check for unexpected changes - if oldState != nil && vsm.hasUnexpectedChanges(oldState, masterVolume) { - vsm.addInconsistency(volumeState, InconsistencyVolumeUnexpected, - "Volume changed unexpectedly since last sync", SeverityMedium) - } - } else { - // New volume detected - vsm.volumes[volumeID] = &VolumeState{ - VolumeID: volumeID, - CurrentState: masterVolume, - InProgressTasks: []*TaskImpact{}, - PlannedChanges: []*PlannedOperation{}, - LastMasterUpdate: now, - Inconsistencies: []StateInconsistency{}, - } - } - } - - // Detect missing volumes (volumes we knew about but master doesn't report) - for volumeID, volumeState := range vsm.volumes { - if _, existsInMaster := masterVolumes[volumeID]; !existsInMaster { - // Check if this is expected (due to deletion task) - if !vsm.isVolumeDeletionExpected(volumeID) { - vsm.addInconsistency(volumeState, InconsistencyVolumeMissing, - "Volume missing from master but not expected to be deleted", SeverityHigh) - } - } - } -} - -// updateShardStates updates EC shard states -func (vsm *VolumeStateManager) updateShardStates(masterShards map[uint32]map[int]*ShardInfo) { - now := time.Now() - - // Update existing shard states - for volumeID, shardMap := range masterShards { - if shardState, exists := vsm.ecShards[volumeID]; exists { - shardState.CurrentShards = shardMap - shardState.LastUpdate = now - } else { - vsm.ecShards[volumeID] = &ECShardState{ - VolumeID: volumeID, - CurrentShards: shardMap, - InProgressTasks: []*TaskImpact{}, - PlannedShards: make(map[int]*PlannedShard), - PredictedShards: make(map[int]*ShardInfo), - LastUpdate: now, - } - } - } - - // Check for missing shards that we expected to exist - for volumeID, shardState := range vsm.ecShards { - if masterShardMap, exists := masterShards[volumeID]; exists { - vsm.validateShardConsistency(shardState, masterShardMap) - } - } -} - -// detectInconsistencies identifies state inconsistencies -func (vsm *VolumeStateManager) detectInconsistencies() { - for _, volumeState := range vsm.volumes { - vsm.detectVolumeInconsistencies(volumeState) - } - - for _, shardState := range vsm.ecShards { - vsm.detectShardInconsistencies(shardState) - } - - vsm.detectOrphanedTasks() - vsm.detectDuplicateTasks() - vsm.detectCapacityInconsistencies() -} - -// updateCapacityInfo updates server capacity information -func (vsm *VolumeStateManager) updateCapacityInfo() { - for server := range vsm.capacityCache { - vsm.recalculateServerCapacity(server) - } -} - -// recalculatePredictedStates recalculates predicted states after all operations -func (vsm *VolumeStateManager) recalculatePredictedStates() { - for _, volumeState := range vsm.volumes { - vsm.calculatePredictedVolumeState(volumeState) - } - - for _, shardState := range vsm.ecShards { - vsm.calculatePredictedShardState(shardState) - } -} - -// Helper methods (simplified implementations) - -func (vsm *VolumeStateManager) hasUnexpectedChanges(old, new *VolumeInfo) bool { - return old.Size != new.Size || old.ReadOnly != new.ReadOnly -} - -func (vsm *VolumeStateManager) isVolumeDeletionExpected(volumeID uint32) bool { - for _, impact := range vsm.inProgressTasks { - if impact.VolumeID == volumeID && impact.VolumeChanges != nil && impact.VolumeChanges.WillBeDeleted { - return true - } - } - return false -} - -func (vsm *VolumeStateManager) addInconsistency(volumeState *VolumeState, incType InconsistencyType, desc string, severity SeverityLevel) { - inconsistency := StateInconsistency{ - Type: incType, - Description: desc, - DetectedAt: time.Now(), - Severity: severity, - VolumeID: volumeState.VolumeID, - } - volumeState.Inconsistencies = append(volumeState.Inconsistencies, inconsistency) - - glog.Warningf("State inconsistency detected for volume %d: %s", volumeState.VolumeID, desc) -} - -func (vsm *VolumeStateManager) removeTaskFromVolume(volumeState *VolumeState, taskID string) { - for i, task := range volumeState.InProgressTasks { - if task.TaskID == taskID { - volumeState.InProgressTasks = append(volumeState.InProgressTasks[:i], volumeState.InProgressTasks[i+1:]...) - break - } - } -} - -func (vsm *VolumeStateManager) removeTaskFromShards(shardState *ECShardState, taskID string) { - for i, task := range shardState.InProgressTasks { - if task.TaskID == taskID { - shardState.InProgressTasks = append(shardState.InProgressTasks[:i], shardState.InProgressTasks[i+1:]...) - break - } - } -} - -func (vsm *VolumeStateManager) copyVolumeState(state *VolumeState) *VolumeState { - // Return a deep copy (implementation would be more detailed) - return &VolumeState{ - VolumeID: state.VolumeID, - CurrentState: state.CurrentState, - LastMasterUpdate: state.LastMasterUpdate, - } -} - -func (vsm *VolumeStateManager) copyECShardState(state *ECShardState) *ECShardState { - // Return a deep copy (implementation would be more detailed) - return &ECShardState{ - VolumeID: state.VolumeID, - LastUpdate: state.LastUpdate, - } -} - -// Placeholder implementations for consistency checking methods -func (vsm *VolumeStateManager) validateShardConsistency(shardState *ECShardState, masterShards map[int]*ShardInfo) { -} -func (vsm *VolumeStateManager) detectVolumeInconsistencies(volumeState *VolumeState) {} -func (vsm *VolumeStateManager) detectShardInconsistencies(shardState *ECShardState) {} -func (vsm *VolumeStateManager) detectOrphanedTasks() {} -func (vsm *VolumeStateManager) detectDuplicateTasks() {} -func (vsm *VolumeStateManager) detectCapacityInconsistencies() {} -func (vsm *VolumeStateManager) recalculateServerCapacity(server string) {} -func (vsm *VolumeStateManager) calculatePredictedVolumeState(volumeState *VolumeState) {} -func (vsm *VolumeStateManager) calculatePredictedShardState(shardState *ECShardState) {} - -// PlannedShard represents a planned shard operation -type PlannedShard struct { - ShardID int - Operation string // "create", "delete", "move" - TargetServer string - ScheduledAt time.Time -} diff --git a/weed/admin/task/volume_state_manager_test.go b/weed/admin/task/volume_state_manager_test.go deleted file mode 100644 index 1f98cf97a..000000000 --- a/weed/admin/task/volume_state_manager_test.go +++ /dev/null @@ -1,440 +0,0 @@ -package task - -import ( - "fmt" - "testing" - "time" - - "github.com/seaweedfs/seaweedfs/weed/worker/types" -) - -func TestVolumeStateManager_RegisterTaskImpact(t *testing.T) { - vsm := NewVolumeStateManager(nil) - - // Create test volume state - volumeID := uint32(1) - volumeState := &VolumeState{ - VolumeID: volumeID, - CurrentState: &VolumeInfo{ - ID: volumeID, - Size: 1024 * 1024 * 1024, // 1GB - }, - InProgressTasks: []*TaskImpact{}, - PlannedChanges: []*PlannedOperation{}, - Inconsistencies: []StateInconsistency{}, - } - vsm.volumes[volumeID] = volumeState - - // Create task impact - impact := &TaskImpact{ - TaskID: "test_task_1", - TaskType: types.TaskTypeErasureCoding, - VolumeID: volumeID, - WorkerID: "worker_1", - StartedAt: time.Now(), - EstimatedEnd: time.Now().Add(15 * time.Minute), - VolumeChanges: &VolumeChanges{ - WillBecomeReadOnly: true, - }, - ShardChanges: make(map[int]*ShardChange), - CapacityDelta: map[string]int64{"server1": 400 * 1024 * 1024}, // 400MB for shards - } - - // Register impact - vsm.RegisterTaskImpact(impact.TaskID, impact) - - // Verify impact was registered - if len(vsm.inProgressTasks) != 1 { - t.Errorf("Expected 1 in-progress task, got %d", len(vsm.inProgressTasks)) - } - - if len(volumeState.InProgressTasks) != 1 { - t.Errorf("Expected 1 task in volume state, got %d", len(volumeState.InProgressTasks)) - } - - // Verify task can be retrieved - retrievedImpact := vsm.inProgressTasks[impact.TaskID] - if retrievedImpact == nil { - t.Error("Task impact not found after registration") - } - - if retrievedImpact.TaskType != types.TaskTypeErasureCoding { - t.Errorf("Expected task type %v, got %v", types.TaskTypeErasureCoding, retrievedImpact.TaskType) - } -} - -func TestVolumeStateManager_UnregisterTaskImpact(t *testing.T) { - vsm := NewVolumeStateManager(nil) - - // Setup test data - volumeID := uint32(1) - taskID := "test_task_1" - - volumeState := &VolumeState{ - VolumeID: volumeID, - CurrentState: &VolumeInfo{ID: volumeID, Size: 1024 * 1024 * 1024}, - InProgressTasks: []*TaskImpact{}, - } - vsm.volumes[volumeID] = volumeState - - impact := &TaskImpact{ - TaskID: taskID, - TaskType: types.TaskTypeVacuum, - VolumeID: volumeID, - CapacityDelta: map[string]int64{"server1": -100 * 1024 * 1024}, // 100MB savings - } - - // Register then unregister - vsm.RegisterTaskImpact(taskID, impact) - vsm.UnregisterTaskImpact(taskID) - - // Verify impact was removed - if len(vsm.inProgressTasks) != 0 { - t.Errorf("Expected 0 in-progress tasks, got %d", len(vsm.inProgressTasks)) - } - - if len(volumeState.InProgressTasks) != 0 { - t.Errorf("Expected 0 tasks in volume state, got %d", len(volumeState.InProgressTasks)) - } -} - -func TestVolumeStateManager_CanAssignVolumeToServer(t *testing.T) { - vsm := NewVolumeStateManager(nil) - - // Setup server capacity - serverID := "test_server" - capacity := &CapacityInfo{ - Server: serverID, - TotalCapacity: 10 * 1024 * 1024 * 1024, // 10GB - UsedCapacity: 3 * 1024 * 1024 * 1024, // 3GB used - ReservedCapacity: 1 * 1024 * 1024 * 1024, // 1GB reserved - PredictedUsage: 4 * 1024 * 1024 * 1024, // 4GB predicted total - } - vsm.capacityCache[serverID] = capacity - - tests := []struct { - name string - volumeSize int64 - expected bool - desc string - }{ - { - name: "Small volume fits", - volumeSize: 1 * 1024 * 1024 * 1024, // 1GB - expected: true, - desc: "1GB volume should fit in 6GB available space", - }, - { - name: "Large volume fits exactly", - volumeSize: 6 * 1024 * 1024 * 1024, // 6GB - expected: true, - desc: "6GB volume should fit exactly in available space", - }, - { - name: "Volume too large", - volumeSize: 7 * 1024 * 1024 * 1024, // 7GB - expected: false, - desc: "7GB volume should not fit in 6GB available space", - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - result := vsm.CanAssignVolumeToServer(tt.volumeSize, serverID) - if result != tt.expected { - t.Errorf("CanAssignVolumeToServer() = %v, want %v. %s", result, tt.expected, tt.desc) - } - }) - } -} - -func TestVolumeStateManager_GetPendingChange(t *testing.T) { - vsm := NewVolumeStateManager(nil) - - volumeID := uint32(1) - - // Create volume with planned operation - volumeState := &VolumeState{ - VolumeID: volumeID, - CurrentState: &VolumeInfo{ - ID: volumeID, - Size: 2 * 1024 * 1024 * 1024, // 2GB - }, - PlannedChanges: []*PlannedOperation{ - { - OperationID: "op_1", - Type: OperationVacuum, - VolumeID: volumeID, - Impact: &TaskImpact{ - TaskID: "task_1", - VolumeChanges: &VolumeChanges{ - SizeChange: -500 * 1024 * 1024, // 500MB reduction - }, - }, - }, - }, - } - vsm.volumes[volumeID] = volumeState - - // Test getting pending change - change := vsm.GetPendingChange(volumeID) - - if change == nil { - t.Fatal("Expected pending change, got nil") - } - - if change.VolumeID != volumeID { - t.Errorf("Expected volume ID %d, got %d", volumeID, change.VolumeID) - } - - expectedNewCapacity := int64(2*1024*1024*1024 - 500*1024*1024) // 2GB - 500MB - if change.NewCapacity != expectedNewCapacity { - t.Errorf("Expected new capacity %d, got %d", expectedNewCapacity, change.NewCapacity) - } - - // Test no pending change - change2 := vsm.GetPendingChange(999) // Non-existent volume - if change2 != nil { - t.Error("Expected nil for non-existent volume, got change") - } -} - -func TestVolumeStateManager_StateConsistency(t *testing.T) { - // Test that demonstrates the core value: accurate state tracking - vsm := NewVolumeStateManager(nil) - - volumeID := uint32(1) - serverID := "test_server" - - // Setup initial state - vsm.volumes[volumeID] = &VolumeState{ - VolumeID: volumeID, - CurrentState: &VolumeInfo{ - ID: volumeID, - Size: 28 * 1024 * 1024 * 1024, // 28GB - ready for EC - Server: serverID, - }, - InProgressTasks: []*TaskImpact{}, - PlannedChanges: []*PlannedOperation{}, - } - - vsm.capacityCache[serverID] = &CapacityInfo{ - Server: serverID, - TotalCapacity: 100 * 1024 * 1024 * 1024, // 100GB - UsedCapacity: 50 * 1024 * 1024 * 1024, // 50GB used - PredictedUsage: 50 * 1024 * 1024 * 1024, // Initially same as used - } - - // Step 1: Register EC task impact - ecImpact := &TaskImpact{ - TaskID: "ec_task_1", - TaskType: types.TaskTypeErasureCoding, - VolumeID: volumeID, - VolumeChanges: &VolumeChanges{ - WillBecomeReadOnly: true, - }, - CapacityDelta: map[string]int64{ - serverID: 12 * 1024 * 1024 * 1024, // 12GB for EC shards (40% overhead) - }, - } - - vsm.RegisterTaskImpact(ecImpact.TaskID, ecImpact) - - // Verify capacity is reserved - capacity := vsm.GetAccurateCapacity(serverID) - expectedPredicted := int64(50 * 1024 * 1024 * 1024) // 50GB initially - if capacity.PredictedUsage != expectedPredicted { - t.Errorf("Expected predicted usage %d, got %d", expectedPredicted, capacity.PredictedUsage) - } - - // Verify reservation is tracked separately - expectedReserved := int64(12 * 1024 * 1024 * 1024) // 12GB for EC shards - if capacity.ReservedCapacity != expectedReserved { - t.Errorf("Expected reserved capacity %d, got %d", expectedReserved, capacity.ReservedCapacity) - } - - // Calculate available capacity correctly - availableCapacity := capacity.TotalCapacity - capacity.UsedCapacity - capacity.ReservedCapacity - // 100GB - 50GB - 12GB = 38GB available - expectedAvailable := int64(38 * 1024 * 1024 * 1024) - if availableCapacity != expectedAvailable { - t.Errorf("Expected available capacity %d, got %d", expectedAvailable, availableCapacity) - } - - // Step 2: Check assignment logic - should reject new large volume - canAssign := vsm.CanAssignVolumeToServer(40*1024*1024*1024, serverID) // 40GB volume - if canAssign { - t.Error("Should not be able to assign 40GB volume when only 38GB available after reservations") - } - - // Step 3: Complete EC task - vsm.UnregisterTaskImpact(ecImpact.TaskID) - - // Verify capacity is updated correctly - capacityAfter := vsm.GetAccurateCapacity(serverID) - if capacityAfter.ReservedCapacity != 0 { - t.Errorf("Expected 0 reserved capacity after task completion, got %d", capacityAfter.ReservedCapacity) - } - - t.Logf("✅ State consistency test passed - accurate capacity tracking throughout task lifecycle") -} - -func TestVolumeStateManager_ConcurrentTasks(t *testing.T) { - // Test multiple concurrent tasks affecting capacity - vsm := NewVolumeStateManager(nil) - - serverID := "test_server" - vsm.capacityCache[serverID] = &CapacityInfo{ - Server: serverID, - TotalCapacity: 50 * 1024 * 1024 * 1024, // 50GB - UsedCapacity: 10 * 1024 * 1024 * 1024, // 10GB used - PredictedUsage: 10 * 1024 * 1024 * 1024, // Initially 10GB - } - - // Register multiple tasks - tasks := []struct { - taskID string - volumeID uint32 - capacityDelta int64 - }{ - {"ec_task_1", 1, 15 * 1024 * 1024 * 1024}, // 15GB for EC - {"vacuum_task_1", 2, -5 * 1024 * 1024 * 1024}, // 5GB savings - {"ec_task_2", 3, 20 * 1024 * 1024 * 1024}, // 20GB for EC - } - - for _, task := range tasks { - // Setup volume state - vsm.volumes[task.volumeID] = &VolumeState{ - VolumeID: task.volumeID, - CurrentState: &VolumeInfo{ID: task.volumeID, Size: 25 * 1024 * 1024 * 1024}, - } - - impact := &TaskImpact{ - TaskID: task.taskID, - VolumeID: task.volumeID, - TaskType: types.TaskTypeErasureCoding, - CapacityDelta: map[string]int64{serverID: task.capacityDelta}, - } - - vsm.RegisterTaskImpact(task.taskID, impact) - } - - // Check cumulative capacity impact - capacity := vsm.GetAccurateCapacity(serverID) - expectedPredicted := int64(10*1024*1024*1024 + 15*1024*1024*1024 - 5*1024*1024*1024 + 20*1024*1024*1024) // 40GB - - if capacity.PredictedUsage != expectedPredicted { - t.Errorf("Expected predicted usage %d GB, got %d GB", - expectedPredicted/(1024*1024*1024), capacity.PredictedUsage/(1024*1024*1024)) - } - - // Verify we can't assign more than available - remainingCapacity := capacity.TotalCapacity - capacity.PredictedUsage - canAssign := vsm.CanAssignVolumeToServer(remainingCapacity+1, serverID) - if canAssign { - t.Error("Should not be able to assign volume larger than remaining capacity") - } - - t.Logf("✅ Concurrent tasks test passed - accurate cumulative capacity tracking") -} - -func TestVolumeStateManager_ECShardTracking(t *testing.T) { - vsm := NewVolumeStateManager(nil) - - volumeID := uint32(1) - - // Create EC shard state - shardState := &ECShardState{ - VolumeID: volumeID, - CurrentShards: map[int]*ShardInfo{ - 0: {ShardID: 0, Server: "server1", Status: ShardStatusExists}, - 1: {ShardID: 1, Server: "server1", Status: ShardStatusExists}, - 2: {ShardID: 2, Server: "server2", Status: ShardStatusExists}, - }, - InProgressTasks: []*TaskImpact{}, - PlannedShards: make(map[int]*PlannedShard), - PredictedShards: make(map[int]*ShardInfo), - } - vsm.ecShards[volumeID] = shardState - - // Register task that will create more shards - impact := &TaskImpact{ - TaskID: "ec_expand_task", - VolumeID: volumeID, - TaskType: types.TaskTypeErasureCoding, - ShardChanges: map[int]*ShardChange{ - 3: {ShardID: 3, WillBeCreated: true, TargetServer: "server3"}, - 4: {ShardID: 4, WillBeCreated: true, TargetServer: "server3"}, - }, - } - - vsm.RegisterTaskImpact(impact.TaskID, impact) - - // Verify shard state tracking - retrievedState := vsm.GetECShardState(volumeID) - if retrievedState == nil { - t.Fatal("Expected EC shard state, got nil") - } - - if len(retrievedState.InProgressTasks) != 1 { - t.Errorf("Expected 1 in-progress task for shards, got %d", len(retrievedState.InProgressTasks)) - } - - // Verify current shards are still tracked - if len(retrievedState.CurrentShards) != 3 { - t.Errorf("Expected 3 current shards, got %d", len(retrievedState.CurrentShards)) - } - - t.Logf("✅ EC shard tracking test passed") -} - -// Benchmark tests for performance -func BenchmarkVolumeStateManager_RegisterTaskImpact(b *testing.B) { - vsm := NewVolumeStateManager(nil) - - // Setup test data - for i := 0; i < 1000; i++ { - volumeID := uint32(i + 1) - vsm.volumes[volumeID] = &VolumeState{ - VolumeID: volumeID, - CurrentState: &VolumeInfo{ID: volumeID}, - InProgressTasks: []*TaskImpact{}, - } - } - - b.ResetTimer() - - for i := 0; i < b.N; i++ { - impact := &TaskImpact{ - TaskID: generateTaskID(), - VolumeID: uint32((i % 1000) + 1), - TaskType: types.TaskTypeVacuum, - CapacityDelta: map[string]int64{"server1": 1024 * 1024}, - } - - vsm.RegisterTaskImpact(impact.TaskID, impact) - vsm.UnregisterTaskImpact(impact.TaskID) - } -} - -func BenchmarkVolumeStateManager_CanAssignVolumeToServer(b *testing.B) { - vsm := NewVolumeStateManager(nil) - - // Setup capacity data - for i := 0; i < 100; i++ { - serverID := fmt.Sprintf("server_%d", i) - vsm.capacityCache[serverID] = &CapacityInfo{ - Server: serverID, - TotalCapacity: 100 * 1024 * 1024 * 1024, - UsedCapacity: 50 * 1024 * 1024 * 1024, - PredictedUsage: 50 * 1024 * 1024 * 1024, - } - } - - b.ResetTimer() - - for i := 0; i < b.N; i++ { - serverID := fmt.Sprintf("server_%d", i%100) - vsm.CanAssignVolumeToServer(1024*1024*1024, serverID) - } -} diff --git a/weed/admin/task/volume_state_tracker.go b/weed/admin/task/volume_state_tracker.go deleted file mode 100644 index a51436b83..000000000 --- a/weed/admin/task/volume_state_tracker.go +++ /dev/null @@ -1,226 +0,0 @@ -package task - -import ( - "sync" - "time" - - "github.com/seaweedfs/seaweedfs/weed/glog" - "github.com/seaweedfs/seaweedfs/weed/pb/master_pb" - "github.com/seaweedfs/seaweedfs/weed/wdclient" - "github.com/seaweedfs/seaweedfs/weed/worker/types" -) - -// VolumeStateTracker tracks volume state changes and reconciles with master -type VolumeStateTracker struct { - masterClient *wdclient.MasterClient - reconcileInterval time.Duration - reservedVolumes map[uint32]*VolumeReservation - pendingChanges map[uint32]*VolumeChange - mutex sync.RWMutex -} - -// NewVolumeStateTracker creates a new volume state tracker -func NewVolumeStateTracker(masterClient *wdclient.MasterClient, reconcileInterval time.Duration) *VolumeStateTracker { - return &VolumeStateTracker{ - masterClient: masterClient, - reconcileInterval: reconcileInterval, - reservedVolumes: make(map[uint32]*VolumeReservation), - pendingChanges: make(map[uint32]*VolumeChange), - } -} - -// ReserveVolume reserves a volume for a task -func (vst *VolumeStateTracker) ReserveVolume(volumeID uint32, taskID string) { - vst.mutex.Lock() - defer vst.mutex.Unlock() - - reservation := &VolumeReservation{ - VolumeID: volumeID, - TaskID: taskID, - ReservedAt: time.Now(), - ExpectedEnd: time.Now().Add(15 * time.Minute), // Default 15 min estimate - CapacityDelta: 0, // Will be updated based on task type - } - - vst.reservedVolumes[volumeID] = reservation - glog.V(2).Infof("Reserved volume %d for task %s", volumeID, taskID) -} - -// ReleaseVolume releases a volume reservation -func (vst *VolumeStateTracker) ReleaseVolume(volumeID uint32, taskID string) { - vst.mutex.Lock() - defer vst.mutex.Unlock() - - if reservation, exists := vst.reservedVolumes[volumeID]; exists { - if reservation.TaskID == taskID { - delete(vst.reservedVolumes, volumeID) - glog.V(2).Infof("Released volume %d reservation for task %s", volumeID, taskID) - } - } -} - -// RecordVolumeChange records a completed volume change -func (vst *VolumeStateTracker) RecordVolumeChange(volumeID uint32, taskType types.TaskType, taskID string) { - vst.mutex.Lock() - defer vst.mutex.Unlock() - - changeType := ChangeTypeECEncoding - if taskType == types.TaskTypeVacuum { - changeType = ChangeTypeVacuumComplete - } - - change := &VolumeChange{ - VolumeID: volumeID, - ChangeType: changeType, - TaskID: taskID, - CompletedAt: time.Now(), - ReportedToMaster: false, - } - - vst.pendingChanges[volumeID] = change - glog.V(1).Infof("Recorded volume change for volume %d: %s", volumeID, changeType) -} - -// GetPendingChange returns pending change for a volume -func (vst *VolumeStateTracker) GetPendingChange(volumeID uint32) *VolumeChange { - vst.mutex.RLock() - defer vst.mutex.RUnlock() - - return vst.pendingChanges[volumeID] -} - -// GetVolumeReservation returns reservation for a volume -func (vst *VolumeStateTracker) GetVolumeReservation(volumeID uint32) *VolumeReservation { - vst.mutex.RLock() - defer vst.mutex.RUnlock() - - return vst.reservedVolumes[volumeID] -} - -// IsVolumeReserved checks if a volume is reserved -func (vst *VolumeStateTracker) IsVolumeReserved(volumeID uint32) bool { - vst.mutex.RLock() - defer vst.mutex.RUnlock() - - _, exists := vst.reservedVolumes[volumeID] - return exists -} - -// ReconcileWithMaster reconciles volume states with master server -func (vst *VolumeStateTracker) ReconcileWithMaster() { - vst.mutex.Lock() - defer vst.mutex.Unlock() - - // Report pending changes to master - for volumeID, change := range vst.pendingChanges { - if vst.reportChangeToMaster(change) { - change.ReportedToMaster = true - delete(vst.pendingChanges, volumeID) - glog.V(1).Infof("Successfully reported volume change for volume %d to master", volumeID) - } - } - - // Clean up expired reservations - vst.cleanupExpiredReservations() -} - -// reportChangeToMaster reports a volume change to the master server -func (vst *VolumeStateTracker) reportChangeToMaster(change *VolumeChange) bool { - // Note: In a real implementation, this would make actual API calls to master - // For now, we'll simulate the reporting - - switch change.ChangeType { - case ChangeTypeECEncoding: - return vst.reportECCompletion(change) - case ChangeTypeVacuumComplete: - return vst.reportVacuumCompletion(change) - } - - return false -} - -// reportECCompletion reports EC completion to master -func (vst *VolumeStateTracker) reportECCompletion(change *VolumeChange) bool { - // This would typically trigger the master to: - // 1. Update volume state to reflect EC encoding - // 2. Update capacity calculations - // 3. Redistribute volume assignments - - glog.V(2).Infof("Reporting EC completion for volume %d", change.VolumeID) - - // Simulate master API call - err := vst.masterClient.WithClient(false, func(client master_pb.SeaweedClient) error { - // In real implementation, there would be a specific API call here - // For now, we simulate success - return nil - }) - - return err == nil -} - -// reportVacuumCompletion reports vacuum completion to master -func (vst *VolumeStateTracker) reportVacuumCompletion(change *VolumeChange) bool { - // This would typically trigger the master to: - // 1. Update volume statistics - // 2. Update capacity calculations - // 3. Mark volume as recently vacuumed - - glog.V(2).Infof("Reporting vacuum completion for volume %d", change.VolumeID) - - // Simulate master API call - err := vst.masterClient.WithClient(false, func(client master_pb.SeaweedClient) error { - // In real implementation, there would be a specific API call here - // For now, we simulate success - return nil - }) - - return err == nil -} - -// cleanupExpiredReservations removes expired volume reservations -func (vst *VolumeStateTracker) cleanupExpiredReservations() { - now := time.Now() - - for volumeID, reservation := range vst.reservedVolumes { - if now.After(reservation.ExpectedEnd) { - delete(vst.reservedVolumes, volumeID) - glog.Warningf("Cleaned up expired reservation for volume %d (task %s)", volumeID, reservation.TaskID) - } - } -} - -// GetAdjustedCapacity returns adjusted capacity considering in-progress tasks -func (vst *VolumeStateTracker) GetAdjustedCapacity(volumeID uint32, baseCapacity int64) int64 { - vst.mutex.RLock() - defer vst.mutex.RUnlock() - - // Check for pending changes - if change := vst.pendingChanges[volumeID]; change != nil { - return change.NewCapacity - } - - // Check for in-progress reservations - if reservation := vst.reservedVolumes[volumeID]; reservation != nil { - return baseCapacity + reservation.CapacityDelta - } - - return baseCapacity -} - -// GetStats returns statistics about volume state tracking -func (vst *VolumeStateTracker) GetStats() map[string]interface{} { - vst.mutex.RLock() - defer vst.mutex.RUnlock() - - stats := make(map[string]interface{}) - stats["reserved_volumes"] = len(vst.reservedVolumes) - stats["pending_changes"] = len(vst.pendingChanges) - - changeTypeCounts := make(map[ChangeType]int) - for _, change := range vst.pendingChanges { - changeTypeCounts[change.ChangeType]++ - } - stats["pending_by_type"] = changeTypeCounts - - return stats -} diff --git a/weed/admin/task/worker_communication.go b/weed/admin/task/worker_communication.go deleted file mode 100644 index 01484311f..000000000 --- a/weed/admin/task/worker_communication.go +++ /dev/null @@ -1,488 +0,0 @@ -package task - -import ( - "context" - "fmt" - "io" - "sync" - "time" - - "github.com/seaweedfs/seaweedfs/weed/glog" - "github.com/seaweedfs/seaweedfs/weed/pb" - "github.com/seaweedfs/seaweedfs/weed/pb/worker_pb" - "github.com/seaweedfs/seaweedfs/weed/worker/types" - "google.golang.org/grpc" - "google.golang.org/grpc/credentials/insecure" -) - -// WorkerConnection manages the gRPC connection to a single worker -type WorkerConnection struct { - workerID string - address string - conn *grpc.ClientConn - client worker_pb.WorkerServiceClient - stream worker_pb.WorkerService_WorkerStreamClient - lastSeen time.Time - mutex sync.RWMutex - adminServer *AdminServer - stopCh chan struct{} - active bool -} - -// WorkerCommunicationManager manages all worker connections -type WorkerCommunicationManager struct { - adminServer *AdminServer - connections map[string]*WorkerConnection - mutex sync.RWMutex - stopCh chan struct{} -} - -// NewWorkerCommunicationManager creates a new worker communication manager -func NewWorkerCommunicationManager(adminServer *AdminServer) *WorkerCommunicationManager { - return &WorkerCommunicationManager{ - adminServer: adminServer, - connections: make(map[string]*WorkerConnection), - stopCh: make(chan struct{}), - } -} - -// Start starts the worker communication manager -func (wcm *WorkerCommunicationManager) Start() { - glog.Infof("Starting worker communication manager") - - go wcm.connectionMonitorLoop() -} - -// Stop stops the worker communication manager -func (wcm *WorkerCommunicationManager) Stop() { - glog.Infof("Stopping worker communication manager") - - close(wcm.stopCh) - - wcm.mutex.Lock() - defer wcm.mutex.Unlock() - - for _, conn := range wcm.connections { - conn.Close() - } -} - -// EstablishWorkerConnection establishes a connection to a worker -func (wcm *WorkerCommunicationManager) EstablishWorkerConnection(workerID, address string) error { - wcm.mutex.Lock() - defer wcm.mutex.Unlock() - - // Check if already connected - if conn, exists := wcm.connections[workerID]; exists { - if conn.active { - return nil // Already connected - } - conn.Close() // Close inactive connection - } - - // Create new connection - conn, err := NewWorkerConnection(workerID, address, wcm.adminServer) - if err != nil { - return fmt.Errorf("failed to create worker connection: %v", err) - } - - wcm.connections[workerID] = conn - - // Start connection - go conn.Start() - - glog.Infof("Established connection to worker %s at %s", workerID, address) - return nil -} - -// SendTaskAssignment sends a task assignment to a worker -func (wcm *WorkerCommunicationManager) SendTaskAssignment(workerID string, task *Task) error { - wcm.mutex.RLock() - conn, exists := wcm.connections[workerID] - wcm.mutex.RUnlock() - - if !exists || !conn.active { - return fmt.Errorf("no active connection to worker %s", workerID) - } - - return conn.SendTaskAssignment(task) -} - -// CancelTask sends a task cancellation to a worker -func (wcm *WorkerCommunicationManager) CancelTask(workerID, taskID string, reason string) error { - wcm.mutex.RLock() - conn, exists := wcm.connections[workerID] - wcm.mutex.RUnlock() - - if !exists || !conn.active { - return fmt.Errorf("no active connection to worker %s", workerID) - } - - return conn.CancelTask(taskID, reason) -} - -// GetActiveConnections returns the list of active worker connections -func (wcm *WorkerCommunicationManager) GetActiveConnections() []string { - wcm.mutex.RLock() - defer wcm.mutex.RUnlock() - - var active []string - for workerID, conn := range wcm.connections { - if conn.active { - active = append(active, workerID) - } - } - - return active -} - -// connectionMonitorLoop monitors worker connections and cleans up inactive ones -func (wcm *WorkerCommunicationManager) connectionMonitorLoop() { - ticker := time.NewTicker(30 * time.Second) - defer ticker.Stop() - - for { - select { - case <-ticker.C: - wcm.cleanupInactiveConnections() - case <-wcm.stopCh: - return - } - } -} - -// cleanupInactiveConnections removes inactive worker connections -func (wcm *WorkerCommunicationManager) cleanupInactiveConnections() { - wcm.mutex.Lock() - defer wcm.mutex.Unlock() - - now := time.Now() - timeout := 2 * time.Minute - - for workerID, conn := range wcm.connections { - if !conn.active || now.Sub(conn.lastSeen) > timeout { - glog.Infof("Cleaning up inactive connection to worker %s", workerID) - conn.Close() - delete(wcm.connections, workerID) - - // Mark worker as inactive in registry - wcm.adminServer.workerRegistry.MarkWorkerInactive(workerID) - } - } -} - -// NewWorkerConnection creates a new worker connection -func NewWorkerConnection(workerID, address string, adminServer *AdminServer) (*WorkerConnection, error) { - // Convert address to gRPC address - grpcAddress := pb.ServerToGrpcAddress(address) - conn, err := grpc.NewClient(grpcAddress, grpc.WithTransportCredentials(insecure.NewCredentials())) - if err != nil { - return nil, fmt.Errorf("failed to connect to worker at %s: %v", address, err) - } - - client := worker_pb.NewWorkerServiceClient(conn) - - return &WorkerConnection{ - workerID: workerID, - address: address, - conn: conn, - client: client, - lastSeen: time.Now(), - adminServer: adminServer, - stopCh: make(chan struct{}), - active: false, - }, nil -} - -// Start starts the worker connection and message handling -func (wc *WorkerConnection) Start() { - defer wc.Close() - - ctx := context.Background() - stream, err := wc.client.WorkerStream(ctx) - if err != nil { - glog.Errorf("Failed to create worker stream for %s: %v", wc.workerID, err) - return - } - - wc.stream = stream - wc.active = true - - glog.Infof("Worker connection %s started", wc.workerID) - - // Start message handling goroutines - go wc.receiveMessages() - - // Keep connection alive until stopped - <-wc.stopCh -} - -// Close closes the worker connection -func (wc *WorkerConnection) Close() { - wc.mutex.Lock() - defer wc.mutex.Unlock() - - if !wc.active { - return - } - - wc.active = false - close(wc.stopCh) - - if wc.stream != nil { - wc.stream.CloseSend() - } - - if wc.conn != nil { - wc.conn.Close() - } - - glog.Infof("Worker connection %s closed", wc.workerID) -} - -// receiveMessages handles incoming messages from the worker -func (wc *WorkerConnection) receiveMessages() { - for { - select { - case <-wc.stopCh: - return - default: - } - - msg, err := wc.stream.Recv() - if err != nil { - if err == io.EOF { - glog.Infof("Worker %s closed connection", wc.workerID) - } else { - glog.Errorf("Error receiving from worker %s: %v", wc.workerID, err) - } - wc.Close() - return - } - - wc.updateLastSeen() - // Convert AdminMessage to WorkerMessage for processing - if workerMsg := convertToWorkerMessage(msg); workerMsg != nil { - wc.handleMessage(workerMsg) - } - } -} - -// updateLastSeen updates the last seen timestamp -func (wc *WorkerConnection) updateLastSeen() { - wc.mutex.Lock() - defer wc.mutex.Unlock() - wc.lastSeen = time.Now() -} - -// handleMessage processes a message from the worker -func (wc *WorkerConnection) handleMessage(msg *worker_pb.WorkerMessage) { - switch message := msg.Message.(type) { - case *worker_pb.WorkerMessage_Registration: - registration := message.Registration - worker := &Worker{ - ID: registration.WorkerId, - Address: registration.Address, - Capabilities: registration.Capabilities, - } - wc.workerID = worker.ID - // UpdateWorkerStatus stub - if wc.adminServer.workerRegistry != nil { - // wc.adminServer.workerRegistry.UpdateWorkerStatus(worker) // Commented out - method doesn't exist - } - glog.Infof("Worker %s registered", worker.ID) - - case *worker_pb.WorkerMessage_Heartbeat: - glog.V(3).Infof("Heartbeat from worker %s", wc.workerID) - - case *worker_pb.WorkerMessage_TaskRequest: - glog.V(2).Infof("Task request from worker %s", wc.workerID) - // AssignTaskToWorker stub - // task := wc.adminServer.AssignTaskToWorker(wc.workerID) // Commented out - method doesn't exist - - case *worker_pb.WorkerMessage_TaskUpdate: - update := message.TaskUpdate - // UpdateTaskProgress stub - fix signature - wc.adminServer.UpdateTaskProgress(update.TaskId, float64(update.Progress)) - - case *worker_pb.WorkerMessage_TaskComplete: - complete := message.TaskComplete - // CompleteTask stub - fix signature - wc.adminServer.CompleteTask(complete.TaskId, complete.Success, complete.ErrorMessage) - - case *worker_pb.WorkerMessage_Shutdown: - glog.Infof("Worker %s shutting down", wc.workerID) - wc.Close() - } -} - -// SendTaskAssignment sends a task assignment to the worker -func (wc *WorkerConnection) SendTaskAssignment(task *Task) error { - return wc.sendTaskAssignment(task) -} - -// sendTaskAssignment sends a task assignment message -func (wc *WorkerConnection) sendTaskAssignment(task *types.Task) error { - // Fix type assertions for parameters - server, _ := task.Parameters["server"].(string) - collection, _ := task.Parameters["collection"].(string) - - // Convert map[string]interface{} to map[string]string - parameters := make(map[string]string) - for k, v := range task.Parameters { - if str, ok := v.(string); ok { - parameters[k] = str - } else { - parameters[k] = fmt.Sprintf("%v", v) - } - } - - // Add master_client parameter for tasks that need it (especially EC tasks) - if wc.adminServer.masterClient != nil { - if currentMaster := wc.adminServer.masterClient.GetMaster(context.Background()); currentMaster != "" { - parameters["master_client"] = string(currentMaster) - glog.V(2).Infof("Added master_client parameter to task %s: %s", task.ID, currentMaster) - } else { - glog.Warningf("No master address available for task %s", task.ID) - } - } - - assignment := &worker_pb.TaskAssignment{ - TaskId: task.ID, - TaskType: string(task.Type), - Priority: int32(task.Priority), - CreatedTime: task.CreatedAt.Unix(), - Params: &worker_pb.TaskParams{ - VolumeId: task.VolumeID, - Server: server, - Collection: collection, - Parameters: parameters, - }, - Metadata: map[string]string{ - "assigned_at": time.Now().Format(time.RFC3339), - }, - } - - response := &worker_pb.AdminMessage{ - AdminId: wc.adminServer.ID, - Timestamp: time.Now().Unix(), - Message: &worker_pb.AdminMessage_TaskAssignment{ - TaskAssignment: assignment, - }, - } - - return wc.sendMessage(response) -} - -// CancelTask sends a task cancellation to the worker -func (wc *WorkerConnection) CancelTask(taskID, reason string) error { - cancellation := &worker_pb.TaskCancellation{ - TaskId: taskID, - Reason: reason, - Force: false, - } - - response := &worker_pb.AdminMessage{ - AdminId: wc.adminServer.ID, - Timestamp: time.Now().Unix(), - Message: &worker_pb.AdminMessage_TaskCancellation{ - TaskCancellation: cancellation, - }, - } - - return wc.sendMessage(response) -} - -// sendMessage sends a message to the worker -func (wc *WorkerConnection) sendMessage(msg *worker_pb.AdminMessage) error { - wc.mutex.RLock() - defer wc.mutex.RUnlock() - - if !wc.active || wc.stream == nil { - return fmt.Errorf("connection to worker %s is not active", wc.workerID) - } - - // The stream expects WorkerMessage from client (admin) to server (worker) - // Convert AdminMessage to appropriate WorkerMessage format - workerMsg := &worker_pb.WorkerMessage{ - WorkerId: wc.workerID, - Timestamp: msg.Timestamp, - } - - // Convert AdminMessage content to WorkerMessage based on message type - switch adminMsg := msg.Message.(type) { - case *worker_pb.AdminMessage_TaskAssignment: - // Task assignments should be sent as notifications to worker - // Since there's no direct equivalent, we'll create a generic message - // In a full implementation, this would need proper message type mapping - _ = adminMsg // Use the variable to avoid unused warning - workerMsg.Message = &worker_pb.WorkerMessage_Heartbeat{ - Heartbeat: &worker_pb.WorkerHeartbeat{ - WorkerId: wc.workerID, - Status: "task_assigned", - }, - } - case *worker_pb.AdminMessage_TaskCancellation: - // Similar conversion for task cancellation - _ = adminMsg // Use the variable to avoid unused warning - workerMsg.Message = &worker_pb.WorkerMessage_Heartbeat{ - Heartbeat: &worker_pb.WorkerHeartbeat{ - WorkerId: wc.workerID, - Status: "task_cancelled", - }, - } - default: - // For other message types, send a generic heartbeat - workerMsg.Message = &worker_pb.WorkerMessage_Heartbeat{ - Heartbeat: &worker_pb.WorkerHeartbeat{ - WorkerId: wc.workerID, - Status: "admin_message", - }, - } - } - - return wc.stream.Send(workerMsg) -} - -// Helper functions - -// convertCapabilities converts string capabilities to TaskType slice -func convertCapabilities(capabilities []string) []TaskType { - var result []TaskType - for _, cap := range capabilities { - result = append(result, TaskType(cap)) - } - return result -} - -// WorkerStatus represents worker status information -type WorkerStatus struct { - Status string - CurrentLoad int - MaxConcurrent int - CurrentTasks []string - TasksCompleted int - TasksFailed int - UptimeSeconds int64 - LastSeen time.Time -} - -// TaskProgress represents task progress information -type TaskProgress struct { - Progress float64 - Message string -} - -// TaskResult represents task completion result -type TaskResult struct { - Success bool - Error string - Message string -} - -// convertToWorkerMessage converts AdminMessage to WorkerMessage (stub implementation) -func convertToWorkerMessage(msg *worker_pb.AdminMessage) *worker_pb.WorkerMessage { - // This is a stub - in real implementation would need proper conversion - // For now, return nil to avoid processing - return nil -} diff --git a/weed/admin/task/worker_registry.go b/weed/admin/task/worker_registry.go deleted file mode 100644 index b535b522c..000000000 --- a/weed/admin/task/worker_registry.go +++ /dev/null @@ -1,348 +0,0 @@ -package task - -import ( - "fmt" - "sync" - "time" - - "github.com/seaweedfs/seaweedfs/weed/glog" - "github.com/seaweedfs/seaweedfs/weed/worker/types" -) - -// WorkerRegistry manages worker registration and tracking -type WorkerRegistry struct { - workers map[string]*types.Worker - capabilities map[types.TaskType][]*types.Worker - metrics map[string]*WorkerMetrics - issues map[string][]WorkerIssue - mutex sync.RWMutex -} - -// WorkerIssue represents an issue with a worker -type WorkerIssue struct { - Type string - Timestamp time.Time - Details string -} - -// NewWorkerRegistry creates a new worker registry -func NewWorkerRegistry() *WorkerRegistry { - return &WorkerRegistry{ - workers: make(map[string]*types.Worker), - capabilities: make(map[types.TaskType][]*types.Worker), - metrics: make(map[string]*WorkerMetrics), - issues: make(map[string][]WorkerIssue), - } -} - -// RegisterWorker registers a new worker -func (wr *WorkerRegistry) RegisterWorker(worker *types.Worker) error { - wr.mutex.Lock() - defer wr.mutex.Unlock() - - if _, exists := wr.workers[worker.ID]; exists { - return fmt.Errorf("worker %s already registered", worker.ID) - } - - // Register worker - wr.workers[worker.ID] = worker - - // Initialize metrics - wr.metrics[worker.ID] = &WorkerMetrics{ - TasksCompleted: 0, - TasksFailed: 0, - AverageTaskTime: 0, - LastTaskTime: time.Time{}, - SuccessRate: 1.0, - } - - // Update capabilities mapping - wr.updateCapabilitiesMapping() - - glog.Infof("Registered worker %s with capabilities: %v", worker.ID, worker.Capabilities) - return nil -} - -// UnregisterWorker removes a worker -func (wr *WorkerRegistry) UnregisterWorker(workerID string) error { - wr.mutex.Lock() - defer wr.mutex.Unlock() - - if _, exists := wr.workers[workerID]; !exists { - return fmt.Errorf("worker %s not found", workerID) - } - - delete(wr.workers, workerID) - delete(wr.metrics, workerID) - delete(wr.issues, workerID) - - // Update capabilities mapping - wr.updateCapabilitiesMapping() - - glog.Infof("Unregistered worker %s", workerID) - return nil -} - -// GetWorker returns a worker by ID -func (wr *WorkerRegistry) GetWorker(workerID string) (*types.Worker, bool) { - wr.mutex.RLock() - defer wr.mutex.RUnlock() - - worker, exists := wr.workers[workerID] - return worker, exists -} - -// GetAvailableWorkers returns workers that are available for new tasks -func (wr *WorkerRegistry) GetAvailableWorkers() []*types.Worker { - wr.mutex.RLock() - defer wr.mutex.RUnlock() - - var available []*types.Worker - for _, worker := range wr.workers { - if worker.Status == "active" && worker.CurrentLoad < worker.MaxConcurrent { - available = append(available, worker) - } - } - return available -} - -// GetWorkersByCapability returns workers that support a specific capability -func (wr *WorkerRegistry) GetWorkersByCapability(taskType types.TaskType) []*types.Worker { - wr.mutex.RLock() - defer wr.mutex.RUnlock() - - return wr.capabilities[taskType] -} - -// UpdateWorkerHeartbeat updates worker heartbeat and status -func (wr *WorkerRegistry) UpdateWorkerHeartbeat(workerID string, status *types.WorkerStatus) error { - wr.mutex.Lock() - defer wr.mutex.Unlock() - - worker, exists := wr.workers[workerID] - if !exists { - return fmt.Errorf("worker %s not found", workerID) - } - - // Update worker status - worker.LastHeartbeat = time.Now() - worker.Status = status.Status - worker.CurrentLoad = status.CurrentLoad - - glog.V(3).Infof("Updated heartbeat for worker %s, status: %s, load: %d/%d", - workerID, status.Status, status.CurrentLoad, worker.MaxConcurrent) - return nil -} - -// GetTimedOutWorkers returns workers that haven't sent heartbeat within timeout -func (wr *WorkerRegistry) GetTimedOutWorkers(timeout time.Duration) []string { - wr.mutex.RLock() - defer wr.mutex.RUnlock() - - var timedOut []string - cutoff := time.Now().Add(-timeout) - - for workerID, worker := range wr.workers { - if worker.LastHeartbeat.Before(cutoff) { - timedOut = append(timedOut, workerID) - } - } - - return timedOut -} - -// MarkWorkerInactive marks a worker as inactive -func (wr *WorkerRegistry) MarkWorkerInactive(workerID string) { - wr.mutex.Lock() - defer wr.mutex.Unlock() - - if worker, exists := wr.workers[workerID]; exists { - worker.Status = "inactive" - worker.CurrentLoad = 0 - } -} - -// RecordWorkerIssue records an issue with a worker -func (wr *WorkerRegistry) RecordWorkerIssue(workerID string, issueType string) { - wr.mutex.Lock() - defer wr.mutex.Unlock() - - issue := WorkerIssue{ - Type: issueType, - Timestamp: time.Now(), - Details: fmt.Sprintf("Worker issue: %s", issueType), - } - - wr.issues[workerID] = append(wr.issues[workerID], issue) - - // Limit issue history to last 10 issues - if len(wr.issues[workerID]) > 10 { - wr.issues[workerID] = wr.issues[workerID][1:] - } - - glog.Warningf("Recorded issue for worker %s: %s", workerID, issueType) -} - -// GetWorkerMetrics returns metrics for a worker -func (wr *WorkerRegistry) GetWorkerMetrics(workerID string) *WorkerMetrics { - wr.mutex.RLock() - defer wr.mutex.RUnlock() - - return wr.metrics[workerID] -} - -// UpdateWorkerMetrics updates performance metrics for a worker -func (wr *WorkerRegistry) UpdateWorkerMetrics(workerID string, taskDuration time.Duration, success bool) { - wr.mutex.Lock() - defer wr.mutex.Unlock() - - metrics, exists := wr.metrics[workerID] - if !exists { - return - } - - if success { - metrics.TasksCompleted++ - } else { - metrics.TasksFailed++ - } - - metrics.LastTaskTime = time.Now() - - // Update average task time - totalTasks := metrics.TasksCompleted + metrics.TasksFailed - if totalTasks > 0 { - oldAvg := metrics.AverageTaskTime - metrics.AverageTaskTime = time.Duration( - (float64(oldAvg)*float64(totalTasks-1) + float64(taskDuration)) / float64(totalTasks), - ) - } - - // Update success rate - if totalTasks > 0 { - metrics.SuccessRate = float64(metrics.TasksCompleted) / float64(totalTasks) - } -} - -// GetBestWorkerForTask returns the best worker for a specific task type -func (wr *WorkerRegistry) GetBestWorkerForTask(taskType types.TaskType) *types.Worker { - wr.mutex.RLock() - defer wr.mutex.RUnlock() - - candidates := wr.capabilities[taskType] - if len(candidates) == 0 { - return nil - } - - var bestWorker *types.Worker - bestScore := -1.0 - - for _, worker := range candidates { - // Skip if not available - if worker.Status != "active" || worker.CurrentLoad >= worker.MaxConcurrent { - continue - } - - // Calculate score based on multiple factors - score := wr.calculateWorkerScore(worker) - if bestWorker == nil || score > bestScore { - bestWorker = worker - bestScore = score - } - } - - return bestWorker -} - -// calculateWorkerScore calculates a score for worker selection -func (wr *WorkerRegistry) calculateWorkerScore(worker *types.Worker) float64 { - metrics := wr.metrics[worker.ID] - if metrics == nil { - return 0.5 // Default score for new workers - } - - // Factors for scoring: - // 1. Available capacity (0.0 to 1.0) - capacityScore := float64(worker.MaxConcurrent-worker.CurrentLoad) / float64(worker.MaxConcurrent) - - // 2. Success rate (0.0 to 1.0) - successScore := metrics.SuccessRate - - // 3. Recent activity bonus (workers that completed tasks recently get slight bonus) - activityScore := 0.0 - if !metrics.LastTaskTime.IsZero() && time.Since(metrics.LastTaskTime) < time.Hour { - activityScore = 0.1 - } - - // 4. Issue penalty (workers with recent issues get penalty) - issuePenalty := 0.0 - if issues, exists := wr.issues[worker.ID]; exists { - recentIssues := 0 - cutoff := time.Now().Add(-time.Hour) - for _, issue := range issues { - if issue.Timestamp.After(cutoff) { - recentIssues++ - } - } - issuePenalty = float64(recentIssues) * 0.1 - } - - // Weighted average - score := (capacityScore*0.4 + successScore*0.4 + activityScore) - issuePenalty - - if score < 0 { - score = 0 - } - if score > 1 { - score = 1 - } - - return score -} - -// updateCapabilitiesMapping rebuilds the capabilities mapping -func (wr *WorkerRegistry) updateCapabilitiesMapping() { - // Clear existing mapping - for taskType := range wr.capabilities { - wr.capabilities[taskType] = nil - } - - // Rebuild mapping - for _, worker := range wr.workers { - for _, capability := range worker.Capabilities { - wr.capabilities[capability] = append(wr.capabilities[capability], worker) - } - } -} - -// GetRegistryStats returns statistics about the registry -func (wr *WorkerRegistry) GetRegistryStats() map[string]interface{} { - wr.mutex.RLock() - defer wr.mutex.RUnlock() - - stats := make(map[string]interface{}) - stats["total_workers"] = len(wr.workers) - - statusCounts := make(map[string]int) - capabilityCounts := make(map[types.TaskType]int) - totalLoad := 0 - maxCapacity := 0 - - for _, worker := range wr.workers { - statusCounts[worker.Status]++ - totalLoad += worker.CurrentLoad - maxCapacity += worker.MaxConcurrent - - for _, capability := range worker.Capabilities { - capabilityCounts[capability]++ - } - } - - stats["by_status"] = statusCounts - stats["by_capability"] = capabilityCounts - stats["total_load"] = totalLoad - stats["max_capacity"] = maxCapacity - stats["utilization"] = float64(totalLoad) / float64(maxCapacity) * 100.0 - - return stats -} diff --git a/weed/admin/task_minimal/admin_server.go b/weed/admin/task_minimal/admin_server.go deleted file mode 100644 index f2645f5bc..000000000 --- a/weed/admin/task_minimal/admin_server.go +++ /dev/null @@ -1,324 +0,0 @@ -package task - -import ( - "fmt" - "sync" - "time" - - "github.com/seaweedfs/seaweedfs/weed/wdclient" - "github.com/seaweedfs/seaweedfs/weed/worker/types" -) - -// AdminConfig contains configuration for the admin server -type AdminConfig struct { - ScanInterval time.Duration - WorkerTimeout time.Duration - TaskTimeout time.Duration - MaxRetries int - ReconcileInterval time.Duration - EnableFailureRecovery bool - MaxConcurrentTasks int -} - -// AdminServer manages workers and tasks -type AdminServer struct { - config *AdminConfig - masterClient *wdclient.MasterClient - running bool - mutex sync.RWMutex - - // Task management - tasks map[string]*types.Task - taskQueue []*types.Task - activeTasks map[string]*types.Task - - // Worker management - workers map[string]*types.Worker - workerStatus map[string]*types.WorkerStatus - - // Task history - taskHistory []TaskHistoryEntry -} - -// TaskHistoryEntry represents a single task history entry -type TaskHistoryEntry struct { - TaskID string - TaskType types.TaskType - VolumeID uint32 - WorkerID string - Status types.TaskStatus - StartedAt time.Time - CompletedAt time.Time - Duration time.Duration - ErrorMessage string -} - -// SystemStats represents system statistics -type SystemStats struct { - ActiveTasks int - QueuedTasks int - ActiveWorkers int - TotalTasks int -} - -// NewAdminServer creates a new admin server -func NewAdminServer(config *AdminConfig, masterClient *wdclient.MasterClient) *AdminServer { - return &AdminServer{ - config: config, - masterClient: masterClient, - tasks: make(map[string]*types.Task), - taskQueue: make([]*types.Task, 0), - activeTasks: make(map[string]*types.Task), - workers: make(map[string]*types.Worker), - workerStatus: make(map[string]*types.WorkerStatus), - taskHistory: make([]TaskHistoryEntry, 0), - } -} - -// Start starts the admin server -func (as *AdminServer) Start() error { - as.mutex.Lock() - defer as.mutex.Unlock() - - if as.running { - return fmt.Errorf("admin server is already running") - } - - as.running = true - return nil -} - -// Stop stops the admin server -func (as *AdminServer) Stop() error { - as.mutex.Lock() - defer as.mutex.Unlock() - - as.running = false - return nil -} - -// RegisterWorker registers a new worker -func (as *AdminServer) RegisterWorker(worker *types.Worker) error { - as.mutex.Lock() - defer as.mutex.Unlock() - - if !as.running { - return fmt.Errorf("admin server is not running") - } - - as.workers[worker.ID] = worker - as.workerStatus[worker.ID] = &types.WorkerStatus{ - Status: "active", - CurrentLoad: 0, - } - - return nil -} - -// QueueTask adds a new task to the task queue -func (as *AdminServer) QueueTask(task *types.Task) error { - as.mutex.Lock() - defer as.mutex.Unlock() - - if !as.running { - return fmt.Errorf("admin server is not running") - } - - if task.ID == "" { - task.ID = fmt.Sprintf("task-%d", time.Now().UnixNano()) - } - - task.Status = types.TaskStatusPending - task.CreatedAt = time.Now() - - as.tasks[task.ID] = task - as.taskQueue = append(as.taskQueue, task) - - return nil -} - -// RequestTask requests a task for a worker -func (as *AdminServer) RequestTask(workerID string, capabilities []types.TaskType) (*types.Task, error) { - as.mutex.Lock() - defer as.mutex.Unlock() - - if !as.running { - return nil, fmt.Errorf("admin server is not running") - } - - // Check if worker exists - worker, exists := as.workers[workerID] - if !exists { - return nil, fmt.Errorf("worker %s not found", workerID) - } - - // Check if worker has capacity - status := as.workerStatus[workerID] - if status.CurrentLoad >= worker.MaxConcurrent { - return nil, nil // No capacity - } - - // Find a suitable task - for i, task := range as.taskQueue { - if task.Status != types.TaskStatusPending { - continue - } - - // Check if worker can handle this task type - canHandle := false - for _, capability := range capabilities { - if task.Type == capability { - canHandle = true - break - } - } - - if canHandle { - // Assign task to worker - task.Status = types.TaskStatusInProgress - task.WorkerID = workerID - now := time.Now() - task.StartedAt = &now - - // Move task from queue to active tasks - as.taskQueue = append(as.taskQueue[:i], as.taskQueue[i+1:]...) - as.activeTasks[task.ID] = task - - // Update worker load - status.CurrentLoad++ - - return task, nil - } - } - - return nil, nil // No suitable task found -} - -// UpdateTaskProgress updates task progress -func (as *AdminServer) UpdateTaskProgress(taskID string, progress float64) error { - as.mutex.Lock() - defer as.mutex.Unlock() - - task, exists := as.tasks[taskID] - if !exists { - return fmt.Errorf("task %s not found", taskID) - } - - task.Progress = progress - - return nil -} - -// CompleteTask marks a task as completed -func (as *AdminServer) CompleteTask(taskID string, success bool, errorMessage string) error { - as.mutex.Lock() - defer as.mutex.Unlock() - - task, exists := as.tasks[taskID] - if !exists { - return fmt.Errorf("task %s not found", taskID) - } - - // Update task status - if success { - task.Status = types.TaskStatusCompleted - } else { - task.Status = types.TaskStatusFailed - task.Error = errorMessage - } - - now := time.Now() - task.CompletedAt = &now - - // Remove from active tasks - delete(as.activeTasks, taskID) - - // Update worker load - if task.WorkerID != "" { - if status, exists := as.workerStatus[task.WorkerID]; exists { - status.CurrentLoad-- - } - } - - // Add to history - var duration time.Duration - if task.StartedAt != nil { - duration = now.Sub(*task.StartedAt) - } - - entry := TaskHistoryEntry{ - TaskID: task.ID, - TaskType: task.Type, - VolumeID: task.VolumeID, - WorkerID: task.WorkerID, - Status: task.Status, - StartedAt: *task.StartedAt, - CompletedAt: now, - Duration: duration, - ErrorMessage: errorMessage, - } - as.taskHistory = append(as.taskHistory, entry) - - return nil -} - -// UpdateWorkerHeartbeat updates worker heartbeat -func (as *AdminServer) UpdateWorkerHeartbeat(workerID string, status *types.WorkerStatus) error { - as.mutex.Lock() - defer as.mutex.Unlock() - - worker, exists := as.workers[workerID] - if !exists { - return fmt.Errorf("worker %s not found", workerID) - } - - worker.LastHeartbeat = time.Now() - as.workerStatus[workerID] = status - - return nil -} - -// GetSystemStats returns system statistics -func (as *AdminServer) GetSystemStats() *SystemStats { - as.mutex.RLock() - defer as.mutex.RUnlock() - - activeWorkers := 0 - for _, status := range as.workerStatus { - if status.Status == "active" { - activeWorkers++ - } - } - - return &SystemStats{ - ActiveTasks: len(as.activeTasks), - QueuedTasks: len(as.taskQueue), - ActiveWorkers: activeWorkers, - TotalTasks: len(as.tasks), - } -} - -// GetQueuedTaskCount returns the number of queued tasks -func (as *AdminServer) GetQueuedTaskCount() int { - as.mutex.RLock() - defer as.mutex.RUnlock() - return len(as.taskQueue) -} - -// GetActiveTaskCount returns the number of active tasks -func (as *AdminServer) GetActiveTaskCount() int { - as.mutex.RLock() - defer as.mutex.RUnlock() - return len(as.activeTasks) -} - -// GetTaskHistory returns task history -func (as *AdminServer) GetTaskHistory() []TaskHistoryEntry { - as.mutex.RLock() - defer as.mutex.RUnlock() - - // Return a copy of the history - history := make([]TaskHistoryEntry, len(as.taskHistory)) - copy(history, as.taskHistory) - return history -} diff --git a/weed/admin/task_minimal/go.mod b/weed/admin/task_minimal/go.mod deleted file mode 100644 index 3af5d3746..000000000 --- a/weed/admin/task_minimal/go.mod +++ /dev/null @@ -1,3 +0,0 @@ -module task_minimal - -go 1.24.1 diff --git a/weed/admin/task_minimal/integration_test.go b/weed/admin/task_minimal/integration_test.go deleted file mode 100644 index a7859e569..000000000 --- a/weed/admin/task_minimal/integration_test.go +++ /dev/null @@ -1,233 +0,0 @@ -package task - -import ( - "fmt" - "testing" - "time" - - "github.com/seaweedfs/seaweedfs/weed/worker/types" -) - -// TestSimpleIntegration tests basic admin-worker operational flow without complex dependencies -func TestSimpleIntegration(t *testing.T) { - t.Logf("Starting simple integration test") - - // Step 1: Create a minimal admin server configuration - config := &AdminConfig{ - ScanInterval: 10 * time.Second, - WorkerTimeout: 30 * time.Second, - TaskTimeout: 2 * time.Hour, - MaxRetries: 3, - ReconcileInterval: 5 * time.Minute, - EnableFailureRecovery: true, - MaxConcurrentTasks: 5, - } - - // Step 2: Create admin server with nil master client (for testing) - adminServer := NewAdminServer(config, nil) - - // Step 3: Start admin server - err := adminServer.Start() - if err != nil { - t.Fatalf("Failed to start admin server: %v", err) - } - defer adminServer.Stop() - - // Step 4: Test worker registration - t.Logf("Testing worker registration") - - worker := &types.Worker{ - ID: "test-worker-1", - Address: "localhost:9001", - Capabilities: []types.TaskType{types.TaskTypeVacuum}, - MaxConcurrent: 2, - Status: "active", - CurrentLoad: 0, - LastHeartbeat: time.Now(), - } - - err = adminServer.RegisterWorker(worker) - if err != nil { - t.Fatalf("Failed to register worker: %v", err) - } - t.Logf("Successfully registered worker %s", worker.ID) - - // Step 5: Test task queueing - t.Logf("Testing task queueing") - - task := &types.Task{ - ID: "test-task-1", - Type: types.TaskTypeVacuum, - VolumeID: 1001, - Server: "localhost:8080", - Status: types.TaskStatusPending, - Priority: types.TaskPriorityNormal, - Parameters: map[string]interface{}{ - "garbage_threshold": "0.3", - }, - CreatedAt: time.Now(), - } - - err = adminServer.QueueTask(task) - if err != nil { - t.Fatalf("Failed to queue task: %v", err) - } - t.Logf("Successfully queued task %s", task.ID) - - // Step 6: Test task request by worker - t.Logf("Testing task request") - - assignedTask, err := adminServer.RequestTask("test-worker-1", []types.TaskType{types.TaskTypeVacuum}) - if err != nil { - t.Fatalf("Failed to request task: %v", err) - } - - if assignedTask != nil { - t.Logf("Successfully assigned task %s to worker", assignedTask.ID) - - // Step 7: Test task progress updates - t.Logf("Testing task progress updates") - - err = adminServer.UpdateTaskProgress(assignedTask.ID, 50.0) - if err != nil { - t.Errorf("Failed to update task progress: %v", err) - } - - err = adminServer.UpdateTaskProgress(assignedTask.ID, 100.0) - if err != nil { - t.Errorf("Failed to update task progress: %v", err) - } - - // Step 8: Test task completion - t.Logf("Testing task completion") - - err = adminServer.CompleteTask(assignedTask.ID, true, "") - if err != nil { - t.Errorf("Failed to complete task: %v", err) - } - t.Logf("Successfully completed task %s", assignedTask.ID) - } else { - t.Logf("No task was assigned (queue might be empty)") - } - - // Step 9: Test basic metrics - t.Logf("Testing basic metrics") - - stats := adminServer.GetSystemStats() - if stats != nil { - t.Logf("System stats: Active tasks=%d, Queued tasks=%d, Active workers=%d", - stats.ActiveTasks, stats.QueuedTasks, stats.ActiveWorkers) - } - - queuedCount := adminServer.GetQueuedTaskCount() - activeCount := adminServer.GetActiveTaskCount() - t.Logf("Queue status: %d queued, %d active tasks", queuedCount, activeCount) - - // Step 10: Test task history - history := adminServer.GetTaskHistory() - t.Logf("Task history contains %d entries", len(history)) - - t.Logf("Simple integration test completed successfully") -} - -// TestWorkerHeartbeat tests worker heartbeat functionality -func TestWorkerHeartbeat(t *testing.T) { - t.Logf("Testing worker heartbeat") - - config := &AdminConfig{ - ScanInterval: 10 * time.Second, - WorkerTimeout: 30 * time.Second, - TaskTimeout: 2 * time.Hour, - MaxRetries: 3, - ReconcileInterval: 5 * time.Minute, - EnableFailureRecovery: true, - MaxConcurrentTasks: 5, - } - - adminServer := NewAdminServer(config, nil) - err := adminServer.Start() - if err != nil { - t.Fatalf("Failed to start admin server: %v", err) - } - defer adminServer.Stop() - - // Register a worker - worker := &types.Worker{ - ID: "heartbeat-worker", - Address: "localhost:9002", - Capabilities: []types.TaskType{types.TaskTypeVacuum}, - MaxConcurrent: 1, - Status: "active", - CurrentLoad: 0, - LastHeartbeat: time.Now(), - } - - err = adminServer.RegisterWorker(worker) - if err != nil { - t.Fatalf("Failed to register worker: %v", err) - } - - // Test heartbeat update - status := &types.WorkerStatus{ - Status: "active", - CurrentLoad: 0, - } - - err = adminServer.UpdateWorkerHeartbeat("heartbeat-worker", status) - if err != nil { - t.Errorf("Failed to update worker heartbeat: %v", err) - } - - t.Logf("Worker heartbeat test completed successfully") -} - -// TestTaskQueueOperations tests task queue operations -func TestTaskQueueOperations(t *testing.T) { - t.Logf("Testing task queue operations") - - config := &AdminConfig{ - ScanInterval: 10 * time.Second, - WorkerTimeout: 30 * time.Second, - TaskTimeout: 2 * time.Hour, - MaxRetries: 3, - ReconcileInterval: 5 * time.Minute, - EnableFailureRecovery: true, - MaxConcurrentTasks: 5, - } - - adminServer := NewAdminServer(config, nil) - err := adminServer.Start() - if err != nil { - t.Fatalf("Failed to start admin server: %v", err) - } - defer adminServer.Stop() - - // Test queuing multiple tasks - for i := 0; i < 3; i++ { - task := &types.Task{ - ID: fmt.Sprintf("queue-test-task-%d", i), - Type: types.TaskTypeVacuum, - VolumeID: uint32(2000 + i), - Server: "localhost:8080", - Status: types.TaskStatusPending, - Priority: types.TaskPriorityNormal, - Parameters: map[string]interface{}{ - "garbage_threshold": "0.3", - }, - CreatedAt: time.Now(), - } - - err = adminServer.QueueTask(task) - if err != nil { - t.Errorf("Failed to queue task %d: %v", i, err) - } - } - - // Check queue size - queuedCount := adminServer.GetQueuedTaskCount() - if queuedCount != 3 { - t.Errorf("Expected 3 queued tasks, got %d", queuedCount) - } - - t.Logf("Task queue operations test completed successfully") -}