Browse Source

remove

remove
worker-execute-ec-tasks
chrislu 4 months ago
parent
commit
9cd3e613a5
  1. 699
      weed/admin/task/admin_server.go
  2. 524
      weed/admin/task/admin_server_test.go
  3. 90
      weed/admin/task/compilation_stubs.go
  4. 309
      weed/admin/task/ec_integration_test.go
  5. 324
      weed/admin/task/ec_test_standalone/enhanced_ec_integration_test.go
  6. 3
      weed/admin/task/ec_test_standalone/go.mod
  7. 324
      weed/admin/task/ec_test_standalone/minimal_admin_server.go
  8. 434
      weed/admin/task/ec_test_standalone/minimal_integration_test.go
  9. 488
      weed/admin/task/ec_worker_test.go
  10. 346
      weed/admin/task/example_usage.go
  11. 123
      weed/admin/task/failure_handler.go
  12. 486
      weed/admin/task/master_sync.go
  13. 324
      weed/admin/task/minimal_admin_server.go
  14. 434
      weed/admin/task/minimal_integration_test.go
  15. 197
      weed/admin/task/operational_integration_test.go
  16. 233
      weed/admin/task/simple_integration_test.go
  17. 604
      weed/admin/task/simulation.go
  18. 695
      weed/admin/task/simulation/comprehensive_simulation.go
  19. 444
      weed/admin/task/simulation/comprehensive_simulation_test.go
  20. 294
      weed/admin/task/simulation/simulation_runner.go
  21. 237
      weed/admin/task/simulation/system_demo_test.go
  22. 509
      weed/admin/task/task_assignment_test.go
  23. 168
      weed/admin/task/task_detectors.go
  24. 161
      weed/admin/task/task_discovery.go
  25. 257
      weed/admin/task/task_scheduler.go
  26. 68
      weed/admin/task/task_types.go
  27. 640
      weed/admin/task/volume_state_manager.go
  28. 440
      weed/admin/task/volume_state_manager_test.go
  29. 226
      weed/admin/task/volume_state_tracker.go
  30. 488
      weed/admin/task/worker_communication.go
  31. 348
      weed/admin/task/worker_registry.go
  32. 324
      weed/admin/task_minimal/admin_server.go
  33. 3
      weed/admin/task_minimal/go.mod
  34. 233
      weed/admin/task_minimal/integration_test.go

699
weed/admin/task/admin_server.go

@ -1,699 +0,0 @@
package task
import (
"fmt"
"math/rand"
"sync"
"time"
"github.com/seaweedfs/seaweedfs/weed/glog"
"github.com/seaweedfs/seaweedfs/weed/wdclient"
"github.com/seaweedfs/seaweedfs/weed/worker/types"
)
// TaskHistory represents task execution history
type TaskHistory struct {
entries []TaskHistoryEntry
mutex sync.RWMutex
}
// TaskHistoryEntry represents a single task history entry
type TaskHistoryEntry struct {
TaskID string
TaskType types.TaskType
VolumeID uint32
WorkerID string
Status types.TaskStatus
StartedAt time.Time
CompletedAt time.Time
Duration time.Duration
ErrorMessage string
}
// NewTaskHistory creates a new task history
func NewTaskHistory() *TaskHistory {
return &TaskHistory{
entries: make([]TaskHistoryEntry, 0),
}
}
// AddEntry adds a new task history entry
func (th *TaskHistory) AddEntry(entry TaskHistoryEntry) {
th.mutex.Lock()
defer th.mutex.Unlock()
th.entries = append(th.entries, entry)
// Keep only the last 1000 entries
if len(th.entries) > 1000 {
th.entries = th.entries[len(th.entries)-1000:]
}
}
// GetRecentEntries returns the most recent entries
func (th *TaskHistory) GetRecentEntries(limit int) []*TaskHistoryEntry {
th.mutex.RLock()
defer th.mutex.RUnlock()
start := len(th.entries) - limit
if start < 0 {
start = 0
}
result := make([]*TaskHistoryEntry, len(th.entries)-start)
for i, entry := range th.entries[start:] {
entryCopy := entry
result[i] = &entryCopy
}
return result
}
// AdminServer manages task distribution and worker coordination
type AdminServer struct {
ID string
Config *AdminConfig
masterClient *wdclient.MasterClient
volumeStateManager *VolumeStateManager
workerRegistry *WorkerRegistry
taskQueue *PriorityTaskQueue
taskScheduler *TaskScheduler
taskHistory *TaskHistory
failureHandler *FailureHandler
masterSync *MasterSynchronizer
workerComm *WorkerCommunicationManager
running bool
stopCh chan struct{}
mutex sync.RWMutex
// Task tracking
activeTasks map[string]*InProgressTask
tasksMutex sync.RWMutex
}
// AdminConfig holds configuration for the admin server
type AdminConfig struct {
ScanInterval time.Duration
WorkerTimeout time.Duration
TaskTimeout time.Duration
MaxRetries int
ReconcileInterval time.Duration
EnableFailureRecovery bool
MaxConcurrentTasks int
}
// NewAdminServer creates a new admin server instance
func NewAdminServer(config *AdminConfig, masterClient *wdclient.MasterClient) *AdminServer {
adminServer := &AdminServer{
ID: generateAdminServerID(),
Config: config,
masterClient: masterClient,
volumeStateManager: NewVolumeStateManager(masterClient),
workerRegistry: NewWorkerRegistry(),
taskQueue: NewPriorityTaskQueue(),
taskHistory: NewTaskHistory(),
failureHandler: NewFailureHandler(config),
activeTasks: make(map[string]*InProgressTask),
stopCh: make(chan struct{}),
}
// Initialize components that depend on admin server
adminServer.taskScheduler = NewTaskScheduler(adminServer.workerRegistry, adminServer.taskQueue)
adminServer.masterSync = NewMasterSynchronizer(masterClient, adminServer.volumeStateManager, adminServer)
adminServer.workerComm = NewWorkerCommunicationManager(adminServer)
glog.Infof("Created admin server %s", adminServer.ID)
return adminServer
}
// Start starts the admin server
func (as *AdminServer) Start() error {
as.mutex.Lock()
defer as.mutex.Unlock()
if as.running {
return nil
}
glog.Infof("Starting admin server %s", as.ID)
// Start components
as.masterSync.Start()
as.workerComm.Start()
// Start background loops
go as.taskAssignmentLoop()
go as.taskMonitoringLoop()
go as.reconciliationLoop()
go as.metricsLoop()
as.running = true
glog.Infof("Admin server %s started successfully", as.ID)
return nil
}
// Stop stops the admin server
func (as *AdminServer) Stop() {
as.mutex.Lock()
defer as.mutex.Unlock()
if !as.running {
return
}
glog.Infof("Stopping admin server %s", as.ID)
close(as.stopCh)
// Stop components
as.masterSync.Stop()
as.workerComm.Stop()
as.running = false
glog.Infof("Admin server %s stopped", as.ID)
}
// RegisterWorker registers a new worker
func (as *AdminServer) RegisterWorker(worker *types.Worker) error {
as.mutex.Lock()
defer as.mutex.Unlock()
if !as.running {
return fmt.Errorf("admin server is not running")
}
return as.workerRegistry.RegisterWorker(worker)
}
// UnregisterWorker removes a worker
func (as *AdminServer) UnregisterWorker(workerID string) error {
as.mutex.Lock()
defer as.mutex.Unlock()
// Reschedule any tasks assigned to this worker
for taskID, task := range as.activeTasks {
if task.WorkerID == workerID {
glog.Warningf("Rescheduling task %s due to worker %s unregistration", taskID, workerID)
as.ReassignTask(taskID, "worker unregistration")
delete(as.activeTasks, taskID)
}
}
return as.workerRegistry.UnregisterWorker(workerID)
}
// UpdateWorkerHeartbeat updates worker heartbeat
func (as *AdminServer) UpdateWorkerHeartbeat(workerID string, status *types.WorkerStatus) error {
as.mutex.Lock()
defer as.mutex.Unlock()
return as.workerRegistry.UpdateWorkerHeartbeat(workerID, status)
}
// RequestTask handles task requests from workers
func (as *AdminServer) RequestTask(workerID string, capabilities []types.TaskType) (*types.Task, error) {
as.mutex.RLock()
defer as.mutex.RUnlock()
if !as.running {
return nil, fmt.Errorf("admin server is not running")
}
worker, exists := as.workerRegistry.GetWorker(workerID)
if !exists {
return nil, fmt.Errorf("worker %s not registered", workerID)
}
// Check if worker has capacity
if worker.CurrentLoad >= worker.MaxConcurrent {
return nil, nil // No capacity
}
// Get next task for this worker
task := as.taskScheduler.GetNextTask(workerID, capabilities)
if task == nil {
return nil, nil // No suitable tasks
}
// Check if volume can be assigned (using comprehensive state management)
if !as.canAssignTask(task, workerID) {
return nil, nil // Cannot assign due to capacity or state constraints
}
// Assign task to worker
inProgressTask := &InProgressTask{
Task: task,
WorkerID: workerID,
StartedAt: time.Now(),
LastUpdate: time.Now(),
Progress: 0.0,
EstimatedEnd: time.Now().Add(as.estimateTaskDuration(task)),
}
as.activeTasks[task.ID] = inProgressTask
worker.CurrentLoad++
// Register task impact with state manager
impact := as.createTaskImpact(task)
as.volumeStateManager.RegisterTaskImpact(task.ID, impact)
inProgressTask.VolumeReserved = true
glog.V(1).Infof("Assigned task %s to worker %s", task.ID, workerID)
return task, nil
}
// UpdateTaskProgress updates task progress
func (as *AdminServer) UpdateTaskProgress(taskID string, progress float64) error {
as.tasksMutex.Lock()
defer as.tasksMutex.Unlock()
inProgressTask, exists := as.activeTasks[taskID]
if !exists {
return fmt.Errorf("task %s not found", taskID)
}
inProgressTask.Progress = progress
inProgressTask.LastUpdate = time.Now()
glog.V(2).Infof("Task %s progress: %.1f%%", taskID, progress)
return nil
}
// CompleteTask marks a task as completed
func (as *AdminServer) CompleteTask(taskID string, success bool, errorMsg string) error {
as.tasksMutex.Lock()
defer as.tasksMutex.Unlock()
inProgressTask, exists := as.activeTasks[taskID]
if !exists {
return fmt.Errorf("task %s not found", taskID)
}
// Remove from active tasks
delete(as.activeTasks, taskID)
// Update worker load
if worker, exists := as.workerRegistry.GetWorker(inProgressTask.WorkerID); exists {
worker.CurrentLoad--
}
// Unregister task impact
as.volumeStateManager.UnregisterTaskImpact(taskID)
// Record in task history
status := types.TaskStatusCompleted
if !success {
status = types.TaskStatusFailed
}
as.taskHistory.AddEntry(TaskHistoryEntry{
TaskID: taskID,
TaskType: inProgressTask.Task.Type,
VolumeID: inProgressTask.Task.VolumeID,
WorkerID: inProgressTask.WorkerID,
Status: status,
StartedAt: inProgressTask.StartedAt,
CompletedAt: time.Now(),
Duration: time.Since(inProgressTask.StartedAt),
ErrorMessage: errorMsg,
})
glog.Infof("Task %s completed: success=%v", taskID, success)
return nil
}
// QueueTask adds a new task to the task queue
func (as *AdminServer) QueueTask(task *types.Task) error {
as.mutex.Lock()
defer as.mutex.Unlock()
if !as.running {
return fmt.Errorf("admin server is not running")
}
// Validate the task
if task == nil {
return fmt.Errorf("task cannot be nil")
}
if task.ID == "" {
task.ID = generateTaskID()
}
// Set creation timestamp if not set
if task.CreatedAt.IsZero() {
task.CreatedAt = time.Now()
}
// Check if task for this volume is already queued or in progress
if as.isVolumeAlreadyQueued(task.VolumeID, task.Type) {
glog.V(2).Infof("Task for volume %d already queued or in progress, skipping", task.VolumeID)
return nil
}
// Add to task queue
as.taskQueue.Push(task)
glog.V(1).Infof("Queued task %s (%s) for volume %d with priority %v",
task.ID, task.Type, task.VolumeID, task.Priority)
return nil
}
// Helper methods
// canAssignTask checks if a task can be assigned to a worker
func (as *AdminServer) canAssignTask(task *types.Task, workerID string) bool {
worker, exists := as.workerRegistry.GetWorker(workerID)
if !exists {
return false
}
// Check worker capacity
if worker.CurrentLoad >= worker.MaxConcurrent {
return false
}
// Check if worker has required capability
hasCapability := false
for _, cap := range worker.Capabilities {
if cap == task.Type {
hasCapability = true
break
}
}
if !hasCapability {
return false
}
return true
}
// createTaskImpact creates a TaskImpact for the given task
func (as *AdminServer) createTaskImpact(task *types.Task) *TaskImpact {
impact := &TaskImpact{
TaskID: task.ID,
VolumeID: task.VolumeID,
TaskType: task.Type,
StartedAt: time.Now(),
EstimatedEnd: time.Now().Add(as.estimateTaskDuration(task)),
CapacityDelta: make(map[string]int64),
VolumeChanges: &VolumeChanges{},
ShardChanges: make(map[int]*ShardChange),
}
// Set task-specific impacts
switch task.Type {
case types.TaskTypeErasureCoding:
impact.VolumeChanges.WillBecomeReadOnly = true
impact.EstimatedEnd = time.Now().Add(2 * time.Hour) // EC takes longer
// EC encoding requires temporary space
if server, ok := task.Parameters["server"]; ok {
if serverStr, ok := server.(string); ok {
volumeState := as.volumeStateManager.GetVolumeState(task.VolumeID)
if volumeState != nil && volumeState.CurrentState != nil {
// Estimate 2x volume size needed temporarily
impact.CapacityDelta[serverStr] = int64(volumeState.CurrentState.Size * 2)
}
}
}
case types.TaskTypeVacuum:
// Vacuum reduces volume size
if server, ok := task.Parameters["server"]; ok {
if serverStr, ok := server.(string); ok {
// Estimate 30% space reclamation
volumeState := as.volumeStateManager.GetVolumeState(task.VolumeID)
if volumeState != nil && volumeState.CurrentState != nil {
impact.CapacityDelta[serverStr] = -int64(float64(volumeState.CurrentState.Size) * 0.3)
}
}
}
}
return impact
}
// estimateTaskDuration estimates how long a task will take
func (as *AdminServer) estimateTaskDuration(task *types.Task) time.Duration {
switch task.Type {
case types.TaskTypeErasureCoding:
return 2 * time.Hour
case types.TaskTypeVacuum:
return 30 * time.Minute
default:
return 1 * time.Hour
}
}
// isVolumeAlreadyQueued checks if a task for the volume is already queued or in progress
func (as *AdminServer) isVolumeAlreadyQueued(volumeID uint32, taskType types.TaskType) bool {
// Check active tasks
as.tasksMutex.RLock()
for _, inProgressTask := range as.activeTasks {
if inProgressTask.Task.VolumeID == volumeID && inProgressTask.Task.Type == taskType {
as.tasksMutex.RUnlock()
return true
}
}
as.tasksMutex.RUnlock()
// Check queued tasks
return as.taskQueue.HasTask(volumeID, taskType)
}
// Background loops
// taskAssignmentLoop handles automatic task assignment to workers
func (as *AdminServer) taskAssignmentLoop() {
ticker := time.NewTicker(5 * time.Second)
defer ticker.Stop()
for {
select {
case <-ticker.C:
as.processTaskAssignments()
case <-as.stopCh:
return
}
}
}
// processTaskAssignments attempts to assign pending tasks to available workers
func (as *AdminServer) processTaskAssignments() {
// Get available workers
workers := as.workerRegistry.GetAvailableWorkers()
if len(workers) == 0 {
return // No workers available
}
// For each worker with available capacity, try to assign a task
for _, worker := range workers {
if worker.CurrentLoad < worker.MaxConcurrent {
task := as.taskScheduler.GetNextTask(worker.ID, worker.Capabilities)
if task != nil {
// Try to assign task directly
_, err := as.RequestTask(worker.ID, worker.Capabilities)
if err != nil {
glog.Errorf("Failed to assign task to worker %s: %v", worker.ID, err)
}
}
}
}
}
// taskMonitoringLoop monitors task progress and handles timeouts
func (as *AdminServer) taskMonitoringLoop() {
ticker := time.NewTicker(1 * time.Minute)
defer ticker.Stop()
for {
select {
case <-ticker.C:
as.checkTaskTimeouts()
case <-as.stopCh:
return
}
}
}
// checkTaskTimeouts checks for tasks that have timed out
func (as *AdminServer) checkTaskTimeouts() {
as.tasksMutex.Lock()
defer as.tasksMutex.Unlock()
now := time.Now()
timeout := 2 * time.Hour // Default task timeout
for taskID, inProgressTask := range as.activeTasks {
if now.Sub(inProgressTask.LastUpdate) > timeout {
glog.Warningf("Task %s timed out (last update: %v)", taskID, inProgressTask.LastUpdate)
as.ReassignTask(taskID, "task timeout")
}
}
}
// ReassignTask reassigns a task due to worker failure
func (as *AdminServer) ReassignTask(taskID, reason string) {
as.tasksMutex.Lock()
defer as.tasksMutex.Unlock()
inProgressTask, exists := as.activeTasks[taskID]
if !exists {
return
}
glog.Infof("Reassigning task %s due to: %s", taskID, reason)
// Reset task status
inProgressTask.Task.Status = types.TaskStatusPending
// Unregister current task impact
as.volumeStateManager.UnregisterTaskImpact(taskID)
// Remove from active tasks
delete(as.activeTasks, taskID)
// Put back in queue with higher priority
inProgressTask.Task.Priority = types.TaskPriorityHigh
as.taskQueue.Push(inProgressTask.Task)
}
// reconciliationLoop periodically reconciles state with master
func (as *AdminServer) reconciliationLoop() {
ticker := time.NewTicker(5 * time.Minute)
defer ticker.Stop()
for {
select {
case <-ticker.C:
as.performReconciliation()
case <-as.stopCh:
return
}
}
}
// performReconciliation reconciles admin state with master
func (as *AdminServer) performReconciliation() {
glog.V(1).Infof("Starting state reconciliation")
// Sync with master
err := as.volumeStateManager.SyncWithMaster()
if err != nil {
glog.Errorf("Failed to sync with master during reconciliation: %v", err)
return
}
glog.V(1).Infof("State reconciliation completed")
}
// metricsLoop periodically logs metrics and statistics
func (as *AdminServer) metricsLoop() {
ticker := time.NewTicker(1 * time.Minute)
defer ticker.Stop()
for {
select {
case <-ticker.C:
as.logMetrics()
case <-as.stopCh:
return
}
}
}
// logMetrics logs current system metrics
func (as *AdminServer) logMetrics() {
as.tasksMutex.RLock()
activeTasks := len(as.activeTasks)
as.tasksMutex.RUnlock()
queuedTasks := as.taskQueue.Size()
activeWorkers := len(as.workerRegistry.GetAvailableWorkers())
glog.V(1).Infof("Admin server metrics: active_tasks=%d, queued_tasks=%d, active_workers=%d",
activeTasks, queuedTasks, activeWorkers)
}
// GetAvailableWorkers returns workers capable of handling the specified task type
func (as *AdminServer) GetAvailableWorkers(taskType string) []*types.Worker {
workers := as.workerRegistry.GetAvailableWorkers()
var available []*types.Worker
for _, worker := range workers {
if worker.CurrentLoad < worker.MaxConcurrent {
for _, cap := range worker.Capabilities {
if string(cap) == taskType {
available = append(available, worker)
break
}
}
}
}
return available
}
// GetSystemStats returns current system statistics
func (as *AdminServer) GetSystemStats() *SystemStats {
as.tasksMutex.RLock()
activeTasks := len(as.activeTasks)
as.tasksMutex.RUnlock()
queuedTasks := as.taskQueue.Size()
activeWorkers := len(as.workerRegistry.GetAvailableWorkers())
return &SystemStats{
ActiveTasks: activeTasks,
QueuedTasks: queuedTasks,
ActiveWorkers: activeWorkers,
TotalWorkers: len(as.workerRegistry.GetAvailableWorkers()),
Uptime: time.Since(time.Now()), // This should be tracked properly
}
}
// Getter methods for testing
func (as *AdminServer) GetQueuedTaskCount() int {
return as.taskQueue.Size()
}
func (as *AdminServer) GetActiveTaskCount() int {
as.tasksMutex.RLock()
defer as.tasksMutex.RUnlock()
return len(as.activeTasks)
}
func (as *AdminServer) GetTaskHistory() []*TaskHistoryEntry {
return as.taskHistory.GetRecentEntries(100)
}
func (as *AdminServer) GetVolumeStateManager() *VolumeStateManager {
return as.volumeStateManager
}
func (as *AdminServer) GetWorkerRegistry() *WorkerRegistry {
return as.workerRegistry
}
// generateTaskID generates a unique task ID
func generateTaskID() string {
return fmt.Sprintf("task_%d_%d", time.Now().UnixNano(), rand.Intn(10000))
}
// generateAdminServerID generates a unique admin server ID
func generateAdminServerID() string {
return fmt.Sprintf("admin-%d", time.Now().Unix())
}
// SystemStats represents system statistics
type SystemStats struct {
ActiveTasks int
QueuedTasks int
ActiveWorkers int
TotalWorkers int
Uptime time.Duration
LastMasterSync time.Time
}

524
weed/admin/task/admin_server_test.go

@ -1,524 +0,0 @@
package task
import (
"fmt"
"testing"
"github.com/seaweedfs/seaweedfs/weed/worker/types"
)
func TestAdminServer_TaskAssignmentWithStateManagement(t *testing.T) {
// Test the core functionality: accurate task assignment based on comprehensive state
adminServer := NewAdminServer(DefaultAdminConfig(), nil)
// Initialize components
adminServer.workerRegistry = NewWorkerRegistry()
adminServer.taskQueue = NewPriorityTaskQueue()
adminServer.volumeStateManager = NewVolumeStateManager(nil)
adminServer.taskScheduler = NewTaskScheduler(adminServer.workerRegistry, adminServer.taskQueue)
adminServer.running = true // Mark as running for test
// Setup test worker
worker := &types.Worker{
ID: "test_worker_1",
Address: "server1:8080",
Capabilities: []types.TaskType{types.TaskTypeErasureCoding, types.TaskTypeVacuum},
MaxConcurrent: 2,
Status: "active",
CurrentLoad: 0,
}
adminServer.workerRegistry.RegisterWorker(worker)
// Setup volume state
volumeID := uint32(1)
adminServer.volumeStateManager.volumes[volumeID] = &VolumeState{
VolumeID: volumeID,
CurrentState: &VolumeInfo{
ID: volumeID,
Size: 28 * 1024 * 1024 * 1024, // 28GB - good for EC
Server: "server1",
},
InProgressTasks: []*TaskImpact{},
PlannedChanges: []*PlannedOperation{},
}
// Setup server capacity
adminServer.volumeStateManager.capacityCache["server1"] = &CapacityInfo{
Server: "server1",
TotalCapacity: 100 * 1024 * 1024 * 1024, // 100GB
UsedCapacity: 50 * 1024 * 1024 * 1024, // 50GB used
PredictedUsage: 50 * 1024 * 1024 * 1024, // Initially same as used
}
// Create EC task
task := &types.Task{
ID: "ec_task_1",
Type: types.TaskTypeErasureCoding,
VolumeID: volumeID,
Server: "server1",
Priority: types.TaskPriorityNormal,
}
// Test task assignment
adminServer.taskQueue.Push(task)
assignedTask, err := adminServer.RequestTask("test_worker_1", []types.TaskType{types.TaskTypeErasureCoding})
if err != nil {
t.Errorf("Task assignment failed: %v", err)
}
if assignedTask == nil {
t.Fatal("Expected task to be assigned, got nil")
}
if assignedTask.ID != "ec_task_1" {
t.Errorf("Expected task ec_task_1, got %s", assignedTask.ID)
}
// Verify state manager was updated
if len(adminServer.volumeStateManager.inProgressTasks) != 1 {
t.Errorf("Expected 1 in-progress task in state manager, got %d", len(adminServer.volumeStateManager.inProgressTasks))
}
// Verify capacity reservation
capacity := adminServer.volumeStateManager.GetAccurateCapacity("server1")
if capacity.ReservedCapacity <= 0 {
t.Error("Expected capacity to be reserved for EC task")
}
t.Log("✅ Task assignment with state management test passed")
}
func TestAdminServer_CanAssignTask(t *testing.T) {
adminServer := NewAdminServer(DefaultAdminConfig(), nil)
adminServer.volumeStateManager = NewVolumeStateManager(nil)
adminServer.inProgressTasks = make(map[string]*InProgressTask)
// Setup volume state
volumeID := uint32(1)
adminServer.volumeStateManager.volumes[volumeID] = &VolumeState{
VolumeID: volumeID,
CurrentState: &VolumeInfo{
ID: volumeID,
Size: 25 * 1024 * 1024 * 1024, // 25GB
},
}
// Setup server capacity - limited space
serverID := "server1"
adminServer.volumeStateManager.capacityCache[serverID] = &CapacityInfo{
Server: serverID,
TotalCapacity: 30 * 1024 * 1024 * 1024, // 30GB total
UsedCapacity: 20 * 1024 * 1024 * 1024, // 20GB used
PredictedUsage: 20 * 1024 * 1024 * 1024, // 10GB available
}
worker := &types.Worker{
ID: "worker1",
Address: serverID,
}
tests := []struct {
name string
taskType types.TaskType
expected bool
desc string
}{
{
name: "EC task fits",
taskType: types.TaskTypeErasureCoding,
expected: false, // 25GB * 1.4 = 35GB needed, but only 10GB available
desc: "EC task should not fit due to insufficient capacity",
},
{
name: "Vacuum task fits",
taskType: types.TaskTypeVacuum,
expected: true,
desc: "Vacuum task should fit (no capacity increase)",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
task := &types.Task{
ID: "test_task",
Type: tt.taskType,
VolumeID: volumeID,
Server: serverID,
}
result := adminServer.canAssignTask(task, worker)
if result != tt.expected {
t.Errorf("canAssignTask() = %v, want %v. %s", result, tt.expected, tt.desc)
}
})
}
}
func TestAdminServer_CreateTaskImpact(t *testing.T) {
adminServer := NewAdminServer(DefaultAdminConfig(), nil)
adminServer.volumeStateManager = NewVolumeStateManager(nil)
// Setup volume state for EC task
volumeID := uint32(1)
adminServer.volumeStateManager.volumes[volumeID] = &VolumeState{
VolumeID: volumeID,
CurrentState: &VolumeInfo{
ID: volumeID,
Size: 25 * 1024 * 1024 * 1024, // 25GB
},
}
task := &types.Task{
ID: "ec_task_1",
Type: types.TaskTypeErasureCoding,
VolumeID: volumeID,
Server: "server1",
}
impact := adminServer.createTaskImpact(task, "worker1")
// Verify impact structure
if impact.TaskID != "ec_task_1" {
t.Errorf("Expected task ID ec_task_1, got %s", impact.TaskID)
}
if impact.TaskType != types.TaskTypeErasureCoding {
t.Errorf("Expected task type %v, got %v", types.TaskTypeErasureCoding, impact.TaskType)
}
// Verify volume changes for EC task
if !impact.VolumeChanges.WillBecomeReadOnly {
t.Error("Expected volume to become read-only after EC")
}
// Verify capacity delta (EC should require ~40% more space)
expectedCapacity := int64(float64(25*1024*1024*1024) * 1.4) // ~35GB
actualCapacity := impact.CapacityDelta["server1"]
if actualCapacity != expectedCapacity {
t.Errorf("Expected capacity delta %d, got %d", expectedCapacity, actualCapacity)
}
// Verify shard changes (should plan 14 shards)
if len(impact.ShardChanges) != 14 {
t.Errorf("Expected 14 shard changes, got %d", len(impact.ShardChanges))
}
for i := 0; i < 14; i++ {
shardChange := impact.ShardChanges[i]
if shardChange == nil {
t.Errorf("Missing shard change for shard %d", i)
continue
}
if !shardChange.WillBeCreated {
t.Errorf("Shard %d should be marked for creation", i)
}
}
t.Log("✅ Task impact creation test passed")
}
func TestAdminServer_TaskCompletionStateCleanup(t *testing.T) {
adminServer := NewAdminServer(DefaultAdminConfig(), nil)
adminServer.workerRegistry = NewWorkerRegistry()
adminServer.volumeStateManager = NewVolumeStateManager(nil)
adminServer.inProgressTasks = make(map[string]*InProgressTask)
// Setup worker
worker := &types.Worker{
ID: "worker1",
CurrentLoad: 1, // Has 1 task assigned
}
adminServer.workerRegistry.RegisterWorker(worker)
// Setup in-progress task
task := &types.Task{
ID: "test_task_1",
Type: types.TaskTypeVacuum,
VolumeID: 1,
}
inProgressTask := &InProgressTask{
Task: task,
WorkerID: "worker1",
VolumeReserved: true,
}
adminServer.inProgressTasks["test_task_1"] = inProgressTask
// Register impact in state manager
impact := &TaskImpact{
TaskID: "test_task_1",
VolumeID: 1,
CapacityDelta: map[string]int64{"server1": -100 * 1024 * 1024}, // 100MB savings
}
adminServer.volumeStateManager.RegisterTaskImpact("test_task_1", impact)
// Complete the task
err := adminServer.CompleteTask("test_task_1", true, "")
if err != nil {
t.Errorf("Task completion failed: %v", err)
}
// Verify cleanup
if len(adminServer.inProgressTasks) != 0 {
t.Errorf("Expected 0 in-progress tasks after completion, got %d", len(adminServer.inProgressTasks))
}
// Verify worker load updated
updatedWorker, _ := adminServer.workerRegistry.GetWorker("worker1")
if updatedWorker.CurrentLoad != 0 {
t.Errorf("Expected worker load 0 after task completion, got %d", updatedWorker.CurrentLoad)
}
// Verify state manager cleaned up
if len(adminServer.volumeStateManager.inProgressTasks) != 0 {
t.Errorf("Expected 0 tasks in state manager after completion, got %d", len(adminServer.volumeStateManager.inProgressTasks))
}
t.Log("✅ Task completion state cleanup test passed")
}
func TestAdminServer_PreventDuplicateTaskAssignment(t *testing.T) {
adminServer := NewAdminServer(DefaultAdminConfig(), nil)
adminServer.workerRegistry = NewWorkerRegistry()
adminServer.taskQueue = NewPriorityTaskQueue()
adminServer.volumeStateManager = NewVolumeStateManager(nil)
adminServer.inProgressTasks = make(map[string]*InProgressTask)
// Setup worker
worker := &types.Worker{
ID: "worker1",
Capabilities: []types.TaskType{types.TaskTypeVacuum},
MaxConcurrent: 2,
Status: "active",
CurrentLoad: 0,
}
adminServer.workerRegistry.RegisterWorker(worker)
// Setup volume state
volumeID := uint32(1)
adminServer.volumeStateManager.volumes[volumeID] = &VolumeState{
VolumeID: volumeID,
CurrentState: &VolumeInfo{ID: volumeID, Size: 1024 * 1024 * 1024},
}
// Create first task and assign it
task1 := &types.Task{
ID: "vacuum_task_1",
Type: types.TaskTypeVacuum,
VolumeID: volumeID,
Priority: types.TaskPriorityNormal,
}
adminServer.taskQueue.Push(task1)
assignedTask1, err := adminServer.RequestTask("worker1", []types.TaskType{types.TaskTypeVacuum})
if err != nil || assignedTask1 == nil {
t.Fatal("First task assignment failed")
}
// Try to assign another vacuum task for the same volume
task2 := &types.Task{
ID: "vacuum_task_2",
Type: types.TaskTypeVacuum,
VolumeID: volumeID, // Same volume!
Priority: types.TaskPriorityNormal,
}
adminServer.taskQueue.Push(task2)
assignedTask2, err := adminServer.RequestTask("worker1", []types.TaskType{types.TaskTypeVacuum})
// Should not assign duplicate task
if assignedTask2 != nil {
t.Error("Should not assign duplicate vacuum task for same volume")
}
t.Log("✅ Duplicate task prevention test passed")
}
func TestAdminServer_SystemStats(t *testing.T) {
adminServer := NewAdminServer(DefaultAdminConfig(), nil)
adminServer.workerRegistry = NewWorkerRegistry()
adminServer.taskQueue = NewPriorityTaskQueue()
adminServer.volumeStateManager = NewVolumeStateManager(nil)
adminServer.inProgressTasks = make(map[string]*InProgressTask)
adminServer.running = true
// Add some test data
worker := &types.Worker{ID: "worker1", Status: "active"}
adminServer.workerRegistry.RegisterWorker(worker)
task := &types.Task{ID: "task1", Type: types.TaskTypeErasureCoding}
adminServer.taskQueue.Push(task)
inProgressTask := &InProgressTask{
Task: &types.Task{ID: "task2", Type: types.TaskTypeVacuum},
}
adminServer.inProgressTasks["task2"] = inProgressTask
// Get system stats
stats := adminServer.GetSystemStats()
// Verify stats structure
if !stats["running"].(bool) {
t.Error("Expected running to be true")
}
if stats["in_progress_tasks"].(int) != 1 {
t.Errorf("Expected 1 in-progress task, got %d", stats["in_progress_tasks"].(int))
}
if stats["queued_tasks"].(int) != 1 {
t.Errorf("Expected 1 queued task, got %d", stats["queued_tasks"].(int))
}
// Check task breakdown
tasksByType := stats["tasks_by_type"].(map[types.TaskType]int)
if tasksByType[types.TaskTypeVacuum] != 1 {
t.Errorf("Expected 1 vacuum task, got %d", tasksByType[types.TaskTypeVacuum])
}
t.Log("✅ System stats test passed")
}
func TestAdminServer_VolumeStateIntegration(t *testing.T) {
// Integration test: Verify admin server correctly uses volume state for decisions
adminServer := NewAdminServer(DefaultAdminConfig(), nil)
adminServer.workerRegistry = NewWorkerRegistry()
adminServer.taskQueue = NewPriorityTaskQueue()
adminServer.volumeStateManager = NewVolumeStateManager(nil)
adminServer.inProgressTasks = make(map[string]*InProgressTask)
// Setup worker
worker := &types.Worker{
ID: "worker1",
Address: "server1",
Capabilities: []types.TaskType{types.TaskTypeErasureCoding},
MaxConcurrent: 1,
Status: "active",
CurrentLoad: 0,
}
adminServer.workerRegistry.RegisterWorker(worker)
// Setup volume and capacity that would normally allow EC
volumeID := uint32(1)
adminServer.volumeStateManager.volumes[volumeID] = &VolumeState{
VolumeID: volumeID,
CurrentState: &VolumeInfo{
ID: volumeID,
Size: 25 * 1024 * 1024 * 1024, // 25GB
Server: "server1",
},
}
adminServer.volumeStateManager.capacityCache["server1"] = &CapacityInfo{
Server: "server1",
TotalCapacity: 100 * 1024 * 1024 * 1024, // 100GB
UsedCapacity: 20 * 1024 * 1024 * 1024, // 20GB used
PredictedUsage: 20 * 1024 * 1024 * 1024, // 80GB available
}
// Create EC task
task := &types.Task{
ID: "ec_task_1",
Type: types.TaskTypeErasureCoding,
VolumeID: volumeID,
Server: "server1",
}
adminServer.taskQueue.Push(task)
// First assignment should work
assignedTask1, err := adminServer.RequestTask("worker1", []types.TaskType{types.TaskTypeErasureCoding})
if err != nil || assignedTask1 == nil {
t.Fatal("First EC task assignment should succeed")
}
// Verify capacity is now reserved
capacity := adminServer.volumeStateManager.GetAccurateCapacity("server1")
if capacity.ReservedCapacity <= 0 {
t.Error("Expected capacity to be reserved for first EC task")
}
// Try to assign another large EC task - should fail due to capacity
volumeID2 := uint32(2)
adminServer.volumeStateManager.volumes[volumeID2] = &VolumeState{
VolumeID: volumeID2,
CurrentState: &VolumeInfo{
ID: volumeID2,
Size: 30 * 1024 * 1024 * 1024, // 30GB - would need 42GB for EC
Server: "server1",
},
}
task2 := &types.Task{
ID: "ec_task_2",
Type: types.TaskTypeErasureCoding,
VolumeID: volumeID2,
Server: "server1",
}
adminServer.taskQueue.Push(task2)
// Add another worker to test capacity-based rejection
worker2 := &types.Worker{
ID: "worker2",
Address: "server1",
Capabilities: []types.TaskType{types.TaskTypeErasureCoding},
MaxConcurrent: 1,
Status: "active",
CurrentLoad: 0,
}
adminServer.workerRegistry.RegisterWorker(worker2)
assignedTask2, err := adminServer.RequestTask("worker2", []types.TaskType{types.TaskTypeErasureCoding})
// Should not assign due to insufficient capacity
if assignedTask2 != nil {
t.Error("Should not assign second EC task due to insufficient server capacity")
}
t.Log("✅ Volume state integration test passed")
t.Log("✅ Admin server correctly uses comprehensive state for task assignment decisions")
}
// Benchmark for task assignment performance
func BenchmarkAdminServer_RequestTask(b *testing.B) {
adminServer := NewAdminServer(DefaultAdminConfig(), nil)
adminServer.workerRegistry = NewWorkerRegistry()
adminServer.taskQueue = NewPriorityTaskQueue()
adminServer.volumeStateManager = NewVolumeStateManager(nil)
adminServer.inProgressTasks = make(map[string]*InProgressTask)
// Setup worker
worker := &types.Worker{
ID: "bench_worker",
Capabilities: []types.TaskType{types.TaskTypeVacuum},
MaxConcurrent: 1000, // High limit for benchmark
Status: "active",
CurrentLoad: 0,
}
adminServer.workerRegistry.RegisterWorker(worker)
// Setup many tasks
for i := 0; i < 1000; i++ {
volumeID := uint32(i + 1)
adminServer.volumeStateManager.volumes[volumeID] = &VolumeState{
VolumeID: volumeID,
CurrentState: &VolumeInfo{ID: volumeID, Size: 1024 * 1024 * 1024},
}
task := &types.Task{
ID: fmt.Sprintf("task_%d", i),
Type: types.TaskTypeVacuum,
VolumeID: volumeID,
}
adminServer.taskQueue.Push(task)
}
b.ResetTimer()
for i := 0; i < b.N; i++ {
adminServer.RequestTask("bench_worker", []types.TaskType{types.TaskTypeVacuum})
}
}

90
weed/admin/task/compilation_stubs.go

@ -1,90 +0,0 @@
package task
import (
"time"
"github.com/seaweedfs/seaweedfs/weed/pb/worker_pb"
"github.com/seaweedfs/seaweedfs/weed/worker/types"
)
// Compilation stubs for missing types and functions
// Task is an alias for types.Task for backward compatibility
type Task = types.Task
// TaskType is an alias for types.TaskType for backward compatibility
type TaskType = types.TaskType
// TaskStatus is an alias for types.TaskStatus for backward compatibility
type TaskStatus = types.TaskStatus
// TaskPriority is an alias for types.TaskPriority for backward compatibility
type TaskPriority = types.TaskPriority
// Additional type aliases for compilation
var (
TaskStatusCompleted = types.TaskStatusCompleted
TaskStatusFailed = types.TaskStatusFailed
)
// Worker represents a worker node
type Worker struct {
ID string
Address string
Capabilities []string
Status string
LastSeen time.Time
}
// convertAdminToWorkerMessage converts AdminMessage to WorkerMessage for stream compatibility
func convertAdminToWorkerMessage(msg *worker_pb.AdminMessage) *worker_pb.WorkerMessage {
// This is a workaround for the stream type mismatch
// In a real implementation, this would need proper message conversion
return &worker_pb.WorkerMessage{
WorkerId: msg.AdminId,
Timestamp: msg.Timestamp,
// Add basic message conversion logic here
}
}
// WorkerRegistry stub methods
func (wr *WorkerRegistry) UpdateWorkerStatus(workerID string, status interface{}) {
// Stub implementation
}
// AdminServer stub methods
func (as *AdminServer) AssignTaskToWorker(workerID string) *Task {
// Stub implementation
return nil
}
// DefaultAdminConfig returns default admin server configuration
func DefaultAdminConfig() *AdminConfig {
return &AdminConfig{
ScanInterval: 30 * time.Minute,
WorkerTimeout: 5 * time.Minute,
TaskTimeout: 10 * time.Minute,
MaxRetries: 3,
ReconcileInterval: 5 * time.Minute,
EnableFailureRecovery: true,
MaxConcurrentTasks: 10,
}
}
// SyncWithMasterData is a stub for the volume state manager
func (vsm *VolumeStateManager) SyncWithMasterData(volumes map[uint32]*VolumeInfo, ecShards map[uint32]map[int]*ShardInfo, serverCapacity map[string]*CapacityInfo) error {
// Stub implementation - would normally sync the data
return nil
}
// GetAllVolumeStates is a stub for the volume state manager
func (vsm *VolumeStateManager) GetAllVolumeStates() map[uint32]*VolumeState {
// Stub implementation - return empty map
return make(map[uint32]*VolumeState)
}
// DetectInconsistencies is a stub for the volume state manager
func (vsm *VolumeStateManager) DetectInconsistencies() []StateInconsistency {
// Stub implementation - return empty slice
return []StateInconsistency{}
}

309
weed/admin/task/ec_integration_test.go

@ -1,309 +0,0 @@
package task
import (
"os"
"path/filepath"
"testing"
"time"
ec_task "github.com/seaweedfs/seaweedfs/weed/worker/tasks/erasure_coding"
"github.com/seaweedfs/seaweedfs/weed/worker/types"
)
// TestECIntegration tests the EC implementation with the admin server
func TestECIntegration(t *testing.T) {
t.Logf("Starting EC integration test")
// Step 1: Create admin server
config := &MinimalAdminConfig{
ScanInterval: 10 * time.Second,
WorkerTimeout: 30 * time.Second,
TaskTimeout: 30 * time.Minute, // EC takes longer
MaxRetries: 3,
ReconcileInterval: 5 * time.Minute,
EnableFailureRecovery: true,
MaxConcurrentTasks: 2, // Limit concurrency for EC tasks
}
adminServer := NewMinimalAdminServer(config, nil)
err := adminServer.Start()
if err != nil {
t.Fatalf("Failed to start admin server: %v", err)
}
defer adminServer.Stop()
// Step 2: Register an EC-capable worker
worker := &types.Worker{
ID: "ec-worker-1",
Address: "localhost:9001",
Capabilities: []types.TaskType{types.TaskTypeErasureCoding},
MaxConcurrent: 1,
Status: "active",
CurrentLoad: 0,
LastHeartbeat: time.Now(),
}
err = adminServer.RegisterWorker(worker)
if err != nil {
t.Fatalf("Failed to register EC worker: %v", err)
}
t.Logf("Successfully registered EC worker %s", worker.ID)
// Step 3: Create an EC task
ecTask := &types.Task{
ID: "ec-task-1",
Type: types.TaskTypeErasureCoding,
VolumeID: 12345,
Server: "localhost:8080",
Status: types.TaskStatusPending,
Priority: types.TaskPriorityHigh,
Parameters: map[string]interface{}{
"volume_size": int64(32 * 1024 * 1024 * 1024), // 32GB
"master_client": "localhost:9333",
"work_dir": "/tmp/seaweedfs_ec_work",
"collection": "test",
},
CreatedAt: time.Now(),
}
err = adminServer.QueueTask(ecTask)
if err != nil {
t.Fatalf("Failed to queue EC task: %v", err)
}
t.Logf("Successfully queued EC task %s for volume %d", ecTask.ID, ecTask.VolumeID)
// Step 4: Worker requests the task
assignedTask, err := adminServer.RequestTask("ec-worker-1", []types.TaskType{types.TaskTypeErasureCoding})
if err != nil {
t.Fatalf("Failed to request EC task: %v", err)
}
if assignedTask != nil {
t.Logf("EC worker got task: %s (%s) for volume %d",
assignedTask.ID, assignedTask.Type, assignedTask.VolumeID)
// Step 5: Simulate EC task execution phases
t.Logf("Simulating EC task execution phases")
// Phase 1: Copying volume data
err = adminServer.UpdateTaskProgress(assignedTask.ID, 15.0)
if err != nil {
t.Errorf("Failed to update progress (copying): %v", err)
}
t.Logf("Phase 1: Volume data copied to local disk")
// Phase 2: Marking read-only
err = adminServer.UpdateTaskProgress(assignedTask.ID, 25.0)
if err != nil {
t.Errorf("Failed to update progress (read-only): %v", err)
}
t.Logf("Phase 2: Source volume marked as read-only")
// Phase 3: Local EC encoding
err = adminServer.UpdateTaskProgress(assignedTask.ID, 60.0)
if err != nil {
t.Errorf("Failed to update progress (encoding): %v", err)
}
t.Logf("Phase 3: Local Reed-Solomon encoding completed (10+4 shards)")
// Phase 4: Calculating optimal placement
err = adminServer.UpdateTaskProgress(assignedTask.ID, 70.0)
if err != nil {
t.Errorf("Failed to update progress (placement): %v", err)
}
t.Logf("Phase 4: Optimal shard placement calculated with affinity")
// Phase 5: Distributing shards
err = adminServer.UpdateTaskProgress(assignedTask.ID, 90.0)
if err != nil {
t.Errorf("Failed to update progress (distribution): %v", err)
}
t.Logf("Phase 5: Shards distributed across servers with rack diversity")
// Phase 6: Verification and cleanup
err = adminServer.UpdateTaskProgress(assignedTask.ID, 100.0)
if err != nil {
t.Errorf("Failed to update progress (completion): %v", err)
}
t.Logf("Phase 6: Verification and cleanup completed")
// Step 6: Complete the task
err = adminServer.CompleteTask(assignedTask.ID, true, "")
if err != nil {
t.Errorf("Failed to complete EC task: %v", err)
}
t.Logf("Successfully completed EC task %s", assignedTask.ID)
} else {
t.Logf("No EC task was assigned (expected in test environment)")
}
// Step 7: Verify task completion
stats := adminServer.GetSystemStats()
t.Logf("Final stats: Active tasks=%d, Queued tasks=%d, Active workers=%d, Total tasks=%d",
stats.ActiveTasks, stats.QueuedTasks, stats.ActiveWorkers, stats.TotalTasks)
history := adminServer.GetTaskHistory()
t.Logf("Task history contains %d completed tasks", len(history))
if len(history) > 0 {
lastEntry := history[len(history)-1]
t.Logf("Last completed task: %s (%s) - Duration: %v",
lastEntry.TaskID, lastEntry.TaskType, lastEntry.Duration)
if lastEntry.TaskType == types.TaskTypeErasureCoding {
t.Logf("EC task completed successfully")
}
}
t.Logf("EC integration test completed successfully")
}
// TestECTaskValidation tests the EC task validation
func TestECTaskValidation(t *testing.T) {
t.Logf("Testing EC task validation")
// Create a temporary work directory
workDir := filepath.Join(os.TempDir(), "seaweedfs_ec_test")
err := os.MkdirAll(workDir, 0755)
if err != nil {
t.Fatalf("Failed to create work directory: %v", err)
}
defer os.RemoveAll(workDir)
// Create EC task
ecTask := ec_task.NewTaskWithParams(
"localhost:8080", // source server
12345, // volume ID
"localhost:9333", // master client
workDir, // work directory
)
// Test validation with valid parameters
validParams := types.TaskParams{
VolumeID: 12345,
Server: "localhost:8080",
Collection: "test",
Parameters: map[string]interface{}{
"volume_size": int64(32 * 1024 * 1024 * 1024),
},
}
err = ecTask.Validate(validParams)
if err != nil {
t.Errorf("Valid parameters should pass validation: %v", err)
}
// Test validation with invalid parameters
invalidParams := types.TaskParams{
VolumeID: 0, // Invalid volume ID
Server: "", // Empty server
}
err = ecTask.Validate(invalidParams)
if err == nil {
t.Errorf("Invalid parameters should fail validation")
}
// Test time estimation
estimatedTime := ecTask.EstimateTime(validParams)
t.Logf("Estimated time for 32GB volume EC: %v", estimatedTime)
if estimatedTime < 20*time.Minute {
t.Errorf("Expected at least 20 minutes for large volume EC, got %v", estimatedTime)
}
t.Logf("EC task validation completed successfully")
}
// TestECFeatures tests specific EC features
func TestECFeatures(t *testing.T) {
t.Logf("Testing EC features")
// Create temporary work directory
workDir := filepath.Join(os.TempDir(), "seaweedfs_ec_features_test")
err := os.MkdirAll(workDir, 0755)
if err != nil {
t.Fatalf("Failed to create work directory: %v", err)
}
defer os.RemoveAll(workDir)
ecTask := ec_task.NewTaskWithParams(
"localhost:8080",
54321,
"localhost:9333",
workDir,
)
// Test step tracking
t.Logf("Testing step tracking functionality")
currentStep := ecTask.GetCurrentStep()
t.Logf("Initial current step: %s", currentStep)
progress := ecTask.GetProgress()
t.Logf("Initial progress: %.1f%%", progress)
// Test parameter extraction
params := types.TaskParams{
VolumeID: 54321,
Server: "localhost:8080",
Collection: "features_test",
Parameters: map[string]interface{}{
"volume_size": int64(64 * 1024 * 1024 * 1024), // 64GB
"data_shards": 10,
"parity_shards": 4,
"affinity_zones": []string{"zone-a", "zone-b", "zone-c"},
},
}
estimatedTime := ecTask.EstimateTime(params)
expectedMinTime := time.Duration(64*2) * time.Minute // 2 minutes per GB
t.Logf("64GB volume estimated time: %v (expected minimum: %v)", estimatedTime, expectedMinTime)
if estimatedTime < expectedMinTime {
t.Errorf("Time estimate seems too low for 64GB volume")
}
t.Logf("EC features test completed successfully")
}
// TestECTaskComparison tests EC implementation features
func TestECTaskComparison(t *testing.T) {
t.Logf("Testing EC implementation features")
// EC task estimation
params := types.TaskParams{
VolumeID: 11111,
Server: "localhost:8080",
Parameters: map[string]interface{}{
"volume_size": int64(30 * 1024 * 1024 * 1024), // 30GB
},
}
// Create task
workDir := filepath.Join(os.TempDir(), "seaweedfs_ec_comparison")
defer os.RemoveAll(workDir)
ecTask := ec_task.NewTaskWithParams(
"localhost:8080",
22222,
"localhost:9333",
workDir,
)
estimatedTime := ecTask.EstimateTime(params)
t.Logf("EC task estimated time: %v", estimatedTime)
// Test feature capabilities
t.Logf("EC implementation features:")
t.Logf(" - Local volume data copying with progress tracking")
t.Logf(" - Local Reed-Solomon encoding (10+4 shards)")
t.Logf(" - Intelligent shard placement with rack awareness")
t.Logf(" - Load balancing across available servers")
t.Logf(" - Backup server selection for redundancy")
t.Logf(" - Detailed step-by-step progress tracking")
t.Logf(" - Comprehensive error handling and recovery")
t.Logf("EC implementation test completed successfully")
}

324
weed/admin/task/ec_test_standalone/enhanced_ec_integration_test.go

@ -1,324 +0,0 @@
package task
import (
"os"
"path/filepath"
"testing"
"time"
ec_task "github.com/seaweedfs/seaweedfs/weed/worker/tasks/erasure_coding"
"github.com/seaweedfs/seaweedfs/weed/worker/types"
)
// TestEnhancedECIntegration tests the enhanced EC implementation with the admin server
func TestEnhancedECIntegration(t *testing.T) {
t.Logf("Starting enhanced EC integration test")
// Step 1: Create admin server
config := &MinimalAdminConfig{
ScanInterval: 10 * time.Second,
WorkerTimeout: 30 * time.Second,
TaskTimeout: 30 * time.Minute, // EC takes longer
MaxRetries: 3,
ReconcileInterval: 5 * time.Minute,
EnableFailureRecovery: true,
MaxConcurrentTasks: 2, // Limit concurrency for EC tasks
}
adminServer := NewMinimalAdminServer(config, nil)
err := adminServer.Start()
if err != nil {
t.Fatalf("Failed to start admin server: %v", err)
}
defer adminServer.Stop()
// Step 2: Register an EC-capable worker
worker := &types.Worker{
ID: "ec-worker-1",
Address: "localhost:9001",
Capabilities: []types.TaskType{types.TaskTypeErasureCoding},
MaxConcurrent: 1,
Status: "active",
CurrentLoad: 0,
LastHeartbeat: time.Now(),
}
err = adminServer.RegisterWorker(worker)
if err != nil {
t.Fatalf("Failed to register EC worker: %v", err)
}
t.Logf("Successfully registered EC worker %s", worker.ID)
// Step 3: Create an EC task
ecTask := &types.Task{
ID: "enhanced-ec-task-1",
Type: types.TaskTypeErasureCoding,
VolumeID: 12345,
Server: "localhost:8080",
Status: types.TaskStatusPending,
Priority: types.TaskPriorityHigh,
Parameters: map[string]interface{}{
"volume_size": int64(32 * 1024 * 1024 * 1024), // 32GB
"master_client": "localhost:9333",
"work_dir": "/tmp/seaweedfs_ec_work",
"collection": "test",
},
CreatedAt: time.Now(),
}
err = adminServer.QueueTask(ecTask)
if err != nil {
t.Fatalf("Failed to queue EC task: %v", err)
}
t.Logf("Successfully queued enhanced EC task %s for volume %d", ecTask.ID, ecTask.VolumeID)
// Step 4: Worker requests the task
assignedTask, err := adminServer.RequestTask("ec-worker-1", []types.TaskType{types.TaskTypeErasureCoding})
if err != nil {
t.Fatalf("Failed to request EC task: %v", err)
}
if assignedTask != nil {
t.Logf("EC worker got task: %s (%s) for volume %d",
assignedTask.ID, assignedTask.Type, assignedTask.VolumeID)
// Step 5: Simulate enhanced EC task execution progress
t.Logf("Simulating enhanced EC task execution phases")
// Phase 1: Copying volume data
err = adminServer.UpdateTaskProgress(assignedTask.ID, 15.0)
if err != nil {
t.Errorf("Failed to update progress (copying): %v", err)
}
t.Logf("Phase 1: Volume data copied to local disk")
// Phase 2: Marking read-only
err = adminServer.UpdateTaskProgress(assignedTask.ID, 25.0)
if err != nil {
t.Errorf("Failed to update progress (read-only): %v", err)
}
t.Logf("Phase 2: Source volume marked as read-only")
// Phase 3: Local EC encoding
err = adminServer.UpdateTaskProgress(assignedTask.ID, 60.0)
if err != nil {
t.Errorf("Failed to update progress (encoding): %v", err)
}
t.Logf("Phase 3: Local Reed-Solomon encoding completed (10+4 shards)")
// Phase 4: Calculating optimal placement
err = adminServer.UpdateTaskProgress(assignedTask.ID, 70.0)
if err != nil {
t.Errorf("Failed to update progress (placement): %v", err)
}
t.Logf("Phase 4: Optimal shard placement calculated with affinity")
// Phase 5: Distributing shards
err = adminServer.UpdateTaskProgress(assignedTask.ID, 90.0)
if err != nil {
t.Errorf("Failed to update progress (distribution): %v", err)
}
t.Logf("Phase 5: Shards distributed across servers with rack diversity")
// Phase 6: Verification and cleanup
err = adminServer.UpdateTaskProgress(assignedTask.ID, 100.0)
if err != nil {
t.Errorf("Failed to update progress (completion): %v", err)
}
t.Logf("Phase 6: Verification and cleanup completed")
// Step 6: Complete the task
err = adminServer.CompleteTask(assignedTask.ID, true, "")
if err != nil {
t.Errorf("Failed to complete EC task: %v", err)
}
t.Logf("Successfully completed enhanced EC task %s", assignedTask.ID)
} else {
t.Logf("No EC task was assigned (expected in test environment)")
}
// Step 7: Verify task completion
stats := adminServer.GetSystemStats()
t.Logf("Final stats: Active tasks=%d, Queued tasks=%d, Active workers=%d, Total tasks=%d",
stats.ActiveTasks, stats.QueuedTasks, stats.ActiveWorkers, stats.TotalTasks)
history := adminServer.GetTaskHistory()
t.Logf("Task history contains %d completed tasks", len(history))
if len(history) > 0 {
lastEntry := history[len(history)-1]
t.Logf("Last completed task: %s (%s) - Duration: %v",
lastEntry.TaskID, lastEntry.TaskType, lastEntry.Duration)
if lastEntry.TaskType == types.TaskTypeErasureCoding {
t.Logf("Enhanced EC task completed successfully")
}
}
t.Logf("Enhanced EC integration test completed successfully")
}
// TestEnhancedECTaskValidation tests the enhanced EC task validation
func TestEnhancedECTaskValidation(t *testing.T) {
t.Logf("Testing enhanced EC task validation")
// Create a temporary work directory
workDir := filepath.Join(os.TempDir(), "seaweedfs_ec_test")
err := os.MkdirAll(workDir, 0755)
if err != nil {
t.Fatalf("Failed to create work directory: %v", err)
}
defer os.RemoveAll(workDir)
// Create enhanced EC task
enhancedTask := ec_task.NewEnhancedECTask(
"localhost:8080", // source server
12345, // volume ID
"localhost:9333", // master client
workDir, // work directory
)
// Test validation with valid parameters
validParams := types.TaskParams{
VolumeID: 12345,
Server: "localhost:8080",
Collection: "test",
Parameters: map[string]interface{}{
"volume_size": int64(32 * 1024 * 1024 * 1024),
},
}
err = enhancedTask.Validate(validParams)
if err != nil {
t.Errorf("Valid parameters should pass validation: %v", err)
}
// Test validation with invalid parameters
invalidParams := types.TaskParams{
VolumeID: 0, // Invalid volume ID
Server: "", // Empty server
}
err = enhancedTask.Validate(invalidParams)
if err == nil {
t.Errorf("Invalid parameters should fail validation")
}
// Test time estimation
estimatedTime := enhancedTask.EstimateTime(validParams)
t.Logf("Estimated time for 32GB volume EC: %v", estimatedTime)
if estimatedTime < 20*time.Minute {
t.Errorf("Expected at least 20 minutes for large volume EC, got %v", estimatedTime)
}
t.Logf("Enhanced EC task validation completed successfully")
}
// TestEnhancedECFeatures tests specific enhanced EC features
func TestEnhancedECFeatures(t *testing.T) {
t.Logf("Testing enhanced EC features")
// Create temporary work directory
workDir := filepath.Join(os.TempDir(), "seaweedfs_ec_features_test")
err := os.MkdirAll(workDir, 0755)
if err != nil {
t.Fatalf("Failed to create work directory: %v", err)
}
defer os.RemoveAll(workDir)
enhancedTask := ec_task.NewEnhancedECTask(
"localhost:8080",
54321,
"localhost:9333",
workDir,
)
// Test step tracking
t.Logf("Testing step tracking functionality")
currentStep := enhancedTask.GetCurrentStep()
t.Logf("Initial current step: %s", currentStep)
progress := enhancedTask.GetProgress()
t.Logf("Initial progress: %.1f%%", progress)
// Test parameter extraction
params := types.TaskParams{
VolumeID: 54321,
Server: "localhost:8080",
Collection: "enhanced_test",
Parameters: map[string]interface{}{
"volume_size": int64(64 * 1024 * 1024 * 1024), // 64GB
"data_shards": 10,
"parity_shards": 4,
"affinity_zones": []string{"zone-a", "zone-b", "zone-c"},
},
}
estimatedTime := enhancedTask.EstimateTime(params)
expectedMinTime := time.Duration(64*2) * time.Minute // 2 minutes per GB
t.Logf("64GB volume estimated time: %v (expected minimum: %v)", estimatedTime, expectedMinTime)
if estimatedTime < expectedMinTime {
t.Errorf("Time estimate seems too low for 64GB volume")
}
t.Logf("Enhanced EC features test completed successfully")
}
// TestECTaskComparison compares basic vs enhanced EC implementations
func TestECTaskComparison(t *testing.T) {
t.Logf("Comparing basic vs enhanced EC implementations")
// Basic EC task estimation
basicParams := types.TaskParams{
VolumeID: 11111,
Server: "localhost:8080",
Parameters: map[string]interface{}{
"volume_size": int64(30 * 1024 * 1024 * 1024), // 30GB
},
}
// Create basic task (existing implementation)
basicTask := ec_task.NewTask("localhost:8080", 11111)
basicTime := basicTask.EstimateTime(basicParams)
// Create enhanced task
workDir := filepath.Join(os.TempDir(), "seaweedfs_ec_comparison")
defer os.RemoveAll(workDir)
enhancedTask := ec_task.NewEnhancedECTask(
"localhost:8080",
22222,
"localhost:9333",
workDir,
)
enhancedTime := enhancedTask.EstimateTime(basicParams)
t.Logf("Basic EC task estimated time: %v", basicTime)
t.Logf("Enhanced EC task estimated time: %v", enhancedTime)
// Enhanced should take longer due to additional processing
if enhancedTime <= basicTime {
t.Logf("Note: Enhanced EC might take longer due to local processing and smart distribution")
}
// Test feature differences
t.Logf("Basic EC features:")
t.Logf(" - Direct volume server EC generation")
t.Logf(" - Simple shard mounting")
t.Logf(" - No custom placement logic")
t.Logf("Enhanced EC features:")
t.Logf(" - Local volume data copying")
t.Logf(" - Local Reed-Solomon encoding")
t.Logf(" - Intelligent shard placement with affinity")
t.Logf(" - Rack diversity for data shards")
t.Logf(" - Load balancing across servers")
t.Logf(" - Backup server selection")
t.Logf(" - Detailed progress tracking")
t.Logf("EC task comparison completed successfully")
}

3
weed/admin/task/ec_test_standalone/go.mod

@ -1,3 +0,0 @@
module ec_test
go 1.24.1

324
weed/admin/task/ec_test_standalone/minimal_admin_server.go

@ -1,324 +0,0 @@
package task
import (
"fmt"
"sync"
"time"
"github.com/seaweedfs/seaweedfs/weed/wdclient"
"github.com/seaweedfs/seaweedfs/weed/worker/types"
)
// MinimalAdminConfig contains configuration for the minimal admin server
type MinimalAdminConfig struct {
ScanInterval time.Duration
WorkerTimeout time.Duration
TaskTimeout time.Duration
MaxRetries int
ReconcileInterval time.Duration
EnableFailureRecovery bool
MaxConcurrentTasks int
}
// MinimalAdminServer manages workers and tasks with a simple implementation
type MinimalAdminServer struct {
config *MinimalAdminConfig
masterClient *wdclient.MasterClient
running bool
mutex sync.RWMutex
// Task management
tasks map[string]*types.Task
taskQueue []*types.Task
activeTasks map[string]*types.Task
// Worker management
workers map[string]*types.Worker
workerStatus map[string]*types.WorkerStatus
// Task history
taskHistory []MinimalTaskHistoryEntry
}
// MinimalTaskHistoryEntry represents a single task history entry
type MinimalTaskHistoryEntry struct {
TaskID string
TaskType types.TaskType
VolumeID uint32
WorkerID string
Status types.TaskStatus
StartedAt time.Time
CompletedAt time.Time
Duration time.Duration
ErrorMessage string
}
// MinimalSystemStats represents system statistics
type MinimalSystemStats struct {
ActiveTasks int
QueuedTasks int
ActiveWorkers int
TotalTasks int
}
// NewMinimalAdminServer creates a new minimal admin server
func NewMinimalAdminServer(config *MinimalAdminConfig, masterClient *wdclient.MasterClient) *MinimalAdminServer {
return &MinimalAdminServer{
config: config,
masterClient: masterClient,
tasks: make(map[string]*types.Task),
taskQueue: make([]*types.Task, 0),
activeTasks: make(map[string]*types.Task),
workers: make(map[string]*types.Worker),
workerStatus: make(map[string]*types.WorkerStatus),
taskHistory: make([]MinimalTaskHistoryEntry, 0),
}
}
// Start starts the minimal admin server
func (as *MinimalAdminServer) Start() error {
as.mutex.Lock()
defer as.mutex.Unlock()
if as.running {
return fmt.Errorf("admin server is already running")
}
as.running = true
return nil
}
// Stop stops the minimal admin server
func (as *MinimalAdminServer) Stop() error {
as.mutex.Lock()
defer as.mutex.Unlock()
as.running = false
return nil
}
// RegisterWorker registers a new worker
func (as *MinimalAdminServer) RegisterWorker(worker *types.Worker) error {
as.mutex.Lock()
defer as.mutex.Unlock()
if !as.running {
return fmt.Errorf("admin server is not running")
}
as.workers[worker.ID] = worker
as.workerStatus[worker.ID] = &types.WorkerStatus{
Status: "active",
CurrentLoad: 0,
}
return nil
}
// QueueTask adds a new task to the task queue
func (as *MinimalAdminServer) QueueTask(task *types.Task) error {
as.mutex.Lock()
defer as.mutex.Unlock()
if !as.running {
return fmt.Errorf("admin server is not running")
}
if task.ID == "" {
task.ID = fmt.Sprintf("task-%d", time.Now().UnixNano())
}
task.Status = types.TaskStatusPending
task.CreatedAt = time.Now()
as.tasks[task.ID] = task
as.taskQueue = append(as.taskQueue, task)
return nil
}
// RequestTask requests a task for a worker
func (as *MinimalAdminServer) RequestTask(workerID string, capabilities []types.TaskType) (*types.Task, error) {
as.mutex.Lock()
defer as.mutex.Unlock()
if !as.running {
return nil, fmt.Errorf("admin server is not running")
}
// Check if worker exists
worker, exists := as.workers[workerID]
if !exists {
return nil, fmt.Errorf("worker %s not found", workerID)
}
// Check if worker has capacity
status := as.workerStatus[workerID]
if status.CurrentLoad >= worker.MaxConcurrent {
return nil, nil // No capacity
}
// Find a suitable task
for i, task := range as.taskQueue {
if task.Status != types.TaskStatusPending {
continue
}
// Check if worker can handle this task type
canHandle := false
for _, capability := range capabilities {
if task.Type == capability {
canHandle = true
break
}
}
if canHandle {
// Assign task to worker
task.Status = types.TaskStatusInProgress
task.WorkerID = workerID
now := time.Now()
task.StartedAt = &now
// Move task from queue to active tasks
as.taskQueue = append(as.taskQueue[:i], as.taskQueue[i+1:]...)
as.activeTasks[task.ID] = task
// Update worker load
status.CurrentLoad++
return task, nil
}
}
return nil, nil // No suitable task found
}
// UpdateTaskProgress updates task progress
func (as *MinimalAdminServer) UpdateTaskProgress(taskID string, progress float64) error {
as.mutex.Lock()
defer as.mutex.Unlock()
task, exists := as.tasks[taskID]
if !exists {
return fmt.Errorf("task %s not found", taskID)
}
task.Progress = progress
return nil
}
// CompleteTask marks a task as completed
func (as *MinimalAdminServer) CompleteTask(taskID string, success bool, errorMessage string) error {
as.mutex.Lock()
defer as.mutex.Unlock()
task, exists := as.tasks[taskID]
if !exists {
return fmt.Errorf("task %s not found", taskID)
}
// Update task status
if success {
task.Status = types.TaskStatusCompleted
} else {
task.Status = types.TaskStatusFailed
task.Error = errorMessage
}
now := time.Now()
task.CompletedAt = &now
// Remove from active tasks
delete(as.activeTasks, taskID)
// Update worker load
if task.WorkerID != "" {
if status, exists := as.workerStatus[task.WorkerID]; exists {
status.CurrentLoad--
}
}
// Add to history
var duration time.Duration
if task.StartedAt != nil {
duration = now.Sub(*task.StartedAt)
}
entry := MinimalTaskHistoryEntry{
TaskID: task.ID,
TaskType: task.Type,
VolumeID: task.VolumeID,
WorkerID: task.WorkerID,
Status: task.Status,
StartedAt: *task.StartedAt,
CompletedAt: now,
Duration: duration,
ErrorMessage: errorMessage,
}
as.taskHistory = append(as.taskHistory, entry)
return nil
}
// UpdateWorkerHeartbeat updates worker heartbeat
func (as *MinimalAdminServer) UpdateWorkerHeartbeat(workerID string, status *types.WorkerStatus) error {
as.mutex.Lock()
defer as.mutex.Unlock()
worker, exists := as.workers[workerID]
if !exists {
return fmt.Errorf("worker %s not found", workerID)
}
worker.LastHeartbeat = time.Now()
as.workerStatus[workerID] = status
return nil
}
// GetSystemStats returns system statistics
func (as *MinimalAdminServer) GetSystemStats() *MinimalSystemStats {
as.mutex.RLock()
defer as.mutex.RUnlock()
activeWorkers := 0
for _, status := range as.workerStatus {
if status.Status == "active" {
activeWorkers++
}
}
return &MinimalSystemStats{
ActiveTasks: len(as.activeTasks),
QueuedTasks: len(as.taskQueue),
ActiveWorkers: activeWorkers,
TotalTasks: len(as.tasks),
}
}
// GetQueuedTaskCount returns the number of queued tasks
func (as *MinimalAdminServer) GetQueuedTaskCount() int {
as.mutex.RLock()
defer as.mutex.RUnlock()
return len(as.taskQueue)
}
// GetActiveTaskCount returns the number of active tasks
func (as *MinimalAdminServer) GetActiveTaskCount() int {
as.mutex.RLock()
defer as.mutex.RUnlock()
return len(as.activeTasks)
}
// GetTaskHistory returns task history
func (as *MinimalAdminServer) GetTaskHistory() []MinimalTaskHistoryEntry {
as.mutex.RLock()
defer as.mutex.RUnlock()
// Return a copy of the history
history := make([]MinimalTaskHistoryEntry, len(as.taskHistory))
copy(history, as.taskHistory)
return history
}

434
weed/admin/task/ec_test_standalone/minimal_integration_test.go

@ -1,434 +0,0 @@
package task
import (
"fmt"
"testing"
"time"
"github.com/seaweedfs/seaweedfs/weed/worker/types"
)
// TestMinimalIntegration tests basic admin-worker operational flow using the minimal implementation
func TestMinimalIntegration(t *testing.T) {
t.Logf("Starting minimal integration test")
// Step 1: Create a minimal admin server configuration
config := &MinimalAdminConfig{
ScanInterval: 10 * time.Second,
WorkerTimeout: 30 * time.Second,
TaskTimeout: 2 * time.Hour,
MaxRetries: 3,
ReconcileInterval: 5 * time.Minute,
EnableFailureRecovery: true,
MaxConcurrentTasks: 5,
}
// Step 2: Create minimal admin server with nil master client (for testing)
adminServer := NewMinimalAdminServer(config, nil)
// Step 3: Start admin server
err := adminServer.Start()
if err != nil {
t.Fatalf("Failed to start admin server: %v", err)
}
defer adminServer.Stop()
// Step 4: Test worker registration
t.Logf("Testing worker registration")
worker := &types.Worker{
ID: "test-worker-1",
Address: "localhost:9001",
Capabilities: []types.TaskType{types.TaskTypeVacuum},
MaxConcurrent: 2,
Status: "active",
CurrentLoad: 0,
LastHeartbeat: time.Now(),
}
err = adminServer.RegisterWorker(worker)
if err != nil {
t.Fatalf("Failed to register worker: %v", err)
}
t.Logf("Successfully registered worker %s", worker.ID)
// Step 5: Test task queueing
t.Logf("Testing task queueing")
task := &types.Task{
ID: "test-task-1",
Type: types.TaskTypeVacuum,
VolumeID: 1001,
Server: "localhost:8080",
Status: types.TaskStatusPending,
Priority: types.TaskPriorityNormal,
Parameters: map[string]interface{}{
"garbage_threshold": "0.3",
},
CreatedAt: time.Now(),
}
err = adminServer.QueueTask(task)
if err != nil {
t.Fatalf("Failed to queue task: %v", err)
}
t.Logf("Successfully queued task %s", task.ID)
// Step 6: Test task request by worker
t.Logf("Testing task request")
assignedTask, err := adminServer.RequestTask("test-worker-1", []types.TaskType{types.TaskTypeVacuum})
if err != nil {
t.Fatalf("Failed to request task: %v", err)
}
if assignedTask != nil {
t.Logf("Successfully assigned task %s to worker", assignedTask.ID)
// Step 7: Test task progress updates
t.Logf("Testing task progress updates")
err = adminServer.UpdateTaskProgress(assignedTask.ID, 25.0)
if err != nil {
t.Errorf("Failed to update task progress to 25%%: %v", err)
}
err = adminServer.UpdateTaskProgress(assignedTask.ID, 50.0)
if err != nil {
t.Errorf("Failed to update task progress to 50%%: %v", err)
}
err = adminServer.UpdateTaskProgress(assignedTask.ID, 75.0)
if err != nil {
t.Errorf("Failed to update task progress to 75%%: %v", err)
}
err = adminServer.UpdateTaskProgress(assignedTask.ID, 100.0)
if err != nil {
t.Errorf("Failed to update task progress to 100%%: %v", err)
}
// Step 8: Test task completion
t.Logf("Testing task completion")
err = adminServer.CompleteTask(assignedTask.ID, true, "")
if err != nil {
t.Errorf("Failed to complete task: %v", err)
}
t.Logf("Successfully completed task %s", assignedTask.ID)
} else {
t.Logf("No task was assigned (queue might be empty)")
}
// Step 9: Test basic metrics
t.Logf("Testing basic metrics")
stats := adminServer.GetSystemStats()
if stats != nil {
t.Logf("System stats: Active tasks=%d, Queued tasks=%d, Active workers=%d, Total tasks=%d",
stats.ActiveTasks, stats.QueuedTasks, stats.ActiveWorkers, stats.TotalTasks)
}
queuedCount := adminServer.GetQueuedTaskCount()
activeCount := adminServer.GetActiveTaskCount()
t.Logf("Queue status: %d queued, %d active tasks", queuedCount, activeCount)
// Step 10: Test task history
history := adminServer.GetTaskHistory()
t.Logf("Task history contains %d entries", len(history))
if len(history) > 0 {
lastEntry := history[len(history)-1]
t.Logf("Last task in history: %s (%s) - Status: %s, Duration: %v",
lastEntry.TaskID, lastEntry.TaskType, lastEntry.Status, lastEntry.Duration)
}
t.Logf("Minimal integration test completed successfully")
}
// TestMinimalWorkerHeartbeat tests worker heartbeat functionality
func TestMinimalWorkerHeartbeat(t *testing.T) {
t.Logf("Testing minimal worker heartbeat")
config := &MinimalAdminConfig{
ScanInterval: 10 * time.Second,
WorkerTimeout: 30 * time.Second,
TaskTimeout: 2 * time.Hour,
MaxRetries: 3,
ReconcileInterval: 5 * time.Minute,
EnableFailureRecovery: true,
MaxConcurrentTasks: 5,
}
adminServer := NewMinimalAdminServer(config, nil)
err := adminServer.Start()
if err != nil {
t.Fatalf("Failed to start admin server: %v", err)
}
defer adminServer.Stop()
// Register a worker
worker := &types.Worker{
ID: "heartbeat-worker",
Address: "localhost:9002",
Capabilities: []types.TaskType{types.TaskTypeVacuum},
MaxConcurrent: 1,
Status: "active",
CurrentLoad: 0,
LastHeartbeat: time.Now(),
}
err = adminServer.RegisterWorker(worker)
if err != nil {
t.Fatalf("Failed to register worker: %v", err)
}
// Test heartbeat update
status := &types.WorkerStatus{
Status: "active",
CurrentLoad: 0,
}
err = adminServer.UpdateWorkerHeartbeat("heartbeat-worker", status)
if err != nil {
t.Errorf("Failed to update worker heartbeat: %v", err)
}
t.Logf("Minimal worker heartbeat test completed successfully")
}
// TestMinimalTaskQueueOperations tests task queue operations
func TestMinimalTaskQueueOperations(t *testing.T) {
t.Logf("Testing minimal task queue operations")
config := &MinimalAdminConfig{
ScanInterval: 10 * time.Second,
WorkerTimeout: 30 * time.Second,
TaskTimeout: 2 * time.Hour,
MaxRetries: 3,
ReconcileInterval: 5 * time.Minute,
EnableFailureRecovery: true,
MaxConcurrentTasks: 5,
}
adminServer := NewMinimalAdminServer(config, nil)
err := adminServer.Start()
if err != nil {
t.Fatalf("Failed to start admin server: %v", err)
}
defer adminServer.Stop()
// Test queuing multiple tasks
taskCount := 3
for i := 0; i < taskCount; i++ {
task := &types.Task{
ID: fmt.Sprintf("queue-test-task-%d", i),
Type: types.TaskTypeVacuum,
VolumeID: uint32(2000 + i),
Server: "localhost:8080",
Status: types.TaskStatusPending,
Priority: types.TaskPriorityNormal,
Parameters: map[string]interface{}{
"garbage_threshold": "0.3",
},
CreatedAt: time.Now(),
}
err = adminServer.QueueTask(task)
if err != nil {
t.Errorf("Failed to queue task %d: %v", i, err)
}
}
// Check queue size
queuedCount := adminServer.GetQueuedTaskCount()
if queuedCount != taskCount {
t.Errorf("Expected %d queued tasks, got %d", taskCount, queuedCount)
}
t.Logf("Minimal task queue operations test completed successfully")
}
// TestMinimalFullWorkflow tests the complete workflow from task creation to completion
func TestMinimalFullWorkflow(t *testing.T) {
t.Logf("Testing minimal full workflow")
config := &MinimalAdminConfig{
ScanInterval: 10 * time.Second,
WorkerTimeout: 30 * time.Second,
TaskTimeout: 2 * time.Hour,
MaxRetries: 3,
ReconcileInterval: 5 * time.Minute,
EnableFailureRecovery: true,
MaxConcurrentTasks: 5,
}
adminServer := NewMinimalAdminServer(config, nil)
err := adminServer.Start()
if err != nil {
t.Fatalf("Failed to start admin server: %v", err)
}
defer adminServer.Stop()
// Register multiple workers with different capabilities
workers := []*types.Worker{
{
ID: "vacuum-worker-1",
Address: "localhost:9001",
Capabilities: []types.TaskType{types.TaskTypeVacuum},
MaxConcurrent: 2,
Status: "active",
CurrentLoad: 0,
LastHeartbeat: time.Now(),
},
{
ID: "ec-worker-1",
Address: "localhost:9002",
Capabilities: []types.TaskType{types.TaskTypeErasureCoding},
MaxConcurrent: 1,
Status: "active",
CurrentLoad: 0,
LastHeartbeat: time.Now(),
},
{
ID: "multi-worker-1",
Address: "localhost:9003",
Capabilities: []types.TaskType{types.TaskTypeVacuum, types.TaskTypeErasureCoding},
MaxConcurrent: 3,
Status: "active",
CurrentLoad: 0,
LastHeartbeat: time.Now(),
},
}
for _, worker := range workers {
err = adminServer.RegisterWorker(worker)
if err != nil {
t.Fatalf("Failed to register worker %s: %v", worker.ID, err)
}
t.Logf("Registered worker %s with capabilities %v", worker.ID, worker.Capabilities)
}
// Create multiple tasks of different types
tasks := []*types.Task{
{
ID: "vacuum-task-1",
Type: types.TaskTypeVacuum,
VolumeID: 3001,
Server: "localhost:8080",
Status: types.TaskStatusPending,
Priority: types.TaskPriorityNormal,
Parameters: map[string]interface{}{
"garbage_threshold": "0.4",
},
CreatedAt: time.Now(),
},
{
ID: "ec-task-1",
Type: types.TaskTypeErasureCoding,
VolumeID: 3002,
Server: "localhost:8080",
Status: types.TaskStatusPending,
Priority: types.TaskPriorityHigh,
Parameters: map[string]interface{}{
"shard_count": "14",
},
CreatedAt: time.Now(),
},
{
ID: "vacuum-task-2",
Type: types.TaskTypeVacuum,
VolumeID: 3003,
Server: "localhost:8081",
Status: types.TaskStatusPending,
Priority: types.TaskPriorityLow,
Parameters: map[string]interface{}{
"garbage_threshold": "0.5",
},
CreatedAt: time.Now(),
},
}
for _, task := range tasks {
err = adminServer.QueueTask(task)
if err != nil {
t.Fatalf("Failed to queue task %s: %v", task.ID, err)
}
t.Logf("Queued task %s (%s) for volume %d", task.ID, task.Type, task.VolumeID)
}
// Test task assignment to different workers
t.Logf("Testing task assignments")
// Vacuum worker should get vacuum tasks
assignedTask, err := adminServer.RequestTask("vacuum-worker-1", []types.TaskType{types.TaskTypeVacuum})
if err != nil {
t.Errorf("Failed to request task for vacuum worker: %v", err)
} else if assignedTask != nil {
t.Logf("Vacuum worker got task: %s (%s)", assignedTask.ID, assignedTask.Type)
// Complete the task
err = adminServer.UpdateTaskProgress(assignedTask.ID, 50.0)
if err != nil {
t.Errorf("Failed to update progress: %v", err)
}
err = adminServer.CompleteTask(assignedTask.ID, true, "")
if err != nil {
t.Errorf("Failed to complete task: %v", err)
}
}
// EC worker should get EC tasks
assignedTask, err = adminServer.RequestTask("ec-worker-1", []types.TaskType{types.TaskTypeErasureCoding})
if err != nil {
t.Errorf("Failed to request task for EC worker: %v", err)
} else if assignedTask != nil {
t.Logf("EC worker got task: %s (%s)", assignedTask.ID, assignedTask.Type)
// Complete the task
err = adminServer.UpdateTaskProgress(assignedTask.ID, 100.0)
if err != nil {
t.Errorf("Failed to update progress: %v", err)
}
err = adminServer.CompleteTask(assignedTask.ID, true, "")
if err != nil {
t.Errorf("Failed to complete task: %v", err)
}
}
// Multi-capability worker should be able to get any remaining task
assignedTask, err = adminServer.RequestTask("multi-worker-1", []types.TaskType{types.TaskTypeVacuum, types.TaskTypeErasureCoding})
if err != nil {
t.Errorf("Failed to request task for multi worker: %v", err)
} else if assignedTask != nil {
t.Logf("Multi worker got task: %s (%s)", assignedTask.ID, assignedTask.Type)
// Complete the task
err = adminServer.UpdateTaskProgress(assignedTask.ID, 100.0)
if err != nil {
t.Errorf("Failed to update progress: %v", err)
}
err = adminServer.CompleteTask(assignedTask.ID, true, "")
if err != nil {
t.Errorf("Failed to complete task: %v", err)
}
}
// Check final statistics
stats := adminServer.GetSystemStats()
t.Logf("Final stats: Active tasks=%d, Queued tasks=%d, Active workers=%d, Total tasks=%d",
stats.ActiveTasks, stats.QueuedTasks, stats.ActiveWorkers, stats.TotalTasks)
history := adminServer.GetTaskHistory()
t.Logf("Task history contains %d completed tasks", len(history))
for _, entry := range history {
t.Logf("Completed: %s (%s) - Worker: %s, Duration: %v",
entry.TaskID, entry.TaskType, entry.WorkerID, entry.Duration)
}
t.Logf("Minimal full workflow test completed successfully")
}

488
weed/admin/task/ec_worker_test.go

@ -1,488 +0,0 @@
package task
import (
"os"
"path/filepath"
"testing"
"time"
"github.com/seaweedfs/seaweedfs/weed/worker/tasks/erasure_coding"
"github.com/seaweedfs/seaweedfs/weed/worker/types"
)
// TestECWorkerIntegration tests the complete EC worker functionality
func TestECWorkerIntegration(t *testing.T) {
t.Logf("Starting EC worker integration test")
// Step 1: Create admin server with EC configuration
config := &MinimalAdminConfig{
ScanInterval: 5 * time.Second,
WorkerTimeout: 60 * time.Second,
TaskTimeout: 45 * time.Minute, // EC takes longer
MaxRetries: 3,
ReconcileInterval: 5 * time.Minute,
EnableFailureRecovery: true,
MaxConcurrentTasks: 1, // One at a time for EC
}
adminServer := NewMinimalAdminServer(config, nil)
err := adminServer.Start()
if err != nil {
t.Fatalf("Failed to start admin server: %v", err)
}
defer adminServer.Stop()
t.Logf("✓ Admin server started successfully")
// Step 2: Register EC-capable worker
worker := &types.Worker{
ID: "ec-worker-1",
Address: "localhost:9001",
Capabilities: []types.TaskType{types.TaskTypeErasureCoding},
MaxConcurrent: 1,
Status: "active",
CurrentLoad: 0,
LastHeartbeat: time.Now(),
}
err = adminServer.RegisterWorker(worker)
if err != nil {
t.Fatalf("Failed to register EC worker: %v", err)
}
t.Logf("✓ EC worker registered: %s", worker.ID)
// Step 3: Create work directory for EC processing
workDir := filepath.Join(os.TempDir(), "seaweedfs_ec_test")
err = os.MkdirAll(workDir, 0755)
if err != nil {
t.Fatalf("Failed to create work directory: %v", err)
}
defer os.RemoveAll(workDir)
t.Logf("✓ Work directory created: %s", workDir)
// Step 4: Create EC task with comprehensive parameters
ecTask := &types.Task{
ID: "ec-test-task-1",
Type: types.TaskTypeErasureCoding,
VolumeID: 54321,
Server: "localhost:8080",
Status: types.TaskStatusPending,
Priority: types.TaskPriorityHigh,
Parameters: map[string]interface{}{
"volume_size": int64(64 * 1024 * 1024 * 1024), // 64GB volume
"master_client": "localhost:9333",
"work_dir": workDir,
"collection": "test",
"data_shards": 10,
"parity_shards": 4,
"rack_aware": true,
"load_balance": true,
},
CreatedAt: time.Now(),
}
err = adminServer.QueueTask(ecTask)
if err != nil {
t.Fatalf("Failed to queue EC task: %v", err)
}
t.Logf("✓ EC task queued: %s for volume %d", ecTask.ID, ecTask.VolumeID)
// Step 5: Worker requests and receives the EC task
assignedTask, err := adminServer.RequestTask("ec-worker-1", []types.TaskType{types.TaskTypeErasureCoding})
if err != nil {
t.Fatalf("Failed to request EC task: %v", err)
}
if assignedTask == nil {
t.Fatalf("No EC task was assigned")
}
t.Logf("✓ EC task assigned: %s (%s) for volume %d",
assignedTask.ID, assignedTask.Type, assignedTask.VolumeID)
// Step 6: Test EC task creation and validation
t.Logf("Testing EC task creation and validation")
// Create EC task instance directly
factory := erasure_coding.NewFactory()
taskParams := types.TaskParams{
VolumeID: assignedTask.VolumeID,
Server: assignedTask.Server,
Collection: "test",
Parameters: assignedTask.Parameters,
}
taskInstance, err := factory.Create(taskParams)
if err != nil {
t.Fatalf("Failed to create EC task instance: %v", err)
}
t.Logf("✓ EC task instance created successfully")
// Step 7: Validate task parameters
err = taskInstance.Validate(taskParams)
if err != nil {
t.Errorf("EC task validation failed: %v", err)
} else {
t.Logf("✓ EC task validation passed")
}
// Step 8: Test time estimation
estimatedTime := taskInstance.EstimateTime(taskParams)
expectedMinTime := time.Duration(64*2) * time.Minute // 2 minutes per GB for 64GB
t.Logf("✓ EC estimated time: %v (minimum expected: %v)", estimatedTime, expectedMinTime)
if estimatedTime < expectedMinTime {
t.Logf("⚠ Note: Estimated time seems optimistic for 64GB volume")
}
// Step 9: Simulate EC task execution phases
t.Logf("Simulating EC execution phases:")
phases := []struct {
progress float64
phase string
}{
{5.0, "Initializing EC processing"},
{15.0, "Volume data copied to local disk with progress tracking"},
{25.0, "Source volume marked as read-only"},
{45.0, "Local Reed-Solomon encoding (10+4 shards) completed"},
{60.0, "Created 14 EC shards with verification"},
{70.0, "Optimal shard placement calculated with rack awareness"},
{85.0, "Intelligent shard distribution with load balancing"},
{95.0, "Shard placement verified across multiple racks"},
{100.0, "EC processing completed with cleanup"},
}
for _, phase := range phases {
err = adminServer.UpdateTaskProgress(assignedTask.ID, phase.progress)
if err != nil {
t.Errorf("Failed to update task progress to %.1f%%: %v", phase.progress, err)
} else {
t.Logf(" %.1f%% - %s", phase.progress, phase.phase)
}
time.Sleep(50 * time.Millisecond) // Simulate processing time
}
// Step 10: Complete the EC task
err = adminServer.CompleteTask(assignedTask.ID, true, "")
if err != nil {
t.Errorf("Failed to complete EC task: %v", err)
} else {
t.Logf("✓ EC task completed successfully")
}
// Step 11: Verify EC task completion and metrics
stats := adminServer.GetSystemStats()
t.Logf("✓ Final stats: Active tasks=%d, Queued tasks=%d, Active workers=%d, Total tasks=%d",
stats.ActiveTasks, stats.QueuedTasks, stats.ActiveWorkers, stats.TotalTasks)
history := adminServer.GetTaskHistory()
t.Logf("✓ Task history contains %d completed tasks", len(history))
if len(history) > 0 {
lastEntry := history[len(history)-1]
t.Logf("✓ Last completed task: %s (%s) - Duration: %v",
lastEntry.TaskID, lastEntry.TaskType, lastEntry.Duration)
if lastEntry.TaskType == types.TaskTypeErasureCoding {
t.Logf("✅ EC task execution verified!")
}
}
t.Logf("✅ EC worker integration test completed successfully")
}
// TestECFeatureValidation tests specific EC features
func TestECFeatureValidation(t *testing.T) {
t.Logf("Testing EC feature validation")
// Create work directory
workDir := filepath.Join(os.TempDir(), "seaweedfs_ec_features_test")
err := os.MkdirAll(workDir, 0755)
if err != nil {
t.Fatalf("Failed to create work directory: %v", err)
}
defer os.RemoveAll(workDir)
// Test EC task features
ecTask := erasure_coding.NewTaskWithParams(
"localhost:8080", // source server
98765, // volume ID
"localhost:9333", // master client
workDir, // work directory
)
// Test current step tracking
currentStep := ecTask.GetCurrentStep()
t.Logf("✓ Initial current step: '%s'", currentStep)
initialProgress := ecTask.GetProgress()
t.Logf("✓ Initial progress: %.1f%%", initialProgress)
// Test parameter validation with features
validParams := types.TaskParams{
VolumeID: 98765,
Server: "localhost:8080",
Collection: "features_test",
Parameters: map[string]interface{}{
"volume_size": int64(128 * 1024 * 1024 * 1024), // 128GB
"master_client": "localhost:9333",
"work_dir": workDir,
"data_shards": 10,
"parity_shards": 4,
"rack_awareness": true,
"load_balancing": true,
"backup_servers": 2,
"affinity_zones": []string{"zone-a", "zone-b", "zone-c"},
},
}
err = ecTask.Validate(validParams)
if err != nil {
t.Errorf("Valid parameters should pass validation: %v", err)
} else {
t.Logf("✓ Parameter validation passed")
}
// Test time estimation for large volume
estimatedTime := ecTask.EstimateTime(validParams)
expectedMinTime := time.Duration(128*2) * time.Minute // 2 minutes per GB
t.Logf("✓ 128GB volume estimated time: %v (expected minimum: %v)", estimatedTime, expectedMinTime)
if estimatedTime < expectedMinTime {
t.Errorf("Time estimate seems too low for 128GB volume")
}
// Test invalid parameters
invalidParams := types.TaskParams{
VolumeID: 0, // Invalid
Server: "", // Invalid
}
err = ecTask.Validate(invalidParams)
if err == nil {
t.Errorf("Invalid parameters should fail validation")
} else {
t.Logf("✓ Invalid parameter validation correctly failed: %v", err)
}
t.Logf("✅ EC feature validation completed successfully")
}
// TestECWorkflow tests the complete EC workflow
func TestECWorkflow(t *testing.T) {
t.Logf("Testing complete EC workflow")
// Create admin server
config := &MinimalAdminConfig{
ScanInterval: 10 * time.Second,
WorkerTimeout: 30 * time.Second,
TaskTimeout: 60 * time.Minute,
MaxRetries: 3,
ReconcileInterval: 5 * time.Minute,
EnableFailureRecovery: true,
MaxConcurrentTasks: 1,
}
adminServer := NewMinimalAdminServer(config, nil)
err := adminServer.Start()
if err != nil {
t.Fatalf("Failed to start admin server: %v", err)
}
defer adminServer.Stop()
// Register multiple workers with different capabilities
workers := []*types.Worker{
{
ID: "ec-specialist-1",
Address: "localhost:9001",
Capabilities: []types.TaskType{types.TaskTypeErasureCoding},
MaxConcurrent: 1,
Status: "active",
CurrentLoad: 0,
LastHeartbeat: time.Now(),
},
{
ID: "vacuum-worker-1",
Address: "localhost:9002",
Capabilities: []types.TaskType{types.TaskTypeVacuum},
MaxConcurrent: 2,
Status: "active",
CurrentLoad: 0,
LastHeartbeat: time.Now(),
},
{
ID: "multi-capability-worker-1",
Address: "localhost:9003",
Capabilities: []types.TaskType{types.TaskTypeVacuum, types.TaskTypeErasureCoding},
MaxConcurrent: 2,
Status: "active",
CurrentLoad: 0,
LastHeartbeat: time.Now(),
},
}
for _, worker := range workers {
err = adminServer.RegisterWorker(worker)
if err != nil {
t.Fatalf("Failed to register worker %s: %v", worker.ID, err)
}
t.Logf("✓ Registered worker %s with capabilities %v", worker.ID, worker.Capabilities)
}
// Create test work directory
workDir := filepath.Join(os.TempDir(), "seaweedfs_workflow_test")
err = os.MkdirAll(workDir, 0755)
if err != nil {
t.Fatalf("Failed to create work directory: %v", err)
}
defer os.RemoveAll(workDir)
// Create multiple tasks of different types
tasks := []*types.Task{
{
ID: "ec-workflow-1",
Type: types.TaskTypeErasureCoding,
VolumeID: 11111,
Server: "localhost:8080",
Status: types.TaskStatusPending,
Priority: types.TaskPriorityHigh,
Parameters: map[string]interface{}{
"volume_size": int64(50 * 1024 * 1024 * 1024),
"master_client": "localhost:9333",
"work_dir": workDir,
"collection": "workflow_test",
},
CreatedAt: time.Now(),
},
{
ID: "vacuum-workflow-1",
Type: types.TaskTypeVacuum,
VolumeID: 22222,
Server: "localhost:8081",
Status: types.TaskStatusPending,
Priority: types.TaskPriorityNormal,
Parameters: map[string]interface{}{
"garbage_threshold": "0.4",
"volume_size": int64(20 * 1024 * 1024 * 1024),
},
CreatedAt: time.Now(),
},
{
ID: "ec-workflow-2",
Type: types.TaskTypeErasureCoding,
VolumeID: 33333,
Server: "localhost:8082",
Status: types.TaskStatusPending,
Priority: types.TaskPriorityNormal,
Parameters: map[string]interface{}{
"volume_size": int64(80 * 1024 * 1024 * 1024),
"master_client": "localhost:9333",
"work_dir": workDir,
"collection": "workflow_test",
},
CreatedAt: time.Now(),
},
}
// Queue all tasks
for _, task := range tasks {
err = adminServer.QueueTask(task)
if err != nil {
t.Fatalf("Failed to queue task %s: %v", task.ID, err)
}
t.Logf("✓ Queued task %s (%s) for volume %d", task.ID, task.Type, task.VolumeID)
}
// Test task assignment to appropriate workers
t.Logf("Testing task assignments to appropriate workers")
// EC specialist should get EC tasks
assignedTask, err := adminServer.RequestTask("ec-specialist-1", []types.TaskType{types.TaskTypeErasureCoding})
if err != nil {
t.Errorf("Failed to request task for EC specialist: %v", err)
} else if assignedTask != nil {
t.Logf("✓ EC specialist got task: %s (%s)", assignedTask.ID, assignedTask.Type)
// Complete the task
err = adminServer.UpdateTaskProgress(assignedTask.ID, 100.0)
if err != nil {
t.Errorf("Failed to update progress: %v", err)
}
err = adminServer.CompleteTask(assignedTask.ID, true, "")
if err != nil {
t.Errorf("Failed to complete task: %v", err)
}
t.Logf("✓ EC task completed by specialist")
}
// Vacuum worker should get vacuum tasks
assignedTask, err = adminServer.RequestTask("vacuum-worker-1", []types.TaskType{types.TaskTypeVacuum})
if err != nil {
t.Errorf("Failed to request task for vacuum worker: %v", err)
} else if assignedTask != nil {
t.Logf("✓ Vacuum worker got task: %s (%s)", assignedTask.ID, assignedTask.Type)
// Complete the task
err = adminServer.UpdateTaskProgress(assignedTask.ID, 100.0)
if err != nil {
t.Errorf("Failed to update progress: %v", err)
}
err = adminServer.CompleteTask(assignedTask.ID, true, "")
if err != nil {
t.Errorf("Failed to complete task: %v", err)
}
t.Logf("✓ Vacuum task completed by vacuum worker")
}
// Multi-capability worker should get remaining tasks
assignedTask, err = adminServer.RequestTask("multi-capability-worker-1", []types.TaskType{types.TaskTypeVacuum, types.TaskTypeErasureCoding})
if err != nil {
t.Errorf("Failed to request task for multi-capability worker: %v", err)
} else if assignedTask != nil {
t.Logf("✓ Multi-capability worker got task: %s (%s)", assignedTask.ID, assignedTask.Type)
// Complete the task
err = adminServer.UpdateTaskProgress(assignedTask.ID, 100.0)
if err != nil {
t.Errorf("Failed to update progress: %v", err)
}
err = adminServer.CompleteTask(assignedTask.ID, true, "")
if err != nil {
t.Errorf("Failed to complete task: %v", err)
}
t.Logf("✓ Task completed by multi-capability worker")
}
// Check final workflow statistics
stats := adminServer.GetSystemStats()
t.Logf("✓ Final workflow stats: Active tasks=%d, Queued tasks=%d, Active workers=%d, Total tasks=%d",
stats.ActiveTasks, stats.QueuedTasks, stats.ActiveWorkers, stats.TotalTasks)
history := adminServer.GetTaskHistory()
t.Logf("✓ Workflow history contains %d completed tasks", len(history))
// Analyze task completion by type
ecTasks := 0
vacuumTasks := 0
for _, entry := range history {
switch entry.TaskType {
case types.TaskTypeErasureCoding:
ecTasks++
t.Logf(" EC: %s - Worker: %s, Duration: %v",
entry.TaskID, entry.WorkerID, entry.Duration)
case types.TaskTypeVacuum:
vacuumTasks++
t.Logf(" Vacuum: %s - Worker: %s, Duration: %v",
entry.TaskID, entry.WorkerID, entry.Duration)
}
}
t.Logf("✓ Completed tasks: %d EC, %d Vacuum", ecTasks, vacuumTasks)
t.Logf("✅ EC workflow test completed successfully")
}

346
weed/admin/task/example_usage.go

@ -1,346 +0,0 @@
package task
import (
"time"
"github.com/seaweedfs/seaweedfs/weed/glog"
"github.com/seaweedfs/seaweedfs/weed/wdclient"
"github.com/seaweedfs/seaweedfs/weed/worker/types"
)
// ExampleUsage demonstrates how to use the task distribution system
func ExampleUsage() {
glog.Infof("=== SeaweedFS Task Distribution System Example ===")
// Example 1: Setting up the Admin Server
setupAdminServerExample()
// Example 2: Simulating Workers
simulateWorkersExample()
// Example 3: Running Simulations
runSimulationsExample()
// Example 4: Demonstrating Features
demonstrateFeaturesExample()
}
// setupAdminServerExample shows how to set up the admin server
func setupAdminServerExample() {
glog.Infof("\n--- Example 1: Setting up Admin Server ---")
// Create master client (in real usage, this would connect to actual master)
masterClient := &wdclient.MasterClient{} // Simplified for example
// Create admin server configuration
config := &AdminConfig{
ScanInterval: 30 * time.Minute,
WorkerTimeout: 5 * time.Minute,
TaskTimeout: 10 * time.Minute,
MaxRetries: 3,
ReconcileInterval: 5 * time.Minute,
EnableFailureRecovery: true,
MaxConcurrentTasks: 10,
}
// Create admin server
adminServer := NewAdminServer(config, masterClient)
// Start the admin server
if err := adminServer.Start(); err != nil {
glog.Errorf("Failed to start admin server: %v", err)
return
}
glog.Infof("✓ Admin server started with configuration:")
glog.Infof(" - Scan Interval: %v", config.ScanInterval)
glog.Infof(" - Worker Timeout: %v", config.WorkerTimeout)
glog.Infof(" - Max Concurrent Tasks: %d", config.MaxConcurrentTasks)
// Simulate some operations
time.Sleep(2 * time.Second)
// Stop the admin server
adminServer.Stop()
glog.Infof("✓ Admin server stopped gracefully")
}
// simulateWorkersExample shows how workers would register and operate
func simulateWorkersExample() {
glog.Infof("\n--- Example 2: Worker Registration and Operation ---")
// Create mock workers
workers := []*types.Worker{
{
ID: "worker-ec-01",
Address: "192.168.1.100:8080",
Capabilities: []types.TaskType{types.TaskTypeErasureCoding},
MaxConcurrent: 2,
Status: "active",
CurrentLoad: 0,
},
{
ID: "worker-vacuum-01",
Address: "192.168.1.101:8080",
Capabilities: []types.TaskType{types.TaskTypeVacuum},
MaxConcurrent: 3,
Status: "active",
CurrentLoad: 0,
},
{
ID: "worker-multi-01",
Address: "192.168.1.102:8080",
Capabilities: []types.TaskType{types.TaskTypeErasureCoding, types.TaskTypeVacuum},
MaxConcurrent: 2,
Status: "active",
CurrentLoad: 0,
},
}
// Create worker registry
registry := NewWorkerRegistry()
// Register workers
for _, worker := range workers {
if err := registry.RegisterWorker(worker); err != nil {
glog.Errorf("Failed to register worker %s: %v", worker.ID, err)
} else {
glog.Infof("✓ Registered worker %s with capabilities: %v", worker.ID, worker.Capabilities)
}
}
// Demonstrate worker selection
bestECWorker := registry.GetBestWorkerForTask(types.TaskTypeErasureCoding)
if bestECWorker != nil {
glog.Infof("✓ Best worker for EC tasks: %s", bestECWorker.ID)
}
bestVacuumWorker := registry.GetBestWorkerForTask(types.TaskTypeVacuum)
if bestVacuumWorker != nil {
glog.Infof("✓ Best worker for vacuum tasks: %s", bestVacuumWorker.ID)
}
// Show registry statistics
stats := registry.GetRegistryStats()
glog.Infof("✓ Registry statistics: %+v", stats)
}
// runSimulationsExample shows how to run simulation scenarios
func runSimulationsExample() {
glog.Infof("\n--- Example 3: Running Simulation Scenarios ---")
// Note: Simulation framework moved to simulation package
// To use: simulationRunner := simulation.NewComprehensiveSimulationRunner()
// simulationRunner.RunAllComprehensiveTests()
glog.Infof("✅ Simulation framework available in separate package")
glog.Infof("Use simulation.NewComprehensiveSimulationRunner() to access comprehensive testing")
}
// demonstrateFeaturesExample shows key system features
func demonstrateFeaturesExample() {
glog.Infof("\n--- Example 4: Key System Features ---")
// Feature 1: Task Discovery
demonstrateTaskDiscovery()
// Feature 2: Volume State Tracking
demonstrateVolumeStateTracking()
// Feature 3: Failure Handling
demonstrateFailureHandling()
// Feature 4: Task Scheduling
demonstrateTaskScheduling()
}
// demonstrateTaskDiscovery shows how task discovery works
func demonstrateTaskDiscovery() {
glog.Infof("\n Feature 1: Task Discovery")
// Create mock volumes
volumes := []*VolumeInfo{
{
ID: 1,
Size: 28 * 1024 * 1024 * 1024, // 28GB (93% of 30GB)
Collection: "photos",
DeletedByteCount: 0,
ReadOnly: false,
ModifiedAtSecond: time.Now().Add(-2 * time.Hour).Unix(), // 2 hours old
},
{
ID: 2,
Size: 20 * 1024 * 1024 * 1024, // 20GB
Collection: "documents",
DeletedByteCount: 8 * 1024 * 1024 * 1024, // 8GB garbage (40%)
ReadOnly: false,
ModifiedAtSecond: time.Now().Add(-1 * time.Hour).Unix(), // 1 hour old
},
}
// Create detectors
ecDetector := NewECDetector()
vacuumDetector := NewVacuumDetector()
// Test EC detection
ecCandidates, _ := ecDetector.DetectECCandidates(volumes)
glog.Infof(" ✓ EC detector found %d candidates", len(ecCandidates))
for _, candidate := range ecCandidates {
glog.Infof(" - Volume %d: %s (priority: %d)", candidate.VolumeID, candidate.Reason, candidate.Priority)
}
// Test vacuum detection
vacuumCandidates, _ := vacuumDetector.DetectVacuumCandidates(volumes)
glog.Infof(" ✓ Vacuum detector found %d candidates", len(vacuumCandidates))
for _, candidate := range vacuumCandidates {
glog.Infof(" - Volume %d: %s (priority: %d)", candidate.VolumeID, candidate.Reason, candidate.Priority)
}
}
// demonstrateVolumeStateTracking shows volume state management
func demonstrateVolumeStateTracking() {
glog.Infof("\n Feature 2: Volume State Tracking")
// Create volume state tracker
tracker := NewVolumeStateTracker(nil, 5*time.Minute)
// Reserve volumes for tasks
tracker.ReserveVolume(1, "task-ec-001")
tracker.ReserveVolume(2, "task-vacuum-001")
glog.Infof(" ✓ Reserved volumes for tasks")
// Check reservations
if tracker.IsVolumeReserved(1) {
glog.Infof(" ✓ Volume 1 is correctly reserved")
}
// Record volume changes
tracker.RecordVolumeChange(1, types.TaskTypeErasureCoding, "task-ec-001")
glog.Infof(" ✓ Recorded volume change for EC completion")
// Get pending changes
if change := tracker.GetPendingChange(1); change != nil {
glog.Infof(" ✓ Pending change found: %s for volume %d", change.ChangeType, change.VolumeID)
}
// Release reservation
tracker.ReleaseVolume(2, "task-vacuum-001")
glog.Infof(" ✓ Released volume reservation")
// Show statistics
stats := tracker.GetStats()
glog.Infof(" ✓ Tracker statistics: %+v", stats)
}
// demonstrateFailureHandling shows failure recovery mechanisms
func demonstrateFailureHandling() {
glog.Infof("\n Feature 3: Failure Handling")
// Create failure handler
config := DefaultAdminConfig()
handler := NewFailureHandler(config)
// Create mock task
task := &InProgressTask{
Task: &types.Task{
ID: "test-task-001",
Type: types.TaskTypeErasureCoding,
VolumeID: 1,
RetryCount: 0,
},
WorkerID: "worker-01",
StartedAt: time.Now(),
LastUpdate: time.Now().Add(-30 * time.Minute), // 30 minutes ago
Progress: 45.0,
}
// Demonstrate different failure scenarios
glog.Infof(" ✓ Simulating worker timeout scenario")
handler.HandleWorkerTimeout("worker-01", []*InProgressTask{task})
glog.Infof(" ✓ Simulating stuck task scenario")
handler.HandleTaskStuck(task)
glog.Infof(" ✓ Simulating duplicate task detection")
handler.HandleDuplicateTask("existing-task", "duplicate-task", 1)
// Show failure statistics
stats := handler.GetFailureStats()
glog.Infof(" ✓ Failure handler statistics: %+v", stats)
}
// demonstrateTaskScheduling shows task scheduling logic
func demonstrateTaskScheduling() {
glog.Infof("\n Feature 4: Task Scheduling")
// Create worker registry and task queue
registry := NewWorkerRegistry()
queue := NewPriorityTaskQueue()
scheduler := NewTaskScheduler(registry, queue)
// Add mock worker
worker := &types.Worker{
ID: "scheduler-worker-01",
Capabilities: []types.TaskType{types.TaskTypeErasureCoding, types.TaskTypeVacuum},
MaxConcurrent: 2,
Status: "active",
CurrentLoad: 0,
}
registry.RegisterWorker(worker)
// Create mock tasks with different priorities
highPriorityTask := &types.Task{
ID: "high-priority-task",
Type: types.TaskTypeErasureCoding,
Priority: types.TaskPriorityHigh,
VolumeID: 1,
}
normalPriorityTask := &types.Task{
ID: "normal-priority-task",
Type: types.TaskTypeVacuum,
Priority: types.TaskPriorityNormal,
VolumeID: 2,
}
// Add tasks to queue
queue.Push(normalPriorityTask)
queue.Push(highPriorityTask) // Should be prioritized
glog.Infof(" ✓ Added tasks to priority queue (size: %d)", queue.Size())
// Test worker selection
selectedWorker := scheduler.SelectWorker(highPriorityTask, []*types.Worker{worker})
if selectedWorker != nil {
glog.Infof(" ✓ Selected worker %s for high-priority task", selectedWorker.ID)
}
// Test task retrieval
nextTask := scheduler.GetNextTask("scheduler-worker-01", []types.TaskType{types.TaskTypeErasureCoding, types.TaskTypeVacuum})
if nextTask != nil {
glog.Infof(" ✓ Next task for worker: %s (priority: %d)", nextTask.ID, nextTask.Priority)
}
glog.Infof(" ✓ Task scheduling demonstration complete")
}
// RunComprehensiveDemo runs a full demonstration of the system
func RunComprehensiveDemo() {
glog.Infof("Starting comprehensive task distribution system demonstration...")
// Run comprehensive example
ExampleUsage()
// Note: To run the comprehensive simulation framework, use:
// simulationRunner := simulation.NewComprehensiveSimulationRunner()
// simulationRunner.RunAllComprehensiveTests()
glog.Infof("=== Comprehensive demonstration complete ===")
glog.Infof("💡 To run comprehensive simulations, use the simulation package separately")
glog.Infof("Step 9: Comprehensive Simulation Testing")
glog.Infof("Note: Simulation framework moved to separate 'simulation' package")
glog.Infof("To run simulations: simulation.NewComprehensiveSimulationRunner().RunAllComprehensiveTests()")
glog.Infof("✅ Simulation framework available in separate package")
glog.Infof("")
}

123
weed/admin/task/failure_handler.go

@ -1,123 +0,0 @@
package task
import (
"time"
"github.com/seaweedfs/seaweedfs/weed/glog"
)
// FailureHandler handles various failure scenarios in the task system
type FailureHandler struct {
config *AdminConfig
}
// NewFailureHandler creates a new failure handler
func NewFailureHandler(config *AdminConfig) *FailureHandler {
return &FailureHandler{
config: config,
}
}
// HandleWorkerTimeout handles worker timeout scenarios
func (fh *FailureHandler) HandleWorkerTimeout(workerID string, affectedTasks []*InProgressTask) {
glog.Warningf("Handling worker timeout for worker %s with %d affected tasks", workerID, len(affectedTasks))
for _, task := range affectedTasks {
fh.handleTaskFailure(task, "worker_timeout", "Worker became unresponsive")
}
}
// HandleTaskStuck handles stuck task scenarios
func (fh *FailureHandler) HandleTaskStuck(task *InProgressTask) {
glog.Warningf("Handling stuck task %s (no progress for %v)", task.Task.ID, time.Since(task.LastUpdate))
fh.handleTaskFailure(task, "task_stuck", "Task made no progress within timeout period")
}
// HandleTaskFailure handles general task failure scenarios
func (fh *FailureHandler) HandleTaskFailure(task *InProgressTask, reason string, details string) {
glog.Errorf("Handling task failure for task %s: %s - %s", task.Task.ID, reason, details)
fh.handleTaskFailure(task, reason, details)
}
// handleTaskFailure is the internal handler for task failures
func (fh *FailureHandler) handleTaskFailure(task *InProgressTask, reason string, details string) {
// Record failure reason
task.Task.Error = details
// Determine if task should be retried
if task.Task.RetryCount < fh.config.MaxRetries {
fh.scheduleRetry(task, reason)
} else {
fh.markTaskFailed(task, reason)
}
}
// scheduleRetry schedules a task for retry
func (fh *FailureHandler) scheduleRetry(task *InProgressTask, reason string) {
task.Task.RetryCount++
// Calculate retry delay with exponential backoff
retryDelay := time.Duration(task.Task.RetryCount) * 5 * time.Minute
task.Task.ScheduledAt = time.Now().Add(retryDelay)
glog.Infof("Scheduling retry %d/%d for task %s (reason: %s, delay: %v)",
task.Task.RetryCount, fh.config.MaxRetries, task.Task.ID, reason, retryDelay)
}
// markTaskFailed permanently marks a task as failed
func (fh *FailureHandler) markTaskFailed(task *InProgressTask, reason string) {
glog.Errorf("Task %s permanently failed after %d retries (reason: %s)",
task.Task.ID, task.Task.RetryCount, reason)
// Could trigger alerts or notifications here
fh.sendFailureAlert(task, reason)
}
// sendFailureAlert sends alerts for permanently failed tasks
func (fh *FailureHandler) sendFailureAlert(task *InProgressTask, reason string) {
// In a real implementation, this would:
// 1. Send notifications to administrators
// 2. Update monitoring dashboards
// 3. Log to audit trails
// 4. Possibly trigger automatic remediation
glog.Errorf("ALERT: Task permanently failed - ID: %s, Type: %s, Volume: %d, Reason: %s",
task.Task.ID, task.Task.Type, task.Task.VolumeID, reason)
}
// HandleDuplicateTask handles duplicate task detection
func (fh *FailureHandler) HandleDuplicateTask(existingTaskID string, duplicateTaskID string, volumeID uint32) {
glog.Warningf("Detected duplicate task for volume %d: existing=%s, duplicate=%s",
volumeID, existingTaskID, duplicateTaskID)
// Cancel the duplicate task
// In a real implementation, this would send a cancellation signal
}
// HandleResourceExhaustion handles resource exhaustion scenarios
func (fh *FailureHandler) HandleResourceExhaustion(workerID string, taskType string) {
glog.Warningf("Worker %s reported resource exhaustion for task type %s", workerID, taskType)
// Could implement:
// 1. Temporary worker blacklisting
// 2. Task redistribution
// 3. Resource monitoring alerts
}
// GetFailureStats returns failure statistics
func (fh *FailureHandler) GetFailureStats() map[string]interface{} {
// In a real implementation, this would track:
// - Failure rates by type
// - Worker reliability scores
// - Task retry statistics
// - System health metrics
return map[string]interface{}{
"enabled": true,
"max_retries": fh.config.MaxRetries,
"task_timeout": fh.config.TaskTimeout.String(),
"worker_timeout": fh.config.WorkerTimeout.String(),
}
}

486
weed/admin/task/master_sync.go

@ -1,486 +0,0 @@
package task
import (
"context"
"fmt"
"strconv"
"strings"
"time"
"github.com/seaweedfs/seaweedfs/weed/glog"
"github.com/seaweedfs/seaweedfs/weed/pb/master_pb"
"github.com/seaweedfs/seaweedfs/weed/wdclient"
"github.com/seaweedfs/seaweedfs/weed/worker/tasks/erasure_coding"
"github.com/seaweedfs/seaweedfs/weed/worker/tasks/vacuum"
"github.com/seaweedfs/seaweedfs/weed/worker/types"
)
// MasterSynchronizer handles periodic synchronization with the master server
type MasterSynchronizer struct {
masterClient *wdclient.MasterClient
volumeStateManager *VolumeStateManager
adminServer *AdminServer
syncInterval time.Duration
stopCh chan struct{}
volumeSizeLimitMB uint64 // Volume size limit from master in MB
}
// NewMasterSynchronizer creates a new master synchronizer
func NewMasterSynchronizer(masterClient *wdclient.MasterClient, vsm *VolumeStateManager, admin *AdminServer) *MasterSynchronizer {
return &MasterSynchronizer{
masterClient: masterClient,
volumeStateManager: vsm,
adminServer: admin,
syncInterval: 30 * time.Second, // Default 30 second sync interval
stopCh: make(chan struct{}),
}
}
// Start begins the periodic master synchronization
func (ms *MasterSynchronizer) Start() {
glog.Infof("Starting master synchronization with interval %v", ms.syncInterval)
go func() {
// Immediate sync on startup
ms.performSync()
ticker := time.NewTicker(ms.syncInterval)
defer ticker.Stop()
for {
select {
case <-ticker.C:
ms.performSync()
case <-ms.stopCh:
glog.Infof("Master synchronization stopped")
return
}
}
}()
}
// Stop stops the master synchronization
func (ms *MasterSynchronizer) Stop() {
close(ms.stopCh)
}
// performSync executes a single synchronization cycle
func (ms *MasterSynchronizer) performSync() {
glog.V(1).Infof("Starting master sync cycle")
startTime := time.Now()
// Get volume list from master
volumeData, err := ms.getVolumeListFromMaster()
if err != nil {
glog.Errorf("Failed to get volume list from master: %v", err)
return
}
// Update volume size limit from master
if volumeData.VolumeSizeLimitMb > 0 {
ms.volumeSizeLimitMB = volumeData.VolumeSizeLimitMb
glog.V(2).Infof("Updated volume size limit to %d MB from master", ms.volumeSizeLimitMB)
}
// Merge data into volume state manager
err = ms.mergeVolumeData(volumeData)
if err != nil {
glog.Errorf("Failed to merge volume data: %v", err)
return
}
// Detect volumes needing work
candidates := ms.detectMaintenanceCandidates(volumeData)
// Process candidates for task assignment
ms.processCandidates(candidates)
duration := time.Since(startTime)
glog.V(1).Infof("Master sync completed in %v, found %d maintenance candidates",
duration, len(candidates))
}
// getVolumeListFromMaster retrieves the current volume topology from master
func (ms *MasterSynchronizer) getVolumeListFromMaster() (*master_pb.VolumeListResponse, error) {
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
err := ms.masterClient.WithClient(false, func(client master_pb.SeaweedClient) error {
req := &master_pb.VolumeListRequest{}
response, err := client.VolumeList(ctx, req)
if err != nil {
return fmt.Errorf("VolumeList RPC failed: %v", err)
}
volumeData = response
return nil
})
if err != nil {
return nil, err
}
return volumeData, nil
}
// VolumeMaintenanceCandidate represents a volume that needs maintenance
type VolumeMaintenanceCandidate struct {
VolumeID uint32
Server string
TaskType string
Priority TaskPriority
Reason string
VolumeInfo *VolumeInfo
ECShardInfo map[int]*ShardInfo
}
// mergeVolumeData merges master volume data into the volume state manager
func (ms *MasterSynchronizer) mergeVolumeData(data *master_pb.VolumeListResponse) error {
if data.TopologyInfo == nil {
return fmt.Errorf("empty topology info from master")
}
volumes := make(map[uint32]*VolumeInfo)
ecShards := make(map[uint32]map[int]*ShardInfo)
serverCapacity := make(map[string]*CapacityInfo)
// Extract volume information from topology
ms.extractVolumesFromTopology(data.TopologyInfo, volumes, ecShards, serverCapacity)
// Update volume state manager
err := ms.volumeStateManager.SyncWithMasterData(volumes, ecShards, serverCapacity)
if err != nil {
return fmt.Errorf("failed to sync with volume state manager: %v", err)
}
glog.V(2).Infof("Synced %d volumes, %d EC volume groups, %d servers",
len(volumes), len(ecShards), len(serverCapacity))
return nil
}
// extractVolumesFromTopology extracts volume and capacity data from master topology
func (ms *MasterSynchronizer) extractVolumesFromTopology(
topology *master_pb.TopologyInfo,
volumes map[uint32]*VolumeInfo,
ecShards map[uint32]map[int]*ShardInfo,
serverCapacity map[string]*CapacityInfo) {
for _, dcInfo := range topology.DataCenterInfos {
for _, rackInfo := range dcInfo.RackInfos {
for _, nodeInfo := range rackInfo.DataNodeInfos {
serverID := fmt.Sprintf("%s:%d", nodeInfo.Id, nodeInfo.GrpcPort)
// Initialize server capacity info
if serverCapacity[serverID] == nil {
serverCapacity[serverID] = &CapacityInfo{
Server: serverID,
}
}
// Process disk information
for diskType, diskInfo := range nodeInfo.DiskInfos {
ms.processDiskInfo(diskInfo, diskType, serverID, volumes, ecShards, serverCapacity)
}
}
}
}
}
// processDiskInfo processes disk information for a specific server
func (ms *MasterSynchronizer) processDiskInfo(
diskInfo *master_pb.DiskInfo,
diskType string,
serverID string,
volumes map[uint32]*VolumeInfo,
ecShards map[uint32]map[int]*ShardInfo,
serverCapacity map[string]*CapacityInfo) {
// Update capacity information
capacity := serverCapacity[serverID]
volumeSizeBytes := int64(ms.volumeSizeLimitMB) * 1024 * 1024 // Convert MB to bytes
capacity.TotalCapacity += int64(diskInfo.MaxVolumeCount) * volumeSizeBytes
capacity.UsedCapacity += int64(diskInfo.ActiveVolumeCount) * volumeSizeBytes
// Process regular volumes
for _, volInfo := range diskInfo.VolumeInfos {
volumes[volInfo.Id] = &VolumeInfo{
ID: volInfo.Id,
Size: volInfo.Size,
Collection: volInfo.Collection,
FileCount: volInfo.FileCount,
DeleteCount: volInfo.DeleteCount,
DeletedByteCount: volInfo.DeletedByteCount,
ReadOnly: volInfo.ReadOnly,
Server: serverID,
DiskType: diskType,
ModifiedAtSecond: volInfo.ModifiedAtSecond,
}
}
// Process EC shards
for _, shardInfo := range diskInfo.EcShardInfos {
volumeID := shardInfo.Id
if ecShards[volumeID] == nil {
ecShards[volumeID] = make(map[int]*ShardInfo)
}
// Extract shard IDs from ec_index_bits
for shardID := 0; shardID < 14; shardID++ {
if (shardInfo.EcIndexBits & (1 << uint(shardID))) != 0 {
ecShards[volumeID][shardID] = &ShardInfo{
ShardID: shardID,
Server: serverID,
Status: ShardStatusExists,
Size: 0, // Size not available in shard info
}
}
}
}
}
// detectMaintenanceCandidates identifies volumes that need maintenance
func (ms *MasterSynchronizer) detectMaintenanceCandidates(data *master_pb.VolumeListResponse) []*VolumeMaintenanceCandidate {
var candidates []*VolumeMaintenanceCandidate
// Get current volume states
currentVolumes := ms.volumeStateManager.GetAllVolumeStates()
for volumeID, volumeState := range currentVolumes {
// Skip volumes with in-progress tasks
if len(volumeState.InProgressTasks) > 0 {
continue
}
// Check for EC encoding candidates
if candidate := ms.checkECEncodingCandidate(volumeID, volumeState); candidate != nil {
candidates = append(candidates, candidate)
}
// Check for vacuum candidates
if candidate := ms.checkVacuumCandidate(volumeID, volumeState); candidate != nil {
candidates = append(candidates, candidate)
}
// Check for EC rebuild candidates
if candidate := ms.checkECRebuildCandidate(volumeID, volumeState); candidate != nil {
candidates = append(candidates, candidate)
}
}
return candidates
}
// EC encoding criteria - using configuration from EC detector
func (ms *MasterSynchronizer) checkECEncodingCandidate(volumeID uint32, state *VolumeState) *VolumeMaintenanceCandidate {
volume := state.CurrentState
if volume == nil {
return nil
}
// Get the current configuration from the EC detector
ecDetector, _ := erasure_coding.GetSharedInstances()
if ecDetector == nil || !ecDetector.IsEnabled() {
return nil
}
// Get configuration values from the detector
fullnessThreshold := ecDetector.GetFullnessRatio()
quietForSeconds := ecDetector.GetQuietForSeconds()
collectionFilter := ecDetector.GetCollectionFilter()
// EC encoding criteria:
// 1. Volume meets fullness ratio threshold
// 2. Volume has been quiet for required duration
// 3. Collection filter matches (if specified)
// 4. Not already EC encoded
// Check fullness ratio (if we have size info)
if volume.Size == 0 {
return nil
}
// Calculate fullness ratio (assuming total capacity is close to actual size for near-full volumes)
// For a more accurate calculation, we'd need the volume's max capacity
fullnessRatio := float64(volume.Size-volume.DeletedByteCount) / float64(volume.Size)
if fullnessRatio < fullnessThreshold {
return nil
}
// Check collection filter if specified
if collectionFilter != "" {
// Parse comma-separated collections
allowedCollections := make(map[string]bool)
for _, collection := range strings.Split(collectionFilter, ",") {
allowedCollections[strings.TrimSpace(collection)] = true
}
// Skip if volume's collection is not in the allowed list
if !allowedCollections[volume.Collection] {
return nil
}
}
// Check quiet duration using volume's last modification time
now := time.Now()
lastModified := time.Unix(volume.ModifiedAtSecond, 0)
timeSinceModification := now.Sub(lastModified)
if timeSinceModification < time.Duration(quietForSeconds)*time.Second {
return nil // Volume hasn't been quiet long enough
}
return &VolumeMaintenanceCandidate{
VolumeID: volumeID,
Server: volume.Server,
TaskType: "ec_encode",
Priority: types.TaskPriorityLow, // EC is typically low priority
Reason: fmt.Sprintf("Volume meets EC criteria: fullness=%.1f%% (>%.1f%%), quiet for %s (>%ds), collection='%s'", fullnessRatio*100, fullnessThreshold*100, timeSinceModification.Truncate(time.Second), quietForSeconds, volume.Collection),
VolumeInfo: volume,
}
}
// checkVacuumCandidate checks if a volume is a candidate for vacuum
func (ms *MasterSynchronizer) checkVacuumCandidate(volumeID uint32, state *VolumeState) *VolumeMaintenanceCandidate {
volume := state.CurrentState
if volume == nil || volume.ReadOnly {
return nil
}
// Get the current configuration from the vacuum detector
vacuumDetector, _ := vacuum.GetSharedInstances()
if vacuumDetector == nil || !vacuumDetector.IsEnabled() {
return nil
}
// Get configuration values from the detector
garbageThreshold := vacuumDetector.GetGarbageThreshold()
minVolumeAge := vacuumDetector.GetMinVolumeAge()
// Vacuum criteria:
// 1. Volume meets garbage threshold
// 2. Volume is old enough (respects minimum age)
// 3. Volume has sufficient size
// Check minimum volume size (avoid vacuum on tiny volumes)
if volume.Size == 0 {
return nil
}
// Check garbage ratio
deletedRatio := float64(volume.DeletedByteCount) / float64(volume.Size)
if deletedRatio < garbageThreshold {
return nil
}
// Check minimum volume age using volume's last modification time
now := time.Now()
lastModified := time.Unix(volume.ModifiedAtSecond, 0)
volumeAge := now.Sub(lastModified)
if volumeAge < minVolumeAge {
return nil // Volume is too new for vacuum
}
// Determine priority based on garbage ratio
priority := types.TaskPriorityNormal
if deletedRatio > 0.6 { // High garbage ratio gets higher priority
priority = types.TaskPriorityHigh
}
return &VolumeMaintenanceCandidate{
VolumeID: volumeID,
Server: volume.Server,
TaskType: "vacuum",
Priority: priority,
Reason: fmt.Sprintf("Volume meets vacuum criteria: garbage=%.1f%% (>%.1f%%), age=%s (>%s)",
deletedRatio*100, garbageThreshold*100, volumeAge.Truncate(time.Second), minVolumeAge.Truncate(time.Second)),
VolumeInfo: volume,
}
}
// checkECRebuildCandidate checks if an EC volume needs shard rebuilding
func (ms *MasterSynchronizer) checkECRebuildCandidate(volumeID uint32, state *VolumeState) *VolumeMaintenanceCandidate {
// For now, skip EC rebuild detection as it requires more complex shard state tracking
// This would be implemented when the volume state manager provides proper EC shard access
return nil
}
// processCandidates attempts to assign tasks for maintenance candidates
func (ms *MasterSynchronizer) processCandidates(candidates []*VolumeMaintenanceCandidate) {
for _, candidate := range candidates {
// Check if we can assign this task
if !ms.canAssignCandidate(candidate) {
glog.V(2).Infof("Cannot assign task for volume %d: insufficient capacity or no workers",
candidate.VolumeID)
continue
}
// Create and queue the task
task := ms.createTaskFromCandidate(candidate)
if task != nil {
ms.adminServer.QueueTask(task)
glog.V(1).Infof("Queued %s task for volume %d on server %s: %s",
candidate.TaskType, candidate.VolumeID, candidate.Server, candidate.Reason)
}
}
}
// canAssignCandidate checks if a candidate can be assigned (capacity, workers available)
func (ms *MasterSynchronizer) canAssignCandidate(candidate *VolumeMaintenanceCandidate) bool {
// Check if server has capacity for the task
if candidate.TaskType == "ec_encode" {
// EC encoding requires significant temporary space
requiredSpace := int64(candidate.VolumeInfo.Size * 2) // Estimate 2x volume size needed
if !ms.volumeStateManager.CanAssignVolumeToServer(requiredSpace, candidate.Server) {
return false
}
}
// Check if we have workers capable of this task type
availableWorkers := ms.adminServer.GetAvailableWorkers(candidate.TaskType)
if len(availableWorkers) == 0 {
return false
}
return true
}
// createTaskFromCandidate creates a task from a maintenance candidate
func (ms *MasterSynchronizer) createTaskFromCandidate(candidate *VolumeMaintenanceCandidate) *Task {
now := time.Now()
task := &Task{
ID: generateTaskID(),
Type: TaskType(candidate.TaskType),
VolumeID: candidate.VolumeID,
Priority: candidate.Priority,
Status: types.TaskStatusPending,
CreatedAt: now,
Parameters: map[string]interface{}{
"volume_id": fmt.Sprintf("%d", candidate.VolumeID),
"server": candidate.Server,
"reason": candidate.Reason,
},
}
// Add task-specific parameters
switch candidate.TaskType {
case "ec_encode":
task.Parameters["replication"] = "001" // Default replication for EC
task.Parameters["collection"] = candidate.VolumeInfo.Collection
case "vacuum":
// Get the current garbage threshold from the vacuum detector
vacuumDetector, _ := vacuum.GetSharedInstances()
var garbageThreshold float64 = 0.3 // Default fallback
if vacuumDetector != nil {
garbageThreshold = vacuumDetector.GetGarbageThreshold()
}
task.Parameters["garbage_threshold"] = strconv.FormatFloat(garbageThreshold, 'f', -1, 64)
case "ec_rebuild":
// Add info about which shards need rebuilding
}
return task
}
// Global variable to hold the master volume data
var volumeData *master_pb.VolumeListResponse

324
weed/admin/task/minimal_admin_server.go

@ -1,324 +0,0 @@
package task
import (
"fmt"
"sync"
"time"
"github.com/seaweedfs/seaweedfs/weed/wdclient"
"github.com/seaweedfs/seaweedfs/weed/worker/types"
)
// MinimalAdminConfig contains configuration for the minimal admin server
type MinimalAdminConfig struct {
ScanInterval time.Duration
WorkerTimeout time.Duration
TaskTimeout time.Duration
MaxRetries int
ReconcileInterval time.Duration
EnableFailureRecovery bool
MaxConcurrentTasks int
}
// MinimalAdminServer manages workers and tasks with a simple implementation
type MinimalAdminServer struct {
config *MinimalAdminConfig
masterClient *wdclient.MasterClient
running bool
mutex sync.RWMutex
// Task management
tasks map[string]*types.Task
taskQueue []*types.Task
activeTasks map[string]*types.Task
// Worker management
workers map[string]*types.Worker
workerStatus map[string]*types.WorkerStatus
// Task history
taskHistory []MinimalTaskHistoryEntry
}
// MinimalTaskHistoryEntry represents a single task history entry
type MinimalTaskHistoryEntry struct {
TaskID string
TaskType types.TaskType
VolumeID uint32
WorkerID string
Status types.TaskStatus
StartedAt time.Time
CompletedAt time.Time
Duration time.Duration
ErrorMessage string
}
// MinimalSystemStats represents system statistics
type MinimalSystemStats struct {
ActiveTasks int
QueuedTasks int
ActiveWorkers int
TotalTasks int
}
// NewMinimalAdminServer creates a new minimal admin server
func NewMinimalAdminServer(config *MinimalAdminConfig, masterClient *wdclient.MasterClient) *MinimalAdminServer {
return &MinimalAdminServer{
config: config,
masterClient: masterClient,
tasks: make(map[string]*types.Task),
taskQueue: make([]*types.Task, 0),
activeTasks: make(map[string]*types.Task),
workers: make(map[string]*types.Worker),
workerStatus: make(map[string]*types.WorkerStatus),
taskHistory: make([]MinimalTaskHistoryEntry, 0),
}
}
// Start starts the minimal admin server
func (as *MinimalAdminServer) Start() error {
as.mutex.Lock()
defer as.mutex.Unlock()
if as.running {
return fmt.Errorf("admin server is already running")
}
as.running = true
return nil
}
// Stop stops the minimal admin server
func (as *MinimalAdminServer) Stop() error {
as.mutex.Lock()
defer as.mutex.Unlock()
as.running = false
return nil
}
// RegisterWorker registers a new worker
func (as *MinimalAdminServer) RegisterWorker(worker *types.Worker) error {
as.mutex.Lock()
defer as.mutex.Unlock()
if !as.running {
return fmt.Errorf("admin server is not running")
}
as.workers[worker.ID] = worker
as.workerStatus[worker.ID] = &types.WorkerStatus{
Status: "active",
CurrentLoad: 0,
}
return nil
}
// QueueTask adds a new task to the task queue
func (as *MinimalAdminServer) QueueTask(task *types.Task) error {
as.mutex.Lock()
defer as.mutex.Unlock()
if !as.running {
return fmt.Errorf("admin server is not running")
}
if task.ID == "" {
task.ID = fmt.Sprintf("task-%d", time.Now().UnixNano())
}
task.Status = types.TaskStatusPending
task.CreatedAt = time.Now()
as.tasks[task.ID] = task
as.taskQueue = append(as.taskQueue, task)
return nil
}
// RequestTask requests a task for a worker
func (as *MinimalAdminServer) RequestTask(workerID string, capabilities []types.TaskType) (*types.Task, error) {
as.mutex.Lock()
defer as.mutex.Unlock()
if !as.running {
return nil, fmt.Errorf("admin server is not running")
}
// Check if worker exists
worker, exists := as.workers[workerID]
if !exists {
return nil, fmt.Errorf("worker %s not found", workerID)
}
// Check if worker has capacity
status := as.workerStatus[workerID]
if status.CurrentLoad >= worker.MaxConcurrent {
return nil, nil // No capacity
}
// Find a suitable task
for i, task := range as.taskQueue {
if task.Status != types.TaskStatusPending {
continue
}
// Check if worker can handle this task type
canHandle := false
for _, capability := range capabilities {
if task.Type == capability {
canHandle = true
break
}
}
if canHandle {
// Assign task to worker
task.Status = types.TaskStatusInProgress
task.WorkerID = workerID
now := time.Now()
task.StartedAt = &now
// Move task from queue to active tasks
as.taskQueue = append(as.taskQueue[:i], as.taskQueue[i+1:]...)
as.activeTasks[task.ID] = task
// Update worker load
status.CurrentLoad++
return task, nil
}
}
return nil, nil // No suitable task found
}
// UpdateTaskProgress updates task progress
func (as *MinimalAdminServer) UpdateTaskProgress(taskID string, progress float64) error {
as.mutex.Lock()
defer as.mutex.Unlock()
task, exists := as.tasks[taskID]
if !exists {
return fmt.Errorf("task %s not found", taskID)
}
task.Progress = progress
return nil
}
// CompleteTask marks a task as completed
func (as *MinimalAdminServer) CompleteTask(taskID string, success bool, errorMessage string) error {
as.mutex.Lock()
defer as.mutex.Unlock()
task, exists := as.tasks[taskID]
if !exists {
return fmt.Errorf("task %s not found", taskID)
}
// Update task status
if success {
task.Status = types.TaskStatusCompleted
} else {
task.Status = types.TaskStatusFailed
task.Error = errorMessage
}
now := time.Now()
task.CompletedAt = &now
// Remove from active tasks
delete(as.activeTasks, taskID)
// Update worker load
if task.WorkerID != "" {
if status, exists := as.workerStatus[task.WorkerID]; exists {
status.CurrentLoad--
}
}
// Add to history
var duration time.Duration
if task.StartedAt != nil {
duration = now.Sub(*task.StartedAt)
}
entry := MinimalTaskHistoryEntry{
TaskID: task.ID,
TaskType: task.Type,
VolumeID: task.VolumeID,
WorkerID: task.WorkerID,
Status: task.Status,
StartedAt: *task.StartedAt,
CompletedAt: now,
Duration: duration,
ErrorMessage: errorMessage,
}
as.taskHistory = append(as.taskHistory, entry)
return nil
}
// UpdateWorkerHeartbeat updates worker heartbeat
func (as *MinimalAdminServer) UpdateWorkerHeartbeat(workerID string, status *types.WorkerStatus) error {
as.mutex.Lock()
defer as.mutex.Unlock()
worker, exists := as.workers[workerID]
if !exists {
return fmt.Errorf("worker %s not found", workerID)
}
worker.LastHeartbeat = time.Now()
as.workerStatus[workerID] = status
return nil
}
// GetSystemStats returns system statistics
func (as *MinimalAdminServer) GetSystemStats() *MinimalSystemStats {
as.mutex.RLock()
defer as.mutex.RUnlock()
activeWorkers := 0
for _, status := range as.workerStatus {
if status.Status == "active" {
activeWorkers++
}
}
return &MinimalSystemStats{
ActiveTasks: len(as.activeTasks),
QueuedTasks: len(as.taskQueue),
ActiveWorkers: activeWorkers,
TotalTasks: len(as.tasks),
}
}
// GetQueuedTaskCount returns the number of queued tasks
func (as *MinimalAdminServer) GetQueuedTaskCount() int {
as.mutex.RLock()
defer as.mutex.RUnlock()
return len(as.taskQueue)
}
// GetActiveTaskCount returns the number of active tasks
func (as *MinimalAdminServer) GetActiveTaskCount() int {
as.mutex.RLock()
defer as.mutex.RUnlock()
return len(as.activeTasks)
}
// GetTaskHistory returns task history
func (as *MinimalAdminServer) GetTaskHistory() []MinimalTaskHistoryEntry {
as.mutex.RLock()
defer as.mutex.RUnlock()
// Return a copy of the history
history := make([]MinimalTaskHistoryEntry, len(as.taskHistory))
copy(history, as.taskHistory)
return history
}

434
weed/admin/task/minimal_integration_test.go

@ -1,434 +0,0 @@
package task
import (
"fmt"
"testing"
"time"
"github.com/seaweedfs/seaweedfs/weed/worker/types"
)
// TestMinimalIntegration tests basic admin-worker operational flow using the minimal implementation
func TestMinimalIntegration(t *testing.T) {
t.Logf("Starting minimal integration test")
// Step 1: Create a minimal admin server configuration
config := &MinimalAdminConfig{
ScanInterval: 10 * time.Second,
WorkerTimeout: 30 * time.Second,
TaskTimeout: 2 * time.Hour,
MaxRetries: 3,
ReconcileInterval: 5 * time.Minute,
EnableFailureRecovery: true,
MaxConcurrentTasks: 5,
}
// Step 2: Create minimal admin server with nil master client (for testing)
adminServer := NewMinimalAdminServer(config, nil)
// Step 3: Start admin server
err := adminServer.Start()
if err != nil {
t.Fatalf("Failed to start admin server: %v", err)
}
defer adminServer.Stop()
// Step 4: Test worker registration
t.Logf("Testing worker registration")
worker := &types.Worker{
ID: "test-worker-1",
Address: "localhost:9001",
Capabilities: []types.TaskType{types.TaskTypeVacuum},
MaxConcurrent: 2,
Status: "active",
CurrentLoad: 0,
LastHeartbeat: time.Now(),
}
err = adminServer.RegisterWorker(worker)
if err != nil {
t.Fatalf("Failed to register worker: %v", err)
}
t.Logf("Successfully registered worker %s", worker.ID)
// Step 5: Test task queueing
t.Logf("Testing task queueing")
task := &types.Task{
ID: "test-task-1",
Type: types.TaskTypeVacuum,
VolumeID: 1001,
Server: "localhost:8080",
Status: types.TaskStatusPending,
Priority: types.TaskPriorityNormal,
Parameters: map[string]interface{}{
"garbage_threshold": "0.3",
},
CreatedAt: time.Now(),
}
err = adminServer.QueueTask(task)
if err != nil {
t.Fatalf("Failed to queue task: %v", err)
}
t.Logf("Successfully queued task %s", task.ID)
// Step 6: Test task request by worker
t.Logf("Testing task request")
assignedTask, err := adminServer.RequestTask("test-worker-1", []types.TaskType{types.TaskTypeVacuum})
if err != nil {
t.Fatalf("Failed to request task: %v", err)
}
if assignedTask != nil {
t.Logf("Successfully assigned task %s to worker", assignedTask.ID)
// Step 7: Test task progress updates
t.Logf("Testing task progress updates")
err = adminServer.UpdateTaskProgress(assignedTask.ID, 25.0)
if err != nil {
t.Errorf("Failed to update task progress to 25%%: %v", err)
}
err = adminServer.UpdateTaskProgress(assignedTask.ID, 50.0)
if err != nil {
t.Errorf("Failed to update task progress to 50%%: %v", err)
}
err = adminServer.UpdateTaskProgress(assignedTask.ID, 75.0)
if err != nil {
t.Errorf("Failed to update task progress to 75%%: %v", err)
}
err = adminServer.UpdateTaskProgress(assignedTask.ID, 100.0)
if err != nil {
t.Errorf("Failed to update task progress to 100%%: %v", err)
}
// Step 8: Test task completion
t.Logf("Testing task completion")
err = adminServer.CompleteTask(assignedTask.ID, true, "")
if err != nil {
t.Errorf("Failed to complete task: %v", err)
}
t.Logf("Successfully completed task %s", assignedTask.ID)
} else {
t.Logf("No task was assigned (queue might be empty)")
}
// Step 9: Test basic metrics
t.Logf("Testing basic metrics")
stats := adminServer.GetSystemStats()
if stats != nil {
t.Logf("System stats: Active tasks=%d, Queued tasks=%d, Active workers=%d, Total tasks=%d",
stats.ActiveTasks, stats.QueuedTasks, stats.ActiveWorkers, stats.TotalTasks)
}
queuedCount := adminServer.GetQueuedTaskCount()
activeCount := adminServer.GetActiveTaskCount()
t.Logf("Queue status: %d queued, %d active tasks", queuedCount, activeCount)
// Step 10: Test task history
history := adminServer.GetTaskHistory()
t.Logf("Task history contains %d entries", len(history))
if len(history) > 0 {
lastEntry := history[len(history)-1]
t.Logf("Last task in history: %s (%s) - Status: %s, Duration: %v",
lastEntry.TaskID, lastEntry.TaskType, lastEntry.Status, lastEntry.Duration)
}
t.Logf("Minimal integration test completed successfully")
}
// TestMinimalWorkerHeartbeat tests worker heartbeat functionality
func TestMinimalWorkerHeartbeat(t *testing.T) {
t.Logf("Testing minimal worker heartbeat")
config := &MinimalAdminConfig{
ScanInterval: 10 * time.Second,
WorkerTimeout: 30 * time.Second,
TaskTimeout: 2 * time.Hour,
MaxRetries: 3,
ReconcileInterval: 5 * time.Minute,
EnableFailureRecovery: true,
MaxConcurrentTasks: 5,
}
adminServer := NewMinimalAdminServer(config, nil)
err := adminServer.Start()
if err != nil {
t.Fatalf("Failed to start admin server: %v", err)
}
defer adminServer.Stop()
// Register a worker
worker := &types.Worker{
ID: "heartbeat-worker",
Address: "localhost:9002",
Capabilities: []types.TaskType{types.TaskTypeVacuum},
MaxConcurrent: 1,
Status: "active",
CurrentLoad: 0,
LastHeartbeat: time.Now(),
}
err = adminServer.RegisterWorker(worker)
if err != nil {
t.Fatalf("Failed to register worker: %v", err)
}
// Test heartbeat update
status := &types.WorkerStatus{
Status: "active",
CurrentLoad: 0,
}
err = adminServer.UpdateWorkerHeartbeat("heartbeat-worker", status)
if err != nil {
t.Errorf("Failed to update worker heartbeat: %v", err)
}
t.Logf("Minimal worker heartbeat test completed successfully")
}
// TestMinimalTaskQueueOperations tests task queue operations
func TestMinimalTaskQueueOperations(t *testing.T) {
t.Logf("Testing minimal task queue operations")
config := &MinimalAdminConfig{
ScanInterval: 10 * time.Second,
WorkerTimeout: 30 * time.Second,
TaskTimeout: 2 * time.Hour,
MaxRetries: 3,
ReconcileInterval: 5 * time.Minute,
EnableFailureRecovery: true,
MaxConcurrentTasks: 5,
}
adminServer := NewMinimalAdminServer(config, nil)
err := adminServer.Start()
if err != nil {
t.Fatalf("Failed to start admin server: %v", err)
}
defer adminServer.Stop()
// Test queuing multiple tasks
taskCount := 3
for i := 0; i < taskCount; i++ {
task := &types.Task{
ID: fmt.Sprintf("queue-test-task-%d", i),
Type: types.TaskTypeVacuum,
VolumeID: uint32(2000 + i),
Server: "localhost:8080",
Status: types.TaskStatusPending,
Priority: types.TaskPriorityNormal,
Parameters: map[string]interface{}{
"garbage_threshold": "0.3",
},
CreatedAt: time.Now(),
}
err = adminServer.QueueTask(task)
if err != nil {
t.Errorf("Failed to queue task %d: %v", i, err)
}
}
// Check queue size
queuedCount := adminServer.GetQueuedTaskCount()
if queuedCount != taskCount {
t.Errorf("Expected %d queued tasks, got %d", taskCount, queuedCount)
}
t.Logf("Minimal task queue operations test completed successfully")
}
// TestMinimalFullWorkflow tests the complete workflow from task creation to completion
func TestMinimalFullWorkflow(t *testing.T) {
t.Logf("Testing minimal full workflow")
config := &MinimalAdminConfig{
ScanInterval: 10 * time.Second,
WorkerTimeout: 30 * time.Second,
TaskTimeout: 2 * time.Hour,
MaxRetries: 3,
ReconcileInterval: 5 * time.Minute,
EnableFailureRecovery: true,
MaxConcurrentTasks: 5,
}
adminServer := NewMinimalAdminServer(config, nil)
err := adminServer.Start()
if err != nil {
t.Fatalf("Failed to start admin server: %v", err)
}
defer adminServer.Stop()
// Register multiple workers with different capabilities
workers := []*types.Worker{
{
ID: "vacuum-worker-1",
Address: "localhost:9001",
Capabilities: []types.TaskType{types.TaskTypeVacuum},
MaxConcurrent: 2,
Status: "active",
CurrentLoad: 0,
LastHeartbeat: time.Now(),
},
{
ID: "ec-worker-1",
Address: "localhost:9002",
Capabilities: []types.TaskType{types.TaskTypeErasureCoding},
MaxConcurrent: 1,
Status: "active",
CurrentLoad: 0,
LastHeartbeat: time.Now(),
},
{
ID: "multi-worker-1",
Address: "localhost:9003",
Capabilities: []types.TaskType{types.TaskTypeVacuum, types.TaskTypeErasureCoding},
MaxConcurrent: 3,
Status: "active",
CurrentLoad: 0,
LastHeartbeat: time.Now(),
},
}
for _, worker := range workers {
err = adminServer.RegisterWorker(worker)
if err != nil {
t.Fatalf("Failed to register worker %s: %v", worker.ID, err)
}
t.Logf("Registered worker %s with capabilities %v", worker.ID, worker.Capabilities)
}
// Create multiple tasks of different types
tasks := []*types.Task{
{
ID: "vacuum-task-1",
Type: types.TaskTypeVacuum,
VolumeID: 3001,
Server: "localhost:8080",
Status: types.TaskStatusPending,
Priority: types.TaskPriorityNormal,
Parameters: map[string]interface{}{
"garbage_threshold": "0.4",
},
CreatedAt: time.Now(),
},
{
ID: "ec-task-1",
Type: types.TaskTypeErasureCoding,
VolumeID: 3002,
Server: "localhost:8080",
Status: types.TaskStatusPending,
Priority: types.TaskPriorityHigh,
Parameters: map[string]interface{}{
"shard_count": "14",
},
CreatedAt: time.Now(),
},
{
ID: "vacuum-task-2",
Type: types.TaskTypeVacuum,
VolumeID: 3003,
Server: "localhost:8081",
Status: types.TaskStatusPending,
Priority: types.TaskPriorityLow,
Parameters: map[string]interface{}{
"garbage_threshold": "0.5",
},
CreatedAt: time.Now(),
},
}
for _, task := range tasks {
err = adminServer.QueueTask(task)
if err != nil {
t.Fatalf("Failed to queue task %s: %v", task.ID, err)
}
t.Logf("Queued task %s (%s) for volume %d", task.ID, task.Type, task.VolumeID)
}
// Test task assignment to different workers
t.Logf("Testing task assignments")
// Vacuum worker should get vacuum tasks
assignedTask, err := adminServer.RequestTask("vacuum-worker-1", []types.TaskType{types.TaskTypeVacuum})
if err != nil {
t.Errorf("Failed to request task for vacuum worker: %v", err)
} else if assignedTask != nil {
t.Logf("Vacuum worker got task: %s (%s)", assignedTask.ID, assignedTask.Type)
// Complete the task
err = adminServer.UpdateTaskProgress(assignedTask.ID, 50.0)
if err != nil {
t.Errorf("Failed to update progress: %v", err)
}
err = adminServer.CompleteTask(assignedTask.ID, true, "")
if err != nil {
t.Errorf("Failed to complete task: %v", err)
}
}
// EC worker should get EC tasks
assignedTask, err = adminServer.RequestTask("ec-worker-1", []types.TaskType{types.TaskTypeErasureCoding})
if err != nil {
t.Errorf("Failed to request task for EC worker: %v", err)
} else if assignedTask != nil {
t.Logf("EC worker got task: %s (%s)", assignedTask.ID, assignedTask.Type)
// Complete the task
err = adminServer.UpdateTaskProgress(assignedTask.ID, 100.0)
if err != nil {
t.Errorf("Failed to update progress: %v", err)
}
err = adminServer.CompleteTask(assignedTask.ID, true, "")
if err != nil {
t.Errorf("Failed to complete task: %v", err)
}
}
// Multi-capability worker should be able to get any remaining task
assignedTask, err = adminServer.RequestTask("multi-worker-1", []types.TaskType{types.TaskTypeVacuum, types.TaskTypeErasureCoding})
if err != nil {
t.Errorf("Failed to request task for multi worker: %v", err)
} else if assignedTask != nil {
t.Logf("Multi worker got task: %s (%s)", assignedTask.ID, assignedTask.Type)
// Complete the task
err = adminServer.UpdateTaskProgress(assignedTask.ID, 100.0)
if err != nil {
t.Errorf("Failed to update progress: %v", err)
}
err = adminServer.CompleteTask(assignedTask.ID, true, "")
if err != nil {
t.Errorf("Failed to complete task: %v", err)
}
}
// Check final statistics
stats := adminServer.GetSystemStats()
t.Logf("Final stats: Active tasks=%d, Queued tasks=%d, Active workers=%d, Total tasks=%d",
stats.ActiveTasks, stats.QueuedTasks, stats.ActiveWorkers, stats.TotalTasks)
history := adminServer.GetTaskHistory()
t.Logf("Task history contains %d completed tasks", len(history))
for _, entry := range history {
t.Logf("Completed: %s (%s) - Worker: %s, Duration: %v",
entry.TaskID, entry.TaskType, entry.WorkerID, entry.Duration)
}
t.Logf("Minimal full workflow test completed successfully")
}

197
weed/admin/task/operational_integration_test.go

@ -1,197 +0,0 @@
package task
import (
"fmt"
"testing"
"time"
"github.com/seaweedfs/seaweedfs/weed/wdclient"
"github.com/seaweedfs/seaweedfs/weed/worker/types"
)
// TestOperationalIntegration tests the basic admin-worker operational flow
func TestOperationalIntegration(t *testing.T) {
t.Logf("Starting operational integration test")
// Step 1: Create admin server with operational configuration
config := &AdminConfig{
ScanInterval: 10 * time.Second,
WorkerTimeout: 30 * time.Second,
TaskTimeout: 2 * time.Hour,
MaxRetries: 3,
ReconcileInterval: 5 * time.Minute,
EnableFailureRecovery: true,
MaxConcurrentTasks: 5,
}
// Create a nil master client for testing (simplified)
var masterClient *wdclient.MasterClient
adminServer := NewAdminServer(config, masterClient)
// Step 2: Start admin server
err := adminServer.Start()
if err != nil {
t.Fatalf("Failed to start admin server: %v", err)
}
defer adminServer.Stop()
// Step 3: Create and register test workers
worker1 := createTestWorker("worker-1", []types.TaskType{types.TaskTypeVacuum, types.TaskTypeErasureCoding})
worker2 := createTestWorker("worker-2", []types.TaskType{types.TaskTypeVacuum})
err = adminServer.RegisterWorker(worker1)
if err != nil {
t.Fatalf("Failed to register worker1: %v", err)
}
err = adminServer.RegisterWorker(worker2)
if err != nil {
t.Fatalf("Failed to register worker2: %v", err)
}
// Step 4: Test basic task queueing
t.Logf("Testing task queueing")
// Create a simple test task
testTask := &types.Task{
ID: "test-vacuum-1",
Type: types.TaskTypeVacuum,
VolumeID: 1001,
Server: "localhost:8080",
Status: types.TaskStatusPending,
Priority: types.TaskPriorityNormal,
Parameters: map[string]interface{}{
"garbage_threshold": "0.3",
"server": "localhost:8080",
},
CreatedAt: time.Now(),
}
err = adminServer.QueueTask(testTask)
if err != nil {
t.Fatalf("Failed to queue test task: %v", err)
}
t.Logf("Successfully queued test vacuum task for volume %d", testTask.VolumeID)
// Step 5: Test worker task request and assignment
t.Logf("Testing worker task requests and assignment")
// Worker requests task
task, err := adminServer.RequestTask("worker-1", []types.TaskType{types.TaskTypeVacuum})
if err != nil {
t.Fatalf("Failed to request task from worker: %v", err)
}
if task == nil {
t.Logf("No tasks available for assignment (this is expected in test environment)")
} else {
t.Logf("Successfully assigned task %s (%s) to worker-1", task.ID, task.Type)
// Step 6: Simulate task progress updates
t.Logf("Testing task progress updates")
err = adminServer.UpdateTaskProgress(task.ID, 25.0)
if err != nil {
t.Errorf("Failed to update task progress: %v", err)
}
err = adminServer.UpdateTaskProgress(task.ID, 50.0)
if err != nil {
t.Errorf("Failed to update task progress: %v", err)
}
err = adminServer.UpdateTaskProgress(task.ID, 100.0)
if err != nil {
t.Errorf("Failed to update task progress: %v", err)
}
// Step 7: Test task completion
t.Logf("Testing task completion")
err = adminServer.CompleteTask(task.ID, true, "")
if err != nil {
t.Errorf("Failed to complete task: %v", err)
}
t.Logf("Successfully completed task %s", task.ID)
}
// Step 8: Test metrics and statistics
t.Logf("Testing system metrics")
stats := adminServer.GetSystemStats()
t.Logf("System stats: Active tasks=%d, Queued tasks=%d, Active workers=%d",
stats.ActiveTasks, stats.QueuedTasks, stats.ActiveWorkers)
queuedCount := adminServer.GetQueuedTaskCount()
activeCount := adminServer.GetActiveTaskCount()
t.Logf("Queue status: %d queued, %d active tasks", queuedCount, activeCount)
// Step 9: Test task history
history := adminServer.GetTaskHistory()
t.Logf("Task history contains %d entries", len(history))
t.Logf("Operational integration test completed successfully")
}
func createTestWorker(id string, capabilities []types.TaskType) *types.Worker {
return &types.Worker{
ID: id,
Address: fmt.Sprintf("localhost:900%s", id[len(id)-1:]),
Capabilities: capabilities,
MaxConcurrent: 2,
Status: "active",
CurrentLoad: 0,
LastHeartbeat: time.Now(),
}
}
// TestECTaskExecution tests the EC task validation (without actual execution)
func TestECTaskExecution(t *testing.T) {
t.Logf("Testing EC task validation")
params := types.TaskParams{
VolumeID: 1002,
Server: "localhost:8080",
Collection: "test",
Parameters: map[string]interface{}{
"volume_size": int64(32 * 1024 * 1024 * 1024),
},
}
// Test that basic validation would work
if params.VolumeID == 0 {
t.Errorf("VolumeID should not be zero")
}
if params.Server == "" {
t.Errorf("Server should not be empty")
}
t.Logf("EC task validation passed")
}
// TestVacuumTaskExecution tests the vacuum task validation (without actual execution)
func TestVacuumTaskExecution(t *testing.T) {
t.Logf("Testing vacuum task validation")
params := types.TaskParams{
VolumeID: 1001,
Server: "localhost:8080",
Collection: "test",
Parameters: map[string]interface{}{
"garbage_threshold": "0.3",
"volume_size": int64(25 * 1024 * 1024 * 1024),
},
}
// Test that basic validation would work
if params.VolumeID == 0 {
t.Errorf("VolumeID should not be zero")
}
if params.Server == "" {
t.Errorf("Server should not be empty")
}
t.Logf("Vacuum task validation passed")
}

233
weed/admin/task/simple_integration_test.go

@ -1,233 +0,0 @@
package task
import (
"fmt"
"testing"
"time"
"github.com/seaweedfs/seaweedfs/weed/worker/types"
)
// TestSimpleIntegration tests basic admin-worker operational flow without complex dependencies
func TestSimpleIntegration(t *testing.T) {
t.Logf("Starting simple integration test")
// Step 1: Create a minimal admin server configuration
config := &AdminConfig{
ScanInterval: 10 * time.Second,
WorkerTimeout: 30 * time.Second,
TaskTimeout: 2 * time.Hour,
MaxRetries: 3,
ReconcileInterval: 5 * time.Minute,
EnableFailureRecovery: true,
MaxConcurrentTasks: 5,
}
// Step 2: Create admin server with nil master client (for testing)
adminServer := NewAdminServer(config, nil)
// Step 3: Start admin server
err := adminServer.Start()
if err != nil {
t.Fatalf("Failed to start admin server: %v", err)
}
defer adminServer.Stop()
// Step 4: Test worker registration
t.Logf("Testing worker registration")
worker := &types.Worker{
ID: "test-worker-1",
Address: "localhost:9001",
Capabilities: []types.TaskType{types.TaskTypeVacuum},
MaxConcurrent: 2,
Status: "active",
CurrentLoad: 0,
LastHeartbeat: time.Now(),
}
err = adminServer.RegisterWorker(worker)
if err != nil {
t.Fatalf("Failed to register worker: %v", err)
}
t.Logf("Successfully registered worker %s", worker.ID)
// Step 5: Test task queueing
t.Logf("Testing task queueing")
task := &types.Task{
ID: "test-task-1",
Type: types.TaskTypeVacuum,
VolumeID: 1001,
Server: "localhost:8080",
Status: types.TaskStatusPending,
Priority: types.TaskPriorityNormal,
Parameters: map[string]interface{}{
"garbage_threshold": "0.3",
},
CreatedAt: time.Now(),
}
err = adminServer.QueueTask(task)
if err != nil {
t.Fatalf("Failed to queue task: %v", err)
}
t.Logf("Successfully queued task %s", task.ID)
// Step 6: Test task request by worker
t.Logf("Testing task request")
assignedTask, err := adminServer.RequestTask("test-worker-1", []types.TaskType{types.TaskTypeVacuum})
if err != nil {
t.Fatalf("Failed to request task: %v", err)
}
if assignedTask != nil {
t.Logf("Successfully assigned task %s to worker", assignedTask.ID)
// Step 7: Test task progress updates
t.Logf("Testing task progress updates")
err = adminServer.UpdateTaskProgress(assignedTask.ID, 50.0)
if err != nil {
t.Errorf("Failed to update task progress: %v", err)
}
err = adminServer.UpdateTaskProgress(assignedTask.ID, 100.0)
if err != nil {
t.Errorf("Failed to update task progress: %v", err)
}
// Step 8: Test task completion
t.Logf("Testing task completion")
err = adminServer.CompleteTask(assignedTask.ID, true, "")
if err != nil {
t.Errorf("Failed to complete task: %v", err)
}
t.Logf("Successfully completed task %s", assignedTask.ID)
} else {
t.Logf("No task was assigned (queue might be empty)")
}
// Step 9: Test basic metrics
t.Logf("Testing basic metrics")
stats := adminServer.GetSystemStats()
if stats != nil {
t.Logf("System stats: Active tasks=%d, Queued tasks=%d, Active workers=%d",
stats.ActiveTasks, stats.QueuedTasks, stats.ActiveWorkers)
}
queuedCount := adminServer.GetQueuedTaskCount()
activeCount := adminServer.GetActiveTaskCount()
t.Logf("Queue status: %d queued, %d active tasks", queuedCount, activeCount)
// Step 10: Test task history
history := adminServer.GetTaskHistory()
t.Logf("Task history contains %d entries", len(history))
t.Logf("Simple integration test completed successfully")
}
// TestWorkerHeartbeat tests worker heartbeat functionality
func TestWorkerHeartbeat(t *testing.T) {
t.Logf("Testing worker heartbeat")
config := &AdminConfig{
ScanInterval: 10 * time.Second,
WorkerTimeout: 30 * time.Second,
TaskTimeout: 2 * time.Hour,
MaxRetries: 3,
ReconcileInterval: 5 * time.Minute,
EnableFailureRecovery: true,
MaxConcurrentTasks: 5,
}
adminServer := NewAdminServer(config, nil)
err := adminServer.Start()
if err != nil {
t.Fatalf("Failed to start admin server: %v", err)
}
defer adminServer.Stop()
// Register a worker
worker := &types.Worker{
ID: "heartbeat-worker",
Address: "localhost:9002",
Capabilities: []types.TaskType{types.TaskTypeVacuum},
MaxConcurrent: 1,
Status: "active",
CurrentLoad: 0,
LastHeartbeat: time.Now(),
}
err = adminServer.RegisterWorker(worker)
if err != nil {
t.Fatalf("Failed to register worker: %v", err)
}
// Test heartbeat update
status := &types.WorkerStatus{
Status: "active",
CurrentLoad: 0,
}
err = adminServer.UpdateWorkerHeartbeat("heartbeat-worker", status)
if err != nil {
t.Errorf("Failed to update worker heartbeat: %v", err)
}
t.Logf("Worker heartbeat test completed successfully")
}
// TestTaskQueueOperations tests task queue operations
func TestTaskQueueOperations(t *testing.T) {
t.Logf("Testing task queue operations")
config := &AdminConfig{
ScanInterval: 10 * time.Second,
WorkerTimeout: 30 * time.Second,
TaskTimeout: 2 * time.Hour,
MaxRetries: 3,
ReconcileInterval: 5 * time.Minute,
EnableFailureRecovery: true,
MaxConcurrentTasks: 5,
}
adminServer := NewAdminServer(config, nil)
err := adminServer.Start()
if err != nil {
t.Fatalf("Failed to start admin server: %v", err)
}
defer adminServer.Stop()
// Test queuing multiple tasks
for i := 0; i < 3; i++ {
task := &types.Task{
ID: fmt.Sprintf("queue-test-task-%d", i),
Type: types.TaskTypeVacuum,
VolumeID: uint32(2000 + i),
Server: "localhost:8080",
Status: types.TaskStatusPending,
Priority: types.TaskPriorityNormal,
Parameters: map[string]interface{}{
"garbage_threshold": "0.3",
},
CreatedAt: time.Now(),
}
err = adminServer.QueueTask(task)
if err != nil {
t.Errorf("Failed to queue task %d: %v", i, err)
}
}
// Check queue size
queuedCount := adminServer.GetQueuedTaskCount()
if queuedCount != 3 {
t.Errorf("Expected 3 queued tasks, got %d", queuedCount)
}
t.Logf("Task queue operations test completed successfully")
}

604
weed/admin/task/simulation.go

@ -1,604 +0,0 @@
package task
import (
"context"
"fmt"
"math/rand"
"sync"
"time"
"github.com/seaweedfs/seaweedfs/weed/glog"
"github.com/seaweedfs/seaweedfs/weed/worker/types"
)
// TaskSimulator provides a comprehensive simulation framework for testing the task distribution system
type TaskSimulator struct {
adminServer *AdminServer
mockWorkers []*MockWorker
mockMaster *MockMasterClient
scenarios map[string]*SimulationScenario
results map[string]*SimulationResult
mutex sync.RWMutex
}
// SimulationScenario defines a test scenario
type SimulationScenario struct {
Name string
Description string
WorkerCount int
VolumeCount int
Duration time.Duration
FailurePatterns []*FailurePattern
TestCases []*TestCase
}
// FailurePattern defines how failures occur during simulation
type FailurePattern struct {
Type FailureType
Probability float64 // 0.0 to 1.0
Timing *TimingSpec // When during task execution
Duration time.Duration
Details string
}
// TestCase defines specific test scenarios
type TestCase struct {
Name string
VolumeID uint32
TaskType types.TaskType
ExpectedOutcome string
FailureToInject *FailurePattern
}
// FailureType represents different types of failures
type FailureType string
const (
FailureWorkerTimeout FailureType = "worker_timeout"
FailureTaskStuck FailureType = "task_stuck"
FailureTaskCrash FailureType = "task_crash"
FailureDuplicate FailureType = "duplicate_task"
FailureResourceExhaust FailureType = "resource_exhaustion"
FailureNetworkPartition FailureType = "network_partition"
)
// TimingSpec defines when a failure occurs
type TimingSpec struct {
MinProgress float64 // Minimum progress before failure can occur
MaxProgress float64 // Maximum progress before failure must occur
Delay time.Duration // Fixed delay before failure
}
// SimulationResult tracks the results of a simulation
type SimulationResult struct {
ScenarioName string
StartTime time.Time
EndTime time.Time
Duration time.Duration
TasksCreated int
TasksCompleted int
TasksFailed int
TasksStuck int
WorkerTimeouts int
DuplicatesFound int
StateInconsistencies int
Errors []string
Warnings []string
Success bool
}
// MockWorker simulates a worker with controllable behavior
type MockWorker struct {
ID string
Capabilities []types.TaskType
MaxConcurrent int
CurrentTasks map[string]*MockTask
Status string
FailureMode *FailurePattern
mutex sync.Mutex
}
// MockTask represents a simulated task execution
type MockTask struct {
Task *types.Task
StartTime time.Time
Progress float64
Stuck bool
Failed bool
Completed bool
}
// MockMasterClient simulates master server interactions
type MockMasterClient struct {
volumes map[uint32]*VolumeInfo
inconsistency bool
mutex sync.RWMutex
}
// NewTaskSimulator creates a new task simulator
func NewTaskSimulator() *TaskSimulator {
return &TaskSimulator{
scenarios: make(map[string]*SimulationScenario),
results: make(map[string]*SimulationResult),
}
}
// RegisterScenario registers a simulation scenario
func (ts *TaskSimulator) RegisterScenario(scenario *SimulationScenario) {
ts.mutex.Lock()
defer ts.mutex.Unlock()
ts.scenarios[scenario.Name] = scenario
glog.Infof("Registered simulation scenario: %s", scenario.Name)
}
// RunScenario executes a simulation scenario
func (ts *TaskSimulator) RunScenario(scenarioName string) (*SimulationResult, error) {
ts.mutex.RLock()
scenario, exists := ts.scenarios[scenarioName]
ts.mutex.RUnlock()
if !exists {
return nil, fmt.Errorf("scenario %s not found", scenarioName)
}
glog.Infof("Starting simulation scenario: %s", scenarioName)
result := &SimulationResult{
ScenarioName: scenarioName,
StartTime: time.Now(),
Errors: make([]string, 0),
Warnings: make([]string, 0),
}
// Setup simulation environment
if err := ts.setupEnvironment(scenario); err != nil {
return nil, fmt.Errorf("failed to setup environment: %v", err)
}
// Execute test cases
ctx, cancel := context.WithTimeout(context.Background(), scenario.Duration)
defer cancel()
ts.executeScenario(ctx, scenario, result)
// Cleanup
ts.cleanup()
result.EndTime = time.Now()
result.Duration = result.EndTime.Sub(result.StartTime)
result.Success = len(result.Errors) == 0
ts.mutex.Lock()
ts.results[scenarioName] = result
ts.mutex.Unlock()
glog.Infof("Completed simulation scenario: %s (success: %v)", scenarioName, result.Success)
return result, nil
}
// setupEnvironment prepares the simulation environment
func (ts *TaskSimulator) setupEnvironment(scenario *SimulationScenario) error {
// Create mock master client
ts.mockMaster = &MockMasterClient{
volumes: make(map[uint32]*VolumeInfo),
}
// Generate mock volumes
for i := uint32(1); i <= uint32(scenario.VolumeCount); i++ {
volume := &VolumeInfo{
ID: i,
Size: uint64(rand.Intn(30 * 1024 * 1024 * 1024)), // Random size up to 30GB
Collection: fmt.Sprintf("collection_%d", (i%3)+1),
DeletedByteCount: uint64(rand.Intn(1024 * 1024 * 1024)), // Random garbage
ReadOnly: false,
Server: fmt.Sprintf("server_%d", (i%6)+1),
ModifiedAtSecond: time.Now().Add(-time.Duration(rand.Intn(86400)) * time.Second).Unix(),
}
ts.mockMaster.volumes[i] = volume
}
// Create mock workers
ts.mockWorkers = make([]*MockWorker, scenario.WorkerCount)
for i := 0; i < scenario.WorkerCount; i++ {
worker := &MockWorker{
ID: fmt.Sprintf("worker_%d", i+1),
Capabilities: []types.TaskType{types.TaskTypeErasureCoding, types.TaskTypeVacuum},
MaxConcurrent: 2,
CurrentTasks: make(map[string]*MockTask),
Status: "active",
}
// Apply failure patterns
if i < len(scenario.FailurePatterns) {
worker.FailureMode = scenario.FailurePatterns[i]
}
ts.mockWorkers[i] = worker
}
// Initialize admin server (simplified for simulation)
config := DefaultAdminConfig()
config.ScanInterval = 10 * time.Second
config.TaskTimeout = 30 * time.Second
// Note: In a real implementation, this would use the actual master client
// For simulation, we'd need to inject our mock
return nil
}
// executeScenario runs the actual simulation scenario
func (ts *TaskSimulator) executeScenario(ctx context.Context, scenario *SimulationScenario, result *SimulationResult) {
// Execute each test case
for _, testCase := range scenario.TestCases {
ts.executeTestCase(ctx, testCase, result)
}
// Run continuous simulation for remaining duration
ts.runContinuousSimulation(ctx, scenario, result)
}
// executeTestCase runs a specific test case
func (ts *TaskSimulator) executeTestCase(ctx context.Context, testCase *TestCase, result *SimulationResult) {
glog.V(1).Infof("Executing test case: %s", testCase.Name)
// Create task for the test case
task := &types.Task{
ID: fmt.Sprintf("test_%s_%d", testCase.Name, time.Now().UnixNano()),
Type: testCase.TaskType,
VolumeID: testCase.VolumeID,
Priority: types.TaskPriorityNormal,
CreatedAt: time.Now(),
}
result.TasksCreated++
// Assign to worker
worker := ts.selectWorkerForTask(task)
if worker == nil {
result.Errors = append(result.Errors, fmt.Sprintf("No available worker for test case %s", testCase.Name))
return
}
// Execute task with potential failure injection
ts.executeTaskOnWorker(ctx, task, worker, testCase.FailureToInject, result)
}
// runContinuousSimulation runs ongoing simulation
func (ts *TaskSimulator) runContinuousSimulation(ctx context.Context, scenario *SimulationScenario, result *SimulationResult) {
ticker := time.NewTicker(5 * time.Second)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
return
case <-ticker.C:
ts.simulateOngoingTasks(result)
ts.checkForInconsistencies(result)
}
}
}
// executeTaskOnWorker simulates task execution on a worker
func (ts *TaskSimulator) executeTaskOnWorker(ctx context.Context, task *types.Task, worker *MockWorker, failurePattern *FailurePattern, result *SimulationResult) {
worker.mutex.Lock()
defer worker.mutex.Unlock()
mockTask := &MockTask{
Task: task,
StartTime: time.Now(),
Progress: 0.0,
}
worker.CurrentTasks[task.ID] = mockTask
// Simulate task execution
go ts.simulateTaskExecution(ctx, mockTask, worker, failurePattern, result)
}
// simulateTaskExecution simulates the execution of a single task
func (ts *TaskSimulator) simulateTaskExecution(ctx context.Context, mockTask *MockTask, worker *MockWorker, failurePattern *FailurePattern, result *SimulationResult) {
defer func() {
worker.mutex.Lock()
delete(worker.CurrentTasks, mockTask.Task.ID)
worker.mutex.Unlock()
}()
duration := 20 * time.Second // Base task duration
progressTicker := time.NewTicker(time.Second)
defer progressTicker.Stop()
startTime := time.Now()
for {
select {
case <-ctx.Done():
return
case <-progressTicker.C:
elapsed := time.Since(startTime)
progress := float64(elapsed) / float64(duration) * 100.0
if progress >= 100.0 {
mockTask.Completed = true
result.TasksCompleted++
glog.V(2).Infof("Task %s completed successfully", mockTask.Task.ID)
return
}
mockTask.Progress = progress
// Check for failure injection
if failurePattern != nil && ts.shouldInjectFailure(failurePattern, progress, elapsed) {
ts.injectFailure(mockTask, worker, failurePattern, result)
return
}
// Check for worker failure mode
if worker.FailureMode != nil && ts.shouldInjectFailure(worker.FailureMode, progress, elapsed) {
ts.injectFailure(mockTask, worker, worker.FailureMode, result)
return
}
}
}
}
// shouldInjectFailure determines if a failure should be injected
func (ts *TaskSimulator) shouldInjectFailure(pattern *FailurePattern, progress float64, elapsed time.Duration) bool {
if pattern.Timing != nil {
if progress < pattern.Timing.MinProgress || progress > pattern.Timing.MaxProgress {
return false
}
if elapsed < pattern.Timing.Delay {
return false
}
}
return rand.Float64() < pattern.Probability
}
// injectFailure simulates a failure
func (ts *TaskSimulator) injectFailure(mockTask *MockTask, worker *MockWorker, pattern *FailurePattern, result *SimulationResult) {
glog.Warningf("Injecting failure: %s for task %s", pattern.Type, mockTask.Task.ID)
switch pattern.Type {
case FailureWorkerTimeout:
worker.Status = "timeout"
result.WorkerTimeouts++
case FailureTaskStuck:
mockTask.Stuck = true
result.TasksStuck++
case FailureTaskCrash:
mockTask.Failed = true
result.TasksFailed++
case FailureDuplicate:
result.DuplicatesFound++
case FailureResourceExhaust:
worker.Status = "resource_exhausted"
result.Warnings = append(result.Warnings, fmt.Sprintf("Worker %s resource exhausted", worker.ID))
case FailureNetworkPartition:
worker.Status = "partitioned"
result.Warnings = append(result.Warnings, fmt.Sprintf("Worker %s network partitioned", worker.ID))
}
}
// selectWorkerForTask selects an available worker for a task
func (ts *TaskSimulator) selectWorkerForTask(task *types.Task) *MockWorker {
for _, worker := range ts.mockWorkers {
if worker.Status == "active" && len(worker.CurrentTasks) < worker.MaxConcurrent {
// Check capabilities
for _, capability := range worker.Capabilities {
if capability == task.Type {
return worker
}
}
}
}
return nil
}
// simulateOngoingTasks handles ongoing task simulation
func (ts *TaskSimulator) simulateOngoingTasks(result *SimulationResult) {
// Create random new tasks
if rand.Float64() < 0.3 { // 30% chance to create new task every tick
taskType := types.TaskTypeVacuum
if rand.Float64() < 0.5 {
taskType = types.TaskTypeErasureCoding
}
task := &types.Task{
ID: fmt.Sprintf("auto_%d", time.Now().UnixNano()),
Type: taskType,
VolumeID: uint32(rand.Intn(len(ts.mockMaster.volumes)) + 1),
Priority: types.TaskPriorityNormal,
CreatedAt: time.Now(),
}
result.TasksCreated++
worker := ts.selectWorkerForTask(task)
if worker != nil {
ts.executeTaskOnWorker(context.Background(), task, worker, nil, result)
}
}
}
// checkForInconsistencies checks for state inconsistencies
func (ts *TaskSimulator) checkForInconsistencies(result *SimulationResult) {
// Check for volume reservation inconsistencies
// Check for duplicate tasks
// Check for orphaned tasks
// This would be more comprehensive in a real implementation
for _, worker := range ts.mockWorkers {
worker.mutex.Lock()
for taskID, mockTask := range worker.CurrentTasks {
if mockTask.Stuck && time.Since(mockTask.StartTime) > 60*time.Second {
result.StateInconsistencies++
result.Warnings = append(result.Warnings, fmt.Sprintf("Long-running stuck task detected: %s", taskID))
}
}
worker.mutex.Unlock()
}
}
// cleanup cleans up simulation resources
func (ts *TaskSimulator) cleanup() {
ts.mockWorkers = nil
ts.mockMaster = nil
}
// GetSimulationResults returns all simulation results
func (ts *TaskSimulator) GetSimulationResults() map[string]*SimulationResult {
ts.mutex.RLock()
defer ts.mutex.RUnlock()
results := make(map[string]*SimulationResult)
for k, v := range ts.results {
results[k] = v
}
return results
}
// CreateStandardScenarios creates a set of standard test scenarios
func (ts *TaskSimulator) CreateStandardScenarios() {
// Scenario 1: Worker Timeout During EC
ts.RegisterScenario(&SimulationScenario{
Name: "worker_timeout_during_ec",
Description: "Test worker timeout during erasure coding operation",
WorkerCount: 3,
VolumeCount: 10,
Duration: 2 * time.Minute,
FailurePatterns: []*FailurePattern{
{
Type: FailureWorkerTimeout,
Probability: 1.0,
Timing: &TimingSpec{
MinProgress: 50.0,
MaxProgress: 60.0,
},
},
},
TestCases: []*TestCase{
{
Name: "ec_timeout_test",
VolumeID: 1,
TaskType: types.TaskTypeErasureCoding,
ExpectedOutcome: "task_reassigned",
},
},
})
// Scenario 2: Stuck Vacuum Task
ts.RegisterScenario(&SimulationScenario{
Name: "stuck_vacuum_task",
Description: "Test stuck vacuum task detection and cleanup",
WorkerCount: 2,
VolumeCount: 5,
Duration: 90 * time.Second,
TestCases: []*TestCase{
{
Name: "vacuum_stuck_test",
VolumeID: 2,
TaskType: types.TaskTypeVacuum,
FailureToInject: &FailurePattern{
Type: FailureTaskStuck,
Probability: 1.0,
Timing: &TimingSpec{
MinProgress: 75.0,
MaxProgress: 80.0,
},
},
ExpectedOutcome: "task_timeout_detected",
},
},
})
// Scenario 3: Duplicate Task Prevention
ts.RegisterScenario(&SimulationScenario{
Name: "duplicate_task_prevention",
Description: "Test duplicate task detection and prevention",
WorkerCount: 4,
VolumeCount: 8,
Duration: 60 * time.Second,
TestCases: []*TestCase{
{
Name: "duplicate_ec_test_1",
VolumeID: 3,
TaskType: types.TaskTypeErasureCoding,
},
{
Name: "duplicate_ec_test_2", // Same volume, should be detected as duplicate
VolumeID: 3,
TaskType: types.TaskTypeErasureCoding,
FailureToInject: &FailurePattern{
Type: FailureDuplicate,
Probability: 1.0,
},
ExpectedOutcome: "duplicate_detected",
},
},
})
// Scenario 4: Master-Admin State Divergence
ts.RegisterScenario(&SimulationScenario{
Name: "master_admin_divergence",
Description: "Test state reconciliation between master and admin server",
WorkerCount: 3,
VolumeCount: 15,
Duration: 2 * time.Minute,
TestCases: []*TestCase{
{
Name: "state_reconciliation_test",
VolumeID: 4,
TaskType: types.TaskTypeErasureCoding,
ExpectedOutcome: "state_reconciled",
},
},
})
}
// GenerateSimulationReport creates a comprehensive report of simulation results
func (ts *TaskSimulator) GenerateSimulationReport() string {
ts.mutex.RLock()
defer ts.mutex.RUnlock()
report := "# Task Distribution System Simulation Report\n\n"
for scenarioName, result := range ts.results {
report += fmt.Sprintf("## Scenario: %s\n", scenarioName)
report += fmt.Sprintf("- **Duration**: %v\n", result.Duration)
report += fmt.Sprintf("- **Success**: %v\n", result.Success)
report += fmt.Sprintf("- **Tasks Created**: %d\n", result.TasksCreated)
report += fmt.Sprintf("- **Tasks Completed**: %d\n", result.TasksCompleted)
report += fmt.Sprintf("- **Tasks Failed**: %d\n", result.TasksFailed)
report += fmt.Sprintf("- **Tasks Stuck**: %d\n", result.TasksStuck)
report += fmt.Sprintf("- **Worker Timeouts**: %d\n", result.WorkerTimeouts)
report += fmt.Sprintf("- **Duplicates Found**: %d\n", result.DuplicatesFound)
report += fmt.Sprintf("- **State Inconsistencies**: %d\n", result.StateInconsistencies)
if len(result.Errors) > 0 {
report += "- **Errors**:\n"
for _, err := range result.Errors {
report += fmt.Sprintf(" - %s\n", err)
}
}
if len(result.Warnings) > 0 {
report += "- **Warnings**:\n"
for _, warning := range result.Warnings {
report += fmt.Sprintf(" - %s\n", warning)
}
}
report += "\n"
}
return report
}

695
weed/admin/task/simulation/comprehensive_simulation.go

@ -1,695 +0,0 @@
package simulation
import (
"context"
"fmt"
"math/rand"
"sync"
"time"
"github.com/seaweedfs/seaweedfs/weed/admin/task"
"github.com/seaweedfs/seaweedfs/weed/glog"
"github.com/seaweedfs/seaweedfs/weed/worker/types"
)
// ComprehensiveSimulator tests all possible edge cases in volume/shard state management
type ComprehensiveSimulator struct {
stateManager *task.VolumeStateManager
mockMaster *MockMasterServer
mockWorkers []*MockWorker
scenarios []*StateTestScenario
currentScenario *StateTestScenario
results *SimulationResults
eventLog []*SimulationEvent
mutex sync.RWMutex
}
// StateTestScenario represents a specific state management test case
type StateTestScenario struct {
Name string
Description string
InitialState *ClusterState
EventSequence []*SimulationEvent
ExpectedFinalState *ClusterState
InconsistencyChecks []*InconsistencyCheck
Duration time.Duration
}
// ClusterState represents the complete state of the cluster
type ClusterState struct {
Volumes map[uint32]*task.VolumeInfo
ECShards map[uint32]map[int]*task.ShardInfo
ServerCapacity map[string]*task.CapacityInfo
InProgressTasks map[string]*task.TaskImpact
Timestamp time.Time
}
// SimulationEvent represents an event that can occur during simulation
type SimulationEvent struct {
Type EventType
Timestamp time.Time
VolumeID uint32
ShardID *int
Server string
TaskID string
Parameters map[string]interface{}
Description string
}
// EventType represents different types of simulation events
type EventType string
const (
// Volume events
EventVolumeCreated EventType = "volume_created"
EventVolumeDeleted EventType = "volume_deleted"
EventVolumeSizeChanged EventType = "volume_size_changed"
EventVolumeReadOnly EventType = "volume_readonly"
// Shard events
EventShardCreated EventType = "shard_created"
EventShardDeleted EventType = "shard_deleted"
EventShardMoved EventType = "shard_moved"
EventShardCorrupted EventType = "shard_corrupted"
// Task events
EventTaskStarted EventType = "task_started"
EventTaskCompleted EventType = "task_completed"
EventTaskFailed EventType = "task_failed"
EventTaskStuck EventType = "task_stuck"
EventTaskCancelled EventType = "task_cancelled"
// Worker events
EventWorkerJoined EventType = "worker_joined"
EventWorkerLeft EventType = "worker_left"
EventWorkerTimeout EventType = "worker_timeout"
EventWorkerRestarted EventType = "worker_restarted"
// Master events
EventMasterSync EventType = "master_sync"
EventMasterInconsistent EventType = "master_inconsistent"
EventMasterPartitioned EventType = "master_partitioned"
EventMasterReconnected EventType = "master_reconnected"
// Network events
EventNetworkPartition EventType = "network_partition"
EventNetworkHealed EventType = "network_healed"
EventMessageDelayed EventType = "message_delayed"
EventMessageLost EventType = "message_lost"
)
// InconsistencyCheck defines what inconsistencies to check for
type InconsistencyCheck struct {
Name string
Type task.InconsistencyType
ExpectedCount int
MaxAllowedCount int
SeverityThreshold task.SeverityLevel
}
// MockMasterServer simulates master server behavior with controllable inconsistencies
type MockMasterServer struct {
volumes map[uint32]*task.VolumeInfo
ecShards map[uint32]map[int]*task.ShardInfo
serverCapacity map[string]*task.CapacityInfo
inconsistencyMode bool
networkPartitioned bool
responseDelay time.Duration
mutex sync.RWMutex
}
// MockWorker represents a mock worker for testing
type MockWorker struct {
ID string
Capabilities []types.TaskType
IsActive bool
TaskDelay time.Duration
FailureRate float64
}
// SimulationResults tracks comprehensive simulation results
type SimulationResults struct {
ScenarioName string
StartTime time.Time
EndTime time.Time
Duration time.Duration
TotalEvents int
EventsByType map[EventType]int
InconsistenciesFound map[task.InconsistencyType]int
TasksExecuted int
TasksSucceeded int
TasksFailed int
StateValidationsPassed int
StateValidationsFailed int
CriticalErrors []string
Warnings []string
DetailedLog []string
Success bool
}
// NewComprehensiveSimulator creates a new comprehensive simulator
func NewComprehensiveSimulator() *ComprehensiveSimulator {
return &ComprehensiveSimulator{
stateManager: task.NewVolumeStateManager(nil),
mockMaster: NewMockMasterServer(),
scenarios: []*StateTestScenario{},
eventLog: []*SimulationEvent{},
results: &SimulationResults{
EventsByType: make(map[EventType]int),
InconsistenciesFound: make(map[task.InconsistencyType]int),
CriticalErrors: []string{},
Warnings: []string{},
DetailedLog: []string{},
},
}
}
// CreateComprehensiveScenarios creates all possible edge case scenarios
func (cs *ComprehensiveSimulator) CreateComprehensiveScenarios() {
cs.scenarios = []*StateTestScenario{
cs.createVolumeCreationDuringTaskScenario(),
cs.createVolumeDeletionDuringTaskScenario(),
cs.createShardCreationRaceConditionScenario(),
cs.createMasterSyncDuringTaskScenario(),
cs.createNetworkPartitionScenario(),
cs.createWorkerFailureDuringECScenario(),
cs.createConcurrentTasksScenario(),
cs.createCapacityOverflowScenario(),
cs.createShardCorruptionScenario(),
cs.createMasterInconsistencyScenario(),
cs.createTaskOrphanScenario(),
cs.createDuplicateTaskDetectionScenario(),
cs.createVolumeStateRollbackScenario(),
cs.createComplexECOperationScenario(),
cs.createHighLoadStressTestScenario(),
}
glog.Infof("Created %d comprehensive test scenarios", len(cs.scenarios))
}
// RunAllComprehensiveScenarios runs all edge case scenarios
func (cs *ComprehensiveSimulator) RunAllComprehensiveScenarios() (*SimulationResults, error) {
glog.Infof("Starting comprehensive state management simulation")
cs.results.StartTime = time.Now()
for _, scenario := range cs.scenarios {
glog.Infof("Running scenario: %s", scenario.Name)
if err := cs.RunScenario(scenario); err != nil {
cs.results.CriticalErrors = append(cs.results.CriticalErrors,
fmt.Sprintf("Scenario %s failed: %v", scenario.Name, err))
}
// Brief pause between scenarios
time.Sleep(1 * time.Second)
}
cs.results.EndTime = time.Now()
cs.results.Duration = cs.results.EndTime.Sub(cs.results.StartTime)
cs.results.Success = len(cs.results.CriticalErrors) == 0
cs.generateDetailedReport()
glog.Infof("Comprehensive simulation completed: %v", cs.results.Success)
return cs.results, nil
}
// Scenario creation methods
func (cs *ComprehensiveSimulator) createVolumeCreationDuringTaskScenario() *StateTestScenario {
return &StateTestScenario{
Name: "volume_creation_during_task",
Description: "Tests state consistency when master reports new volume while task is creating it",
InitialState: &ClusterState{
Volumes: make(map[uint32]*task.VolumeInfo),
ECShards: make(map[uint32]map[int]*task.ShardInfo),
},
EventSequence: []*SimulationEvent{
{Type: EventTaskStarted, VolumeID: 1, TaskID: "create_task_1", Parameters: map[string]interface{}{"type": "create"}},
{Type: EventVolumeCreated, VolumeID: 1, Parameters: map[string]interface{}{"size": int64(1024 * 1024 * 1024)}},
{Type: EventMasterSync},
{Type: EventTaskCompleted, TaskID: "create_task_1"},
},
ExpectedFinalState: &ClusterState{
Volumes: map[uint32]*task.VolumeInfo{
1: {ID: 1, Size: 1024 * 1024 * 1024},
},
},
InconsistencyChecks: []*InconsistencyCheck{
{Name: "No unexpected volumes", Type: task.InconsistencyVolumeUnexpected, MaxAllowedCount: 0},
},
Duration: 30 * time.Second,
}
}
func (cs *ComprehensiveSimulator) createVolumeDeletionDuringTaskScenario() *StateTestScenario {
return &StateTestScenario{
Name: "volume_deletion_during_task",
Description: "Tests handling when volume is deleted while task is working on it",
InitialState: &ClusterState{
Volumes: map[uint32]*task.VolumeInfo{
1: {ID: 1, Size: 1024 * 1024 * 1024},
},
},
EventSequence: []*SimulationEvent{
{Type: EventTaskStarted, VolumeID: 1, TaskID: "vacuum_task_1", Parameters: map[string]interface{}{"type": "vacuum"}},
{Type: EventVolumeDeleted, VolumeID: 1},
{Type: EventMasterSync},
{Type: EventTaskFailed, TaskID: "vacuum_task_1", Parameters: map[string]interface{}{"reason": "volume_deleted"}},
},
InconsistencyChecks: []*InconsistencyCheck{
{Name: "Missing volume detected", Type: task.InconsistencyVolumeMissing, ExpectedCount: 1},
},
Duration: 30 * time.Second,
}
}
func (cs *ComprehensiveSimulator) createShardCreationRaceConditionScenario() *StateTestScenario {
return &StateTestScenario{
Name: "shard_creation_race_condition",
Description: "Tests race condition between EC task creating shards and master sync",
InitialState: &ClusterState{
Volumes: map[uint32]*task.VolumeInfo{
1: {ID: 1, Size: 28 * 1024 * 1024 * 1024}, // Large volume ready for EC
},
},
EventSequence: []*SimulationEvent{
{Type: EventTaskStarted, VolumeID: 1, TaskID: "ec_task_1", Parameters: map[string]interface{}{"type": "ec_encode"}},
// Simulate shards being created one by one
{Type: EventShardCreated, VolumeID: 1, ShardID: intPtr(0), Server: "server1"},
{Type: EventShardCreated, VolumeID: 1, ShardID: intPtr(1), Server: "server1"},
{Type: EventMasterSync}, // Master sync happens while shards are being created
{Type: EventShardCreated, VolumeID: 1, ShardID: intPtr(2), Server: "server2"},
{Type: EventShardCreated, VolumeID: 1, ShardID: intPtr(3), Server: "server2"},
{Type: EventTaskCompleted, TaskID: "ec_task_1"},
{Type: EventMasterSync},
},
InconsistencyChecks: []*InconsistencyCheck{
{Name: "All shards accounted for", Type: task.InconsistencyShardMissing, MaxAllowedCount: 0},
},
Duration: 45 * time.Second,
}
}
func (cs *ComprehensiveSimulator) createNetworkPartitionScenario() *StateTestScenario {
return &StateTestScenario{
Name: "network_partition_recovery",
Description: "Tests state consistency during and after network partitions",
EventSequence: []*SimulationEvent{
{Type: EventTaskStarted, VolumeID: 1, TaskID: "partition_task_1"},
{Type: EventNetworkPartition, Parameters: map[string]interface{}{"duration": "30s"}},
{Type: EventVolumeCreated, VolumeID: 2}, // Created during partition
{Type: EventNetworkHealed},
{Type: EventMasterReconnected},
{Type: EventMasterSync},
{Type: EventTaskCompleted, TaskID: "partition_task_1"},
},
InconsistencyChecks: []*InconsistencyCheck{
{Name: "State reconciled after partition", Type: task.InconsistencyVolumeUnexpected, MaxAllowedCount: 1},
},
Duration: 60 * time.Second,
}
}
func (cs *ComprehensiveSimulator) createConcurrentTasksScenario() *StateTestScenario {
return &StateTestScenario{
Name: "concurrent_tasks_capacity_tracking",
Description: "Tests capacity tracking with multiple concurrent tasks",
EventSequence: []*SimulationEvent{
{Type: EventTaskStarted, VolumeID: 1, TaskID: "ec_task_1"},
{Type: EventTaskStarted, VolumeID: 2, TaskID: "vacuum_task_1"},
{Type: EventTaskStarted, VolumeID: 3, TaskID: "ec_task_2"},
{Type: EventMasterSync},
{Type: EventTaskCompleted, TaskID: "vacuum_task_1"},
{Type: EventTaskCompleted, TaskID: "ec_task_1"},
{Type: EventTaskCompleted, TaskID: "ec_task_2"},
{Type: EventMasterSync},
},
InconsistencyChecks: []*InconsistencyCheck{
{Name: "Capacity tracking accurate", Type: task.InconsistencyCapacityMismatch, MaxAllowedCount: 0},
},
Duration: 90 * time.Second,
}
}
func (cs *ComprehensiveSimulator) createComplexECOperationScenario() *StateTestScenario {
return &StateTestScenario{
Name: "complex_ec_operation",
Description: "Tests complex EC operations with shard movements and rebuilds",
EventSequence: []*SimulationEvent{
{Type: EventTaskStarted, VolumeID: 1, TaskID: "ec_encode_1"},
// Create all 14 shards
{Type: EventShardCreated, VolumeID: 1, ShardID: intPtr(0), Server: "server1"},
{Type: EventShardCreated, VolumeID: 1, ShardID: intPtr(1), Server: "server1"},
// ... more shards
{Type: EventTaskCompleted, TaskID: "ec_encode_1"},
{Type: EventShardCorrupted, VolumeID: 1, ShardID: intPtr(2)},
{Type: EventTaskStarted, VolumeID: 1, TaskID: "ec_rebuild_1"},
{Type: EventShardCreated, VolumeID: 1, ShardID: intPtr(2), Server: "server3"}, // Rebuilt
{Type: EventTaskCompleted, TaskID: "ec_rebuild_1"},
{Type: EventMasterSync},
},
Duration: 120 * time.Second,
}
}
func (cs *ComprehensiveSimulator) createHighLoadStressTestScenario() *StateTestScenario {
events := []*SimulationEvent{}
// Create 100 concurrent tasks
for i := 0; i < 100; i++ {
events = append(events, &SimulationEvent{
Type: EventTaskStarted,
VolumeID: uint32(i + 1),
TaskID: fmt.Sprintf("stress_task_%d", i),
})
}
// Add master syncs throughout
for i := 0; i < 10; i++ {
events = append(events, &SimulationEvent{
Type: EventMasterSync,
})
}
// Complete all tasks
for i := 0; i < 100; i++ {
events = append(events, &SimulationEvent{
Type: EventTaskCompleted,
TaskID: fmt.Sprintf("stress_task_%d", i),
})
}
return &StateTestScenario{
Name: "high_load_stress_test",
Description: "Tests system under high load with many concurrent operations",
EventSequence: events,
Duration: 5 * time.Minute,
}
}
// Add more scenario creation methods...
func (cs *ComprehensiveSimulator) createMasterSyncDuringTaskScenario() *StateTestScenario {
return &StateTestScenario{Name: "master_sync_during_task", Description: "Test", Duration: 30 * time.Second}
}
func (cs *ComprehensiveSimulator) createWorkerFailureDuringECScenario() *StateTestScenario {
return &StateTestScenario{Name: "worker_failure_during_ec", Description: "Test", Duration: 30 * time.Second}
}
func (cs *ComprehensiveSimulator) createCapacityOverflowScenario() *StateTestScenario {
return &StateTestScenario{Name: "capacity_overflow", Description: "Test", Duration: 30 * time.Second}
}
func (cs *ComprehensiveSimulator) createShardCorruptionScenario() *StateTestScenario {
return &StateTestScenario{Name: "shard_corruption", Description: "Test", Duration: 30 * time.Second}
}
func (cs *ComprehensiveSimulator) createMasterInconsistencyScenario() *StateTestScenario {
return &StateTestScenario{Name: "master_inconsistency", Description: "Test", Duration: 30 * time.Second}
}
func (cs *ComprehensiveSimulator) createTaskOrphanScenario() *StateTestScenario {
return &StateTestScenario{Name: "task_orphan", Description: "Test", Duration: 30 * time.Second}
}
func (cs *ComprehensiveSimulator) createDuplicateTaskDetectionScenario() *StateTestScenario {
return &StateTestScenario{Name: "duplicate_task_detection", Description: "Test", Duration: 30 * time.Second}
}
func (cs *ComprehensiveSimulator) createVolumeStateRollbackScenario() *StateTestScenario {
return &StateTestScenario{Name: "volume_state_rollback", Description: "Test", Duration: 30 * time.Second}
}
// RunScenario executes a single test scenario
func (cs *ComprehensiveSimulator) RunScenario(scenario *StateTestScenario) error {
cs.mutex.Lock()
cs.currentScenario = scenario
cs.mutex.Unlock()
glog.V(1).Infof("Setting up scenario: %s", scenario.Name)
// Setup initial state
if err := cs.setupInitialState(scenario.InitialState); err != nil {
return fmt.Errorf("failed to setup initial state: %v", err)
}
// Execute event sequence
ctx, cancel := context.WithTimeout(context.Background(), scenario.Duration)
defer cancel()
for _, event := range scenario.EventSequence {
select {
case <-ctx.Done():
return fmt.Errorf("scenario timed out")
default:
if err := cs.executeEvent(event); err != nil {
cs.results.Warnings = append(cs.results.Warnings,
fmt.Sprintf("Event execution warning in %s: %v", scenario.Name, err))
}
cs.logEvent(event)
}
// Small delay between events
time.Sleep(100 * time.Millisecond)
}
// Validate final state
if err := cs.validateFinalState(scenario); err != nil {
cs.results.StateValidationsFailed++
return fmt.Errorf("final state validation failed: %v", err)
} else {
cs.results.StateValidationsPassed++
}
glog.V(1).Infof("Scenario %s completed successfully", scenario.Name)
return nil
}
// executeEvent executes a single simulation event
func (cs *ComprehensiveSimulator) executeEvent(event *SimulationEvent) error {
cs.results.TotalEvents++
cs.results.EventsByType[event.Type]++
switch event.Type {
case EventTaskStarted:
return cs.simulateTaskStart(event)
case EventTaskCompleted:
return cs.simulateTaskCompletion(event)
case EventVolumeCreated:
return cs.simulateVolumeCreation(event)
case EventVolumeDeleted:
return cs.simulateVolumeDeletion(event)
case EventShardCreated:
return cs.simulateShardCreation(event)
case EventMasterSync:
return cs.simulateMasterSync(event)
case EventNetworkPartition:
return cs.simulateNetworkPartition(event)
default:
return nil // Unsupported event type
}
}
// Event simulation methods
func (cs *ComprehensiveSimulator) simulateTaskStart(event *SimulationEvent) error {
taskType, _ := event.Parameters["type"].(string)
impact := &task.TaskImpact{
TaskID: event.TaskID,
TaskType: types.TaskType(taskType),
VolumeID: event.VolumeID,
StartedAt: time.Now(),
EstimatedEnd: time.Now().Add(30 * time.Second),
VolumeChanges: &task.VolumeChanges{},
ShardChanges: make(map[int]*task.ShardChange),
CapacityDelta: make(map[string]int64),
}
cs.stateManager.RegisterTaskImpact(event.TaskID, impact)
cs.results.TasksExecuted++
return nil
}
func (cs *ComprehensiveSimulator) simulateTaskCompletion(event *SimulationEvent) error {
cs.stateManager.UnregisterTaskImpact(event.TaskID)
cs.results.TasksSucceeded++
return nil
}
func (cs *ComprehensiveSimulator) simulateVolumeCreation(event *SimulationEvent) error {
size, _ := event.Parameters["size"].(int64)
cs.mockMaster.CreateVolume(event.VolumeID, size)
return nil
}
func (cs *ComprehensiveSimulator) simulateVolumeDeletion(event *SimulationEvent) error {
cs.mockMaster.DeleteVolume(event.VolumeID)
return nil
}
func (cs *ComprehensiveSimulator) simulateShardCreation(event *SimulationEvent) error {
if event.ShardID != nil {
cs.mockMaster.CreateShard(event.VolumeID, *event.ShardID, event.Server)
}
return nil
}
func (cs *ComprehensiveSimulator) simulateMasterSync(event *SimulationEvent) error {
return cs.stateManager.SyncWithMaster()
}
func (cs *ComprehensiveSimulator) simulateNetworkPartition(event *SimulationEvent) error {
cs.mockMaster.SetNetworkPartitioned(true)
// Auto-heal after duration
if durationStr, ok := event.Parameters["duration"].(string); ok {
if duration, err := time.ParseDuration(durationStr); err == nil {
time.AfterFunc(duration, func() {
cs.mockMaster.SetNetworkPartitioned(false)
})
}
}
return nil
}
// Helper methods
func (cs *ComprehensiveSimulator) setupInitialState(initialState *ClusterState) error {
if initialState == nil {
return nil
}
// Setup mock master with initial state
for volumeID, volume := range initialState.Volumes {
cs.mockMaster.CreateVolume(volumeID, int64(volume.Size))
}
for volumeID, shards := range initialState.ECShards {
for shardID, shard := range shards {
cs.mockMaster.CreateShard(volumeID, shardID, shard.Server)
}
}
return nil
}
func (cs *ComprehensiveSimulator) validateFinalState(scenario *StateTestScenario) error {
// Run inconsistency checks
for _, check := range scenario.InconsistencyChecks {
if err := cs.validateInconsistencyCheck(check); err != nil {
return err
}
}
return nil
}
func (cs *ComprehensiveSimulator) validateInconsistencyCheck(check *InconsistencyCheck) error {
// This would check for specific inconsistencies
// For now, we'll simulate the check
found := rand.Intn(check.MaxAllowedCount + 1)
if found > check.MaxAllowedCount {
return fmt.Errorf("inconsistency check %s failed: found %d, max allowed %d",
check.Name, found, check.MaxAllowedCount)
}
cs.results.InconsistenciesFound[check.Type] += found
return nil
}
func (cs *ComprehensiveSimulator) logEvent(event *SimulationEvent) {
cs.mutex.Lock()
defer cs.mutex.Unlock()
cs.eventLog = append(cs.eventLog, event)
logMsg := fmt.Sprintf("Event: %s, Volume: %d, Task: %s", event.Type, event.VolumeID, event.TaskID)
cs.results.DetailedLog = append(cs.results.DetailedLog, logMsg)
}
func (cs *ComprehensiveSimulator) generateDetailedReport() {
glog.Infof("=== COMPREHENSIVE SIMULATION REPORT ===")
glog.Infof("Duration: %v", cs.results.Duration)
glog.Infof("Total Events: %d", cs.results.TotalEvents)
glog.Infof("Tasks Executed: %d", cs.results.TasksExecuted)
glog.Infof("Tasks Succeeded: %d", cs.results.TasksSucceeded)
glog.Infof("State Validations Passed: %d", cs.results.StateValidationsPassed)
glog.Infof("State Validations Failed: %d", cs.results.StateValidationsFailed)
glog.Infof("Events by Type:")
for eventType, count := range cs.results.EventsByType {
glog.Infof(" %s: %d", eventType, count)
}
glog.Infof("Inconsistencies Found:")
for incType, count := range cs.results.InconsistenciesFound {
glog.Infof(" %s: %d", incType, count)
}
if len(cs.results.CriticalErrors) > 0 {
glog.Errorf("Critical Errors:")
for _, err := range cs.results.CriticalErrors {
glog.Errorf(" %s", err)
}
}
glog.Infof("Overall Success: %v", cs.results.Success)
glog.Infof("========================================")
}
// Mock Master Server implementation
func NewMockMasterServer() *MockMasterServer {
return &MockMasterServer{
volumes: make(map[uint32]*task.VolumeInfo),
ecShards: make(map[uint32]map[int]*task.ShardInfo),
serverCapacity: make(map[string]*task.CapacityInfo),
}
}
func (mms *MockMasterServer) CreateVolume(volumeID uint32, size int64) {
mms.mutex.Lock()
defer mms.mutex.Unlock()
mms.volumes[volumeID] = &task.VolumeInfo{
ID: volumeID,
Size: uint64(size),
}
}
func (mms *MockMasterServer) DeleteVolume(volumeID uint32) {
mms.mutex.Lock()
defer mms.mutex.Unlock()
delete(mms.volumes, volumeID)
delete(mms.ecShards, volumeID)
}
func (mms *MockMasterServer) CreateShard(volumeID uint32, shardID int, server string) {
mms.mutex.Lock()
defer mms.mutex.Unlock()
if mms.ecShards[volumeID] == nil {
mms.ecShards[volumeID] = make(map[int]*task.ShardInfo)
}
mms.ecShards[volumeID][shardID] = &task.ShardInfo{
ShardID: shardID,
Server: server,
Status: task.ShardStatusExists,
}
}
func (mms *MockMasterServer) SetNetworkPartitioned(partitioned bool) {
mms.mutex.Lock()
defer mms.mutex.Unlock()
mms.networkPartitioned = partitioned
}
// Helper function
func intPtr(i int) *int {
return &i
}

444
weed/admin/task/simulation/comprehensive_simulation_test.go

@ -1,444 +0,0 @@
package simulation
import (
"fmt"
"testing"
"time"
"github.com/seaweedfs/seaweedfs/weed/admin/task"
)
func TestComprehensiveSimulation_VolumeCreationDuringTask(t *testing.T) {
simulator := NewComprehensiveSimulator()
scenario := &StateTestScenario{
Name: "volume_creation_during_task",
Description: "Tests state consistency when master reports new volume while task is creating it",
InitialState: &ClusterState{
Volumes: make(map[uint32]*task.VolumeInfo),
ECShards: make(map[uint32]map[int]*task.ShardInfo),
},
EventSequence: []*SimulationEvent{
{Type: EventTaskStarted, VolumeID: 1, TaskID: "create_task_1", Parameters: map[string]interface{}{"type": "create"}},
{Type: EventVolumeCreated, VolumeID: 1, Parameters: map[string]interface{}{"size": int64(1024 * 1024 * 1024)}},
{Type: EventMasterSync},
{Type: EventTaskCompleted, TaskID: "create_task_1"},
},
InconsistencyChecks: []*InconsistencyCheck{
{Name: "No unexpected volumes", Type: task.InconsistencyVolumeUnexpected, MaxAllowedCount: 0},
},
Duration: 30 * time.Second,
}
err := simulator.RunScenario(scenario)
if err != nil {
t.Errorf("Volume creation during task scenario failed: %v", err)
}
t.Log("✅ Volume creation during task test passed")
}
func TestComprehensiveSimulation_VolumeDeletionDuringTask(t *testing.T) {
simulator := NewComprehensiveSimulator()
scenario := &StateTestScenario{
Name: "volume_deletion_during_task",
Description: "Tests handling when volume is deleted while task is working on it",
InitialState: &ClusterState{
Volumes: map[uint32]*task.VolumeInfo{
1: {ID: 1, Size: 1024 * 1024 * 1024},
},
},
EventSequence: []*SimulationEvent{
{Type: EventTaskStarted, VolumeID: 1, TaskID: "vacuum_task_1", Parameters: map[string]interface{}{"type": "vacuum"}},
{Type: EventVolumeDeleted, VolumeID: 1},
{Type: EventMasterSync},
{Type: EventTaskFailed, TaskID: "vacuum_task_1", Parameters: map[string]interface{}{"reason": "volume_deleted"}},
},
InconsistencyChecks: []*InconsistencyCheck{
{Name: "Missing volume detected", Type: task.InconsistencyVolumeMissing, ExpectedCount: 1, MaxAllowedCount: 1},
},
Duration: 30 * time.Second,
}
err := simulator.RunScenario(scenario)
if err != nil {
t.Errorf("Volume deletion during task scenario failed: %v", err)
}
t.Log("✅ Volume deletion during task test passed")
}
func TestComprehensiveSimulation_ShardCreationRaceCondition(t *testing.T) {
simulator := NewComprehensiveSimulator()
scenario := &StateTestScenario{
Name: "shard_creation_race_condition",
Description: "Tests race condition between EC task creating shards and master sync",
InitialState: &ClusterState{
Volumes: map[uint32]*task.VolumeInfo{
1: {ID: 1, Size: 28 * 1024 * 1024 * 1024}, // Large volume ready for EC
},
},
EventSequence: []*SimulationEvent{
{Type: EventTaskStarted, VolumeID: 1, TaskID: "ec_task_1", Parameters: map[string]interface{}{"type": "ec_encode"}},
// Simulate shards being created one by one
{Type: EventShardCreated, VolumeID: 1, ShardID: intPtr(0), Server: "server1"},
{Type: EventShardCreated, VolumeID: 1, ShardID: intPtr(1), Server: "server1"},
{Type: EventMasterSync}, // Master sync happens while shards are being created
{Type: EventShardCreated, VolumeID: 1, ShardID: intPtr(2), Server: "server2"},
{Type: EventShardCreated, VolumeID: 1, ShardID: intPtr(3), Server: "server2"},
{Type: EventTaskCompleted, TaskID: "ec_task_1"},
{Type: EventMasterSync},
},
InconsistencyChecks: []*InconsistencyCheck{
{Name: "All shards accounted for", Type: task.InconsistencyShardMissing, MaxAllowedCount: 0},
},
Duration: 45 * time.Second,
}
err := simulator.RunScenario(scenario)
if err != nil {
t.Errorf("Shard creation race condition scenario failed: %v", err)
}
t.Log("✅ Shard creation race condition test passed")
}
func TestComprehensiveSimulation_NetworkPartitionRecovery(t *testing.T) {
simulator := NewComprehensiveSimulator()
scenario := &StateTestScenario{
Name: "network_partition_recovery",
Description: "Tests state consistency during and after network partitions",
EventSequence: []*SimulationEvent{
{Type: EventTaskStarted, VolumeID: 1, TaskID: "partition_task_1"},
{Type: EventNetworkPartition, Parameters: map[string]interface{}{"duration": "5s"}}, // Shorter for test
{Type: EventVolumeCreated, VolumeID: 2}, // Created during partition
{Type: EventNetworkHealed},
{Type: EventMasterReconnected},
{Type: EventMasterSync},
{Type: EventTaskCompleted, TaskID: "partition_task_1"},
},
InconsistencyChecks: []*InconsistencyCheck{
{Name: "State reconciled after partition", Type: task.InconsistencyVolumeUnexpected, MaxAllowedCount: 1},
},
Duration: 30 * time.Second,
}
err := simulator.RunScenario(scenario)
if err != nil {
t.Errorf("Network partition recovery scenario failed: %v", err)
}
t.Log("✅ Network partition recovery test passed")
}
func TestComprehensiveSimulation_ConcurrentTasksCapacityTracking(t *testing.T) {
simulator := NewComprehensiveSimulator()
scenario := &StateTestScenario{
Name: "concurrent_tasks_capacity_tracking",
Description: "Tests capacity tracking with multiple concurrent tasks",
EventSequence: []*SimulationEvent{
{Type: EventTaskStarted, VolumeID: 1, TaskID: "ec_task_1"},
{Type: EventTaskStarted, VolumeID: 2, TaskID: "vacuum_task_1"},
{Type: EventTaskStarted, VolumeID: 3, TaskID: "ec_task_2"},
{Type: EventMasterSync},
{Type: EventTaskCompleted, TaskID: "vacuum_task_1"},
{Type: EventTaskCompleted, TaskID: "ec_task_1"},
{Type: EventTaskCompleted, TaskID: "ec_task_2"},
{Type: EventMasterSync},
},
InconsistencyChecks: []*InconsistencyCheck{
{Name: "Capacity tracking accurate", Type: task.InconsistencyCapacityMismatch, MaxAllowedCount: 0},
},
Duration: 60 * time.Second,
}
err := simulator.RunScenario(scenario)
if err != nil {
t.Errorf("Concurrent tasks capacity tracking scenario failed: %v", err)
}
t.Log("✅ Concurrent tasks capacity tracking test passed")
}
func TestComprehensiveSimulation_ComplexECOperation(t *testing.T) {
simulator := NewComprehensiveSimulator()
scenario := &StateTestScenario{
Name: "complex_ec_operation",
Description: "Tests complex EC operations with shard movements and rebuilds",
EventSequence: []*SimulationEvent{
{Type: EventTaskStarted, VolumeID: 1, TaskID: "ec_encode_1"},
// Create some shards
{Type: EventShardCreated, VolumeID: 1, ShardID: intPtr(0), Server: "server1"},
{Type: EventShardCreated, VolumeID: 1, ShardID: intPtr(1), Server: "server1"},
{Type: EventShardCreated, VolumeID: 1, ShardID: intPtr(2), Server: "server2"},
{Type: EventTaskCompleted, TaskID: "ec_encode_1"},
{Type: EventShardCorrupted, VolumeID: 1, ShardID: intPtr(2)},
{Type: EventTaskStarted, VolumeID: 1, TaskID: "ec_rebuild_1"},
{Type: EventShardCreated, VolumeID: 1, ShardID: intPtr(2), Server: "server3"}, // Rebuilt
{Type: EventTaskCompleted, TaskID: "ec_rebuild_1"},
{Type: EventMasterSync},
},
Duration: 60 * time.Second,
}
err := simulator.RunScenario(scenario)
if err != nil {
t.Errorf("Complex EC operation scenario failed: %v", err)
}
t.Log("✅ Complex EC operation test passed")
}
func TestComprehensiveSimulation_HighLoadStressTest(t *testing.T) {
if testing.Short() {
t.Skip("Skipping high load stress test in short mode")
}
simulator := NewComprehensiveSimulator()
events := []*SimulationEvent{}
// Create 50 concurrent tasks (reduced from 100 for faster test)
for i := 0; i < 50; i++ {
events = append(events, &SimulationEvent{
Type: EventTaskStarted,
VolumeID: uint32(i + 1),
TaskID: fmt.Sprintf("stress_task_%d", i),
})
}
// Add master syncs throughout
for i := 0; i < 5; i++ {
events = append(events, &SimulationEvent{
Type: EventMasterSync,
})
}
// Complete all tasks
for i := 0; i < 50; i++ {
events = append(events, &SimulationEvent{
Type: EventTaskCompleted,
TaskID: fmt.Sprintf("stress_task_%d", i),
})
}
scenario := &StateTestScenario{
Name: "high_load_stress_test",
Description: "Tests system under high load with many concurrent operations",
EventSequence: events,
Duration: 2 * time.Minute, // Reduced for faster test
}
err := simulator.RunScenario(scenario)
if err != nil {
t.Errorf("High load stress test scenario failed: %v", err)
}
t.Log("✅ High load stress test passed")
}
func TestComprehensiveSimulation_AllScenarios(t *testing.T) {
if testing.Short() {
t.Skip("Skipping comprehensive simulation in short mode")
}
simulator := NewComprehensiveSimulator()
simulator.CreateComprehensiveScenarios()
// Run a subset of scenarios for testing (full suite would be too slow)
testScenarios := []string{
"volume_creation_during_task",
"volume_deletion_during_task",
"shard_creation_race_condition",
"network_partition_recovery",
"concurrent_tasks_capacity_tracking",
}
passedScenarios := 0
totalScenarios := len(testScenarios)
for _, scenarioName := range testScenarios {
t.Run(scenarioName, func(t *testing.T) {
// Find the scenario
var scenario *StateTestScenario
for _, s := range simulator.scenarios {
if s.Name == scenarioName {
scenario = s
break
}
}
if scenario == nil {
t.Errorf("Scenario %s not found", scenarioName)
return
}
// Reduce duration for faster testing
scenario.Duration = 15 * time.Second
err := simulator.RunScenario(scenario)
if err != nil {
t.Errorf("Scenario %s failed: %v", scenarioName, err)
} else {
passedScenarios++
t.Logf("✅ Scenario %s passed", scenarioName)
}
})
}
successRate := float64(passedScenarios) / float64(totalScenarios) * 100.0
t.Logf("=== COMPREHENSIVE SIMULATION TEST RESULTS ===")
t.Logf("Scenarios Passed: %d/%d (%.1f%%)", passedScenarios, totalScenarios, successRate)
if successRate < 100.0 {
t.Errorf("Some scenarios failed. Success rate: %.1f%%", successRate)
} else {
t.Log("🎉 All comprehensive simulation scenarios passed!")
}
}
func TestComprehensiveSimulation_SimulationFramework(t *testing.T) {
// Test the simulation framework itself
simulator := NewComprehensiveSimulator()
// Test event execution
event := &SimulationEvent{
Type: EventTaskStarted,
VolumeID: 1,
TaskID: "test_task",
Parameters: map[string]interface{}{
"type": "vacuum",
},
}
err := simulator.executeEvent(event)
if err != nil {
t.Errorf("Event execution failed: %v", err)
}
// Verify task was registered
if simulator.results.TasksExecuted != 1 {
t.Errorf("Expected 1 task executed, got %d", simulator.results.TasksExecuted)
}
// Test event logging
simulator.logEvent(event)
if len(simulator.eventLog) != 1 {
t.Errorf("Expected 1 logged event, got %d", len(simulator.eventLog))
}
// Test mock master
simulator.mockMaster.CreateVolume(1, 1024*1024*1024)
if len(simulator.mockMaster.volumes) != 1 {
t.Errorf("Expected 1 volume in mock master, got %d", len(simulator.mockMaster.volumes))
}
t.Log("✅ Simulation framework test passed")
}
// Integration test that validates the complete state management flow
func TestComprehensiveSimulation_StateManagementIntegration(t *testing.T) {
// This test validates the core requirement: accurate volume/shard state tracking
simulator := NewComprehensiveSimulator()
// Use mock master client instead of nil to avoid nil pointer errors
simulator.stateManager = task.NewVolumeStateManager(nil) // Skip master client calls for test
// Setup realistic initial state
initialState := &ClusterState{
Volumes: map[uint32]*task.VolumeInfo{
1: {ID: 1, Size: 28 * 1024 * 1024 * 1024, Server: "server1"}, // Ready for EC
2: {ID: 2, Size: 20 * 1024 * 1024 * 1024, Server: "server2", DeletedByteCount: 8 * 1024 * 1024 * 1024}, // Needs vacuum
},
ServerCapacity: map[string]*task.CapacityInfo{
"server1": {Server: "server1", TotalCapacity: 100 * 1024 * 1024 * 1024, UsedCapacity: 30 * 1024 * 1024 * 1024},
"server2": {Server: "server2", TotalCapacity: 100 * 1024 * 1024 * 1024, UsedCapacity: 25 * 1024 * 1024 * 1024},
},
}
// Complex event sequence that tests state consistency (excluding master sync for test)
eventSequence := []*SimulationEvent{
// Start EC task on volume 1
{Type: EventTaskStarted, VolumeID: 1, TaskID: "ec_task_1", Parameters: map[string]interface{}{"type": "ec_encode"}},
// Start vacuum task on volume 2
{Type: EventTaskStarted, VolumeID: 2, TaskID: "vacuum_task_1", Parameters: map[string]interface{}{"type": "vacuum"}},
// EC task creates shards
{Type: EventShardCreated, VolumeID: 1, ShardID: intPtr(0), Server: "server1"},
{Type: EventShardCreated, VolumeID: 1, ShardID: intPtr(1), Server: "server1"},
{Type: EventShardCreated, VolumeID: 1, ShardID: intPtr(2), Server: "server2"},
// Vacuum task completes (volume 2 size reduces)
{Type: EventTaskCompleted, TaskID: "vacuum_task_1"},
{Type: EventVolumeSizeChanged, VolumeID: 2, Parameters: map[string]interface{}{"new_size": int64(12 * 1024 * 1024 * 1024)}},
// EC task completes
{Type: EventTaskCompleted, TaskID: "ec_task_1"},
{Type: EventVolumeReadOnly, VolumeID: 1}, // Volume becomes read-only after EC
}
scenario := &StateTestScenario{
Name: "state_management_integration",
Description: "Complete state management integration test",
InitialState: initialState,
EventSequence: eventSequence,
Duration: 30 * time.Second, // Reduced for faster test
InconsistencyChecks: []*InconsistencyCheck{
{Name: "No state inconsistencies", Type: task.InconsistencyVolumeUnexpected, MaxAllowedCount: 0},
{Name: "No capacity mismatches", Type: task.InconsistencyCapacityMismatch, MaxAllowedCount: 0},
{Name: "No orphaned tasks", Type: task.InconsistencyTaskOrphaned, MaxAllowedCount: 0},
},
}
err := simulator.RunScenario(scenario)
if err != nil {
t.Errorf("State management integration test failed: %v", err)
}
// Verify final state
if simulator.results.TasksExecuted != 2 {
t.Errorf("Expected 2 tasks executed, got %d", simulator.results.TasksExecuted)
}
if simulator.results.TasksSucceeded != 2 {
t.Errorf("Expected 2 tasks succeeded, got %d", simulator.results.TasksSucceeded)
}
t.Log("✅ State management integration test passed")
t.Log("✅ System accurately tracked volume/shard states throughout complex operation sequence")
}
// Performance test for simulation framework
func BenchmarkComprehensiveSimulation_EventExecution(b *testing.B) {
simulator := NewComprehensiveSimulator()
events := []*SimulationEvent{
{Type: EventTaskStarted, VolumeID: 1, TaskID: "task_1"},
{Type: EventVolumeCreated, VolumeID: 2},
{Type: EventShardCreated, VolumeID: 1, ShardID: intPtr(0), Server: "server1"},
{Type: EventMasterSync},
{Type: EventTaskCompleted, TaskID: "task_1"},
}
b.ResetTimer()
for i := 0; i < b.N; i++ {
for _, event := range events {
simulator.executeEvent(event)
}
}
}
// Helper functions for tests
func createTestVolumeInfo(id uint32, size uint64) *task.VolumeInfo {
return &task.VolumeInfo{
ID: id,
Size: size,
}
}

294
weed/admin/task/simulation/simulation_runner.go

@ -1,294 +0,0 @@
package simulation
import (
"fmt"
"github.com/seaweedfs/seaweedfs/weed/glog"
)
// ComprehensiveSimulationRunner orchestrates all comprehensive state management tests
type ComprehensiveSimulationRunner struct {
simulator *ComprehensiveSimulator
}
// NewComprehensiveSimulationRunner creates a new comprehensive simulation runner
func NewComprehensiveSimulationRunner() *ComprehensiveSimulationRunner {
return &ComprehensiveSimulationRunner{
simulator: NewComprehensiveSimulator(),
}
}
// RunAllComprehensiveTests runs all comprehensive edge case scenarios
func (csr *ComprehensiveSimulationRunner) RunAllComprehensiveTests() error {
glog.Infof("=== STARTING COMPREHENSIVE VOLUME/SHARD STATE MANAGEMENT SIMULATION ===")
// Create all test scenarios
csr.simulator.CreateComprehensiveScenarios()
// Run all scenarios
results, err := csr.simulator.RunAllComprehensiveScenarios()
if err != nil {
return fmt.Errorf("comprehensive simulation failed: %v", err)
}
// Analyze results
csr.analyzeResults(results)
// Generate final report
csr.generateFinalReport(results)
return nil
}
// analyzeResults analyzes the simulation results
func (csr *ComprehensiveSimulationRunner) analyzeResults(results *SimulationResults) {
glog.Infof("=== ANALYZING COMPREHENSIVE SIMULATION RESULTS ===")
// Check critical errors
if len(results.CriticalErrors) > 0 {
glog.Errorf("CRITICAL ISSUES FOUND:")
for i, err := range results.CriticalErrors {
glog.Errorf(" %d. %s", i+1, err)
}
}
// Check state validation success rate
totalValidations := results.StateValidationsPassed + results.StateValidationsFailed
if totalValidations > 0 {
successRate := float64(results.StateValidationsPassed) / float64(totalValidations) * 100.0
glog.Infof("State Validation Success Rate: %.2f%% (%d/%d)",
successRate, results.StateValidationsPassed, totalValidations)
if successRate < 95.0 {
glog.Warningf("State validation success rate is below 95%% - investigation needed")
}
}
// Check task execution success rate
if results.TasksExecuted > 0 {
taskSuccessRate := float64(results.TasksSucceeded) / float64(results.TasksExecuted) * 100.0
glog.Infof("Task Execution Success Rate: %.2f%% (%d/%d)",
taskSuccessRate, results.TasksSucceeded, results.TasksExecuted)
}
// Analyze inconsistency patterns
if len(results.InconsistenciesFound) > 0 {
glog.Infof("Inconsistency Analysis:")
for incType, count := range results.InconsistenciesFound {
if count > 0 {
glog.Infof(" %s: %d occurrences", incType, count)
}
}
}
}
// generateFinalReport generates a comprehensive final report
func (csr *ComprehensiveSimulationRunner) generateFinalReport(results *SimulationResults) {
glog.Infof("=== COMPREHENSIVE SIMULATION FINAL REPORT ===")
glog.Infof("Test Duration: %v", results.Duration)
glog.Infof("Total Events Simulated: %d", results.TotalEvents)
glog.Infof("Scenarios Tested: %d", len(csr.simulator.scenarios))
glog.Infof("Overall Success: %v", results.Success)
// Event breakdown
glog.Infof("\nEvent Breakdown:")
for eventType, count := range results.EventsByType {
glog.Infof(" %s: %d", eventType, count)
}
// Test coverage summary
glog.Infof("\nTest Coverage Summary:")
glog.Infof("✓ Volume creation during task execution")
glog.Infof("✓ Volume deletion during task execution")
glog.Infof("✓ EC shard creation race conditions")
glog.Infof("✓ Network partition scenarios")
glog.Infof("✓ Concurrent task capacity tracking")
glog.Infof("✓ Complex EC operations with rebuilds")
glog.Infof("✓ High load stress testing")
glog.Infof("✓ Master sync timing issues")
glog.Infof("✓ Worker failure during operations")
glog.Infof("✓ Capacity overflow handling")
glog.Infof("✓ Shard corruption scenarios")
glog.Infof("✓ Master state inconsistencies")
glog.Infof("✓ Task orphan detection")
glog.Infof("✓ Duplicate task prevention")
glog.Infof("✓ Volume state rollback scenarios")
// Quality metrics
glog.Infof("\nQuality Metrics:")
if results.StateValidationsPassed > 0 {
glog.Infof("✓ State consistency maintained across all scenarios")
}
if len(results.CriticalErrors) == 0 {
glog.Infof("✓ No critical errors detected")
}
if results.TasksSucceeded > 0 {
glog.Infof("✓ Task execution reliability verified")
}
// Recommendations
glog.Infof("\nRecommendations:")
if results.Success {
glog.Infof("✓ The task distribution system is ready for production deployment")
glog.Infof("✓ All edge cases have been tested and handled correctly")
glog.Infof("✓ Volume and shard state management is robust and consistent")
} else {
glog.Warningf("⚠ System requires additional work before production deployment")
glog.Warningf("⚠ Address critical errors before proceeding")
}
glog.Infof("==========================================")
}
// RunSpecificEdgeCaseTest runs a specific edge case test
func (csr *ComprehensiveSimulationRunner) RunSpecificEdgeCaseTest(scenarioName string) error {
glog.Infof("Running specific edge case test: %s", scenarioName)
// Create scenarios if not already done
if len(csr.simulator.scenarios) == 0 {
csr.simulator.CreateComprehensiveScenarios()
}
// Find and run specific scenario
for _, scenario := range csr.simulator.scenarios {
if scenario.Name == scenarioName {
err := csr.simulator.RunScenario(scenario)
if err != nil {
return fmt.Errorf("scenario %s failed: %v", scenarioName, err)
}
glog.Infof("Scenario %s completed successfully", scenarioName)
return nil
}
}
return fmt.Errorf("scenario %s not found", scenarioName)
}
// ValidateSystemReadiness performs final validation of system readiness
func (csr *ComprehensiveSimulationRunner) ValidateSystemReadiness() error {
glog.Infof("=== VALIDATING SYSTEM READINESS FOR PRODUCTION ===")
checklistItems := []struct {
name string
description string
validator func() error
}{
{
"Volume State Accuracy",
"Verify volume state tracking is accurate under all conditions",
csr.validateVolumeStateAccuracy,
},
{
"Shard Management",
"Verify EC shard creation/deletion/movement is handled correctly",
csr.validateShardManagement,
},
{
"Capacity Planning",
"Verify capacity calculations include in-progress and planned operations",
csr.validateCapacityPlanning,
},
{
"Failure Recovery",
"Verify system recovers gracefully from all failure scenarios",
csr.validateFailureRecovery,
},
{
"Consistency Guarantees",
"Verify state consistency is maintained across all operations",
csr.validateConsistencyGuarantees,
},
}
var failedChecks []string
for _, item := range checklistItems {
glog.Infof("Validating: %s", item.name)
if err := item.validator(); err != nil {
failedChecks = append(failedChecks, fmt.Sprintf("%s: %v", item.name, err))
glog.Errorf("❌ %s: %v", item.name, err)
} else {
glog.Infof("✅ %s: PASSED", item.name)
}
}
if len(failedChecks) > 0 {
return fmt.Errorf("system readiness validation failed: %v", failedChecks)
}
glog.Infof("🎉 SYSTEM IS READY FOR PRODUCTION DEPLOYMENT!")
return nil
}
// Validation methods
func (csr *ComprehensiveSimulationRunner) validateVolumeStateAccuracy() error {
// Run volume state accuracy tests
return csr.RunSpecificEdgeCaseTest("volume_creation_during_task")
}
func (csr *ComprehensiveSimulationRunner) validateShardManagement() error {
// Run shard management tests
return csr.RunSpecificEdgeCaseTest("shard_creation_race_condition")
}
func (csr *ComprehensiveSimulationRunner) validateCapacityPlanning() error {
// Run capacity planning tests
return csr.RunSpecificEdgeCaseTest("concurrent_tasks_capacity_tracking")
}
func (csr *ComprehensiveSimulationRunner) validateFailureRecovery() error {
// Run failure recovery tests
return csr.RunSpecificEdgeCaseTest("network_partition_recovery")
}
func (csr *ComprehensiveSimulationRunner) validateConsistencyGuarantees() error {
// Run consistency tests
return csr.RunSpecificEdgeCaseTest("complex_ec_operation")
}
// DemonstrateBugPrevention shows how the simulation prevents bugs
func (csr *ComprehensiveSimulationRunner) DemonstrateBugPrevention() {
glog.Infof("=== DEMONSTRATING BUG PREVENTION CAPABILITIES ===")
bugScenarios := []struct {
name string
description string
impact string
}{
{
"Race Condition Prevention",
"Master sync occurs while EC shards are being created",
"Prevents state inconsistencies that could lead to data loss",
},
{
"Capacity Overflow Prevention",
"Multiple tasks assigned without considering cumulative capacity impact",
"Prevents server disk space exhaustion",
},
{
"Orphaned Task Detection",
"Worker fails but task remains marked as in-progress",
"Prevents volumes from being stuck in intermediate states",
},
{
"Duplicate Task Prevention",
"Same volume assigned to multiple workers simultaneously",
"Prevents data corruption from conflicting operations",
},
{
"Network Partition Handling",
"Admin server loses connection to master during operations",
"Ensures eventual consistency when connectivity is restored",
},
}
for i, scenario := range bugScenarios {
glog.Infof("%d. %s", i+1, scenario.name)
glog.Infof(" Scenario: %s", scenario.description)
glog.Infof(" Impact Prevention: %s", scenario.impact)
glog.Infof("")
}
glog.Infof("✅ All potential bugs are detected and prevented by the simulation framework")
glog.Infof("✅ The system is thoroughly validated for production use")
}

237
weed/admin/task/simulation/system_demo_test.go

@ -1,237 +0,0 @@
package simulation
import (
"testing"
"github.com/seaweedfs/seaweedfs/weed/admin/task"
"github.com/seaweedfs/seaweedfs/weed/worker/types"
)
// TestSystemDemo demonstrates the complete working system
func TestSystemDemo(t *testing.T) {
t.Log("🚀 SEAWEEDFS TASK DISTRIBUTION SYSTEM DEMONSTRATION")
t.Log("====================================================")
// Test 1: Volume State Management
t.Log("\n📊 1. VOLUME STATE MANAGEMENT")
testVolumeStateManagement(t)
// Test 2: Task Assignment Logic
t.Log("\n⚡ 2. TASK ASSIGNMENT LOGIC")
testTaskAssignment(t)
// Test 3: Capacity Management
t.Log("\n💾 3. CAPACITY MANAGEMENT")
testCapacityManagement(t)
// Test 4: Edge Case Handling
t.Log("\n🛡️ 4. EDGE CASE HANDLING")
testEdgeCaseHandling(t)
t.Log("\n🎉 SYSTEM DEMONSTRATION COMPLETE")
t.Log("✅ All core features working correctly")
t.Log("✅ System ready for production deployment")
}
func testVolumeStateManagement(t *testing.T) {
vsm := task.NewVolumeStateManager(nil)
// Create volume
volumeID := uint32(1)
// Register task impact
impact := &task.TaskImpact{
TaskID: "ec_task_1",
VolumeID: volumeID,
TaskType: types.TaskTypeErasureCoding,
VolumeChanges: &task.VolumeChanges{
WillBecomeReadOnly: true,
},
CapacityDelta: map[string]int64{"server1": 12 * 1024 * 1024 * 1024}, // 12GB
}
vsm.RegisterTaskImpact(impact.TaskID, impact)
t.Log(" ✅ Volume state registration works")
t.Log(" ✅ Task impact tracking works")
t.Log(" ✅ State consistency maintained")
}
func testTaskAssignment(t *testing.T) {
registry := task.NewWorkerRegistry()
queue := task.NewPriorityTaskQueue()
scheduler := task.NewTaskScheduler(registry, queue)
// Register worker
worker := &types.Worker{
ID: "worker1",
Capabilities: []types.TaskType{types.TaskTypeVacuum},
MaxConcurrent: 2,
Status: "active",
CurrentLoad: 0,
}
registry.RegisterWorker(worker)
// Create task
taskItem := &types.Task{
ID: "vacuum_task_1",
Type: types.TaskTypeVacuum,
Priority: types.TaskPriorityNormal,
}
queue.Push(taskItem)
// Test assignment
assignedTask := scheduler.GetNextTask("worker1", []types.TaskType{types.TaskTypeVacuum})
if assignedTask == nil {
t.Error("❌ Task assignment failed")
return
}
if assignedTask.ID != "vacuum_task_1" {
t.Errorf("❌ Wrong task assigned: expected vacuum_task_1, got %s", assignedTask.ID)
return
}
t.Log(" ✅ Worker registration works")
t.Log(" ✅ Task queueing works")
t.Log(" ✅ Task assignment logic works")
t.Log(" ✅ Capability matching works")
}
func testCapacityManagement(t *testing.T) {
vsm := task.NewVolumeStateManager(nil)
// Note: We can't directly set capacityCache due to private fields,
// but we can test the public interface
// Test capacity checking with a made-up scenario
serverID := "test_server"
// This would normally fail since we can't set the capacity cache,
// but we can demonstrate the interface
canAssign := vsm.CanAssignVolumeToServer(5*1024*1024*1024, serverID)
// Since we can't set up the test data properly due to private fields,
// we'll just verify the method works without error
_ = canAssign
t.Log(" ✅ Capacity calculation interface works")
t.Log(" ✅ Reserved capacity tracking interface works")
t.Log(" ✅ Assignment constraints interface works")
}
func testEdgeCaseHandling(t *testing.T) {
// Test empty queue
registry := task.NewWorkerRegistry()
queue := task.NewPriorityTaskQueue()
scheduler := task.NewTaskScheduler(registry, queue)
worker := &types.Worker{
ID: "worker1",
Capabilities: []types.TaskType{types.TaskTypeVacuum},
Status: "active",
}
registry.RegisterWorker(worker)
// Empty queue should return nil
taskItem := scheduler.GetNextTask("worker1", []types.TaskType{types.TaskTypeVacuum})
if taskItem != nil {
t.Error("❌ Empty queue should return nil")
return
}
// Test unknown worker
unknownTask := scheduler.GetNextTask("unknown", []types.TaskType{types.TaskTypeVacuum})
if unknownTask != nil {
t.Error("❌ Unknown worker should not get tasks")
return
}
t.Log(" ✅ Empty queue handled correctly")
t.Log(" ✅ Unknown worker handled correctly")
t.Log(" ✅ Edge cases properly managed")
}
// TestSystemCapabilities demonstrates key system capabilities
func TestSystemCapabilities(t *testing.T) {
t.Log("\n🎯 SEAWEEDFS TASK DISTRIBUTION SYSTEM CAPABILITIES")
t.Log("==================================================")
capabilities := []string{
"✅ Comprehensive volume/shard state tracking",
"✅ Accurate capacity planning with reservations",
"✅ Task assignment based on worker capabilities",
"✅ Priority-based task scheduling",
"✅ Concurrent task management",
"✅ EC shard lifecycle tracking",
"✅ Capacity overflow prevention",
"✅ Duplicate task prevention",
"✅ Worker performance metrics",
"✅ Failure detection and recovery",
"✅ State reconciliation with master",
"✅ Comprehensive simulation framework",
"✅ Production-ready error handling",
"✅ Scalable distributed architecture",
"✅ Real-time progress monitoring",
}
for _, capability := range capabilities {
t.Log(" " + capability)
}
t.Log("\n📈 SYSTEM METRICS")
t.Log(" Total Lines of Code: 4,919")
t.Log(" Test Coverage: Comprehensive")
t.Log(" Edge Cases: 15+ scenarios tested")
t.Log(" Simulation Framework: Complete")
t.Log(" Production Ready: ✅ YES")
t.Log("\n🚀 READY FOR PRODUCTION DEPLOYMENT!")
}
// TestBugPrevention demonstrates how the system prevents common bugs
func TestBugPrevention(t *testing.T) {
t.Log("\n🛡️ BUG PREVENTION DEMONSTRATION")
t.Log("================================")
bugScenarios := []struct {
name string
description string
prevention string
}{
{
"Race Conditions",
"Master sync during shard creation",
"State manager tracks in-progress changes",
},
{
"Capacity Overflow",
"Multiple tasks overwhelming server disk",
"Reserved capacity tracking prevents overflow",
},
{
"Orphaned Tasks",
"Worker fails, task stuck in-progress",
"Timeout detection and automatic cleanup",
},
{
"Duplicate Tasks",
"Same volume assigned to multiple workers",
"Volume reservation prevents conflicts",
},
{
"State Inconsistency",
"Admin view diverges from master",
"Periodic reconciliation ensures consistency",
},
}
for i, scenario := range bugScenarios {
t.Logf(" %d. %s", i+1, scenario.name)
t.Logf(" Problem: %s", scenario.description)
t.Logf(" Solution: %s", scenario.prevention)
t.Log("")
}
t.Log("✅ All major bug categories prevented through design")
}

509
weed/admin/task/task_assignment_test.go

@ -1,509 +0,0 @@
package task
import (
"fmt"
"testing"
"time"
"github.com/seaweedfs/seaweedfs/weed/worker/types"
)
func TestTaskAssignment_BasicAssignment(t *testing.T) {
registry := NewWorkerRegistry()
queue := NewPriorityTaskQueue()
scheduler := NewTaskScheduler(registry, queue)
// Register worker
worker := &types.Worker{
ID: "worker1",
Capabilities: []types.TaskType{types.TaskTypeVacuum},
MaxConcurrent: 1,
Status: "active",
CurrentLoad: 0,
}
registry.RegisterWorker(worker)
// Create task
task := &types.Task{
ID: "task1",
Type: types.TaskTypeVacuum,
Priority: types.TaskPriorityNormal,
}
queue.Push(task)
// Test assignment
nextTask := scheduler.GetNextTask("worker1", []types.TaskType{types.TaskTypeVacuum})
if nextTask == nil {
t.Fatal("Expected task to be assigned")
}
if nextTask.ID != "task1" {
t.Errorf("Expected task1, got %s", nextTask.ID)
}
t.Log("✅ Basic task assignment test passed")
}
func TestTaskAssignment_CapabilityMatching(t *testing.T) {
registry := NewWorkerRegistry()
queue := NewPriorityTaskQueue()
scheduler := NewTaskScheduler(registry, queue)
// Register workers with different capabilities
ecWorker := &types.Worker{
ID: "ec_worker",
Capabilities: []types.TaskType{types.TaskTypeErasureCoding},
Status: "active",
CurrentLoad: 0,
}
registry.RegisterWorker(ecWorker)
vacuumWorker := &types.Worker{
ID: "vacuum_worker",
Capabilities: []types.TaskType{types.TaskTypeVacuum},
Status: "active",
CurrentLoad: 0,
}
registry.RegisterWorker(vacuumWorker)
// Create different types of tasks
ecTask := &types.Task{
ID: "ec_task",
Type: types.TaskTypeErasureCoding,
}
vacuumTask := &types.Task{
ID: "vacuum_task",
Type: types.TaskTypeVacuum,
}
queue.Push(ecTask)
queue.Push(vacuumTask)
// Test EC worker gets EC task
assignedECTask := scheduler.GetNextTask("ec_worker", []types.TaskType{types.TaskTypeErasureCoding})
if assignedECTask == nil || assignedECTask.Type != types.TaskTypeErasureCoding {
t.Error("EC worker should get EC task")
}
// Test vacuum worker gets vacuum task
assignedVacuumTask := scheduler.GetNextTask("vacuum_worker", []types.TaskType{types.TaskTypeVacuum})
if assignedVacuumTask == nil || assignedVacuumTask.Type != types.TaskTypeVacuum {
t.Error("Vacuum worker should get vacuum task")
}
// Test wrong capability - should get nothing
wrongTask := scheduler.GetNextTask("ec_worker", []types.TaskType{types.TaskTypeVacuum})
if wrongTask != nil {
t.Error("EC worker should not get vacuum task")
}
t.Log("✅ Capability matching test passed")
}
func TestTaskAssignment_PriorityOrdering(t *testing.T) {
queue := NewPriorityTaskQueue()
// Add tasks in reverse priority order
lowTask := &types.Task{
ID: "low_task",
Priority: types.TaskPriorityLow,
}
highTask := &types.Task{
ID: "high_task",
Priority: types.TaskPriorityHigh,
}
normalTask := &types.Task{
ID: "normal_task",
Priority: types.TaskPriorityNormal,
}
queue.Push(lowTask)
queue.Push(normalTask)
queue.Push(highTask)
// Should get high priority first
first := queue.Pop()
if first.Priority != types.TaskPriorityHigh {
t.Errorf("Expected high priority first, got %d", first.Priority)
}
// Then normal priority
second := queue.Pop()
if second.Priority != types.TaskPriorityNormal {
t.Errorf("Expected normal priority second, got %d", second.Priority)
}
// Finally low priority
third := queue.Pop()
if third.Priority != types.TaskPriorityLow {
t.Errorf("Expected low priority third, got %d", third.Priority)
}
t.Log("✅ Priority ordering test passed")
}
func TestTaskAssignment_WorkerCapacityLimits(t *testing.T) {
registry := NewWorkerRegistry()
// Register worker with limited capacity
worker := &types.Worker{
ID: "limited_worker",
Capabilities: []types.TaskType{types.TaskTypeVacuum},
MaxConcurrent: 2,
Status: "active",
CurrentLoad: 2, // Already at capacity
}
registry.RegisterWorker(worker)
// Worker should not be available
availableWorkers := registry.GetAvailableWorkers()
if len(availableWorkers) != 0 {
t.Error("Worker at capacity should not be available")
}
// Reduce load
worker.CurrentLoad = 1
// Worker should now be available
availableWorkers = registry.GetAvailableWorkers()
if len(availableWorkers) != 1 {
t.Error("Worker with capacity should be available")
}
t.Log("✅ Worker capacity limits test passed")
}
func TestTaskAssignment_ScheduledTasks(t *testing.T) {
registry := NewWorkerRegistry()
queue := NewPriorityTaskQueue()
scheduler := NewTaskScheduler(registry, queue)
worker := &types.Worker{
ID: "worker1",
Capabilities: []types.TaskType{types.TaskTypeVacuum},
Status: "active",
CurrentLoad: 0,
}
registry.RegisterWorker(worker)
// Create task scheduled for future
futureTask := &types.Task{
ID: "future_task",
Type: types.TaskTypeVacuum,
ScheduledAt: time.Now().Add(1 * time.Hour), // 1 hour from now
}
// Create task ready now
readyTask := &types.Task{
ID: "ready_task",
Type: types.TaskTypeVacuum,
ScheduledAt: time.Now().Add(-1 * time.Minute), // 1 minute ago
}
queue.Push(futureTask)
queue.Push(readyTask)
// Should get ready task, not future task
assignedTask := scheduler.GetNextTask("worker1", []types.TaskType{types.TaskTypeVacuum})
if assignedTask == nil || assignedTask.ID != "ready_task" {
t.Error("Should assign ready task, not future scheduled task")
}
t.Log("✅ Scheduled tasks test passed")
}
func TestTaskAssignment_WorkerSelection(t *testing.T) {
registry := NewWorkerRegistry()
queue := NewPriorityTaskQueue()
scheduler := NewTaskScheduler(registry, queue)
// Register workers with different characteristics
highPerformanceWorker := &types.Worker{
ID: "high_perf_worker",
Address: "server1",
Capabilities: []types.TaskType{types.TaskTypeErasureCoding},
Status: "active",
CurrentLoad: 0,
MaxConcurrent: 4,
}
lowPerformanceWorker := &types.Worker{
ID: "low_perf_worker",
Address: "server2",
Capabilities: []types.TaskType{types.TaskTypeErasureCoding},
Status: "active",
CurrentLoad: 1,
MaxConcurrent: 2,
}
registry.RegisterWorker(highPerformanceWorker)
registry.RegisterWorker(lowPerformanceWorker)
// Set up metrics to favor high performance worker
registry.metrics[highPerformanceWorker.ID] = &WorkerMetrics{
TasksCompleted: 100,
TasksFailed: 5,
SuccessRate: 0.95,
AverageTaskTime: 10 * time.Minute,
LastTaskTime: time.Now().Add(-5 * time.Minute),
}
registry.metrics[lowPerformanceWorker.ID] = &WorkerMetrics{
TasksCompleted: 50,
TasksFailed: 10,
SuccessRate: 0.83,
AverageTaskTime: 20 * time.Minute,
LastTaskTime: time.Now().Add(-1 * time.Hour),
}
// Create high priority task
task := &types.Task{
ID: "important_task",
Type: types.TaskTypeErasureCoding,
Priority: types.TaskPriorityHigh,
Server: "server1", // Prefers server1
}
availableWorkers := []*types.Worker{highPerformanceWorker, lowPerformanceWorker}
selectedWorker := scheduler.SelectWorker(task, availableWorkers)
if selectedWorker == nil {
t.Fatal("No worker selected")
}
if selectedWorker.ID != "high_perf_worker" {
t.Errorf("Expected high performance worker to be selected, got %s", selectedWorker.ID)
}
t.Log("✅ Worker selection test passed")
}
func TestTaskAssignment_ServerAffinity(t *testing.T) {
registry := NewWorkerRegistry()
queue := NewPriorityTaskQueue()
scheduler := NewTaskScheduler(registry, queue)
// Workers on different servers
worker1 := &types.Worker{
ID: "worker1",
Address: "server1",
Capabilities: []types.TaskType{types.TaskTypeVacuum},
Status: "active",
CurrentLoad: 0,
}
worker2 := &types.Worker{
ID: "worker2",
Address: "server2",
Capabilities: []types.TaskType{types.TaskTypeVacuum},
Status: "active",
CurrentLoad: 0,
}
registry.RegisterWorker(worker1)
registry.RegisterWorker(worker2)
// Task that prefers server1
task := &types.Task{
ID: "affinity_task",
Type: types.TaskTypeVacuum,
Server: "server1", // Should prefer worker on server1
}
availableWorkers := []*types.Worker{worker1, worker2}
selectedWorker := scheduler.SelectWorker(task, availableWorkers)
if selectedWorker == nil {
t.Fatal("No worker selected")
}
if selectedWorker.Address != "server1" {
t.Errorf("Expected worker on server1 to be selected for server affinity")
}
t.Log("✅ Server affinity test passed")
}
func TestTaskAssignment_DuplicateTaskPrevention(t *testing.T) {
queue := NewPriorityTaskQueue()
// Add initial task
task1 := &types.Task{
ID: "task1",
Type: types.TaskTypeVacuum,
VolumeID: 1,
}
queue.Push(task1)
// Check for duplicate
hasDuplicate := queue.HasTask(1, types.TaskTypeVacuum)
if !hasDuplicate {
t.Error("Should detect existing task for volume")
}
// Check for non-existent task
hasNonExistent := queue.HasTask(2, types.TaskTypeVacuum)
if hasNonExistent {
t.Error("Should not detect task for different volume")
}
// Check for different task type
hasDifferentType := queue.HasTask(1, types.TaskTypeErasureCoding)
if hasDifferentType {
t.Error("Should not detect different task type for same volume")
}
t.Log("✅ Duplicate task prevention test passed")
}
func TestTaskAssignment_TaskRemoval(t *testing.T) {
queue := NewPriorityTaskQueue()
// Add tasks
task1 := &types.Task{ID: "task1", Priority: types.TaskPriorityNormal}
task2 := &types.Task{ID: "task2", Priority: types.TaskPriorityHigh}
task3 := &types.Task{ID: "task3", Priority: types.TaskPriorityLow}
queue.Push(task1)
queue.Push(task2)
queue.Push(task3)
if queue.Size() != 3 {
t.Errorf("Expected queue size 3, got %d", queue.Size())
}
// Remove middle priority task
removed := queue.RemoveTask("task1")
if !removed {
t.Error("Should have removed task1")
}
if queue.Size() != 2 {
t.Errorf("Expected queue size 2 after removal, got %d", queue.Size())
}
// Verify order maintained (high priority first)
next := queue.Peek()
if next.ID != "task2" {
t.Errorf("Expected task2 (high priority) to be next, got %s", next.ID)
}
t.Log("✅ Task removal test passed")
}
func TestTaskAssignment_EdgeCases(t *testing.T) {
t.Run("EmptyQueue", func(t *testing.T) {
registry := NewWorkerRegistry()
queue := NewPriorityTaskQueue()
scheduler := NewTaskScheduler(registry, queue)
worker := &types.Worker{
ID: "worker1",
Capabilities: []types.TaskType{types.TaskTypeVacuum},
Status: "active",
}
registry.RegisterWorker(worker)
// Empty queue should return nil
task := scheduler.GetNextTask("worker1", []types.TaskType{types.TaskTypeVacuum})
if task != nil {
t.Error("Empty queue should return nil task")
}
})
t.Run("UnknownWorker", func(t *testing.T) {
registry := NewWorkerRegistry()
queue := NewPriorityTaskQueue()
scheduler := NewTaskScheduler(registry, queue)
task := &types.Task{ID: "task1", Type: types.TaskTypeVacuum}
queue.Push(task)
// Unknown worker should return nil
assignedTask := scheduler.GetNextTask("unknown_worker", []types.TaskType{types.TaskTypeVacuum})
if assignedTask != nil {
t.Error("Unknown worker should not get tasks")
}
})
t.Run("InactiveWorker", func(t *testing.T) {
registry := NewWorkerRegistry()
worker := &types.Worker{
ID: "inactive_worker",
Capabilities: []types.TaskType{types.TaskTypeVacuum},
Status: "inactive",
CurrentLoad: 0,
}
registry.RegisterWorker(worker)
// Inactive worker should not be available
available := registry.GetAvailableWorkers()
if len(available) != 0 {
t.Error("Inactive worker should not be available")
}
})
t.Log("✅ Edge cases test passed")
}
// Performance test for task assignment
func BenchmarkTaskAssignment_GetNextTask(b *testing.B) {
registry := NewWorkerRegistry()
queue := NewPriorityTaskQueue()
scheduler := NewTaskScheduler(registry, queue)
// Setup worker
worker := &types.Worker{
ID: "bench_worker",
Capabilities: []types.TaskType{types.TaskTypeVacuum},
Status: "active",
CurrentLoad: 0,
}
registry.RegisterWorker(worker)
// Add many tasks
for i := 0; i < 1000; i++ {
task := &types.Task{
ID: fmt.Sprintf("task_%d", i),
Type: types.TaskTypeVacuum,
Priority: types.TaskPriorityNormal,
}
queue.Push(task)
}
b.ResetTimer()
for i := 0; i < b.N; i++ {
scheduler.GetNextTask("bench_worker", []types.TaskType{types.TaskTypeVacuum})
}
}
func BenchmarkTaskAssignment_WorkerSelection(b *testing.B) {
registry := NewWorkerRegistry()
scheduler := NewTaskScheduler(registry, nil)
// Create many workers
workers := make([]*types.Worker, 100)
for i := 0; i < 100; i++ {
worker := &types.Worker{
ID: fmt.Sprintf("worker_%d", i),
Capabilities: []types.TaskType{types.TaskTypeVacuum},
Status: "active",
CurrentLoad: i % 3, // Varying loads
}
registry.RegisterWorker(worker)
workers[i] = worker
}
task := &types.Task{
ID: "bench_task",
Type: types.TaskTypeVacuum,
}
b.ResetTimer()
for i := 0; i < b.N; i++ {
scheduler.SelectWorker(task, workers)
}
}

168
weed/admin/task/task_detectors.go

@ -1,168 +0,0 @@
package task
import (
"time"
"github.com/seaweedfs/seaweedfs/weed/glog"
"github.com/seaweedfs/seaweedfs/weed/worker/types"
)
// ECDetector detects volumes that need erasure coding
type ECDetector struct {
minUtilization float64
minIdleTime time.Duration
}
// NewECDetector creates a new EC detector
func NewECDetector() *ECDetector {
return &ECDetector{
minUtilization: 95.0, // 95% full
minIdleTime: time.Hour, // 1 hour idle
}
}
// DetectECCandidates finds volumes that need erasure coding
func (ed *ECDetector) DetectECCandidates(volumes []*VolumeInfo) ([]*VolumeCandidate, error) {
var candidates []*VolumeCandidate
for _, vol := range volumes {
if ed.isECCandidate(vol) {
candidate := &VolumeCandidate{
VolumeID: vol.ID,
Server: vol.Server,
Collection: vol.Collection,
TaskType: types.TaskTypeErasureCoding,
Priority: ed.calculateECPriority(vol),
Reason: "Volume is full and idle, ready for erasure coding",
DetectedAt: time.Now(),
ScheduleAt: time.Now(),
Parameters: map[string]interface{}{
"utilization": vol.GetUtilization(),
"idle_time": vol.GetIdleTime().String(),
"volume_size": vol.Size,
},
}
candidates = append(candidates, candidate)
}
}
glog.V(2).Infof("EC detector found %d candidates", len(candidates))
return candidates, nil
}
// isECCandidate checks if a volume is suitable for EC
func (ed *ECDetector) isECCandidate(vol *VolumeInfo) bool {
// Skip if read-only
if vol.ReadOnly {
return false
}
// Skip if already has remote storage (likely already EC'd)
if vol.RemoteStorageKey != "" {
return false
}
// Check utilization
if vol.GetUtilization() < ed.minUtilization {
return false
}
// Check idle time
if vol.GetIdleTime() < ed.minIdleTime {
return false
}
return true
}
// calculateECPriority calculates priority for EC tasks
func (ed *ECDetector) calculateECPriority(vol *VolumeInfo) types.TaskPriority {
utilization := vol.GetUtilization()
idleTime := vol.GetIdleTime()
// Higher priority for fuller volumes that have been idle longer
if utilization >= 98.0 && idleTime > 24*time.Hour {
return types.TaskPriorityHigh
}
if utilization >= 96.0 && idleTime > 6*time.Hour {
return types.TaskPriorityNormal
}
return types.TaskPriorityLow
}
// VacuumDetector detects volumes that need vacuum operations
type VacuumDetector struct {
minGarbageRatio float64
minDeleteCount uint64
}
// NewVacuumDetector creates a new vacuum detector
func NewVacuumDetector() *VacuumDetector {
return &VacuumDetector{
minGarbageRatio: 0.3, // 30% garbage
minDeleteCount: 100, // At least 100 deleted files
}
}
// DetectVacuumCandidates finds volumes that need vacuum operations
func (vd *VacuumDetector) DetectVacuumCandidates(volumes []*VolumeInfo) ([]*VolumeCandidate, error) {
var candidates []*VolumeCandidate
for _, vol := range volumes {
if vd.isVacuumCandidate(vol) {
candidate := &VolumeCandidate{
VolumeID: vol.ID,
Server: vol.Server,
Collection: vol.Collection,
TaskType: types.TaskTypeVacuum,
Priority: vd.calculateVacuumPriority(vol),
Reason: "Volume has high garbage ratio and needs vacuum",
DetectedAt: time.Now(),
ScheduleAt: time.Now(),
Parameters: map[string]interface{}{
"garbage_ratio": vol.GetGarbageRatio(),
"delete_count": vol.DeleteCount,
"deleted_byte_count": vol.DeletedByteCount,
},
}
candidates = append(candidates, candidate)
}
}
glog.V(2).Infof("Vacuum detector found %d candidates", len(candidates))
return candidates, nil
}
// isVacuumCandidate checks if a volume needs vacuum
func (vd *VacuumDetector) isVacuumCandidate(vol *VolumeInfo) bool {
// Skip if read-only
if vol.ReadOnly {
return false
}
// Check garbage ratio
if vol.GetGarbageRatio() < vd.minGarbageRatio {
return false
}
// Check delete count
if vol.DeleteCount < vd.minDeleteCount {
return false
}
return true
}
// calculateVacuumPriority calculates priority for vacuum tasks
func (vd *VacuumDetector) calculateVacuumPriority(vol *VolumeInfo) types.TaskPriority {
garbageRatio := vol.GetGarbageRatio()
// Higher priority for volumes with more garbage
if garbageRatio >= 0.6 {
return types.TaskPriorityHigh
}
if garbageRatio >= 0.4 {
return types.TaskPriorityNormal
}
return types.TaskPriorityLow
}

161
weed/admin/task/task_discovery.go

@ -1,161 +0,0 @@
package task
import (
"context"
"time"
"github.com/seaweedfs/seaweedfs/weed/glog"
"github.com/seaweedfs/seaweedfs/weed/pb/master_pb"
"github.com/seaweedfs/seaweedfs/weed/wdclient"
)
// TaskDiscoveryEngine discovers volumes that need maintenance tasks
type TaskDiscoveryEngine struct {
masterClient *wdclient.MasterClient
scanInterval time.Duration
ecDetector *ECDetector
vacuumDetector *VacuumDetector
}
// NewTaskDiscoveryEngine creates a new task discovery engine
func NewTaskDiscoveryEngine(masterClient *wdclient.MasterClient, scanInterval time.Duration) *TaskDiscoveryEngine {
return &TaskDiscoveryEngine{
masterClient: masterClient,
scanInterval: scanInterval,
ecDetector: NewECDetector(),
vacuumDetector: NewVacuumDetector(),
}
}
// ScanForTasks scans for volumes that need maintenance tasks
func (tde *TaskDiscoveryEngine) ScanForTasks() ([]*VolumeCandidate, error) {
var candidates []*VolumeCandidate
// Get cluster topology and volume information
volumeInfos, err := tde.getVolumeInformation()
if err != nil {
return nil, err
}
// Scan for EC candidates
ecCandidates, err := tde.ecDetector.DetectECCandidates(volumeInfos)
if err != nil {
glog.Errorf("EC detection failed: %v", err)
} else {
candidates = append(candidates, ecCandidates...)
}
// Scan for vacuum candidates
vacuumCandidates, err := tde.vacuumDetector.DetectVacuumCandidates(volumeInfos)
if err != nil {
glog.Errorf("Vacuum detection failed: %v", err)
} else {
candidates = append(candidates, vacuumCandidates...)
}
glog.V(1).Infof("Task discovery found %d candidates (%d EC, %d vacuum)",
len(candidates), len(ecCandidates), len(vacuumCandidates))
return candidates, nil
}
// getVolumeInformation retrieves volume information from master
func (tde *TaskDiscoveryEngine) getVolumeInformation() ([]*VolumeInfo, error) {
var volumeInfos []*VolumeInfo
err := tde.masterClient.WithClient(false, func(client master_pb.SeaweedClient) error {
resp, err := client.VolumeList(context.Background(), &master_pb.VolumeListRequest{})
if err != nil {
return err
}
if resp.TopologyInfo != nil {
for _, dc := range resp.TopologyInfo.DataCenterInfos {
for _, rack := range dc.RackInfos {
for _, node := range rack.DataNodeInfos {
for _, diskInfo := range node.DiskInfos {
for _, volInfo := range diskInfo.VolumeInfos {
volumeInfo := &VolumeInfo{
ID: volInfo.Id,
Size: volInfo.Size,
Collection: volInfo.Collection,
FileCount: volInfo.FileCount,
DeleteCount: volInfo.DeleteCount,
DeletedByteCount: volInfo.DeletedByteCount,
ReadOnly: volInfo.ReadOnly,
Server: node.Id,
DataCenter: dc.Id,
Rack: rack.Id,
DiskType: volInfo.DiskType,
ModifiedAtSecond: volInfo.ModifiedAtSecond,
RemoteStorageKey: volInfo.RemoteStorageKey,
}
volumeInfos = append(volumeInfos, volumeInfo)
}
}
}
}
}
}
return nil
})
return volumeInfos, err
}
// VolumeInfo contains detailed volume information
type VolumeInfo struct {
ID uint32
Size uint64
Collection string
FileCount uint64
DeleteCount uint64
DeletedByteCount uint64
ReadOnly bool
Server string
DataCenter string
Rack string
DiskType string
ModifiedAtSecond int64
RemoteStorageKey string
}
// GetUtilization calculates volume utilization percentage
func (vi *VolumeInfo) GetUtilization() float64 {
if vi.Size == 0 {
return 0.0
}
// Assuming max volume size of 30GB
maxSize := uint64(30 * 1024 * 1024 * 1024)
return float64(vi.Size) / float64(maxSize) * 100.0
}
// GetGarbageRatio calculates the garbage ratio
func (vi *VolumeInfo) GetGarbageRatio() float64 {
if vi.Size == 0 {
return 0.0
}
return float64(vi.DeletedByteCount) / float64(vi.Size)
}
// GetIdleTime calculates how long the volume has been idle
func (vi *VolumeInfo) GetIdleTime() time.Duration {
lastModified := time.Unix(vi.ModifiedAtSecond, 0)
return time.Since(lastModified)
}
// IsECCandidate checks if volume is a candidate for EC
func (vi *VolumeInfo) IsECCandidate() bool {
return !vi.ReadOnly &&
vi.GetUtilization() >= 95.0 &&
vi.GetIdleTime() > time.Hour &&
vi.RemoteStorageKey == "" // Not already EC'd
}
// IsVacuumCandidate checks if volume is a candidate for vacuum
func (vi *VolumeInfo) IsVacuumCandidate() bool {
return !vi.ReadOnly &&
vi.GetGarbageRatio() >= 0.3 &&
vi.DeleteCount > 0
}

257
weed/admin/task/task_scheduler.go

@ -1,257 +0,0 @@
package task
import (
"sync"
"time"
"github.com/seaweedfs/seaweedfs/weed/glog"
"github.com/seaweedfs/seaweedfs/weed/worker/types"
)
// TaskScheduler handles task assignment to workers
type TaskScheduler struct {
workerRegistry *WorkerRegistry
taskQueue *PriorityTaskQueue
mutex sync.RWMutex
}
// NewTaskScheduler creates a new task scheduler
func NewTaskScheduler(registry *WorkerRegistry, queue *PriorityTaskQueue) *TaskScheduler {
return &TaskScheduler{
workerRegistry: registry,
taskQueue: queue,
}
}
// GetNextTask gets the next suitable task for a worker
func (ts *TaskScheduler) GetNextTask(workerID string, capabilities []types.TaskType) *types.Task {
ts.mutex.RLock()
defer ts.mutex.RUnlock()
// Get worker info
_, exists := ts.workerRegistry.GetWorker(workerID)
if !exists {
return nil
}
// Check worker capabilities
capabilityMap := make(map[types.TaskType]bool)
for _, cap := range capabilities {
capabilityMap[cap] = true
}
// Find next suitable task
tasks := ts.taskQueue.GetTasks()
for _, task := range tasks {
// Check if worker can handle this task type
if !capabilityMap[task.Type] {
continue
}
// Check if task is ready to be scheduled
if !task.ScheduledAt.IsZero() && task.ScheduledAt.After(time.Now()) {
continue
}
// Additional checks can be added here
// (e.g., server affinity, resource requirements)
return task
}
return nil
}
// SelectWorker selects the best worker for a task
func (ts *TaskScheduler) SelectWorker(task *types.Task, availableWorkers []*types.Worker) *types.Worker {
ts.mutex.RLock()
defer ts.mutex.RUnlock()
var bestWorker *types.Worker
bestScore := -1.0
for _, worker := range availableWorkers {
// Check if worker supports this task type
if !ts.workerSupportsTask(worker, task.Type) {
continue
}
// Calculate selection score
score := ts.calculateSelectionScore(worker, task)
if bestWorker == nil || score > bestScore {
bestWorker = worker
bestScore = score
}
}
if bestWorker != nil {
glog.V(2).Infof("Selected worker %s for task %s (score: %.2f)", bestWorker.ID, task.Type, bestScore)
}
return bestWorker
}
// workerSupportsTask checks if a worker supports a task type
func (ts *TaskScheduler) workerSupportsTask(worker *types.Worker, taskType types.TaskType) bool {
for _, capability := range worker.Capabilities {
if capability == taskType {
return true
}
}
return false
}
// calculateSelectionScore calculates a score for worker selection
func (ts *TaskScheduler) calculateSelectionScore(worker *types.Worker, task *types.Task) float64 {
// Base score from worker registry
baseScore := ts.workerRegistry.calculateWorkerScore(worker)
// Task-specific adjustments
taskScore := baseScore
// Priority adjustment
switch task.Priority {
case types.TaskPriorityHigh:
taskScore *= 1.2 // Prefer high-performing workers for high-priority tasks
case types.TaskPriorityLow:
taskScore *= 0.9 // Low-priority tasks can use any available worker
}
// Server affinity bonus (if worker and volume are on same server)
if task.Server != "" && worker.Address == task.Server {
taskScore += 0.1
}
// Retry penalty (prefer different workers for retried tasks)
if task.RetryCount > 0 {
taskScore *= 0.8
}
return taskScore
}
// PriorityTaskQueue implements a priority queue for tasks
type PriorityTaskQueue struct {
tasks []*types.Task
mutex sync.RWMutex
}
// NewPriorityTaskQueue creates a new priority task queue
func NewPriorityTaskQueue() *PriorityTaskQueue {
return &PriorityTaskQueue{
tasks: make([]*types.Task, 0),
}
}
// Push adds a task to the queue
func (ptq *PriorityTaskQueue) Push(task *types.Task) {
ptq.mutex.Lock()
defer ptq.mutex.Unlock()
// Insert task in priority order (highest priority first)
inserted := false
for i, existingTask := range ptq.tasks {
if task.Priority > existingTask.Priority {
// Insert at position i
ptq.tasks = append(ptq.tasks[:i], append([]*types.Task{task}, ptq.tasks[i:]...)...)
inserted = true
break
}
}
if !inserted {
// Add to end
ptq.tasks = append(ptq.tasks, task)
}
glog.V(3).Infof("Added task %s to queue (priority: %d, queue size: %d)", task.ID, task.Priority, len(ptq.tasks))
}
// Pop removes and returns the highest priority task
func (ptq *PriorityTaskQueue) Pop() *types.Task {
ptq.mutex.Lock()
defer ptq.mutex.Unlock()
if len(ptq.tasks) == 0 {
return nil
}
task := ptq.tasks[0]
ptq.tasks = ptq.tasks[1:]
return task
}
// Peek returns the highest priority task without removing it
func (ptq *PriorityTaskQueue) Peek() *types.Task {
ptq.mutex.RLock()
defer ptq.mutex.RUnlock()
if len(ptq.tasks) == 0 {
return nil
}
return ptq.tasks[0]
}
// IsEmpty returns true if the queue is empty
func (ptq *PriorityTaskQueue) IsEmpty() bool {
ptq.mutex.RLock()
defer ptq.mutex.RUnlock()
return len(ptq.tasks) == 0
}
// Size returns the number of tasks in the queue
func (ptq *PriorityTaskQueue) Size() int {
ptq.mutex.RLock()
defer ptq.mutex.RUnlock()
return len(ptq.tasks)
}
// HasTask checks if a task exists for a volume and task type
func (ptq *PriorityTaskQueue) HasTask(volumeID uint32, taskType types.TaskType) bool {
ptq.mutex.RLock()
defer ptq.mutex.RUnlock()
for _, task := range ptq.tasks {
if task.VolumeID == volumeID && task.Type == taskType {
return true
}
}
return false
}
// GetTasks returns a copy of all tasks in the queue
func (ptq *PriorityTaskQueue) GetTasks() []*types.Task {
ptq.mutex.RLock()
defer ptq.mutex.RUnlock()
tasksCopy := make([]*types.Task, len(ptq.tasks))
copy(tasksCopy, ptq.tasks)
return tasksCopy
}
// RemoveTask removes a specific task from the queue
func (ptq *PriorityTaskQueue) RemoveTask(taskID string) bool {
ptq.mutex.Lock()
defer ptq.mutex.Unlock()
for i, task := range ptq.tasks {
if task.ID == taskID {
ptq.tasks = append(ptq.tasks[:i], ptq.tasks[i+1:]...)
glog.V(3).Infof("Removed task %s from queue", taskID)
return true
}
}
return false
}
// Clear removes all tasks from the queue
func (ptq *PriorityTaskQueue) Clear() {
ptq.mutex.Lock()
defer ptq.mutex.Unlock()
ptq.tasks = ptq.tasks[:0]
glog.V(3).Infof("Cleared task queue")
}

68
weed/admin/task/task_types.go

@ -1,68 +0,0 @@
package task
import (
"time"
"github.com/seaweedfs/seaweedfs/weed/worker/types"
)
// InProgressTask represents a task currently being executed
type InProgressTask struct {
Task *types.Task
WorkerID string
StartedAt time.Time
LastUpdate time.Time
Progress float64
EstimatedEnd time.Time
VolumeReserved bool // Reserved for capacity planning
}
// VolumeCandidate represents a volume that needs maintenance
type VolumeCandidate struct {
VolumeID uint32
Server string
Collection string
TaskType types.TaskType
Priority types.TaskPriority
Reason string
DetectedAt time.Time
ScheduleAt time.Time
Parameters map[string]interface{}
}
// VolumeChange represents a volume state change
type VolumeChange struct {
VolumeID uint32
ChangeType ChangeType
OldCapacity int64
NewCapacity int64
TaskID string
CompletedAt time.Time
ReportedToMaster bool
}
// ChangeType represents the type of volume change
type ChangeType string
const (
ChangeTypeECEncoding ChangeType = "ec_encoding"
ChangeTypeVacuumComplete ChangeType = "vacuum_completed"
)
// WorkerMetrics represents performance metrics for a worker
type WorkerMetrics struct {
TasksCompleted int
TasksFailed int
AverageTaskTime time.Duration
LastTaskTime time.Time
SuccessRate float64
}
// VolumeReservation represents a reserved volume capacity
type VolumeReservation struct {
VolumeID uint32
TaskID string
ReservedAt time.Time
ExpectedEnd time.Time
CapacityDelta int64 // Expected change in capacity
}

640
weed/admin/task/volume_state_manager.go

@ -1,640 +0,0 @@
package task
import (
"context"
"sync"
"time"
"github.com/seaweedfs/seaweedfs/weed/glog"
"github.com/seaweedfs/seaweedfs/weed/pb/master_pb"
"github.com/seaweedfs/seaweedfs/weed/storage/erasure_coding"
"github.com/seaweedfs/seaweedfs/weed/wdclient"
"github.com/seaweedfs/seaweedfs/weed/worker/types"
)
// VolumeStateManager provides comprehensive tracking of all volume and shard states
type VolumeStateManager struct {
masterClient *wdclient.MasterClient
volumes map[uint32]*VolumeState
ecShards map[uint32]*ECShardState // Key: VolumeID
inProgressTasks map[string]*TaskImpact // Key: TaskID
plannedOperations map[string]*PlannedOperation // Key: OperationID
capacityCache map[string]*CapacityInfo // Key: Server address
lastMasterSync time.Time
mutex sync.RWMutex
}
// VolumeState tracks comprehensive state of a volume
type VolumeState struct {
VolumeID uint32
CurrentState *VolumeInfo // Current state from master
InProgressTasks []*TaskImpact // Tasks currently affecting this volume
PlannedChanges []*PlannedOperation // Future operations planned
PredictedState *VolumeInfo // Predicted state after all operations
LastMasterUpdate time.Time
Inconsistencies []StateInconsistency
}
// ECShardState tracks EC shard information
type ECShardState struct {
VolumeID uint32
CurrentShards map[int]*ShardInfo // Current shards from master (0-13)
InProgressTasks []*TaskImpact // Tasks affecting shards
PlannedShards map[int]*PlannedShard // Planned shard operations
PredictedShards map[int]*ShardInfo // Predicted final state
LastUpdate time.Time
}
// ShardInfo represents information about an EC shard
type ShardInfo struct {
ShardID int
Server string
Size uint64
Status ShardStatus
LastUpdate time.Time
}
// ShardStatus represents the status of a shard
type ShardStatus string
const (
ShardStatusExists ShardStatus = "exists"
ShardStatusCreating ShardStatus = "creating"
ShardStatusDeleting ShardStatus = "deleting"
ShardStatusMissing ShardStatus = "missing"
ShardStatusCorrupted ShardStatus = "corrupted"
)
// TaskImpact describes how a task affects volume/shard state
type TaskImpact struct {
TaskID string
TaskType types.TaskType
VolumeID uint32
WorkerID string
StartedAt time.Time
EstimatedEnd time.Time
// Volume impacts
VolumeChanges *VolumeChanges
// Shard impacts
ShardChanges map[int]*ShardChange // Key: ShardID
// Capacity impacts
CapacityDelta map[string]int64 // Key: Server, Value: capacity change
}
// VolumeChanges describes changes to a volume
type VolumeChanges struct {
SizeChange int64
WillBeDeleted bool
WillBeCreated bool
WillBecomeReadOnly bool
CollectionChange string
DiskTypeChange string
}
// ShardChange describes changes to a shard
type ShardChange struct {
ShardID int
WillBeCreated bool
WillBeDeleted bool
TargetServer string
SizeChange int64
}
// PlannedOperation represents a future operation
type PlannedOperation struct {
OperationID string
Type OperationType
VolumeID uint32
ScheduledAt time.Time
Priority types.TaskPriority
Prerequisites []string // Other operation IDs that must complete first
Impact *TaskImpact
}
// OperationType represents different types of planned operations
type OperationType string
const (
OperationECEncode OperationType = "ec_encode"
OperationECRebuild OperationType = "ec_rebuild"
OperationECBalance OperationType = "ec_balance"
OperationVacuum OperationType = "vacuum"
OperationVolumeMove OperationType = "volume_move"
OperationShardMove OperationType = "shard_move"
OperationVolumeDelete OperationType = "volume_delete"
)
// CapacityInfo tracks server capacity information
type CapacityInfo struct {
Server string
TotalCapacity int64
UsedCapacity int64
ReservedCapacity int64 // Capacity reserved for in-progress tasks
PredictedUsage int64 // Predicted usage after all operations
LastUpdate time.Time
}
// StateInconsistency represents detected inconsistencies
type StateInconsistency struct {
Type InconsistencyType
Description string
DetectedAt time.Time
Severity SeverityLevel
VolumeID uint32
ShardID *int
}
// InconsistencyType represents different types of state inconsistencies
type InconsistencyType string
const (
InconsistencyVolumeMissing InconsistencyType = "volume_missing"
InconsistencyVolumeUnexpected InconsistencyType = "volume_unexpected"
InconsistencyShardMissing InconsistencyType = "shard_missing"
InconsistencyShardUnexpected InconsistencyType = "shard_unexpected"
InconsistencyCapacityMismatch InconsistencyType = "capacity_mismatch"
InconsistencyTaskOrphaned InconsistencyType = "task_orphaned"
InconsistencyDuplicateTask InconsistencyType = "duplicate_task"
)
// SeverityLevel represents the severity of an inconsistency
type SeverityLevel string
const (
SeverityLow SeverityLevel = "low"
SeverityMedium SeverityLevel = "medium"
SeverityHigh SeverityLevel = "high"
SeverityCritical SeverityLevel = "critical"
)
// NewVolumeStateManager creates a new volume state manager
func NewVolumeStateManager(masterClient *wdclient.MasterClient) *VolumeStateManager {
return &VolumeStateManager{
masterClient: masterClient,
volumes: make(map[uint32]*VolumeState),
ecShards: make(map[uint32]*ECShardState),
inProgressTasks: make(map[string]*TaskImpact),
plannedOperations: make(map[string]*PlannedOperation),
capacityCache: make(map[string]*CapacityInfo),
}
}
// SyncWithMaster synchronizes state with the master server
func (vsm *VolumeStateManager) SyncWithMaster() error {
vsm.mutex.Lock()
defer vsm.mutex.Unlock()
glog.V(2).Infof("Syncing volume state with master")
// Get current volume list from master
masterVolumes, masterShards, err := vsm.fetchMasterState()
if err != nil {
return err
}
// Update volume states
vsm.updateVolumeStates(masterVolumes)
// Update shard states
vsm.updateShardStates(masterShards)
// Detect inconsistencies
vsm.detectInconsistencies()
// Update capacity information
vsm.updateCapacityInfo()
// Recalculate predicted states
vsm.recalculatePredictedStates()
vsm.lastMasterSync = time.Now()
glog.V(2).Infof("Master sync completed, tracking %d volumes, %d EC volumes",
len(vsm.volumes), len(vsm.ecShards))
return nil
}
// RegisterTaskImpact registers the impact of a new task
func (vsm *VolumeStateManager) RegisterTaskImpact(taskID string, impact *TaskImpact) {
vsm.mutex.Lock()
defer vsm.mutex.Unlock()
vsm.inProgressTasks[taskID] = impact
// Update volume state
if volumeState, exists := vsm.volumes[impact.VolumeID]; exists {
volumeState.InProgressTasks = append(volumeState.InProgressTasks, impact)
}
// Update shard state for EC operations
if impact.TaskType == types.TaskTypeErasureCoding {
if shardState, exists := vsm.ecShards[impact.VolumeID]; exists {
shardState.InProgressTasks = append(shardState.InProgressTasks, impact)
}
}
// Update capacity reservations
for server, capacityDelta := range impact.CapacityDelta {
if capacity, exists := vsm.capacityCache[server]; exists {
capacity.ReservedCapacity += capacityDelta
}
}
// Recalculate predicted states
vsm.recalculatePredictedStates()
glog.V(2).Infof("Registered task impact: %s for volume %d", taskID, impact.VolumeID)
}
// UnregisterTaskImpact removes a completed task's impact
func (vsm *VolumeStateManager) UnregisterTaskImpact(taskID string) {
vsm.mutex.Lock()
defer vsm.mutex.Unlock()
impact, exists := vsm.inProgressTasks[taskID]
if !exists {
return
}
delete(vsm.inProgressTasks, taskID)
// Remove from volume state
if volumeState, exists := vsm.volumes[impact.VolumeID]; exists {
vsm.removeTaskFromVolume(volumeState, taskID)
}
// Remove from shard state
if shardState, exists := vsm.ecShards[impact.VolumeID]; exists {
vsm.removeTaskFromShards(shardState, taskID)
}
// Update capacity reservations
for server, capacityDelta := range impact.CapacityDelta {
if capacity, exists := vsm.capacityCache[server]; exists {
capacity.ReservedCapacity -= capacityDelta
}
}
// Recalculate predicted states
vsm.recalculatePredictedStates()
glog.V(2).Infof("Unregistered task impact: %s", taskID)
}
// GetAccurateCapacity returns accurate capacity information for a server
func (vsm *VolumeStateManager) GetAccurateCapacity(server string) *CapacityInfo {
vsm.mutex.RLock()
defer vsm.mutex.RUnlock()
if capacity, exists := vsm.capacityCache[server]; exists {
// Return a copy to avoid external modifications
return &CapacityInfo{
Server: capacity.Server,
TotalCapacity: capacity.TotalCapacity,
UsedCapacity: capacity.UsedCapacity,
ReservedCapacity: capacity.ReservedCapacity,
PredictedUsage: capacity.PredictedUsage,
LastUpdate: capacity.LastUpdate,
}
}
return nil
}
// GetVolumeState returns the current state of a volume
func (vsm *VolumeStateManager) GetVolumeState(volumeID uint32) *VolumeState {
vsm.mutex.RLock()
defer vsm.mutex.RUnlock()
if state, exists := vsm.volumes[volumeID]; exists {
// Return a copy to avoid external modifications
return vsm.copyVolumeState(state)
}
return nil
}
// GetECShardState returns the current state of EC shards for a volume
func (vsm *VolumeStateManager) GetECShardState(volumeID uint32) *ECShardState {
vsm.mutex.RLock()
defer vsm.mutex.RUnlock()
if state, exists := vsm.ecShards[volumeID]; exists {
return vsm.copyECShardState(state)
}
return nil
}
// CanAssignVolumeToServer checks if a volume can be assigned to a server
func (vsm *VolumeStateManager) CanAssignVolumeToServer(volumeSize int64, server string) bool {
vsm.mutex.RLock()
defer vsm.mutex.RUnlock()
capacity := vsm.capacityCache[server]
if capacity == nil {
return false
}
// Calculate available capacity: Total - Used - Reserved
availableCapacity := capacity.TotalCapacity - capacity.UsedCapacity - capacity.ReservedCapacity
return availableCapacity >= volumeSize
}
// PlanOperation schedules a future operation
func (vsm *VolumeStateManager) PlanOperation(operation *PlannedOperation) {
vsm.mutex.Lock()
defer vsm.mutex.Unlock()
vsm.plannedOperations[operation.OperationID] = operation
// Add to volume planned changes
if volumeState, exists := vsm.volumes[operation.VolumeID]; exists {
volumeState.PlannedChanges = append(volumeState.PlannedChanges, operation)
}
glog.V(2).Infof("Planned operation: %s for volume %d", operation.OperationID, operation.VolumeID)
}
// GetPendingChange returns pending change for a volume
func (vsm *VolumeStateManager) GetPendingChange(volumeID uint32) *VolumeChange {
vsm.mutex.RLock()
defer vsm.mutex.RUnlock()
// Look for pending changes in volume state
if volumeState, exists := vsm.volumes[volumeID]; exists {
// Return the most recent pending change
if len(volumeState.PlannedChanges) > 0 {
latestOp := volumeState.PlannedChanges[len(volumeState.PlannedChanges)-1]
if latestOp.Impact != nil && latestOp.Impact.VolumeChanges != nil {
return &VolumeChange{
VolumeID: volumeID,
ChangeType: ChangeType(latestOp.Type),
OldCapacity: int64(volumeState.CurrentState.Size),
NewCapacity: int64(volumeState.CurrentState.Size) + latestOp.Impact.VolumeChanges.SizeChange,
TaskID: latestOp.Impact.TaskID,
CompletedAt: time.Time{}, // Not completed yet
ReportedToMaster: false,
}
}
}
}
return nil
}
// fetchMasterState retrieves current state from master
func (vsm *VolumeStateManager) fetchMasterState() (map[uint32]*VolumeInfo, map[uint32]map[int]*ShardInfo, error) {
volumes := make(map[uint32]*VolumeInfo)
shards := make(map[uint32]map[int]*ShardInfo)
err := vsm.masterClient.WithClient(false, func(client master_pb.SeaweedClient) error {
// Fetch volume list
resp, err := client.VolumeList(context.Background(), &master_pb.VolumeListRequest{})
if err != nil {
return err
}
// Process topology info
if resp.TopologyInfo != nil {
for _, dc := range resp.TopologyInfo.DataCenterInfos {
for _, rack := range dc.RackInfos {
for _, node := range rack.DataNodeInfos {
for _, diskInfo := range node.DiskInfos {
// Process regular volumes
for _, volInfo := range diskInfo.VolumeInfos {
volumes[volInfo.Id] = &VolumeInfo{
ID: volInfo.Id,
Size: volInfo.Size,
Collection: volInfo.Collection,
FileCount: volInfo.FileCount,
DeleteCount: volInfo.DeleteCount,
DeletedByteCount: volInfo.DeletedByteCount,
ReadOnly: volInfo.ReadOnly,
Server: node.Id,
DataCenter: dc.Id,
Rack: rack.Id,
DiskType: volInfo.DiskType,
ModifiedAtSecond: volInfo.ModifiedAtSecond,
RemoteStorageKey: volInfo.RemoteStorageKey,
}
}
// Process EC shards
for _, ecShardInfo := range diskInfo.EcShardInfos {
volumeID := ecShardInfo.Id
if shards[volumeID] == nil {
shards[volumeID] = make(map[int]*ShardInfo)
}
// Decode shard bits
for shardID := 0; shardID < erasure_coding.TotalShardsCount; shardID++ {
if (ecShardInfo.EcIndexBits & (1 << uint(shardID))) != 0 {
shards[volumeID][shardID] = &ShardInfo{
ShardID: shardID,
Server: node.Id,
Size: 0, // Size would need to be fetched separately
Status: ShardStatusExists,
LastUpdate: time.Now(),
}
}
}
}
}
}
}
}
}
return nil
})
return volumes, shards, err
}
// updateVolumeStates updates volume states based on master data
func (vsm *VolumeStateManager) updateVolumeStates(masterVolumes map[uint32]*VolumeInfo) {
now := time.Now()
// Update existing volumes and add new ones
for volumeID, masterVolume := range masterVolumes {
if volumeState, exists := vsm.volumes[volumeID]; exists {
// Update existing volume
oldState := volumeState.CurrentState
volumeState.CurrentState = masterVolume
volumeState.LastMasterUpdate = now
// Check for unexpected changes
if oldState != nil && vsm.hasUnexpectedChanges(oldState, masterVolume) {
vsm.addInconsistency(volumeState, InconsistencyVolumeUnexpected,
"Volume changed unexpectedly since last sync", SeverityMedium)
}
} else {
// New volume detected
vsm.volumes[volumeID] = &VolumeState{
VolumeID: volumeID,
CurrentState: masterVolume,
InProgressTasks: []*TaskImpact{},
PlannedChanges: []*PlannedOperation{},
LastMasterUpdate: now,
Inconsistencies: []StateInconsistency{},
}
}
}
// Detect missing volumes (volumes we knew about but master doesn't report)
for volumeID, volumeState := range vsm.volumes {
if _, existsInMaster := masterVolumes[volumeID]; !existsInMaster {
// Check if this is expected (due to deletion task)
if !vsm.isVolumeDeletionExpected(volumeID) {
vsm.addInconsistency(volumeState, InconsistencyVolumeMissing,
"Volume missing from master but not expected to be deleted", SeverityHigh)
}
}
}
}
// updateShardStates updates EC shard states
func (vsm *VolumeStateManager) updateShardStates(masterShards map[uint32]map[int]*ShardInfo) {
now := time.Now()
// Update existing shard states
for volumeID, shardMap := range masterShards {
if shardState, exists := vsm.ecShards[volumeID]; exists {
shardState.CurrentShards = shardMap
shardState.LastUpdate = now
} else {
vsm.ecShards[volumeID] = &ECShardState{
VolumeID: volumeID,
CurrentShards: shardMap,
InProgressTasks: []*TaskImpact{},
PlannedShards: make(map[int]*PlannedShard),
PredictedShards: make(map[int]*ShardInfo),
LastUpdate: now,
}
}
}
// Check for missing shards that we expected to exist
for volumeID, shardState := range vsm.ecShards {
if masterShardMap, exists := masterShards[volumeID]; exists {
vsm.validateShardConsistency(shardState, masterShardMap)
}
}
}
// detectInconsistencies identifies state inconsistencies
func (vsm *VolumeStateManager) detectInconsistencies() {
for _, volumeState := range vsm.volumes {
vsm.detectVolumeInconsistencies(volumeState)
}
for _, shardState := range vsm.ecShards {
vsm.detectShardInconsistencies(shardState)
}
vsm.detectOrphanedTasks()
vsm.detectDuplicateTasks()
vsm.detectCapacityInconsistencies()
}
// updateCapacityInfo updates server capacity information
func (vsm *VolumeStateManager) updateCapacityInfo() {
for server := range vsm.capacityCache {
vsm.recalculateServerCapacity(server)
}
}
// recalculatePredictedStates recalculates predicted states after all operations
func (vsm *VolumeStateManager) recalculatePredictedStates() {
for _, volumeState := range vsm.volumes {
vsm.calculatePredictedVolumeState(volumeState)
}
for _, shardState := range vsm.ecShards {
vsm.calculatePredictedShardState(shardState)
}
}
// Helper methods (simplified implementations)
func (vsm *VolumeStateManager) hasUnexpectedChanges(old, new *VolumeInfo) bool {
return old.Size != new.Size || old.ReadOnly != new.ReadOnly
}
func (vsm *VolumeStateManager) isVolumeDeletionExpected(volumeID uint32) bool {
for _, impact := range vsm.inProgressTasks {
if impact.VolumeID == volumeID && impact.VolumeChanges != nil && impact.VolumeChanges.WillBeDeleted {
return true
}
}
return false
}
func (vsm *VolumeStateManager) addInconsistency(volumeState *VolumeState, incType InconsistencyType, desc string, severity SeverityLevel) {
inconsistency := StateInconsistency{
Type: incType,
Description: desc,
DetectedAt: time.Now(),
Severity: severity,
VolumeID: volumeState.VolumeID,
}
volumeState.Inconsistencies = append(volumeState.Inconsistencies, inconsistency)
glog.Warningf("State inconsistency detected for volume %d: %s", volumeState.VolumeID, desc)
}
func (vsm *VolumeStateManager) removeTaskFromVolume(volumeState *VolumeState, taskID string) {
for i, task := range volumeState.InProgressTasks {
if task.TaskID == taskID {
volumeState.InProgressTasks = append(volumeState.InProgressTasks[:i], volumeState.InProgressTasks[i+1:]...)
break
}
}
}
func (vsm *VolumeStateManager) removeTaskFromShards(shardState *ECShardState, taskID string) {
for i, task := range shardState.InProgressTasks {
if task.TaskID == taskID {
shardState.InProgressTasks = append(shardState.InProgressTasks[:i], shardState.InProgressTasks[i+1:]...)
break
}
}
}
func (vsm *VolumeStateManager) copyVolumeState(state *VolumeState) *VolumeState {
// Return a deep copy (implementation would be more detailed)
return &VolumeState{
VolumeID: state.VolumeID,
CurrentState: state.CurrentState,
LastMasterUpdate: state.LastMasterUpdate,
}
}
func (vsm *VolumeStateManager) copyECShardState(state *ECShardState) *ECShardState {
// Return a deep copy (implementation would be more detailed)
return &ECShardState{
VolumeID: state.VolumeID,
LastUpdate: state.LastUpdate,
}
}
// Placeholder implementations for consistency checking methods
func (vsm *VolumeStateManager) validateShardConsistency(shardState *ECShardState, masterShards map[int]*ShardInfo) {
}
func (vsm *VolumeStateManager) detectVolumeInconsistencies(volumeState *VolumeState) {}
func (vsm *VolumeStateManager) detectShardInconsistencies(shardState *ECShardState) {}
func (vsm *VolumeStateManager) detectOrphanedTasks() {}
func (vsm *VolumeStateManager) detectDuplicateTasks() {}
func (vsm *VolumeStateManager) detectCapacityInconsistencies() {}
func (vsm *VolumeStateManager) recalculateServerCapacity(server string) {}
func (vsm *VolumeStateManager) calculatePredictedVolumeState(volumeState *VolumeState) {}
func (vsm *VolumeStateManager) calculatePredictedShardState(shardState *ECShardState) {}
// PlannedShard represents a planned shard operation
type PlannedShard struct {
ShardID int
Operation string // "create", "delete", "move"
TargetServer string
ScheduledAt time.Time
}

440
weed/admin/task/volume_state_manager_test.go

@ -1,440 +0,0 @@
package task
import (
"fmt"
"testing"
"time"
"github.com/seaweedfs/seaweedfs/weed/worker/types"
)
func TestVolumeStateManager_RegisterTaskImpact(t *testing.T) {
vsm := NewVolumeStateManager(nil)
// Create test volume state
volumeID := uint32(1)
volumeState := &VolumeState{
VolumeID: volumeID,
CurrentState: &VolumeInfo{
ID: volumeID,
Size: 1024 * 1024 * 1024, // 1GB
},
InProgressTasks: []*TaskImpact{},
PlannedChanges: []*PlannedOperation{},
Inconsistencies: []StateInconsistency{},
}
vsm.volumes[volumeID] = volumeState
// Create task impact
impact := &TaskImpact{
TaskID: "test_task_1",
TaskType: types.TaskTypeErasureCoding,
VolumeID: volumeID,
WorkerID: "worker_1",
StartedAt: time.Now(),
EstimatedEnd: time.Now().Add(15 * time.Minute),
VolumeChanges: &VolumeChanges{
WillBecomeReadOnly: true,
},
ShardChanges: make(map[int]*ShardChange),
CapacityDelta: map[string]int64{"server1": 400 * 1024 * 1024}, // 400MB for shards
}
// Register impact
vsm.RegisterTaskImpact(impact.TaskID, impact)
// Verify impact was registered
if len(vsm.inProgressTasks) != 1 {
t.Errorf("Expected 1 in-progress task, got %d", len(vsm.inProgressTasks))
}
if len(volumeState.InProgressTasks) != 1 {
t.Errorf("Expected 1 task in volume state, got %d", len(volumeState.InProgressTasks))
}
// Verify task can be retrieved
retrievedImpact := vsm.inProgressTasks[impact.TaskID]
if retrievedImpact == nil {
t.Error("Task impact not found after registration")
}
if retrievedImpact.TaskType != types.TaskTypeErasureCoding {
t.Errorf("Expected task type %v, got %v", types.TaskTypeErasureCoding, retrievedImpact.TaskType)
}
}
func TestVolumeStateManager_UnregisterTaskImpact(t *testing.T) {
vsm := NewVolumeStateManager(nil)
// Setup test data
volumeID := uint32(1)
taskID := "test_task_1"
volumeState := &VolumeState{
VolumeID: volumeID,
CurrentState: &VolumeInfo{ID: volumeID, Size: 1024 * 1024 * 1024},
InProgressTasks: []*TaskImpact{},
}
vsm.volumes[volumeID] = volumeState
impact := &TaskImpact{
TaskID: taskID,
TaskType: types.TaskTypeVacuum,
VolumeID: volumeID,
CapacityDelta: map[string]int64{"server1": -100 * 1024 * 1024}, // 100MB savings
}
// Register then unregister
vsm.RegisterTaskImpact(taskID, impact)
vsm.UnregisterTaskImpact(taskID)
// Verify impact was removed
if len(vsm.inProgressTasks) != 0 {
t.Errorf("Expected 0 in-progress tasks, got %d", len(vsm.inProgressTasks))
}
if len(volumeState.InProgressTasks) != 0 {
t.Errorf("Expected 0 tasks in volume state, got %d", len(volumeState.InProgressTasks))
}
}
func TestVolumeStateManager_CanAssignVolumeToServer(t *testing.T) {
vsm := NewVolumeStateManager(nil)
// Setup server capacity
serverID := "test_server"
capacity := &CapacityInfo{
Server: serverID,
TotalCapacity: 10 * 1024 * 1024 * 1024, // 10GB
UsedCapacity: 3 * 1024 * 1024 * 1024, // 3GB used
ReservedCapacity: 1 * 1024 * 1024 * 1024, // 1GB reserved
PredictedUsage: 4 * 1024 * 1024 * 1024, // 4GB predicted total
}
vsm.capacityCache[serverID] = capacity
tests := []struct {
name string
volumeSize int64
expected bool
desc string
}{
{
name: "Small volume fits",
volumeSize: 1 * 1024 * 1024 * 1024, // 1GB
expected: true,
desc: "1GB volume should fit in 6GB available space",
},
{
name: "Large volume fits exactly",
volumeSize: 6 * 1024 * 1024 * 1024, // 6GB
expected: true,
desc: "6GB volume should fit exactly in available space",
},
{
name: "Volume too large",
volumeSize: 7 * 1024 * 1024 * 1024, // 7GB
expected: false,
desc: "7GB volume should not fit in 6GB available space",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := vsm.CanAssignVolumeToServer(tt.volumeSize, serverID)
if result != tt.expected {
t.Errorf("CanAssignVolumeToServer() = %v, want %v. %s", result, tt.expected, tt.desc)
}
})
}
}
func TestVolumeStateManager_GetPendingChange(t *testing.T) {
vsm := NewVolumeStateManager(nil)
volumeID := uint32(1)
// Create volume with planned operation
volumeState := &VolumeState{
VolumeID: volumeID,
CurrentState: &VolumeInfo{
ID: volumeID,
Size: 2 * 1024 * 1024 * 1024, // 2GB
},
PlannedChanges: []*PlannedOperation{
{
OperationID: "op_1",
Type: OperationVacuum,
VolumeID: volumeID,
Impact: &TaskImpact{
TaskID: "task_1",
VolumeChanges: &VolumeChanges{
SizeChange: -500 * 1024 * 1024, // 500MB reduction
},
},
},
},
}
vsm.volumes[volumeID] = volumeState
// Test getting pending change
change := vsm.GetPendingChange(volumeID)
if change == nil {
t.Fatal("Expected pending change, got nil")
}
if change.VolumeID != volumeID {
t.Errorf("Expected volume ID %d, got %d", volumeID, change.VolumeID)
}
expectedNewCapacity := int64(2*1024*1024*1024 - 500*1024*1024) // 2GB - 500MB
if change.NewCapacity != expectedNewCapacity {
t.Errorf("Expected new capacity %d, got %d", expectedNewCapacity, change.NewCapacity)
}
// Test no pending change
change2 := vsm.GetPendingChange(999) // Non-existent volume
if change2 != nil {
t.Error("Expected nil for non-existent volume, got change")
}
}
func TestVolumeStateManager_StateConsistency(t *testing.T) {
// Test that demonstrates the core value: accurate state tracking
vsm := NewVolumeStateManager(nil)
volumeID := uint32(1)
serverID := "test_server"
// Setup initial state
vsm.volumes[volumeID] = &VolumeState{
VolumeID: volumeID,
CurrentState: &VolumeInfo{
ID: volumeID,
Size: 28 * 1024 * 1024 * 1024, // 28GB - ready for EC
Server: serverID,
},
InProgressTasks: []*TaskImpact{},
PlannedChanges: []*PlannedOperation{},
}
vsm.capacityCache[serverID] = &CapacityInfo{
Server: serverID,
TotalCapacity: 100 * 1024 * 1024 * 1024, // 100GB
UsedCapacity: 50 * 1024 * 1024 * 1024, // 50GB used
PredictedUsage: 50 * 1024 * 1024 * 1024, // Initially same as used
}
// Step 1: Register EC task impact
ecImpact := &TaskImpact{
TaskID: "ec_task_1",
TaskType: types.TaskTypeErasureCoding,
VolumeID: volumeID,
VolumeChanges: &VolumeChanges{
WillBecomeReadOnly: true,
},
CapacityDelta: map[string]int64{
serverID: 12 * 1024 * 1024 * 1024, // 12GB for EC shards (40% overhead)
},
}
vsm.RegisterTaskImpact(ecImpact.TaskID, ecImpact)
// Verify capacity is reserved
capacity := vsm.GetAccurateCapacity(serverID)
expectedPredicted := int64(50 * 1024 * 1024 * 1024) // 50GB initially
if capacity.PredictedUsage != expectedPredicted {
t.Errorf("Expected predicted usage %d, got %d", expectedPredicted, capacity.PredictedUsage)
}
// Verify reservation is tracked separately
expectedReserved := int64(12 * 1024 * 1024 * 1024) // 12GB for EC shards
if capacity.ReservedCapacity != expectedReserved {
t.Errorf("Expected reserved capacity %d, got %d", expectedReserved, capacity.ReservedCapacity)
}
// Calculate available capacity correctly
availableCapacity := capacity.TotalCapacity - capacity.UsedCapacity - capacity.ReservedCapacity
// 100GB - 50GB - 12GB = 38GB available
expectedAvailable := int64(38 * 1024 * 1024 * 1024)
if availableCapacity != expectedAvailable {
t.Errorf("Expected available capacity %d, got %d", expectedAvailable, availableCapacity)
}
// Step 2: Check assignment logic - should reject new large volume
canAssign := vsm.CanAssignVolumeToServer(40*1024*1024*1024, serverID) // 40GB volume
if canAssign {
t.Error("Should not be able to assign 40GB volume when only 38GB available after reservations")
}
// Step 3: Complete EC task
vsm.UnregisterTaskImpact(ecImpact.TaskID)
// Verify capacity is updated correctly
capacityAfter := vsm.GetAccurateCapacity(serverID)
if capacityAfter.ReservedCapacity != 0 {
t.Errorf("Expected 0 reserved capacity after task completion, got %d", capacityAfter.ReservedCapacity)
}
t.Logf("✅ State consistency test passed - accurate capacity tracking throughout task lifecycle")
}
func TestVolumeStateManager_ConcurrentTasks(t *testing.T) {
// Test multiple concurrent tasks affecting capacity
vsm := NewVolumeStateManager(nil)
serverID := "test_server"
vsm.capacityCache[serverID] = &CapacityInfo{
Server: serverID,
TotalCapacity: 50 * 1024 * 1024 * 1024, // 50GB
UsedCapacity: 10 * 1024 * 1024 * 1024, // 10GB used
PredictedUsage: 10 * 1024 * 1024 * 1024, // Initially 10GB
}
// Register multiple tasks
tasks := []struct {
taskID string
volumeID uint32
capacityDelta int64
}{
{"ec_task_1", 1, 15 * 1024 * 1024 * 1024}, // 15GB for EC
{"vacuum_task_1", 2, -5 * 1024 * 1024 * 1024}, // 5GB savings
{"ec_task_2", 3, 20 * 1024 * 1024 * 1024}, // 20GB for EC
}
for _, task := range tasks {
// Setup volume state
vsm.volumes[task.volumeID] = &VolumeState{
VolumeID: task.volumeID,
CurrentState: &VolumeInfo{ID: task.volumeID, Size: 25 * 1024 * 1024 * 1024},
}
impact := &TaskImpact{
TaskID: task.taskID,
VolumeID: task.volumeID,
TaskType: types.TaskTypeErasureCoding,
CapacityDelta: map[string]int64{serverID: task.capacityDelta},
}
vsm.RegisterTaskImpact(task.taskID, impact)
}
// Check cumulative capacity impact
capacity := vsm.GetAccurateCapacity(serverID)
expectedPredicted := int64(10*1024*1024*1024 + 15*1024*1024*1024 - 5*1024*1024*1024 + 20*1024*1024*1024) // 40GB
if capacity.PredictedUsage != expectedPredicted {
t.Errorf("Expected predicted usage %d GB, got %d GB",
expectedPredicted/(1024*1024*1024), capacity.PredictedUsage/(1024*1024*1024))
}
// Verify we can't assign more than available
remainingCapacity := capacity.TotalCapacity - capacity.PredictedUsage
canAssign := vsm.CanAssignVolumeToServer(remainingCapacity+1, serverID)
if canAssign {
t.Error("Should not be able to assign volume larger than remaining capacity")
}
t.Logf("✅ Concurrent tasks test passed - accurate cumulative capacity tracking")
}
func TestVolumeStateManager_ECShardTracking(t *testing.T) {
vsm := NewVolumeStateManager(nil)
volumeID := uint32(1)
// Create EC shard state
shardState := &ECShardState{
VolumeID: volumeID,
CurrentShards: map[int]*ShardInfo{
0: {ShardID: 0, Server: "server1", Status: ShardStatusExists},
1: {ShardID: 1, Server: "server1", Status: ShardStatusExists},
2: {ShardID: 2, Server: "server2", Status: ShardStatusExists},
},
InProgressTasks: []*TaskImpact{},
PlannedShards: make(map[int]*PlannedShard),
PredictedShards: make(map[int]*ShardInfo),
}
vsm.ecShards[volumeID] = shardState
// Register task that will create more shards
impact := &TaskImpact{
TaskID: "ec_expand_task",
VolumeID: volumeID,
TaskType: types.TaskTypeErasureCoding,
ShardChanges: map[int]*ShardChange{
3: {ShardID: 3, WillBeCreated: true, TargetServer: "server3"},
4: {ShardID: 4, WillBeCreated: true, TargetServer: "server3"},
},
}
vsm.RegisterTaskImpact(impact.TaskID, impact)
// Verify shard state tracking
retrievedState := vsm.GetECShardState(volumeID)
if retrievedState == nil {
t.Fatal("Expected EC shard state, got nil")
}
if len(retrievedState.InProgressTasks) != 1 {
t.Errorf("Expected 1 in-progress task for shards, got %d", len(retrievedState.InProgressTasks))
}
// Verify current shards are still tracked
if len(retrievedState.CurrentShards) != 3 {
t.Errorf("Expected 3 current shards, got %d", len(retrievedState.CurrentShards))
}
t.Logf("✅ EC shard tracking test passed")
}
// Benchmark tests for performance
func BenchmarkVolumeStateManager_RegisterTaskImpact(b *testing.B) {
vsm := NewVolumeStateManager(nil)
// Setup test data
for i := 0; i < 1000; i++ {
volumeID := uint32(i + 1)
vsm.volumes[volumeID] = &VolumeState{
VolumeID: volumeID,
CurrentState: &VolumeInfo{ID: volumeID},
InProgressTasks: []*TaskImpact{},
}
}
b.ResetTimer()
for i := 0; i < b.N; i++ {
impact := &TaskImpact{
TaskID: generateTaskID(),
VolumeID: uint32((i % 1000) + 1),
TaskType: types.TaskTypeVacuum,
CapacityDelta: map[string]int64{"server1": 1024 * 1024},
}
vsm.RegisterTaskImpact(impact.TaskID, impact)
vsm.UnregisterTaskImpact(impact.TaskID)
}
}
func BenchmarkVolumeStateManager_CanAssignVolumeToServer(b *testing.B) {
vsm := NewVolumeStateManager(nil)
// Setup capacity data
for i := 0; i < 100; i++ {
serverID := fmt.Sprintf("server_%d", i)
vsm.capacityCache[serverID] = &CapacityInfo{
Server: serverID,
TotalCapacity: 100 * 1024 * 1024 * 1024,
UsedCapacity: 50 * 1024 * 1024 * 1024,
PredictedUsage: 50 * 1024 * 1024 * 1024,
}
}
b.ResetTimer()
for i := 0; i < b.N; i++ {
serverID := fmt.Sprintf("server_%d", i%100)
vsm.CanAssignVolumeToServer(1024*1024*1024, serverID)
}
}

226
weed/admin/task/volume_state_tracker.go

@ -1,226 +0,0 @@
package task
import (
"sync"
"time"
"github.com/seaweedfs/seaweedfs/weed/glog"
"github.com/seaweedfs/seaweedfs/weed/pb/master_pb"
"github.com/seaweedfs/seaweedfs/weed/wdclient"
"github.com/seaweedfs/seaweedfs/weed/worker/types"
)
// VolumeStateTracker tracks volume state changes and reconciles with master
type VolumeStateTracker struct {
masterClient *wdclient.MasterClient
reconcileInterval time.Duration
reservedVolumes map[uint32]*VolumeReservation
pendingChanges map[uint32]*VolumeChange
mutex sync.RWMutex
}
// NewVolumeStateTracker creates a new volume state tracker
func NewVolumeStateTracker(masterClient *wdclient.MasterClient, reconcileInterval time.Duration) *VolumeStateTracker {
return &VolumeStateTracker{
masterClient: masterClient,
reconcileInterval: reconcileInterval,
reservedVolumes: make(map[uint32]*VolumeReservation),
pendingChanges: make(map[uint32]*VolumeChange),
}
}
// ReserveVolume reserves a volume for a task
func (vst *VolumeStateTracker) ReserveVolume(volumeID uint32, taskID string) {
vst.mutex.Lock()
defer vst.mutex.Unlock()
reservation := &VolumeReservation{
VolumeID: volumeID,
TaskID: taskID,
ReservedAt: time.Now(),
ExpectedEnd: time.Now().Add(15 * time.Minute), // Default 15 min estimate
CapacityDelta: 0, // Will be updated based on task type
}
vst.reservedVolumes[volumeID] = reservation
glog.V(2).Infof("Reserved volume %d for task %s", volumeID, taskID)
}
// ReleaseVolume releases a volume reservation
func (vst *VolumeStateTracker) ReleaseVolume(volumeID uint32, taskID string) {
vst.mutex.Lock()
defer vst.mutex.Unlock()
if reservation, exists := vst.reservedVolumes[volumeID]; exists {
if reservation.TaskID == taskID {
delete(vst.reservedVolumes, volumeID)
glog.V(2).Infof("Released volume %d reservation for task %s", volumeID, taskID)
}
}
}
// RecordVolumeChange records a completed volume change
func (vst *VolumeStateTracker) RecordVolumeChange(volumeID uint32, taskType types.TaskType, taskID string) {
vst.mutex.Lock()
defer vst.mutex.Unlock()
changeType := ChangeTypeECEncoding
if taskType == types.TaskTypeVacuum {
changeType = ChangeTypeVacuumComplete
}
change := &VolumeChange{
VolumeID: volumeID,
ChangeType: changeType,
TaskID: taskID,
CompletedAt: time.Now(),
ReportedToMaster: false,
}
vst.pendingChanges[volumeID] = change
glog.V(1).Infof("Recorded volume change for volume %d: %s", volumeID, changeType)
}
// GetPendingChange returns pending change for a volume
func (vst *VolumeStateTracker) GetPendingChange(volumeID uint32) *VolumeChange {
vst.mutex.RLock()
defer vst.mutex.RUnlock()
return vst.pendingChanges[volumeID]
}
// GetVolumeReservation returns reservation for a volume
func (vst *VolumeStateTracker) GetVolumeReservation(volumeID uint32) *VolumeReservation {
vst.mutex.RLock()
defer vst.mutex.RUnlock()
return vst.reservedVolumes[volumeID]
}
// IsVolumeReserved checks if a volume is reserved
func (vst *VolumeStateTracker) IsVolumeReserved(volumeID uint32) bool {
vst.mutex.RLock()
defer vst.mutex.RUnlock()
_, exists := vst.reservedVolumes[volumeID]
return exists
}
// ReconcileWithMaster reconciles volume states with master server
func (vst *VolumeStateTracker) ReconcileWithMaster() {
vst.mutex.Lock()
defer vst.mutex.Unlock()
// Report pending changes to master
for volumeID, change := range vst.pendingChanges {
if vst.reportChangeToMaster(change) {
change.ReportedToMaster = true
delete(vst.pendingChanges, volumeID)
glog.V(1).Infof("Successfully reported volume change for volume %d to master", volumeID)
}
}
// Clean up expired reservations
vst.cleanupExpiredReservations()
}
// reportChangeToMaster reports a volume change to the master server
func (vst *VolumeStateTracker) reportChangeToMaster(change *VolumeChange) bool {
// Note: In a real implementation, this would make actual API calls to master
// For now, we'll simulate the reporting
switch change.ChangeType {
case ChangeTypeECEncoding:
return vst.reportECCompletion(change)
case ChangeTypeVacuumComplete:
return vst.reportVacuumCompletion(change)
}
return false
}
// reportECCompletion reports EC completion to master
func (vst *VolumeStateTracker) reportECCompletion(change *VolumeChange) bool {
// This would typically trigger the master to:
// 1. Update volume state to reflect EC encoding
// 2. Update capacity calculations
// 3. Redistribute volume assignments
glog.V(2).Infof("Reporting EC completion for volume %d", change.VolumeID)
// Simulate master API call
err := vst.masterClient.WithClient(false, func(client master_pb.SeaweedClient) error {
// In real implementation, there would be a specific API call here
// For now, we simulate success
return nil
})
return err == nil
}
// reportVacuumCompletion reports vacuum completion to master
func (vst *VolumeStateTracker) reportVacuumCompletion(change *VolumeChange) bool {
// This would typically trigger the master to:
// 1. Update volume statistics
// 2. Update capacity calculations
// 3. Mark volume as recently vacuumed
glog.V(2).Infof("Reporting vacuum completion for volume %d", change.VolumeID)
// Simulate master API call
err := vst.masterClient.WithClient(false, func(client master_pb.SeaweedClient) error {
// In real implementation, there would be a specific API call here
// For now, we simulate success
return nil
})
return err == nil
}
// cleanupExpiredReservations removes expired volume reservations
func (vst *VolumeStateTracker) cleanupExpiredReservations() {
now := time.Now()
for volumeID, reservation := range vst.reservedVolumes {
if now.After(reservation.ExpectedEnd) {
delete(vst.reservedVolumes, volumeID)
glog.Warningf("Cleaned up expired reservation for volume %d (task %s)", volumeID, reservation.TaskID)
}
}
}
// GetAdjustedCapacity returns adjusted capacity considering in-progress tasks
func (vst *VolumeStateTracker) GetAdjustedCapacity(volumeID uint32, baseCapacity int64) int64 {
vst.mutex.RLock()
defer vst.mutex.RUnlock()
// Check for pending changes
if change := vst.pendingChanges[volumeID]; change != nil {
return change.NewCapacity
}
// Check for in-progress reservations
if reservation := vst.reservedVolumes[volumeID]; reservation != nil {
return baseCapacity + reservation.CapacityDelta
}
return baseCapacity
}
// GetStats returns statistics about volume state tracking
func (vst *VolumeStateTracker) GetStats() map[string]interface{} {
vst.mutex.RLock()
defer vst.mutex.RUnlock()
stats := make(map[string]interface{})
stats["reserved_volumes"] = len(vst.reservedVolumes)
stats["pending_changes"] = len(vst.pendingChanges)
changeTypeCounts := make(map[ChangeType]int)
for _, change := range vst.pendingChanges {
changeTypeCounts[change.ChangeType]++
}
stats["pending_by_type"] = changeTypeCounts
return stats
}

488
weed/admin/task/worker_communication.go

@ -1,488 +0,0 @@
package task
import (
"context"
"fmt"
"io"
"sync"
"time"
"github.com/seaweedfs/seaweedfs/weed/glog"
"github.com/seaweedfs/seaweedfs/weed/pb"
"github.com/seaweedfs/seaweedfs/weed/pb/worker_pb"
"github.com/seaweedfs/seaweedfs/weed/worker/types"
"google.golang.org/grpc"
"google.golang.org/grpc/credentials/insecure"
)
// WorkerConnection manages the gRPC connection to a single worker
type WorkerConnection struct {
workerID string
address string
conn *grpc.ClientConn
client worker_pb.WorkerServiceClient
stream worker_pb.WorkerService_WorkerStreamClient
lastSeen time.Time
mutex sync.RWMutex
adminServer *AdminServer
stopCh chan struct{}
active bool
}
// WorkerCommunicationManager manages all worker connections
type WorkerCommunicationManager struct {
adminServer *AdminServer
connections map[string]*WorkerConnection
mutex sync.RWMutex
stopCh chan struct{}
}
// NewWorkerCommunicationManager creates a new worker communication manager
func NewWorkerCommunicationManager(adminServer *AdminServer) *WorkerCommunicationManager {
return &WorkerCommunicationManager{
adminServer: adminServer,
connections: make(map[string]*WorkerConnection),
stopCh: make(chan struct{}),
}
}
// Start starts the worker communication manager
func (wcm *WorkerCommunicationManager) Start() {
glog.Infof("Starting worker communication manager")
go wcm.connectionMonitorLoop()
}
// Stop stops the worker communication manager
func (wcm *WorkerCommunicationManager) Stop() {
glog.Infof("Stopping worker communication manager")
close(wcm.stopCh)
wcm.mutex.Lock()
defer wcm.mutex.Unlock()
for _, conn := range wcm.connections {
conn.Close()
}
}
// EstablishWorkerConnection establishes a connection to a worker
func (wcm *WorkerCommunicationManager) EstablishWorkerConnection(workerID, address string) error {
wcm.mutex.Lock()
defer wcm.mutex.Unlock()
// Check if already connected
if conn, exists := wcm.connections[workerID]; exists {
if conn.active {
return nil // Already connected
}
conn.Close() // Close inactive connection
}
// Create new connection
conn, err := NewWorkerConnection(workerID, address, wcm.adminServer)
if err != nil {
return fmt.Errorf("failed to create worker connection: %v", err)
}
wcm.connections[workerID] = conn
// Start connection
go conn.Start()
glog.Infof("Established connection to worker %s at %s", workerID, address)
return nil
}
// SendTaskAssignment sends a task assignment to a worker
func (wcm *WorkerCommunicationManager) SendTaskAssignment(workerID string, task *Task) error {
wcm.mutex.RLock()
conn, exists := wcm.connections[workerID]
wcm.mutex.RUnlock()
if !exists || !conn.active {
return fmt.Errorf("no active connection to worker %s", workerID)
}
return conn.SendTaskAssignment(task)
}
// CancelTask sends a task cancellation to a worker
func (wcm *WorkerCommunicationManager) CancelTask(workerID, taskID string, reason string) error {
wcm.mutex.RLock()
conn, exists := wcm.connections[workerID]
wcm.mutex.RUnlock()
if !exists || !conn.active {
return fmt.Errorf("no active connection to worker %s", workerID)
}
return conn.CancelTask(taskID, reason)
}
// GetActiveConnections returns the list of active worker connections
func (wcm *WorkerCommunicationManager) GetActiveConnections() []string {
wcm.mutex.RLock()
defer wcm.mutex.RUnlock()
var active []string
for workerID, conn := range wcm.connections {
if conn.active {
active = append(active, workerID)
}
}
return active
}
// connectionMonitorLoop monitors worker connections and cleans up inactive ones
func (wcm *WorkerCommunicationManager) connectionMonitorLoop() {
ticker := time.NewTicker(30 * time.Second)
defer ticker.Stop()
for {
select {
case <-ticker.C:
wcm.cleanupInactiveConnections()
case <-wcm.stopCh:
return
}
}
}
// cleanupInactiveConnections removes inactive worker connections
func (wcm *WorkerCommunicationManager) cleanupInactiveConnections() {
wcm.mutex.Lock()
defer wcm.mutex.Unlock()
now := time.Now()
timeout := 2 * time.Minute
for workerID, conn := range wcm.connections {
if !conn.active || now.Sub(conn.lastSeen) > timeout {
glog.Infof("Cleaning up inactive connection to worker %s", workerID)
conn.Close()
delete(wcm.connections, workerID)
// Mark worker as inactive in registry
wcm.adminServer.workerRegistry.MarkWorkerInactive(workerID)
}
}
}
// NewWorkerConnection creates a new worker connection
func NewWorkerConnection(workerID, address string, adminServer *AdminServer) (*WorkerConnection, error) {
// Convert address to gRPC address
grpcAddress := pb.ServerToGrpcAddress(address)
conn, err := grpc.NewClient(grpcAddress, grpc.WithTransportCredentials(insecure.NewCredentials()))
if err != nil {
return nil, fmt.Errorf("failed to connect to worker at %s: %v", address, err)
}
client := worker_pb.NewWorkerServiceClient(conn)
return &WorkerConnection{
workerID: workerID,
address: address,
conn: conn,
client: client,
lastSeen: time.Now(),
adminServer: adminServer,
stopCh: make(chan struct{}),
active: false,
}, nil
}
// Start starts the worker connection and message handling
func (wc *WorkerConnection) Start() {
defer wc.Close()
ctx := context.Background()
stream, err := wc.client.WorkerStream(ctx)
if err != nil {
glog.Errorf("Failed to create worker stream for %s: %v", wc.workerID, err)
return
}
wc.stream = stream
wc.active = true
glog.Infof("Worker connection %s started", wc.workerID)
// Start message handling goroutines
go wc.receiveMessages()
// Keep connection alive until stopped
<-wc.stopCh
}
// Close closes the worker connection
func (wc *WorkerConnection) Close() {
wc.mutex.Lock()
defer wc.mutex.Unlock()
if !wc.active {
return
}
wc.active = false
close(wc.stopCh)
if wc.stream != nil {
wc.stream.CloseSend()
}
if wc.conn != nil {
wc.conn.Close()
}
glog.Infof("Worker connection %s closed", wc.workerID)
}
// receiveMessages handles incoming messages from the worker
func (wc *WorkerConnection) receiveMessages() {
for {
select {
case <-wc.stopCh:
return
default:
}
msg, err := wc.stream.Recv()
if err != nil {
if err == io.EOF {
glog.Infof("Worker %s closed connection", wc.workerID)
} else {
glog.Errorf("Error receiving from worker %s: %v", wc.workerID, err)
}
wc.Close()
return
}
wc.updateLastSeen()
// Convert AdminMessage to WorkerMessage for processing
if workerMsg := convertToWorkerMessage(msg); workerMsg != nil {
wc.handleMessage(workerMsg)
}
}
}
// updateLastSeen updates the last seen timestamp
func (wc *WorkerConnection) updateLastSeen() {
wc.mutex.Lock()
defer wc.mutex.Unlock()
wc.lastSeen = time.Now()
}
// handleMessage processes a message from the worker
func (wc *WorkerConnection) handleMessage(msg *worker_pb.WorkerMessage) {
switch message := msg.Message.(type) {
case *worker_pb.WorkerMessage_Registration:
registration := message.Registration
worker := &Worker{
ID: registration.WorkerId,
Address: registration.Address,
Capabilities: registration.Capabilities,
}
wc.workerID = worker.ID
// UpdateWorkerStatus stub
if wc.adminServer.workerRegistry != nil {
// wc.adminServer.workerRegistry.UpdateWorkerStatus(worker) // Commented out - method doesn't exist
}
glog.Infof("Worker %s registered", worker.ID)
case *worker_pb.WorkerMessage_Heartbeat:
glog.V(3).Infof("Heartbeat from worker %s", wc.workerID)
case *worker_pb.WorkerMessage_TaskRequest:
glog.V(2).Infof("Task request from worker %s", wc.workerID)
// AssignTaskToWorker stub
// task := wc.adminServer.AssignTaskToWorker(wc.workerID) // Commented out - method doesn't exist
case *worker_pb.WorkerMessage_TaskUpdate:
update := message.TaskUpdate
// UpdateTaskProgress stub - fix signature
wc.adminServer.UpdateTaskProgress(update.TaskId, float64(update.Progress))
case *worker_pb.WorkerMessage_TaskComplete:
complete := message.TaskComplete
// CompleteTask stub - fix signature
wc.adminServer.CompleteTask(complete.TaskId, complete.Success, complete.ErrorMessage)
case *worker_pb.WorkerMessage_Shutdown:
glog.Infof("Worker %s shutting down", wc.workerID)
wc.Close()
}
}
// SendTaskAssignment sends a task assignment to the worker
func (wc *WorkerConnection) SendTaskAssignment(task *Task) error {
return wc.sendTaskAssignment(task)
}
// sendTaskAssignment sends a task assignment message
func (wc *WorkerConnection) sendTaskAssignment(task *types.Task) error {
// Fix type assertions for parameters
server, _ := task.Parameters["server"].(string)
collection, _ := task.Parameters["collection"].(string)
// Convert map[string]interface{} to map[string]string
parameters := make(map[string]string)
for k, v := range task.Parameters {
if str, ok := v.(string); ok {
parameters[k] = str
} else {
parameters[k] = fmt.Sprintf("%v", v)
}
}
// Add master_client parameter for tasks that need it (especially EC tasks)
if wc.adminServer.masterClient != nil {
if currentMaster := wc.adminServer.masterClient.GetMaster(context.Background()); currentMaster != "" {
parameters["master_client"] = string(currentMaster)
glog.V(2).Infof("Added master_client parameter to task %s: %s", task.ID, currentMaster)
} else {
glog.Warningf("No master address available for task %s", task.ID)
}
}
assignment := &worker_pb.TaskAssignment{
TaskId: task.ID,
TaskType: string(task.Type),
Priority: int32(task.Priority),
CreatedTime: task.CreatedAt.Unix(),
Params: &worker_pb.TaskParams{
VolumeId: task.VolumeID,
Server: server,
Collection: collection,
Parameters: parameters,
},
Metadata: map[string]string{
"assigned_at": time.Now().Format(time.RFC3339),
},
}
response := &worker_pb.AdminMessage{
AdminId: wc.adminServer.ID,
Timestamp: time.Now().Unix(),
Message: &worker_pb.AdminMessage_TaskAssignment{
TaskAssignment: assignment,
},
}
return wc.sendMessage(response)
}
// CancelTask sends a task cancellation to the worker
func (wc *WorkerConnection) CancelTask(taskID, reason string) error {
cancellation := &worker_pb.TaskCancellation{
TaskId: taskID,
Reason: reason,
Force: false,
}
response := &worker_pb.AdminMessage{
AdminId: wc.adminServer.ID,
Timestamp: time.Now().Unix(),
Message: &worker_pb.AdminMessage_TaskCancellation{
TaskCancellation: cancellation,
},
}
return wc.sendMessage(response)
}
// sendMessage sends a message to the worker
func (wc *WorkerConnection) sendMessage(msg *worker_pb.AdminMessage) error {
wc.mutex.RLock()
defer wc.mutex.RUnlock()
if !wc.active || wc.stream == nil {
return fmt.Errorf("connection to worker %s is not active", wc.workerID)
}
// The stream expects WorkerMessage from client (admin) to server (worker)
// Convert AdminMessage to appropriate WorkerMessage format
workerMsg := &worker_pb.WorkerMessage{
WorkerId: wc.workerID,
Timestamp: msg.Timestamp,
}
// Convert AdminMessage content to WorkerMessage based on message type
switch adminMsg := msg.Message.(type) {
case *worker_pb.AdminMessage_TaskAssignment:
// Task assignments should be sent as notifications to worker
// Since there's no direct equivalent, we'll create a generic message
// In a full implementation, this would need proper message type mapping
_ = adminMsg // Use the variable to avoid unused warning
workerMsg.Message = &worker_pb.WorkerMessage_Heartbeat{
Heartbeat: &worker_pb.WorkerHeartbeat{
WorkerId: wc.workerID,
Status: "task_assigned",
},
}
case *worker_pb.AdminMessage_TaskCancellation:
// Similar conversion for task cancellation
_ = adminMsg // Use the variable to avoid unused warning
workerMsg.Message = &worker_pb.WorkerMessage_Heartbeat{
Heartbeat: &worker_pb.WorkerHeartbeat{
WorkerId: wc.workerID,
Status: "task_cancelled",
},
}
default:
// For other message types, send a generic heartbeat
workerMsg.Message = &worker_pb.WorkerMessage_Heartbeat{
Heartbeat: &worker_pb.WorkerHeartbeat{
WorkerId: wc.workerID,
Status: "admin_message",
},
}
}
return wc.stream.Send(workerMsg)
}
// Helper functions
// convertCapabilities converts string capabilities to TaskType slice
func convertCapabilities(capabilities []string) []TaskType {
var result []TaskType
for _, cap := range capabilities {
result = append(result, TaskType(cap))
}
return result
}
// WorkerStatus represents worker status information
type WorkerStatus struct {
Status string
CurrentLoad int
MaxConcurrent int
CurrentTasks []string
TasksCompleted int
TasksFailed int
UptimeSeconds int64
LastSeen time.Time
}
// TaskProgress represents task progress information
type TaskProgress struct {
Progress float64
Message string
}
// TaskResult represents task completion result
type TaskResult struct {
Success bool
Error string
Message string
}
// convertToWorkerMessage converts AdminMessage to WorkerMessage (stub implementation)
func convertToWorkerMessage(msg *worker_pb.AdminMessage) *worker_pb.WorkerMessage {
// This is a stub - in real implementation would need proper conversion
// For now, return nil to avoid processing
return nil
}

348
weed/admin/task/worker_registry.go

@ -1,348 +0,0 @@
package task
import (
"fmt"
"sync"
"time"
"github.com/seaweedfs/seaweedfs/weed/glog"
"github.com/seaweedfs/seaweedfs/weed/worker/types"
)
// WorkerRegistry manages worker registration and tracking
type WorkerRegistry struct {
workers map[string]*types.Worker
capabilities map[types.TaskType][]*types.Worker
metrics map[string]*WorkerMetrics
issues map[string][]WorkerIssue
mutex sync.RWMutex
}
// WorkerIssue represents an issue with a worker
type WorkerIssue struct {
Type string
Timestamp time.Time
Details string
}
// NewWorkerRegistry creates a new worker registry
func NewWorkerRegistry() *WorkerRegistry {
return &WorkerRegistry{
workers: make(map[string]*types.Worker),
capabilities: make(map[types.TaskType][]*types.Worker),
metrics: make(map[string]*WorkerMetrics),
issues: make(map[string][]WorkerIssue),
}
}
// RegisterWorker registers a new worker
func (wr *WorkerRegistry) RegisterWorker(worker *types.Worker) error {
wr.mutex.Lock()
defer wr.mutex.Unlock()
if _, exists := wr.workers[worker.ID]; exists {
return fmt.Errorf("worker %s already registered", worker.ID)
}
// Register worker
wr.workers[worker.ID] = worker
// Initialize metrics
wr.metrics[worker.ID] = &WorkerMetrics{
TasksCompleted: 0,
TasksFailed: 0,
AverageTaskTime: 0,
LastTaskTime: time.Time{},
SuccessRate: 1.0,
}
// Update capabilities mapping
wr.updateCapabilitiesMapping()
glog.Infof("Registered worker %s with capabilities: %v", worker.ID, worker.Capabilities)
return nil
}
// UnregisterWorker removes a worker
func (wr *WorkerRegistry) UnregisterWorker(workerID string) error {
wr.mutex.Lock()
defer wr.mutex.Unlock()
if _, exists := wr.workers[workerID]; !exists {
return fmt.Errorf("worker %s not found", workerID)
}
delete(wr.workers, workerID)
delete(wr.metrics, workerID)
delete(wr.issues, workerID)
// Update capabilities mapping
wr.updateCapabilitiesMapping()
glog.Infof("Unregistered worker %s", workerID)
return nil
}
// GetWorker returns a worker by ID
func (wr *WorkerRegistry) GetWorker(workerID string) (*types.Worker, bool) {
wr.mutex.RLock()
defer wr.mutex.RUnlock()
worker, exists := wr.workers[workerID]
return worker, exists
}
// GetAvailableWorkers returns workers that are available for new tasks
func (wr *WorkerRegistry) GetAvailableWorkers() []*types.Worker {
wr.mutex.RLock()
defer wr.mutex.RUnlock()
var available []*types.Worker
for _, worker := range wr.workers {
if worker.Status == "active" && worker.CurrentLoad < worker.MaxConcurrent {
available = append(available, worker)
}
}
return available
}
// GetWorkersByCapability returns workers that support a specific capability
func (wr *WorkerRegistry) GetWorkersByCapability(taskType types.TaskType) []*types.Worker {
wr.mutex.RLock()
defer wr.mutex.RUnlock()
return wr.capabilities[taskType]
}
// UpdateWorkerHeartbeat updates worker heartbeat and status
func (wr *WorkerRegistry) UpdateWorkerHeartbeat(workerID string, status *types.WorkerStatus) error {
wr.mutex.Lock()
defer wr.mutex.Unlock()
worker, exists := wr.workers[workerID]
if !exists {
return fmt.Errorf("worker %s not found", workerID)
}
// Update worker status
worker.LastHeartbeat = time.Now()
worker.Status = status.Status
worker.CurrentLoad = status.CurrentLoad
glog.V(3).Infof("Updated heartbeat for worker %s, status: %s, load: %d/%d",
workerID, status.Status, status.CurrentLoad, worker.MaxConcurrent)
return nil
}
// GetTimedOutWorkers returns workers that haven't sent heartbeat within timeout
func (wr *WorkerRegistry) GetTimedOutWorkers(timeout time.Duration) []string {
wr.mutex.RLock()
defer wr.mutex.RUnlock()
var timedOut []string
cutoff := time.Now().Add(-timeout)
for workerID, worker := range wr.workers {
if worker.LastHeartbeat.Before(cutoff) {
timedOut = append(timedOut, workerID)
}
}
return timedOut
}
// MarkWorkerInactive marks a worker as inactive
func (wr *WorkerRegistry) MarkWorkerInactive(workerID string) {
wr.mutex.Lock()
defer wr.mutex.Unlock()
if worker, exists := wr.workers[workerID]; exists {
worker.Status = "inactive"
worker.CurrentLoad = 0
}
}
// RecordWorkerIssue records an issue with a worker
func (wr *WorkerRegistry) RecordWorkerIssue(workerID string, issueType string) {
wr.mutex.Lock()
defer wr.mutex.Unlock()
issue := WorkerIssue{
Type: issueType,
Timestamp: time.Now(),
Details: fmt.Sprintf("Worker issue: %s", issueType),
}
wr.issues[workerID] = append(wr.issues[workerID], issue)
// Limit issue history to last 10 issues
if len(wr.issues[workerID]) > 10 {
wr.issues[workerID] = wr.issues[workerID][1:]
}
glog.Warningf("Recorded issue for worker %s: %s", workerID, issueType)
}
// GetWorkerMetrics returns metrics for a worker
func (wr *WorkerRegistry) GetWorkerMetrics(workerID string) *WorkerMetrics {
wr.mutex.RLock()
defer wr.mutex.RUnlock()
return wr.metrics[workerID]
}
// UpdateWorkerMetrics updates performance metrics for a worker
func (wr *WorkerRegistry) UpdateWorkerMetrics(workerID string, taskDuration time.Duration, success bool) {
wr.mutex.Lock()
defer wr.mutex.Unlock()
metrics, exists := wr.metrics[workerID]
if !exists {
return
}
if success {
metrics.TasksCompleted++
} else {
metrics.TasksFailed++
}
metrics.LastTaskTime = time.Now()
// Update average task time
totalTasks := metrics.TasksCompleted + metrics.TasksFailed
if totalTasks > 0 {
oldAvg := metrics.AverageTaskTime
metrics.AverageTaskTime = time.Duration(
(float64(oldAvg)*float64(totalTasks-1) + float64(taskDuration)) / float64(totalTasks),
)
}
// Update success rate
if totalTasks > 0 {
metrics.SuccessRate = float64(metrics.TasksCompleted) / float64(totalTasks)
}
}
// GetBestWorkerForTask returns the best worker for a specific task type
func (wr *WorkerRegistry) GetBestWorkerForTask(taskType types.TaskType) *types.Worker {
wr.mutex.RLock()
defer wr.mutex.RUnlock()
candidates := wr.capabilities[taskType]
if len(candidates) == 0 {
return nil
}
var bestWorker *types.Worker
bestScore := -1.0
for _, worker := range candidates {
// Skip if not available
if worker.Status != "active" || worker.CurrentLoad >= worker.MaxConcurrent {
continue
}
// Calculate score based on multiple factors
score := wr.calculateWorkerScore(worker)
if bestWorker == nil || score > bestScore {
bestWorker = worker
bestScore = score
}
}
return bestWorker
}
// calculateWorkerScore calculates a score for worker selection
func (wr *WorkerRegistry) calculateWorkerScore(worker *types.Worker) float64 {
metrics := wr.metrics[worker.ID]
if metrics == nil {
return 0.5 // Default score for new workers
}
// Factors for scoring:
// 1. Available capacity (0.0 to 1.0)
capacityScore := float64(worker.MaxConcurrent-worker.CurrentLoad) / float64(worker.MaxConcurrent)
// 2. Success rate (0.0 to 1.0)
successScore := metrics.SuccessRate
// 3. Recent activity bonus (workers that completed tasks recently get slight bonus)
activityScore := 0.0
if !metrics.LastTaskTime.IsZero() && time.Since(metrics.LastTaskTime) < time.Hour {
activityScore = 0.1
}
// 4. Issue penalty (workers with recent issues get penalty)
issuePenalty := 0.0
if issues, exists := wr.issues[worker.ID]; exists {
recentIssues := 0
cutoff := time.Now().Add(-time.Hour)
for _, issue := range issues {
if issue.Timestamp.After(cutoff) {
recentIssues++
}
}
issuePenalty = float64(recentIssues) * 0.1
}
// Weighted average
score := (capacityScore*0.4 + successScore*0.4 + activityScore) - issuePenalty
if score < 0 {
score = 0
}
if score > 1 {
score = 1
}
return score
}
// updateCapabilitiesMapping rebuilds the capabilities mapping
func (wr *WorkerRegistry) updateCapabilitiesMapping() {
// Clear existing mapping
for taskType := range wr.capabilities {
wr.capabilities[taskType] = nil
}
// Rebuild mapping
for _, worker := range wr.workers {
for _, capability := range worker.Capabilities {
wr.capabilities[capability] = append(wr.capabilities[capability], worker)
}
}
}
// GetRegistryStats returns statistics about the registry
func (wr *WorkerRegistry) GetRegistryStats() map[string]interface{} {
wr.mutex.RLock()
defer wr.mutex.RUnlock()
stats := make(map[string]interface{})
stats["total_workers"] = len(wr.workers)
statusCounts := make(map[string]int)
capabilityCounts := make(map[types.TaskType]int)
totalLoad := 0
maxCapacity := 0
for _, worker := range wr.workers {
statusCounts[worker.Status]++
totalLoad += worker.CurrentLoad
maxCapacity += worker.MaxConcurrent
for _, capability := range worker.Capabilities {
capabilityCounts[capability]++
}
}
stats["by_status"] = statusCounts
stats["by_capability"] = capabilityCounts
stats["total_load"] = totalLoad
stats["max_capacity"] = maxCapacity
stats["utilization"] = float64(totalLoad) / float64(maxCapacity) * 100.0
return stats
}

324
weed/admin/task_minimal/admin_server.go

@ -1,324 +0,0 @@
package task
import (
"fmt"
"sync"
"time"
"github.com/seaweedfs/seaweedfs/weed/wdclient"
"github.com/seaweedfs/seaweedfs/weed/worker/types"
)
// AdminConfig contains configuration for the admin server
type AdminConfig struct {
ScanInterval time.Duration
WorkerTimeout time.Duration
TaskTimeout time.Duration
MaxRetries int
ReconcileInterval time.Duration
EnableFailureRecovery bool
MaxConcurrentTasks int
}
// AdminServer manages workers and tasks
type AdminServer struct {
config *AdminConfig
masterClient *wdclient.MasterClient
running bool
mutex sync.RWMutex
// Task management
tasks map[string]*types.Task
taskQueue []*types.Task
activeTasks map[string]*types.Task
// Worker management
workers map[string]*types.Worker
workerStatus map[string]*types.WorkerStatus
// Task history
taskHistory []TaskHistoryEntry
}
// TaskHistoryEntry represents a single task history entry
type TaskHistoryEntry struct {
TaskID string
TaskType types.TaskType
VolumeID uint32
WorkerID string
Status types.TaskStatus
StartedAt time.Time
CompletedAt time.Time
Duration time.Duration
ErrorMessage string
}
// SystemStats represents system statistics
type SystemStats struct {
ActiveTasks int
QueuedTasks int
ActiveWorkers int
TotalTasks int
}
// NewAdminServer creates a new admin server
func NewAdminServer(config *AdminConfig, masterClient *wdclient.MasterClient) *AdminServer {
return &AdminServer{
config: config,
masterClient: masterClient,
tasks: make(map[string]*types.Task),
taskQueue: make([]*types.Task, 0),
activeTasks: make(map[string]*types.Task),
workers: make(map[string]*types.Worker),
workerStatus: make(map[string]*types.WorkerStatus),
taskHistory: make([]TaskHistoryEntry, 0),
}
}
// Start starts the admin server
func (as *AdminServer) Start() error {
as.mutex.Lock()
defer as.mutex.Unlock()
if as.running {
return fmt.Errorf("admin server is already running")
}
as.running = true
return nil
}
// Stop stops the admin server
func (as *AdminServer) Stop() error {
as.mutex.Lock()
defer as.mutex.Unlock()
as.running = false
return nil
}
// RegisterWorker registers a new worker
func (as *AdminServer) RegisterWorker(worker *types.Worker) error {
as.mutex.Lock()
defer as.mutex.Unlock()
if !as.running {
return fmt.Errorf("admin server is not running")
}
as.workers[worker.ID] = worker
as.workerStatus[worker.ID] = &types.WorkerStatus{
Status: "active",
CurrentLoad: 0,
}
return nil
}
// QueueTask adds a new task to the task queue
func (as *AdminServer) QueueTask(task *types.Task) error {
as.mutex.Lock()
defer as.mutex.Unlock()
if !as.running {
return fmt.Errorf("admin server is not running")
}
if task.ID == "" {
task.ID = fmt.Sprintf("task-%d", time.Now().UnixNano())
}
task.Status = types.TaskStatusPending
task.CreatedAt = time.Now()
as.tasks[task.ID] = task
as.taskQueue = append(as.taskQueue, task)
return nil
}
// RequestTask requests a task for a worker
func (as *AdminServer) RequestTask(workerID string, capabilities []types.TaskType) (*types.Task, error) {
as.mutex.Lock()
defer as.mutex.Unlock()
if !as.running {
return nil, fmt.Errorf("admin server is not running")
}
// Check if worker exists
worker, exists := as.workers[workerID]
if !exists {
return nil, fmt.Errorf("worker %s not found", workerID)
}
// Check if worker has capacity
status := as.workerStatus[workerID]
if status.CurrentLoad >= worker.MaxConcurrent {
return nil, nil // No capacity
}
// Find a suitable task
for i, task := range as.taskQueue {
if task.Status != types.TaskStatusPending {
continue
}
// Check if worker can handle this task type
canHandle := false
for _, capability := range capabilities {
if task.Type == capability {
canHandle = true
break
}
}
if canHandle {
// Assign task to worker
task.Status = types.TaskStatusInProgress
task.WorkerID = workerID
now := time.Now()
task.StartedAt = &now
// Move task from queue to active tasks
as.taskQueue = append(as.taskQueue[:i], as.taskQueue[i+1:]...)
as.activeTasks[task.ID] = task
// Update worker load
status.CurrentLoad++
return task, nil
}
}
return nil, nil // No suitable task found
}
// UpdateTaskProgress updates task progress
func (as *AdminServer) UpdateTaskProgress(taskID string, progress float64) error {
as.mutex.Lock()
defer as.mutex.Unlock()
task, exists := as.tasks[taskID]
if !exists {
return fmt.Errorf("task %s not found", taskID)
}
task.Progress = progress
return nil
}
// CompleteTask marks a task as completed
func (as *AdminServer) CompleteTask(taskID string, success bool, errorMessage string) error {
as.mutex.Lock()
defer as.mutex.Unlock()
task, exists := as.tasks[taskID]
if !exists {
return fmt.Errorf("task %s not found", taskID)
}
// Update task status
if success {
task.Status = types.TaskStatusCompleted
} else {
task.Status = types.TaskStatusFailed
task.Error = errorMessage
}
now := time.Now()
task.CompletedAt = &now
// Remove from active tasks
delete(as.activeTasks, taskID)
// Update worker load
if task.WorkerID != "" {
if status, exists := as.workerStatus[task.WorkerID]; exists {
status.CurrentLoad--
}
}
// Add to history
var duration time.Duration
if task.StartedAt != nil {
duration = now.Sub(*task.StartedAt)
}
entry := TaskHistoryEntry{
TaskID: task.ID,
TaskType: task.Type,
VolumeID: task.VolumeID,
WorkerID: task.WorkerID,
Status: task.Status,
StartedAt: *task.StartedAt,
CompletedAt: now,
Duration: duration,
ErrorMessage: errorMessage,
}
as.taskHistory = append(as.taskHistory, entry)
return nil
}
// UpdateWorkerHeartbeat updates worker heartbeat
func (as *AdminServer) UpdateWorkerHeartbeat(workerID string, status *types.WorkerStatus) error {
as.mutex.Lock()
defer as.mutex.Unlock()
worker, exists := as.workers[workerID]
if !exists {
return fmt.Errorf("worker %s not found", workerID)
}
worker.LastHeartbeat = time.Now()
as.workerStatus[workerID] = status
return nil
}
// GetSystemStats returns system statistics
func (as *AdminServer) GetSystemStats() *SystemStats {
as.mutex.RLock()
defer as.mutex.RUnlock()
activeWorkers := 0
for _, status := range as.workerStatus {
if status.Status == "active" {
activeWorkers++
}
}
return &SystemStats{
ActiveTasks: len(as.activeTasks),
QueuedTasks: len(as.taskQueue),
ActiveWorkers: activeWorkers,
TotalTasks: len(as.tasks),
}
}
// GetQueuedTaskCount returns the number of queued tasks
func (as *AdminServer) GetQueuedTaskCount() int {
as.mutex.RLock()
defer as.mutex.RUnlock()
return len(as.taskQueue)
}
// GetActiveTaskCount returns the number of active tasks
func (as *AdminServer) GetActiveTaskCount() int {
as.mutex.RLock()
defer as.mutex.RUnlock()
return len(as.activeTasks)
}
// GetTaskHistory returns task history
func (as *AdminServer) GetTaskHistory() []TaskHistoryEntry {
as.mutex.RLock()
defer as.mutex.RUnlock()
// Return a copy of the history
history := make([]TaskHistoryEntry, len(as.taskHistory))
copy(history, as.taskHistory)
return history
}

3
weed/admin/task_minimal/go.mod

@ -1,3 +0,0 @@
module task_minimal
go 1.24.1

233
weed/admin/task_minimal/integration_test.go

@ -1,233 +0,0 @@
package task
import (
"fmt"
"testing"
"time"
"github.com/seaweedfs/seaweedfs/weed/worker/types"
)
// TestSimpleIntegration tests basic admin-worker operational flow without complex dependencies
func TestSimpleIntegration(t *testing.T) {
t.Logf("Starting simple integration test")
// Step 1: Create a minimal admin server configuration
config := &AdminConfig{
ScanInterval: 10 * time.Second,
WorkerTimeout: 30 * time.Second,
TaskTimeout: 2 * time.Hour,
MaxRetries: 3,
ReconcileInterval: 5 * time.Minute,
EnableFailureRecovery: true,
MaxConcurrentTasks: 5,
}
// Step 2: Create admin server with nil master client (for testing)
adminServer := NewAdminServer(config, nil)
// Step 3: Start admin server
err := adminServer.Start()
if err != nil {
t.Fatalf("Failed to start admin server: %v", err)
}
defer adminServer.Stop()
// Step 4: Test worker registration
t.Logf("Testing worker registration")
worker := &types.Worker{
ID: "test-worker-1",
Address: "localhost:9001",
Capabilities: []types.TaskType{types.TaskTypeVacuum},
MaxConcurrent: 2,
Status: "active",
CurrentLoad: 0,
LastHeartbeat: time.Now(),
}
err = adminServer.RegisterWorker(worker)
if err != nil {
t.Fatalf("Failed to register worker: %v", err)
}
t.Logf("Successfully registered worker %s", worker.ID)
// Step 5: Test task queueing
t.Logf("Testing task queueing")
task := &types.Task{
ID: "test-task-1",
Type: types.TaskTypeVacuum,
VolumeID: 1001,
Server: "localhost:8080",
Status: types.TaskStatusPending,
Priority: types.TaskPriorityNormal,
Parameters: map[string]interface{}{
"garbage_threshold": "0.3",
},
CreatedAt: time.Now(),
}
err = adminServer.QueueTask(task)
if err != nil {
t.Fatalf("Failed to queue task: %v", err)
}
t.Logf("Successfully queued task %s", task.ID)
// Step 6: Test task request by worker
t.Logf("Testing task request")
assignedTask, err := adminServer.RequestTask("test-worker-1", []types.TaskType{types.TaskTypeVacuum})
if err != nil {
t.Fatalf("Failed to request task: %v", err)
}
if assignedTask != nil {
t.Logf("Successfully assigned task %s to worker", assignedTask.ID)
// Step 7: Test task progress updates
t.Logf("Testing task progress updates")
err = adminServer.UpdateTaskProgress(assignedTask.ID, 50.0)
if err != nil {
t.Errorf("Failed to update task progress: %v", err)
}
err = adminServer.UpdateTaskProgress(assignedTask.ID, 100.0)
if err != nil {
t.Errorf("Failed to update task progress: %v", err)
}
// Step 8: Test task completion
t.Logf("Testing task completion")
err = adminServer.CompleteTask(assignedTask.ID, true, "")
if err != nil {
t.Errorf("Failed to complete task: %v", err)
}
t.Logf("Successfully completed task %s", assignedTask.ID)
} else {
t.Logf("No task was assigned (queue might be empty)")
}
// Step 9: Test basic metrics
t.Logf("Testing basic metrics")
stats := adminServer.GetSystemStats()
if stats != nil {
t.Logf("System stats: Active tasks=%d, Queued tasks=%d, Active workers=%d",
stats.ActiveTasks, stats.QueuedTasks, stats.ActiveWorkers)
}
queuedCount := adminServer.GetQueuedTaskCount()
activeCount := adminServer.GetActiveTaskCount()
t.Logf("Queue status: %d queued, %d active tasks", queuedCount, activeCount)
// Step 10: Test task history
history := adminServer.GetTaskHistory()
t.Logf("Task history contains %d entries", len(history))
t.Logf("Simple integration test completed successfully")
}
// TestWorkerHeartbeat tests worker heartbeat functionality
func TestWorkerHeartbeat(t *testing.T) {
t.Logf("Testing worker heartbeat")
config := &AdminConfig{
ScanInterval: 10 * time.Second,
WorkerTimeout: 30 * time.Second,
TaskTimeout: 2 * time.Hour,
MaxRetries: 3,
ReconcileInterval: 5 * time.Minute,
EnableFailureRecovery: true,
MaxConcurrentTasks: 5,
}
adminServer := NewAdminServer(config, nil)
err := adminServer.Start()
if err != nil {
t.Fatalf("Failed to start admin server: %v", err)
}
defer adminServer.Stop()
// Register a worker
worker := &types.Worker{
ID: "heartbeat-worker",
Address: "localhost:9002",
Capabilities: []types.TaskType{types.TaskTypeVacuum},
MaxConcurrent: 1,
Status: "active",
CurrentLoad: 0,
LastHeartbeat: time.Now(),
}
err = adminServer.RegisterWorker(worker)
if err != nil {
t.Fatalf("Failed to register worker: %v", err)
}
// Test heartbeat update
status := &types.WorkerStatus{
Status: "active",
CurrentLoad: 0,
}
err = adminServer.UpdateWorkerHeartbeat("heartbeat-worker", status)
if err != nil {
t.Errorf("Failed to update worker heartbeat: %v", err)
}
t.Logf("Worker heartbeat test completed successfully")
}
// TestTaskQueueOperations tests task queue operations
func TestTaskQueueOperations(t *testing.T) {
t.Logf("Testing task queue operations")
config := &AdminConfig{
ScanInterval: 10 * time.Second,
WorkerTimeout: 30 * time.Second,
TaskTimeout: 2 * time.Hour,
MaxRetries: 3,
ReconcileInterval: 5 * time.Minute,
EnableFailureRecovery: true,
MaxConcurrentTasks: 5,
}
adminServer := NewAdminServer(config, nil)
err := adminServer.Start()
if err != nil {
t.Fatalf("Failed to start admin server: %v", err)
}
defer adminServer.Stop()
// Test queuing multiple tasks
for i := 0; i < 3; i++ {
task := &types.Task{
ID: fmt.Sprintf("queue-test-task-%d", i),
Type: types.TaskTypeVacuum,
VolumeID: uint32(2000 + i),
Server: "localhost:8080",
Status: types.TaskStatusPending,
Priority: types.TaskPriorityNormal,
Parameters: map[string]interface{}{
"garbage_threshold": "0.3",
},
CreatedAt: time.Now(),
}
err = adminServer.QueueTask(task)
if err != nil {
t.Errorf("Failed to queue task %d: %v", i, err)
}
}
// Check queue size
queuedCount := adminServer.GetQueuedTaskCount()
if queuedCount != 3 {
t.Errorf("Expected 3 queued tasks, got %d", queuedCount)
}
t.Logf("Task queue operations test completed successfully")
}
Loading…
Cancel
Save