34 changed files with 0 additions and 11477 deletions
-
699weed/admin/task/admin_server.go
-
524weed/admin/task/admin_server_test.go
-
90weed/admin/task/compilation_stubs.go
-
309weed/admin/task/ec_integration_test.go
-
324weed/admin/task/ec_test_standalone/enhanced_ec_integration_test.go
-
3weed/admin/task/ec_test_standalone/go.mod
-
324weed/admin/task/ec_test_standalone/minimal_admin_server.go
-
434weed/admin/task/ec_test_standalone/minimal_integration_test.go
-
488weed/admin/task/ec_worker_test.go
-
346weed/admin/task/example_usage.go
-
123weed/admin/task/failure_handler.go
-
486weed/admin/task/master_sync.go
-
324weed/admin/task/minimal_admin_server.go
-
434weed/admin/task/minimal_integration_test.go
-
197weed/admin/task/operational_integration_test.go
-
233weed/admin/task/simple_integration_test.go
-
604weed/admin/task/simulation.go
-
695weed/admin/task/simulation/comprehensive_simulation.go
-
444weed/admin/task/simulation/comprehensive_simulation_test.go
-
294weed/admin/task/simulation/simulation_runner.go
-
237weed/admin/task/simulation/system_demo_test.go
-
509weed/admin/task/task_assignment_test.go
-
168weed/admin/task/task_detectors.go
-
161weed/admin/task/task_discovery.go
-
257weed/admin/task/task_scheduler.go
-
68weed/admin/task/task_types.go
-
640weed/admin/task/volume_state_manager.go
-
440weed/admin/task/volume_state_manager_test.go
-
226weed/admin/task/volume_state_tracker.go
-
488weed/admin/task/worker_communication.go
-
348weed/admin/task/worker_registry.go
-
324weed/admin/task_minimal/admin_server.go
-
3weed/admin/task_minimal/go.mod
-
233weed/admin/task_minimal/integration_test.go
@ -1,699 +0,0 @@ |
|||||
package task |
|
||||
|
|
||||
import ( |
|
||||
"fmt" |
|
||||
"math/rand" |
|
||||
"sync" |
|
||||
"time" |
|
||||
|
|
||||
"github.com/seaweedfs/seaweedfs/weed/glog" |
|
||||
"github.com/seaweedfs/seaweedfs/weed/wdclient" |
|
||||
"github.com/seaweedfs/seaweedfs/weed/worker/types" |
|
||||
) |
|
||||
|
|
||||
// TaskHistory represents task execution history
|
|
||||
type TaskHistory struct { |
|
||||
entries []TaskHistoryEntry |
|
||||
mutex sync.RWMutex |
|
||||
} |
|
||||
|
|
||||
// TaskHistoryEntry represents a single task history entry
|
|
||||
type TaskHistoryEntry struct { |
|
||||
TaskID string |
|
||||
TaskType types.TaskType |
|
||||
VolumeID uint32 |
|
||||
WorkerID string |
|
||||
Status types.TaskStatus |
|
||||
StartedAt time.Time |
|
||||
CompletedAt time.Time |
|
||||
Duration time.Duration |
|
||||
ErrorMessage string |
|
||||
} |
|
||||
|
|
||||
// NewTaskHistory creates a new task history
|
|
||||
func NewTaskHistory() *TaskHistory { |
|
||||
return &TaskHistory{ |
|
||||
entries: make([]TaskHistoryEntry, 0), |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// AddEntry adds a new task history entry
|
|
||||
func (th *TaskHistory) AddEntry(entry TaskHistoryEntry) { |
|
||||
th.mutex.Lock() |
|
||||
defer th.mutex.Unlock() |
|
||||
|
|
||||
th.entries = append(th.entries, entry) |
|
||||
|
|
||||
// Keep only the last 1000 entries
|
|
||||
if len(th.entries) > 1000 { |
|
||||
th.entries = th.entries[len(th.entries)-1000:] |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// GetRecentEntries returns the most recent entries
|
|
||||
func (th *TaskHistory) GetRecentEntries(limit int) []*TaskHistoryEntry { |
|
||||
th.mutex.RLock() |
|
||||
defer th.mutex.RUnlock() |
|
||||
|
|
||||
start := len(th.entries) - limit |
|
||||
if start < 0 { |
|
||||
start = 0 |
|
||||
} |
|
||||
|
|
||||
result := make([]*TaskHistoryEntry, len(th.entries)-start) |
|
||||
for i, entry := range th.entries[start:] { |
|
||||
entryCopy := entry |
|
||||
result[i] = &entryCopy |
|
||||
} |
|
||||
|
|
||||
return result |
|
||||
} |
|
||||
|
|
||||
// AdminServer manages task distribution and worker coordination
|
|
||||
type AdminServer struct { |
|
||||
ID string |
|
||||
Config *AdminConfig |
|
||||
masterClient *wdclient.MasterClient |
|
||||
volumeStateManager *VolumeStateManager |
|
||||
workerRegistry *WorkerRegistry |
|
||||
taskQueue *PriorityTaskQueue |
|
||||
taskScheduler *TaskScheduler |
|
||||
taskHistory *TaskHistory |
|
||||
failureHandler *FailureHandler |
|
||||
masterSync *MasterSynchronizer |
|
||||
workerComm *WorkerCommunicationManager |
|
||||
running bool |
|
||||
stopCh chan struct{} |
|
||||
mutex sync.RWMutex |
|
||||
|
|
||||
// Task tracking
|
|
||||
activeTasks map[string]*InProgressTask |
|
||||
tasksMutex sync.RWMutex |
|
||||
} |
|
||||
|
|
||||
// AdminConfig holds configuration for the admin server
|
|
||||
type AdminConfig struct { |
|
||||
ScanInterval time.Duration |
|
||||
WorkerTimeout time.Duration |
|
||||
TaskTimeout time.Duration |
|
||||
MaxRetries int |
|
||||
ReconcileInterval time.Duration |
|
||||
EnableFailureRecovery bool |
|
||||
MaxConcurrentTasks int |
|
||||
} |
|
||||
|
|
||||
// NewAdminServer creates a new admin server instance
|
|
||||
func NewAdminServer(config *AdminConfig, masterClient *wdclient.MasterClient) *AdminServer { |
|
||||
adminServer := &AdminServer{ |
|
||||
ID: generateAdminServerID(), |
|
||||
Config: config, |
|
||||
masterClient: masterClient, |
|
||||
volumeStateManager: NewVolumeStateManager(masterClient), |
|
||||
workerRegistry: NewWorkerRegistry(), |
|
||||
taskQueue: NewPriorityTaskQueue(), |
|
||||
taskHistory: NewTaskHistory(), |
|
||||
failureHandler: NewFailureHandler(config), |
|
||||
activeTasks: make(map[string]*InProgressTask), |
|
||||
stopCh: make(chan struct{}), |
|
||||
} |
|
||||
|
|
||||
// Initialize components that depend on admin server
|
|
||||
adminServer.taskScheduler = NewTaskScheduler(adminServer.workerRegistry, adminServer.taskQueue) |
|
||||
adminServer.masterSync = NewMasterSynchronizer(masterClient, adminServer.volumeStateManager, adminServer) |
|
||||
adminServer.workerComm = NewWorkerCommunicationManager(adminServer) |
|
||||
|
|
||||
glog.Infof("Created admin server %s", adminServer.ID) |
|
||||
return adminServer |
|
||||
} |
|
||||
|
|
||||
// Start starts the admin server
|
|
||||
func (as *AdminServer) Start() error { |
|
||||
as.mutex.Lock() |
|
||||
defer as.mutex.Unlock() |
|
||||
|
|
||||
if as.running { |
|
||||
return nil |
|
||||
} |
|
||||
|
|
||||
glog.Infof("Starting admin server %s", as.ID) |
|
||||
|
|
||||
// Start components
|
|
||||
as.masterSync.Start() |
|
||||
as.workerComm.Start() |
|
||||
|
|
||||
// Start background loops
|
|
||||
go as.taskAssignmentLoop() |
|
||||
go as.taskMonitoringLoop() |
|
||||
go as.reconciliationLoop() |
|
||||
go as.metricsLoop() |
|
||||
|
|
||||
as.running = true |
|
||||
glog.Infof("Admin server %s started successfully", as.ID) |
|
||||
|
|
||||
return nil |
|
||||
} |
|
||||
|
|
||||
// Stop stops the admin server
|
|
||||
func (as *AdminServer) Stop() { |
|
||||
as.mutex.Lock() |
|
||||
defer as.mutex.Unlock() |
|
||||
|
|
||||
if !as.running { |
|
||||
return |
|
||||
} |
|
||||
|
|
||||
glog.Infof("Stopping admin server %s", as.ID) |
|
||||
|
|
||||
close(as.stopCh) |
|
||||
|
|
||||
// Stop components
|
|
||||
as.masterSync.Stop() |
|
||||
as.workerComm.Stop() |
|
||||
|
|
||||
as.running = false |
|
||||
glog.Infof("Admin server %s stopped", as.ID) |
|
||||
} |
|
||||
|
|
||||
// RegisterWorker registers a new worker
|
|
||||
func (as *AdminServer) RegisterWorker(worker *types.Worker) error { |
|
||||
as.mutex.Lock() |
|
||||
defer as.mutex.Unlock() |
|
||||
|
|
||||
if !as.running { |
|
||||
return fmt.Errorf("admin server is not running") |
|
||||
} |
|
||||
|
|
||||
return as.workerRegistry.RegisterWorker(worker) |
|
||||
} |
|
||||
|
|
||||
// UnregisterWorker removes a worker
|
|
||||
func (as *AdminServer) UnregisterWorker(workerID string) error { |
|
||||
as.mutex.Lock() |
|
||||
defer as.mutex.Unlock() |
|
||||
|
|
||||
// Reschedule any tasks assigned to this worker
|
|
||||
for taskID, task := range as.activeTasks { |
|
||||
if task.WorkerID == workerID { |
|
||||
glog.Warningf("Rescheduling task %s due to worker %s unregistration", taskID, workerID) |
|
||||
as.ReassignTask(taskID, "worker unregistration") |
|
||||
delete(as.activeTasks, taskID) |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
return as.workerRegistry.UnregisterWorker(workerID) |
|
||||
} |
|
||||
|
|
||||
// UpdateWorkerHeartbeat updates worker heartbeat
|
|
||||
func (as *AdminServer) UpdateWorkerHeartbeat(workerID string, status *types.WorkerStatus) error { |
|
||||
as.mutex.Lock() |
|
||||
defer as.mutex.Unlock() |
|
||||
|
|
||||
return as.workerRegistry.UpdateWorkerHeartbeat(workerID, status) |
|
||||
} |
|
||||
|
|
||||
// RequestTask handles task requests from workers
|
|
||||
func (as *AdminServer) RequestTask(workerID string, capabilities []types.TaskType) (*types.Task, error) { |
|
||||
as.mutex.RLock() |
|
||||
defer as.mutex.RUnlock() |
|
||||
|
|
||||
if !as.running { |
|
||||
return nil, fmt.Errorf("admin server is not running") |
|
||||
} |
|
||||
|
|
||||
worker, exists := as.workerRegistry.GetWorker(workerID) |
|
||||
if !exists { |
|
||||
return nil, fmt.Errorf("worker %s not registered", workerID) |
|
||||
} |
|
||||
|
|
||||
// Check if worker has capacity
|
|
||||
if worker.CurrentLoad >= worker.MaxConcurrent { |
|
||||
return nil, nil // No capacity
|
|
||||
} |
|
||||
|
|
||||
// Get next task for this worker
|
|
||||
task := as.taskScheduler.GetNextTask(workerID, capabilities) |
|
||||
if task == nil { |
|
||||
return nil, nil // No suitable tasks
|
|
||||
} |
|
||||
|
|
||||
// Check if volume can be assigned (using comprehensive state management)
|
|
||||
if !as.canAssignTask(task, workerID) { |
|
||||
return nil, nil // Cannot assign due to capacity or state constraints
|
|
||||
} |
|
||||
|
|
||||
// Assign task to worker
|
|
||||
inProgressTask := &InProgressTask{ |
|
||||
Task: task, |
|
||||
WorkerID: workerID, |
|
||||
StartedAt: time.Now(), |
|
||||
LastUpdate: time.Now(), |
|
||||
Progress: 0.0, |
|
||||
EstimatedEnd: time.Now().Add(as.estimateTaskDuration(task)), |
|
||||
} |
|
||||
|
|
||||
as.activeTasks[task.ID] = inProgressTask |
|
||||
worker.CurrentLoad++ |
|
||||
|
|
||||
// Register task impact with state manager
|
|
||||
impact := as.createTaskImpact(task) |
|
||||
as.volumeStateManager.RegisterTaskImpact(task.ID, impact) |
|
||||
inProgressTask.VolumeReserved = true |
|
||||
|
|
||||
glog.V(1).Infof("Assigned task %s to worker %s", task.ID, workerID) |
|
||||
return task, nil |
|
||||
} |
|
||||
|
|
||||
// UpdateTaskProgress updates task progress
|
|
||||
func (as *AdminServer) UpdateTaskProgress(taskID string, progress float64) error { |
|
||||
as.tasksMutex.Lock() |
|
||||
defer as.tasksMutex.Unlock() |
|
||||
|
|
||||
inProgressTask, exists := as.activeTasks[taskID] |
|
||||
if !exists { |
|
||||
return fmt.Errorf("task %s not found", taskID) |
|
||||
} |
|
||||
|
|
||||
inProgressTask.Progress = progress |
|
||||
inProgressTask.LastUpdate = time.Now() |
|
||||
|
|
||||
glog.V(2).Infof("Task %s progress: %.1f%%", taskID, progress) |
|
||||
return nil |
|
||||
} |
|
||||
|
|
||||
// CompleteTask marks a task as completed
|
|
||||
func (as *AdminServer) CompleteTask(taskID string, success bool, errorMsg string) error { |
|
||||
as.tasksMutex.Lock() |
|
||||
defer as.tasksMutex.Unlock() |
|
||||
|
|
||||
inProgressTask, exists := as.activeTasks[taskID] |
|
||||
if !exists { |
|
||||
return fmt.Errorf("task %s not found", taskID) |
|
||||
} |
|
||||
|
|
||||
// Remove from active tasks
|
|
||||
delete(as.activeTasks, taskID) |
|
||||
|
|
||||
// Update worker load
|
|
||||
if worker, exists := as.workerRegistry.GetWorker(inProgressTask.WorkerID); exists { |
|
||||
worker.CurrentLoad-- |
|
||||
} |
|
||||
|
|
||||
// Unregister task impact
|
|
||||
as.volumeStateManager.UnregisterTaskImpact(taskID) |
|
||||
|
|
||||
// Record in task history
|
|
||||
status := types.TaskStatusCompleted |
|
||||
if !success { |
|
||||
status = types.TaskStatusFailed |
|
||||
} |
|
||||
|
|
||||
as.taskHistory.AddEntry(TaskHistoryEntry{ |
|
||||
TaskID: taskID, |
|
||||
TaskType: inProgressTask.Task.Type, |
|
||||
VolumeID: inProgressTask.Task.VolumeID, |
|
||||
WorkerID: inProgressTask.WorkerID, |
|
||||
Status: status, |
|
||||
StartedAt: inProgressTask.StartedAt, |
|
||||
CompletedAt: time.Now(), |
|
||||
Duration: time.Since(inProgressTask.StartedAt), |
|
||||
ErrorMessage: errorMsg, |
|
||||
}) |
|
||||
|
|
||||
glog.Infof("Task %s completed: success=%v", taskID, success) |
|
||||
return nil |
|
||||
} |
|
||||
|
|
||||
// QueueTask adds a new task to the task queue
|
|
||||
func (as *AdminServer) QueueTask(task *types.Task) error { |
|
||||
as.mutex.Lock() |
|
||||
defer as.mutex.Unlock() |
|
||||
|
|
||||
if !as.running { |
|
||||
return fmt.Errorf("admin server is not running") |
|
||||
} |
|
||||
|
|
||||
// Validate the task
|
|
||||
if task == nil { |
|
||||
return fmt.Errorf("task cannot be nil") |
|
||||
} |
|
||||
|
|
||||
if task.ID == "" { |
|
||||
task.ID = generateTaskID() |
|
||||
} |
|
||||
|
|
||||
// Set creation timestamp if not set
|
|
||||
if task.CreatedAt.IsZero() { |
|
||||
task.CreatedAt = time.Now() |
|
||||
} |
|
||||
|
|
||||
// Check if task for this volume is already queued or in progress
|
|
||||
if as.isVolumeAlreadyQueued(task.VolumeID, task.Type) { |
|
||||
glog.V(2).Infof("Task for volume %d already queued or in progress, skipping", task.VolumeID) |
|
||||
return nil |
|
||||
} |
|
||||
|
|
||||
// Add to task queue
|
|
||||
as.taskQueue.Push(task) |
|
||||
|
|
||||
glog.V(1).Infof("Queued task %s (%s) for volume %d with priority %v", |
|
||||
task.ID, task.Type, task.VolumeID, task.Priority) |
|
||||
|
|
||||
return nil |
|
||||
} |
|
||||
|
|
||||
// Helper methods
|
|
||||
|
|
||||
// canAssignTask checks if a task can be assigned to a worker
|
|
||||
func (as *AdminServer) canAssignTask(task *types.Task, workerID string) bool { |
|
||||
worker, exists := as.workerRegistry.GetWorker(workerID) |
|
||||
if !exists { |
|
||||
return false |
|
||||
} |
|
||||
|
|
||||
// Check worker capacity
|
|
||||
if worker.CurrentLoad >= worker.MaxConcurrent { |
|
||||
return false |
|
||||
} |
|
||||
|
|
||||
// Check if worker has required capability
|
|
||||
hasCapability := false |
|
||||
for _, cap := range worker.Capabilities { |
|
||||
if cap == task.Type { |
|
||||
hasCapability = true |
|
||||
break |
|
||||
} |
|
||||
} |
|
||||
if !hasCapability { |
|
||||
return false |
|
||||
} |
|
||||
|
|
||||
return true |
|
||||
} |
|
||||
|
|
||||
// createTaskImpact creates a TaskImpact for the given task
|
|
||||
func (as *AdminServer) createTaskImpact(task *types.Task) *TaskImpact { |
|
||||
impact := &TaskImpact{ |
|
||||
TaskID: task.ID, |
|
||||
VolumeID: task.VolumeID, |
|
||||
TaskType: task.Type, |
|
||||
StartedAt: time.Now(), |
|
||||
EstimatedEnd: time.Now().Add(as.estimateTaskDuration(task)), |
|
||||
CapacityDelta: make(map[string]int64), |
|
||||
VolumeChanges: &VolumeChanges{}, |
|
||||
ShardChanges: make(map[int]*ShardChange), |
|
||||
} |
|
||||
|
|
||||
// Set task-specific impacts
|
|
||||
switch task.Type { |
|
||||
case types.TaskTypeErasureCoding: |
|
||||
impact.VolumeChanges.WillBecomeReadOnly = true |
|
||||
impact.EstimatedEnd = time.Now().Add(2 * time.Hour) // EC takes longer
|
|
||||
|
|
||||
// EC encoding requires temporary space
|
|
||||
if server, ok := task.Parameters["server"]; ok { |
|
||||
if serverStr, ok := server.(string); ok { |
|
||||
volumeState := as.volumeStateManager.GetVolumeState(task.VolumeID) |
|
||||
if volumeState != nil && volumeState.CurrentState != nil { |
|
||||
// Estimate 2x volume size needed temporarily
|
|
||||
impact.CapacityDelta[serverStr] = int64(volumeState.CurrentState.Size * 2) |
|
||||
} |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
case types.TaskTypeVacuum: |
|
||||
// Vacuum reduces volume size
|
|
||||
if server, ok := task.Parameters["server"]; ok { |
|
||||
if serverStr, ok := server.(string); ok { |
|
||||
// Estimate 30% space reclamation
|
|
||||
volumeState := as.volumeStateManager.GetVolumeState(task.VolumeID) |
|
||||
if volumeState != nil && volumeState.CurrentState != nil { |
|
||||
impact.CapacityDelta[serverStr] = -int64(float64(volumeState.CurrentState.Size) * 0.3) |
|
||||
} |
|
||||
} |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
return impact |
|
||||
} |
|
||||
|
|
||||
// estimateTaskDuration estimates how long a task will take
|
|
||||
func (as *AdminServer) estimateTaskDuration(task *types.Task) time.Duration { |
|
||||
switch task.Type { |
|
||||
case types.TaskTypeErasureCoding: |
|
||||
return 2 * time.Hour |
|
||||
case types.TaskTypeVacuum: |
|
||||
return 30 * time.Minute |
|
||||
default: |
|
||||
return 1 * time.Hour |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// isVolumeAlreadyQueued checks if a task for the volume is already queued or in progress
|
|
||||
func (as *AdminServer) isVolumeAlreadyQueued(volumeID uint32, taskType types.TaskType) bool { |
|
||||
// Check active tasks
|
|
||||
as.tasksMutex.RLock() |
|
||||
for _, inProgressTask := range as.activeTasks { |
|
||||
if inProgressTask.Task.VolumeID == volumeID && inProgressTask.Task.Type == taskType { |
|
||||
as.tasksMutex.RUnlock() |
|
||||
return true |
|
||||
} |
|
||||
} |
|
||||
as.tasksMutex.RUnlock() |
|
||||
|
|
||||
// Check queued tasks
|
|
||||
return as.taskQueue.HasTask(volumeID, taskType) |
|
||||
} |
|
||||
|
|
||||
// Background loops
|
|
||||
|
|
||||
// taskAssignmentLoop handles automatic task assignment to workers
|
|
||||
func (as *AdminServer) taskAssignmentLoop() { |
|
||||
ticker := time.NewTicker(5 * time.Second) |
|
||||
defer ticker.Stop() |
|
||||
|
|
||||
for { |
|
||||
select { |
|
||||
case <-ticker.C: |
|
||||
as.processTaskAssignments() |
|
||||
case <-as.stopCh: |
|
||||
return |
|
||||
} |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// processTaskAssignments attempts to assign pending tasks to available workers
|
|
||||
func (as *AdminServer) processTaskAssignments() { |
|
||||
// Get available workers
|
|
||||
workers := as.workerRegistry.GetAvailableWorkers() |
|
||||
if len(workers) == 0 { |
|
||||
return // No workers available
|
|
||||
} |
|
||||
|
|
||||
// For each worker with available capacity, try to assign a task
|
|
||||
for _, worker := range workers { |
|
||||
if worker.CurrentLoad < worker.MaxConcurrent { |
|
||||
task := as.taskScheduler.GetNextTask(worker.ID, worker.Capabilities) |
|
||||
if task != nil { |
|
||||
// Try to assign task directly
|
|
||||
_, err := as.RequestTask(worker.ID, worker.Capabilities) |
|
||||
if err != nil { |
|
||||
glog.Errorf("Failed to assign task to worker %s: %v", worker.ID, err) |
|
||||
} |
|
||||
} |
|
||||
} |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// taskMonitoringLoop monitors task progress and handles timeouts
|
|
||||
func (as *AdminServer) taskMonitoringLoop() { |
|
||||
ticker := time.NewTicker(1 * time.Minute) |
|
||||
defer ticker.Stop() |
|
||||
|
|
||||
for { |
|
||||
select { |
|
||||
case <-ticker.C: |
|
||||
as.checkTaskTimeouts() |
|
||||
case <-as.stopCh: |
|
||||
return |
|
||||
} |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// checkTaskTimeouts checks for tasks that have timed out
|
|
||||
func (as *AdminServer) checkTaskTimeouts() { |
|
||||
as.tasksMutex.Lock() |
|
||||
defer as.tasksMutex.Unlock() |
|
||||
|
|
||||
now := time.Now() |
|
||||
timeout := 2 * time.Hour // Default task timeout
|
|
||||
|
|
||||
for taskID, inProgressTask := range as.activeTasks { |
|
||||
if now.Sub(inProgressTask.LastUpdate) > timeout { |
|
||||
glog.Warningf("Task %s timed out (last update: %v)", taskID, inProgressTask.LastUpdate) |
|
||||
as.ReassignTask(taskID, "task timeout") |
|
||||
} |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// ReassignTask reassigns a task due to worker failure
|
|
||||
func (as *AdminServer) ReassignTask(taskID, reason string) { |
|
||||
as.tasksMutex.Lock() |
|
||||
defer as.tasksMutex.Unlock() |
|
||||
|
|
||||
inProgressTask, exists := as.activeTasks[taskID] |
|
||||
if !exists { |
|
||||
return |
|
||||
} |
|
||||
|
|
||||
glog.Infof("Reassigning task %s due to: %s", taskID, reason) |
|
||||
|
|
||||
// Reset task status
|
|
||||
inProgressTask.Task.Status = types.TaskStatusPending |
|
||||
|
|
||||
// Unregister current task impact
|
|
||||
as.volumeStateManager.UnregisterTaskImpact(taskID) |
|
||||
|
|
||||
// Remove from active tasks
|
|
||||
delete(as.activeTasks, taskID) |
|
||||
|
|
||||
// Put back in queue with higher priority
|
|
||||
inProgressTask.Task.Priority = types.TaskPriorityHigh |
|
||||
as.taskQueue.Push(inProgressTask.Task) |
|
||||
} |
|
||||
|
|
||||
// reconciliationLoop periodically reconciles state with master
|
|
||||
func (as *AdminServer) reconciliationLoop() { |
|
||||
ticker := time.NewTicker(5 * time.Minute) |
|
||||
defer ticker.Stop() |
|
||||
|
|
||||
for { |
|
||||
select { |
|
||||
case <-ticker.C: |
|
||||
as.performReconciliation() |
|
||||
case <-as.stopCh: |
|
||||
return |
|
||||
} |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// performReconciliation reconciles admin state with master
|
|
||||
func (as *AdminServer) performReconciliation() { |
|
||||
glog.V(1).Infof("Starting state reconciliation") |
|
||||
|
|
||||
// Sync with master
|
|
||||
err := as.volumeStateManager.SyncWithMaster() |
|
||||
if err != nil { |
|
||||
glog.Errorf("Failed to sync with master during reconciliation: %v", err) |
|
||||
return |
|
||||
} |
|
||||
|
|
||||
glog.V(1).Infof("State reconciliation completed") |
|
||||
} |
|
||||
|
|
||||
// metricsLoop periodically logs metrics and statistics
|
|
||||
func (as *AdminServer) metricsLoop() { |
|
||||
ticker := time.NewTicker(1 * time.Minute) |
|
||||
defer ticker.Stop() |
|
||||
|
|
||||
for { |
|
||||
select { |
|
||||
case <-ticker.C: |
|
||||
as.logMetrics() |
|
||||
case <-as.stopCh: |
|
||||
return |
|
||||
} |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// logMetrics logs current system metrics
|
|
||||
func (as *AdminServer) logMetrics() { |
|
||||
as.tasksMutex.RLock() |
|
||||
activeTasks := len(as.activeTasks) |
|
||||
as.tasksMutex.RUnlock() |
|
||||
|
|
||||
queuedTasks := as.taskQueue.Size() |
|
||||
activeWorkers := len(as.workerRegistry.GetAvailableWorkers()) |
|
||||
|
|
||||
glog.V(1).Infof("Admin server metrics: active_tasks=%d, queued_tasks=%d, active_workers=%d", |
|
||||
activeTasks, queuedTasks, activeWorkers) |
|
||||
} |
|
||||
|
|
||||
// GetAvailableWorkers returns workers capable of handling the specified task type
|
|
||||
func (as *AdminServer) GetAvailableWorkers(taskType string) []*types.Worker { |
|
||||
workers := as.workerRegistry.GetAvailableWorkers() |
|
||||
var available []*types.Worker |
|
||||
|
|
||||
for _, worker := range workers { |
|
||||
if worker.CurrentLoad < worker.MaxConcurrent { |
|
||||
for _, cap := range worker.Capabilities { |
|
||||
if string(cap) == taskType { |
|
||||
available = append(available, worker) |
|
||||
break |
|
||||
} |
|
||||
} |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
return available |
|
||||
} |
|
||||
|
|
||||
// GetSystemStats returns current system statistics
|
|
||||
func (as *AdminServer) GetSystemStats() *SystemStats { |
|
||||
as.tasksMutex.RLock() |
|
||||
activeTasks := len(as.activeTasks) |
|
||||
as.tasksMutex.RUnlock() |
|
||||
|
|
||||
queuedTasks := as.taskQueue.Size() |
|
||||
activeWorkers := len(as.workerRegistry.GetAvailableWorkers()) |
|
||||
|
|
||||
return &SystemStats{ |
|
||||
ActiveTasks: activeTasks, |
|
||||
QueuedTasks: queuedTasks, |
|
||||
ActiveWorkers: activeWorkers, |
|
||||
TotalWorkers: len(as.workerRegistry.GetAvailableWorkers()), |
|
||||
Uptime: time.Since(time.Now()), // This should be tracked properly
|
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// Getter methods for testing
|
|
||||
func (as *AdminServer) GetQueuedTaskCount() int { |
|
||||
return as.taskQueue.Size() |
|
||||
} |
|
||||
|
|
||||
func (as *AdminServer) GetActiveTaskCount() int { |
|
||||
as.tasksMutex.RLock() |
|
||||
defer as.tasksMutex.RUnlock() |
|
||||
return len(as.activeTasks) |
|
||||
} |
|
||||
|
|
||||
func (as *AdminServer) GetTaskHistory() []*TaskHistoryEntry { |
|
||||
return as.taskHistory.GetRecentEntries(100) |
|
||||
} |
|
||||
|
|
||||
func (as *AdminServer) GetVolumeStateManager() *VolumeStateManager { |
|
||||
return as.volumeStateManager |
|
||||
} |
|
||||
|
|
||||
func (as *AdminServer) GetWorkerRegistry() *WorkerRegistry { |
|
||||
return as.workerRegistry |
|
||||
} |
|
||||
|
|
||||
// generateTaskID generates a unique task ID
|
|
||||
func generateTaskID() string { |
|
||||
return fmt.Sprintf("task_%d_%d", time.Now().UnixNano(), rand.Intn(10000)) |
|
||||
} |
|
||||
|
|
||||
// generateAdminServerID generates a unique admin server ID
|
|
||||
func generateAdminServerID() string { |
|
||||
return fmt.Sprintf("admin-%d", time.Now().Unix()) |
|
||||
} |
|
||||
|
|
||||
// SystemStats represents system statistics
|
|
||||
type SystemStats struct { |
|
||||
ActiveTasks int |
|
||||
QueuedTasks int |
|
||||
ActiveWorkers int |
|
||||
TotalWorkers int |
|
||||
Uptime time.Duration |
|
||||
LastMasterSync time.Time |
|
||||
} |
|
||||
@ -1,524 +0,0 @@ |
|||||
package task |
|
||||
|
|
||||
import ( |
|
||||
"fmt" |
|
||||
"testing" |
|
||||
|
|
||||
"github.com/seaweedfs/seaweedfs/weed/worker/types" |
|
||||
) |
|
||||
|
|
||||
func TestAdminServer_TaskAssignmentWithStateManagement(t *testing.T) { |
|
||||
// Test the core functionality: accurate task assignment based on comprehensive state
|
|
||||
adminServer := NewAdminServer(DefaultAdminConfig(), nil) |
|
||||
|
|
||||
// Initialize components
|
|
||||
adminServer.workerRegistry = NewWorkerRegistry() |
|
||||
adminServer.taskQueue = NewPriorityTaskQueue() |
|
||||
adminServer.volumeStateManager = NewVolumeStateManager(nil) |
|
||||
adminServer.taskScheduler = NewTaskScheduler(adminServer.workerRegistry, adminServer.taskQueue) |
|
||||
adminServer.running = true // Mark as running for test
|
|
||||
|
|
||||
// Setup test worker
|
|
||||
worker := &types.Worker{ |
|
||||
ID: "test_worker_1", |
|
||||
Address: "server1:8080", |
|
||||
Capabilities: []types.TaskType{types.TaskTypeErasureCoding, types.TaskTypeVacuum}, |
|
||||
MaxConcurrent: 2, |
|
||||
Status: "active", |
|
||||
CurrentLoad: 0, |
|
||||
} |
|
||||
adminServer.workerRegistry.RegisterWorker(worker) |
|
||||
|
|
||||
// Setup volume state
|
|
||||
volumeID := uint32(1) |
|
||||
adminServer.volumeStateManager.volumes[volumeID] = &VolumeState{ |
|
||||
VolumeID: volumeID, |
|
||||
CurrentState: &VolumeInfo{ |
|
||||
ID: volumeID, |
|
||||
Size: 28 * 1024 * 1024 * 1024, // 28GB - good for EC
|
|
||||
Server: "server1", |
|
||||
}, |
|
||||
InProgressTasks: []*TaskImpact{}, |
|
||||
PlannedChanges: []*PlannedOperation{}, |
|
||||
} |
|
||||
|
|
||||
// Setup server capacity
|
|
||||
adminServer.volumeStateManager.capacityCache["server1"] = &CapacityInfo{ |
|
||||
Server: "server1", |
|
||||
TotalCapacity: 100 * 1024 * 1024 * 1024, // 100GB
|
|
||||
UsedCapacity: 50 * 1024 * 1024 * 1024, // 50GB used
|
|
||||
PredictedUsage: 50 * 1024 * 1024 * 1024, // Initially same as used
|
|
||||
} |
|
||||
|
|
||||
// Create EC task
|
|
||||
task := &types.Task{ |
|
||||
ID: "ec_task_1", |
|
||||
Type: types.TaskTypeErasureCoding, |
|
||||
VolumeID: volumeID, |
|
||||
Server: "server1", |
|
||||
Priority: types.TaskPriorityNormal, |
|
||||
} |
|
||||
|
|
||||
// Test task assignment
|
|
||||
adminServer.taskQueue.Push(task) |
|
||||
|
|
||||
assignedTask, err := adminServer.RequestTask("test_worker_1", []types.TaskType{types.TaskTypeErasureCoding}) |
|
||||
if err != nil { |
|
||||
t.Errorf("Task assignment failed: %v", err) |
|
||||
} |
|
||||
|
|
||||
if assignedTask == nil { |
|
||||
t.Fatal("Expected task to be assigned, got nil") |
|
||||
} |
|
||||
|
|
||||
if assignedTask.ID != "ec_task_1" { |
|
||||
t.Errorf("Expected task ec_task_1, got %s", assignedTask.ID) |
|
||||
} |
|
||||
|
|
||||
// Verify state manager was updated
|
|
||||
if len(adminServer.volumeStateManager.inProgressTasks) != 1 { |
|
||||
t.Errorf("Expected 1 in-progress task in state manager, got %d", len(adminServer.volumeStateManager.inProgressTasks)) |
|
||||
} |
|
||||
|
|
||||
// Verify capacity reservation
|
|
||||
capacity := adminServer.volumeStateManager.GetAccurateCapacity("server1") |
|
||||
if capacity.ReservedCapacity <= 0 { |
|
||||
t.Error("Expected capacity to be reserved for EC task") |
|
||||
} |
|
||||
|
|
||||
t.Log("✅ Task assignment with state management test passed") |
|
||||
} |
|
||||
|
|
||||
func TestAdminServer_CanAssignTask(t *testing.T) { |
|
||||
adminServer := NewAdminServer(DefaultAdminConfig(), nil) |
|
||||
adminServer.volumeStateManager = NewVolumeStateManager(nil) |
|
||||
adminServer.inProgressTasks = make(map[string]*InProgressTask) |
|
||||
|
|
||||
// Setup volume state
|
|
||||
volumeID := uint32(1) |
|
||||
adminServer.volumeStateManager.volumes[volumeID] = &VolumeState{ |
|
||||
VolumeID: volumeID, |
|
||||
CurrentState: &VolumeInfo{ |
|
||||
ID: volumeID, |
|
||||
Size: 25 * 1024 * 1024 * 1024, // 25GB
|
|
||||
}, |
|
||||
} |
|
||||
|
|
||||
// Setup server capacity - limited space
|
|
||||
serverID := "server1" |
|
||||
adminServer.volumeStateManager.capacityCache[serverID] = &CapacityInfo{ |
|
||||
Server: serverID, |
|
||||
TotalCapacity: 30 * 1024 * 1024 * 1024, // 30GB total
|
|
||||
UsedCapacity: 20 * 1024 * 1024 * 1024, // 20GB used
|
|
||||
PredictedUsage: 20 * 1024 * 1024 * 1024, // 10GB available
|
|
||||
} |
|
||||
|
|
||||
worker := &types.Worker{ |
|
||||
ID: "worker1", |
|
||||
Address: serverID, |
|
||||
} |
|
||||
|
|
||||
tests := []struct { |
|
||||
name string |
|
||||
taskType types.TaskType |
|
||||
expected bool |
|
||||
desc string |
|
||||
}{ |
|
||||
{ |
|
||||
name: "EC task fits", |
|
||||
taskType: types.TaskTypeErasureCoding, |
|
||||
expected: false, // 25GB * 1.4 = 35GB needed, but only 10GB available
|
|
||||
desc: "EC task should not fit due to insufficient capacity", |
|
||||
}, |
|
||||
{ |
|
||||
name: "Vacuum task fits", |
|
||||
taskType: types.TaskTypeVacuum, |
|
||||
expected: true, |
|
||||
desc: "Vacuum task should fit (no capacity increase)", |
|
||||
}, |
|
||||
} |
|
||||
|
|
||||
for _, tt := range tests { |
|
||||
t.Run(tt.name, func(t *testing.T) { |
|
||||
task := &types.Task{ |
|
||||
ID: "test_task", |
|
||||
Type: tt.taskType, |
|
||||
VolumeID: volumeID, |
|
||||
Server: serverID, |
|
||||
} |
|
||||
|
|
||||
result := adminServer.canAssignTask(task, worker) |
|
||||
if result != tt.expected { |
|
||||
t.Errorf("canAssignTask() = %v, want %v. %s", result, tt.expected, tt.desc) |
|
||||
} |
|
||||
}) |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
func TestAdminServer_CreateTaskImpact(t *testing.T) { |
|
||||
adminServer := NewAdminServer(DefaultAdminConfig(), nil) |
|
||||
adminServer.volumeStateManager = NewVolumeStateManager(nil) |
|
||||
|
|
||||
// Setup volume state for EC task
|
|
||||
volumeID := uint32(1) |
|
||||
adminServer.volumeStateManager.volumes[volumeID] = &VolumeState{ |
|
||||
VolumeID: volumeID, |
|
||||
CurrentState: &VolumeInfo{ |
|
||||
ID: volumeID, |
|
||||
Size: 25 * 1024 * 1024 * 1024, // 25GB
|
|
||||
}, |
|
||||
} |
|
||||
|
|
||||
task := &types.Task{ |
|
||||
ID: "ec_task_1", |
|
||||
Type: types.TaskTypeErasureCoding, |
|
||||
VolumeID: volumeID, |
|
||||
Server: "server1", |
|
||||
} |
|
||||
|
|
||||
impact := adminServer.createTaskImpact(task, "worker1") |
|
||||
|
|
||||
// Verify impact structure
|
|
||||
if impact.TaskID != "ec_task_1" { |
|
||||
t.Errorf("Expected task ID ec_task_1, got %s", impact.TaskID) |
|
||||
} |
|
||||
|
|
||||
if impact.TaskType != types.TaskTypeErasureCoding { |
|
||||
t.Errorf("Expected task type %v, got %v", types.TaskTypeErasureCoding, impact.TaskType) |
|
||||
} |
|
||||
|
|
||||
// Verify volume changes for EC task
|
|
||||
if !impact.VolumeChanges.WillBecomeReadOnly { |
|
||||
t.Error("Expected volume to become read-only after EC") |
|
||||
} |
|
||||
|
|
||||
// Verify capacity delta (EC should require ~40% more space)
|
|
||||
expectedCapacity := int64(float64(25*1024*1024*1024) * 1.4) // ~35GB
|
|
||||
actualCapacity := impact.CapacityDelta["server1"] |
|
||||
if actualCapacity != expectedCapacity { |
|
||||
t.Errorf("Expected capacity delta %d, got %d", expectedCapacity, actualCapacity) |
|
||||
} |
|
||||
|
|
||||
// Verify shard changes (should plan 14 shards)
|
|
||||
if len(impact.ShardChanges) != 14 { |
|
||||
t.Errorf("Expected 14 shard changes, got %d", len(impact.ShardChanges)) |
|
||||
} |
|
||||
|
|
||||
for i := 0; i < 14; i++ { |
|
||||
shardChange := impact.ShardChanges[i] |
|
||||
if shardChange == nil { |
|
||||
t.Errorf("Missing shard change for shard %d", i) |
|
||||
continue |
|
||||
} |
|
||||
|
|
||||
if !shardChange.WillBeCreated { |
|
||||
t.Errorf("Shard %d should be marked for creation", i) |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
t.Log("✅ Task impact creation test passed") |
|
||||
} |
|
||||
|
|
||||
func TestAdminServer_TaskCompletionStateCleanup(t *testing.T) { |
|
||||
adminServer := NewAdminServer(DefaultAdminConfig(), nil) |
|
||||
adminServer.workerRegistry = NewWorkerRegistry() |
|
||||
adminServer.volumeStateManager = NewVolumeStateManager(nil) |
|
||||
adminServer.inProgressTasks = make(map[string]*InProgressTask) |
|
||||
|
|
||||
// Setup worker
|
|
||||
worker := &types.Worker{ |
|
||||
ID: "worker1", |
|
||||
CurrentLoad: 1, // Has 1 task assigned
|
|
||||
} |
|
||||
adminServer.workerRegistry.RegisterWorker(worker) |
|
||||
|
|
||||
// Setup in-progress task
|
|
||||
task := &types.Task{ |
|
||||
ID: "test_task_1", |
|
||||
Type: types.TaskTypeVacuum, |
|
||||
VolumeID: 1, |
|
||||
} |
|
||||
|
|
||||
inProgressTask := &InProgressTask{ |
|
||||
Task: task, |
|
||||
WorkerID: "worker1", |
|
||||
VolumeReserved: true, |
|
||||
} |
|
||||
adminServer.inProgressTasks["test_task_1"] = inProgressTask |
|
||||
|
|
||||
// Register impact in state manager
|
|
||||
impact := &TaskImpact{ |
|
||||
TaskID: "test_task_1", |
|
||||
VolumeID: 1, |
|
||||
CapacityDelta: map[string]int64{"server1": -100 * 1024 * 1024}, // 100MB savings
|
|
||||
} |
|
||||
adminServer.volumeStateManager.RegisterTaskImpact("test_task_1", impact) |
|
||||
|
|
||||
// Complete the task
|
|
||||
err := adminServer.CompleteTask("test_task_1", true, "") |
|
||||
if err != nil { |
|
||||
t.Errorf("Task completion failed: %v", err) |
|
||||
} |
|
||||
|
|
||||
// Verify cleanup
|
|
||||
if len(adminServer.inProgressTasks) != 0 { |
|
||||
t.Errorf("Expected 0 in-progress tasks after completion, got %d", len(adminServer.inProgressTasks)) |
|
||||
} |
|
||||
|
|
||||
// Verify worker load updated
|
|
||||
updatedWorker, _ := adminServer.workerRegistry.GetWorker("worker1") |
|
||||
if updatedWorker.CurrentLoad != 0 { |
|
||||
t.Errorf("Expected worker load 0 after task completion, got %d", updatedWorker.CurrentLoad) |
|
||||
} |
|
||||
|
|
||||
// Verify state manager cleaned up
|
|
||||
if len(adminServer.volumeStateManager.inProgressTasks) != 0 { |
|
||||
t.Errorf("Expected 0 tasks in state manager after completion, got %d", len(adminServer.volumeStateManager.inProgressTasks)) |
|
||||
} |
|
||||
|
|
||||
t.Log("✅ Task completion state cleanup test passed") |
|
||||
} |
|
||||
|
|
||||
func TestAdminServer_PreventDuplicateTaskAssignment(t *testing.T) { |
|
||||
adminServer := NewAdminServer(DefaultAdminConfig(), nil) |
|
||||
adminServer.workerRegistry = NewWorkerRegistry() |
|
||||
adminServer.taskQueue = NewPriorityTaskQueue() |
|
||||
adminServer.volumeStateManager = NewVolumeStateManager(nil) |
|
||||
adminServer.inProgressTasks = make(map[string]*InProgressTask) |
|
||||
|
|
||||
// Setup worker
|
|
||||
worker := &types.Worker{ |
|
||||
ID: "worker1", |
|
||||
Capabilities: []types.TaskType{types.TaskTypeVacuum}, |
|
||||
MaxConcurrent: 2, |
|
||||
Status: "active", |
|
||||
CurrentLoad: 0, |
|
||||
} |
|
||||
adminServer.workerRegistry.RegisterWorker(worker) |
|
||||
|
|
||||
// Setup volume state
|
|
||||
volumeID := uint32(1) |
|
||||
adminServer.volumeStateManager.volumes[volumeID] = &VolumeState{ |
|
||||
VolumeID: volumeID, |
|
||||
CurrentState: &VolumeInfo{ID: volumeID, Size: 1024 * 1024 * 1024}, |
|
||||
} |
|
||||
|
|
||||
// Create first task and assign it
|
|
||||
task1 := &types.Task{ |
|
||||
ID: "vacuum_task_1", |
|
||||
Type: types.TaskTypeVacuum, |
|
||||
VolumeID: volumeID, |
|
||||
Priority: types.TaskPriorityNormal, |
|
||||
} |
|
||||
|
|
||||
adminServer.taskQueue.Push(task1) |
|
||||
assignedTask1, err := adminServer.RequestTask("worker1", []types.TaskType{types.TaskTypeVacuum}) |
|
||||
if err != nil || assignedTask1 == nil { |
|
||||
t.Fatal("First task assignment failed") |
|
||||
} |
|
||||
|
|
||||
// Try to assign another vacuum task for the same volume
|
|
||||
task2 := &types.Task{ |
|
||||
ID: "vacuum_task_2", |
|
||||
Type: types.TaskTypeVacuum, |
|
||||
VolumeID: volumeID, // Same volume!
|
|
||||
Priority: types.TaskPriorityNormal, |
|
||||
} |
|
||||
|
|
||||
adminServer.taskQueue.Push(task2) |
|
||||
assignedTask2, err := adminServer.RequestTask("worker1", []types.TaskType{types.TaskTypeVacuum}) |
|
||||
|
|
||||
// Should not assign duplicate task
|
|
||||
if assignedTask2 != nil { |
|
||||
t.Error("Should not assign duplicate vacuum task for same volume") |
|
||||
} |
|
||||
|
|
||||
t.Log("✅ Duplicate task prevention test passed") |
|
||||
} |
|
||||
|
|
||||
func TestAdminServer_SystemStats(t *testing.T) { |
|
||||
adminServer := NewAdminServer(DefaultAdminConfig(), nil) |
|
||||
adminServer.workerRegistry = NewWorkerRegistry() |
|
||||
adminServer.taskQueue = NewPriorityTaskQueue() |
|
||||
adminServer.volumeStateManager = NewVolumeStateManager(nil) |
|
||||
adminServer.inProgressTasks = make(map[string]*InProgressTask) |
|
||||
adminServer.running = true |
|
||||
|
|
||||
// Add some test data
|
|
||||
worker := &types.Worker{ID: "worker1", Status: "active"} |
|
||||
adminServer.workerRegistry.RegisterWorker(worker) |
|
||||
|
|
||||
task := &types.Task{ID: "task1", Type: types.TaskTypeErasureCoding} |
|
||||
adminServer.taskQueue.Push(task) |
|
||||
|
|
||||
inProgressTask := &InProgressTask{ |
|
||||
Task: &types.Task{ID: "task2", Type: types.TaskTypeVacuum}, |
|
||||
} |
|
||||
adminServer.inProgressTasks["task2"] = inProgressTask |
|
||||
|
|
||||
// Get system stats
|
|
||||
stats := adminServer.GetSystemStats() |
|
||||
|
|
||||
// Verify stats structure
|
|
||||
if !stats["running"].(bool) { |
|
||||
t.Error("Expected running to be true") |
|
||||
} |
|
||||
|
|
||||
if stats["in_progress_tasks"].(int) != 1 { |
|
||||
t.Errorf("Expected 1 in-progress task, got %d", stats["in_progress_tasks"].(int)) |
|
||||
} |
|
||||
|
|
||||
if stats["queued_tasks"].(int) != 1 { |
|
||||
t.Errorf("Expected 1 queued task, got %d", stats["queued_tasks"].(int)) |
|
||||
} |
|
||||
|
|
||||
// Check task breakdown
|
|
||||
tasksByType := stats["tasks_by_type"].(map[types.TaskType]int) |
|
||||
if tasksByType[types.TaskTypeVacuum] != 1 { |
|
||||
t.Errorf("Expected 1 vacuum task, got %d", tasksByType[types.TaskTypeVacuum]) |
|
||||
} |
|
||||
|
|
||||
t.Log("✅ System stats test passed") |
|
||||
} |
|
||||
|
|
||||
func TestAdminServer_VolumeStateIntegration(t *testing.T) { |
|
||||
// Integration test: Verify admin server correctly uses volume state for decisions
|
|
||||
adminServer := NewAdminServer(DefaultAdminConfig(), nil) |
|
||||
adminServer.workerRegistry = NewWorkerRegistry() |
|
||||
adminServer.taskQueue = NewPriorityTaskQueue() |
|
||||
adminServer.volumeStateManager = NewVolumeStateManager(nil) |
|
||||
adminServer.inProgressTasks = make(map[string]*InProgressTask) |
|
||||
|
|
||||
// Setup worker
|
|
||||
worker := &types.Worker{ |
|
||||
ID: "worker1", |
|
||||
Address: "server1", |
|
||||
Capabilities: []types.TaskType{types.TaskTypeErasureCoding}, |
|
||||
MaxConcurrent: 1, |
|
||||
Status: "active", |
|
||||
CurrentLoad: 0, |
|
||||
} |
|
||||
adminServer.workerRegistry.RegisterWorker(worker) |
|
||||
|
|
||||
// Setup volume and capacity that would normally allow EC
|
|
||||
volumeID := uint32(1) |
|
||||
adminServer.volumeStateManager.volumes[volumeID] = &VolumeState{ |
|
||||
VolumeID: volumeID, |
|
||||
CurrentState: &VolumeInfo{ |
|
||||
ID: volumeID, |
|
||||
Size: 25 * 1024 * 1024 * 1024, // 25GB
|
|
||||
Server: "server1", |
|
||||
}, |
|
||||
} |
|
||||
|
|
||||
adminServer.volumeStateManager.capacityCache["server1"] = &CapacityInfo{ |
|
||||
Server: "server1", |
|
||||
TotalCapacity: 100 * 1024 * 1024 * 1024, // 100GB
|
|
||||
UsedCapacity: 20 * 1024 * 1024 * 1024, // 20GB used
|
|
||||
PredictedUsage: 20 * 1024 * 1024 * 1024, // 80GB available
|
|
||||
} |
|
||||
|
|
||||
// Create EC task
|
|
||||
task := &types.Task{ |
|
||||
ID: "ec_task_1", |
|
||||
Type: types.TaskTypeErasureCoding, |
|
||||
VolumeID: volumeID, |
|
||||
Server: "server1", |
|
||||
} |
|
||||
|
|
||||
adminServer.taskQueue.Push(task) |
|
||||
|
|
||||
// First assignment should work
|
|
||||
assignedTask1, err := adminServer.RequestTask("worker1", []types.TaskType{types.TaskTypeErasureCoding}) |
|
||||
if err != nil || assignedTask1 == nil { |
|
||||
t.Fatal("First EC task assignment should succeed") |
|
||||
} |
|
||||
|
|
||||
// Verify capacity is now reserved
|
|
||||
capacity := adminServer.volumeStateManager.GetAccurateCapacity("server1") |
|
||||
if capacity.ReservedCapacity <= 0 { |
|
||||
t.Error("Expected capacity to be reserved for first EC task") |
|
||||
} |
|
||||
|
|
||||
// Try to assign another large EC task - should fail due to capacity
|
|
||||
volumeID2 := uint32(2) |
|
||||
adminServer.volumeStateManager.volumes[volumeID2] = &VolumeState{ |
|
||||
VolumeID: volumeID2, |
|
||||
CurrentState: &VolumeInfo{ |
|
||||
ID: volumeID2, |
|
||||
Size: 30 * 1024 * 1024 * 1024, // 30GB - would need 42GB for EC
|
|
||||
Server: "server1", |
|
||||
}, |
|
||||
} |
|
||||
|
|
||||
task2 := &types.Task{ |
|
||||
ID: "ec_task_2", |
|
||||
Type: types.TaskTypeErasureCoding, |
|
||||
VolumeID: volumeID2, |
|
||||
Server: "server1", |
|
||||
} |
|
||||
|
|
||||
adminServer.taskQueue.Push(task2) |
|
||||
|
|
||||
// Add another worker to test capacity-based rejection
|
|
||||
worker2 := &types.Worker{ |
|
||||
ID: "worker2", |
|
||||
Address: "server1", |
|
||||
Capabilities: []types.TaskType{types.TaskTypeErasureCoding}, |
|
||||
MaxConcurrent: 1, |
|
||||
Status: "active", |
|
||||
CurrentLoad: 0, |
|
||||
} |
|
||||
adminServer.workerRegistry.RegisterWorker(worker2) |
|
||||
|
|
||||
assignedTask2, err := adminServer.RequestTask("worker2", []types.TaskType{types.TaskTypeErasureCoding}) |
|
||||
|
|
||||
// Should not assign due to insufficient capacity
|
|
||||
if assignedTask2 != nil { |
|
||||
t.Error("Should not assign second EC task due to insufficient server capacity") |
|
||||
} |
|
||||
|
|
||||
t.Log("✅ Volume state integration test passed") |
|
||||
t.Log("✅ Admin server correctly uses comprehensive state for task assignment decisions") |
|
||||
} |
|
||||
|
|
||||
// Benchmark for task assignment performance
|
|
||||
func BenchmarkAdminServer_RequestTask(b *testing.B) { |
|
||||
adminServer := NewAdminServer(DefaultAdminConfig(), nil) |
|
||||
adminServer.workerRegistry = NewWorkerRegistry() |
|
||||
adminServer.taskQueue = NewPriorityTaskQueue() |
|
||||
adminServer.volumeStateManager = NewVolumeStateManager(nil) |
|
||||
adminServer.inProgressTasks = make(map[string]*InProgressTask) |
|
||||
|
|
||||
// Setup worker
|
|
||||
worker := &types.Worker{ |
|
||||
ID: "bench_worker", |
|
||||
Capabilities: []types.TaskType{types.TaskTypeVacuum}, |
|
||||
MaxConcurrent: 1000, // High limit for benchmark
|
|
||||
Status: "active", |
|
||||
CurrentLoad: 0, |
|
||||
} |
|
||||
adminServer.workerRegistry.RegisterWorker(worker) |
|
||||
|
|
||||
// Setup many tasks
|
|
||||
for i := 0; i < 1000; i++ { |
|
||||
volumeID := uint32(i + 1) |
|
||||
adminServer.volumeStateManager.volumes[volumeID] = &VolumeState{ |
|
||||
VolumeID: volumeID, |
|
||||
CurrentState: &VolumeInfo{ID: volumeID, Size: 1024 * 1024 * 1024}, |
|
||||
} |
|
||||
|
|
||||
task := &types.Task{ |
|
||||
ID: fmt.Sprintf("task_%d", i), |
|
||||
Type: types.TaskTypeVacuum, |
|
||||
VolumeID: volumeID, |
|
||||
} |
|
||||
adminServer.taskQueue.Push(task) |
|
||||
} |
|
||||
|
|
||||
b.ResetTimer() |
|
||||
|
|
||||
for i := 0; i < b.N; i++ { |
|
||||
adminServer.RequestTask("bench_worker", []types.TaskType{types.TaskTypeVacuum}) |
|
||||
} |
|
||||
} |
|
||||
@ -1,90 +0,0 @@ |
|||||
package task |
|
||||
|
|
||||
import ( |
|
||||
"time" |
|
||||
|
|
||||
"github.com/seaweedfs/seaweedfs/weed/pb/worker_pb" |
|
||||
"github.com/seaweedfs/seaweedfs/weed/worker/types" |
|
||||
) |
|
||||
|
|
||||
// Compilation stubs for missing types and functions
|
|
||||
|
|
||||
// Task is an alias for types.Task for backward compatibility
|
|
||||
type Task = types.Task |
|
||||
|
|
||||
// TaskType is an alias for types.TaskType for backward compatibility
|
|
||||
type TaskType = types.TaskType |
|
||||
|
|
||||
// TaskStatus is an alias for types.TaskStatus for backward compatibility
|
|
||||
type TaskStatus = types.TaskStatus |
|
||||
|
|
||||
// TaskPriority is an alias for types.TaskPriority for backward compatibility
|
|
||||
type TaskPriority = types.TaskPriority |
|
||||
|
|
||||
// Additional type aliases for compilation
|
|
||||
var ( |
|
||||
TaskStatusCompleted = types.TaskStatusCompleted |
|
||||
TaskStatusFailed = types.TaskStatusFailed |
|
||||
) |
|
||||
|
|
||||
// Worker represents a worker node
|
|
||||
type Worker struct { |
|
||||
ID string |
|
||||
Address string |
|
||||
Capabilities []string |
|
||||
Status string |
|
||||
LastSeen time.Time |
|
||||
} |
|
||||
|
|
||||
// convertAdminToWorkerMessage converts AdminMessage to WorkerMessage for stream compatibility
|
|
||||
func convertAdminToWorkerMessage(msg *worker_pb.AdminMessage) *worker_pb.WorkerMessage { |
|
||||
// This is a workaround for the stream type mismatch
|
|
||||
// In a real implementation, this would need proper message conversion
|
|
||||
return &worker_pb.WorkerMessage{ |
|
||||
WorkerId: msg.AdminId, |
|
||||
Timestamp: msg.Timestamp, |
|
||||
// Add basic message conversion logic here
|
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// WorkerRegistry stub methods
|
|
||||
func (wr *WorkerRegistry) UpdateWorkerStatus(workerID string, status interface{}) { |
|
||||
// Stub implementation
|
|
||||
} |
|
||||
|
|
||||
// AdminServer stub methods
|
|
||||
func (as *AdminServer) AssignTaskToWorker(workerID string) *Task { |
|
||||
// Stub implementation
|
|
||||
return nil |
|
||||
} |
|
||||
|
|
||||
// DefaultAdminConfig returns default admin server configuration
|
|
||||
func DefaultAdminConfig() *AdminConfig { |
|
||||
return &AdminConfig{ |
|
||||
ScanInterval: 30 * time.Minute, |
|
||||
WorkerTimeout: 5 * time.Minute, |
|
||||
TaskTimeout: 10 * time.Minute, |
|
||||
MaxRetries: 3, |
|
||||
ReconcileInterval: 5 * time.Minute, |
|
||||
EnableFailureRecovery: true, |
|
||||
MaxConcurrentTasks: 10, |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// SyncWithMasterData is a stub for the volume state manager
|
|
||||
func (vsm *VolumeStateManager) SyncWithMasterData(volumes map[uint32]*VolumeInfo, ecShards map[uint32]map[int]*ShardInfo, serverCapacity map[string]*CapacityInfo) error { |
|
||||
// Stub implementation - would normally sync the data
|
|
||||
return nil |
|
||||
} |
|
||||
|
|
||||
// GetAllVolumeStates is a stub for the volume state manager
|
|
||||
func (vsm *VolumeStateManager) GetAllVolumeStates() map[uint32]*VolumeState { |
|
||||
// Stub implementation - return empty map
|
|
||||
return make(map[uint32]*VolumeState) |
|
||||
} |
|
||||
|
|
||||
// DetectInconsistencies is a stub for the volume state manager
|
|
||||
func (vsm *VolumeStateManager) DetectInconsistencies() []StateInconsistency { |
|
||||
// Stub implementation - return empty slice
|
|
||||
return []StateInconsistency{} |
|
||||
} |
|
||||
@ -1,309 +0,0 @@ |
|||||
package task |
|
||||
|
|
||||
import ( |
|
||||
"os" |
|
||||
"path/filepath" |
|
||||
"testing" |
|
||||
"time" |
|
||||
|
|
||||
ec_task "github.com/seaweedfs/seaweedfs/weed/worker/tasks/erasure_coding" |
|
||||
"github.com/seaweedfs/seaweedfs/weed/worker/types" |
|
||||
) |
|
||||
|
|
||||
// TestECIntegration tests the EC implementation with the admin server
|
|
||||
func TestECIntegration(t *testing.T) { |
|
||||
t.Logf("Starting EC integration test") |
|
||||
|
|
||||
// Step 1: Create admin server
|
|
||||
config := &MinimalAdminConfig{ |
|
||||
ScanInterval: 10 * time.Second, |
|
||||
WorkerTimeout: 30 * time.Second, |
|
||||
TaskTimeout: 30 * time.Minute, // EC takes longer
|
|
||||
MaxRetries: 3, |
|
||||
ReconcileInterval: 5 * time.Minute, |
|
||||
EnableFailureRecovery: true, |
|
||||
MaxConcurrentTasks: 2, // Limit concurrency for EC tasks
|
|
||||
} |
|
||||
|
|
||||
adminServer := NewMinimalAdminServer(config, nil) |
|
||||
err := adminServer.Start() |
|
||||
if err != nil { |
|
||||
t.Fatalf("Failed to start admin server: %v", err) |
|
||||
} |
|
||||
defer adminServer.Stop() |
|
||||
|
|
||||
// Step 2: Register an EC-capable worker
|
|
||||
worker := &types.Worker{ |
|
||||
ID: "ec-worker-1", |
|
||||
Address: "localhost:9001", |
|
||||
Capabilities: []types.TaskType{types.TaskTypeErasureCoding}, |
|
||||
MaxConcurrent: 1, |
|
||||
Status: "active", |
|
||||
CurrentLoad: 0, |
|
||||
LastHeartbeat: time.Now(), |
|
||||
} |
|
||||
|
|
||||
err = adminServer.RegisterWorker(worker) |
|
||||
if err != nil { |
|
||||
t.Fatalf("Failed to register EC worker: %v", err) |
|
||||
} |
|
||||
t.Logf("Successfully registered EC worker %s", worker.ID) |
|
||||
|
|
||||
// Step 3: Create an EC task
|
|
||||
ecTask := &types.Task{ |
|
||||
ID: "ec-task-1", |
|
||||
Type: types.TaskTypeErasureCoding, |
|
||||
VolumeID: 12345, |
|
||||
Server: "localhost:8080", |
|
||||
Status: types.TaskStatusPending, |
|
||||
Priority: types.TaskPriorityHigh, |
|
||||
Parameters: map[string]interface{}{ |
|
||||
"volume_size": int64(32 * 1024 * 1024 * 1024), // 32GB
|
|
||||
"master_client": "localhost:9333", |
|
||||
"work_dir": "/tmp/seaweedfs_ec_work", |
|
||||
"collection": "test", |
|
||||
}, |
|
||||
CreatedAt: time.Now(), |
|
||||
} |
|
||||
|
|
||||
err = adminServer.QueueTask(ecTask) |
|
||||
if err != nil { |
|
||||
t.Fatalf("Failed to queue EC task: %v", err) |
|
||||
} |
|
||||
t.Logf("Successfully queued EC task %s for volume %d", ecTask.ID, ecTask.VolumeID) |
|
||||
|
|
||||
// Step 4: Worker requests the task
|
|
||||
assignedTask, err := adminServer.RequestTask("ec-worker-1", []types.TaskType{types.TaskTypeErasureCoding}) |
|
||||
if err != nil { |
|
||||
t.Fatalf("Failed to request EC task: %v", err) |
|
||||
} |
|
||||
|
|
||||
if assignedTask != nil { |
|
||||
t.Logf("EC worker got task: %s (%s) for volume %d", |
|
||||
assignedTask.ID, assignedTask.Type, assignedTask.VolumeID) |
|
||||
|
|
||||
// Step 5: Simulate EC task execution phases
|
|
||||
t.Logf("Simulating EC task execution phases") |
|
||||
|
|
||||
// Phase 1: Copying volume data
|
|
||||
err = adminServer.UpdateTaskProgress(assignedTask.ID, 15.0) |
|
||||
if err != nil { |
|
||||
t.Errorf("Failed to update progress (copying): %v", err) |
|
||||
} |
|
||||
t.Logf("Phase 1: Volume data copied to local disk") |
|
||||
|
|
||||
// Phase 2: Marking read-only
|
|
||||
err = adminServer.UpdateTaskProgress(assignedTask.ID, 25.0) |
|
||||
if err != nil { |
|
||||
t.Errorf("Failed to update progress (read-only): %v", err) |
|
||||
} |
|
||||
t.Logf("Phase 2: Source volume marked as read-only") |
|
||||
|
|
||||
// Phase 3: Local EC encoding
|
|
||||
err = adminServer.UpdateTaskProgress(assignedTask.ID, 60.0) |
|
||||
if err != nil { |
|
||||
t.Errorf("Failed to update progress (encoding): %v", err) |
|
||||
} |
|
||||
t.Logf("Phase 3: Local Reed-Solomon encoding completed (10+4 shards)") |
|
||||
|
|
||||
// Phase 4: Calculating optimal placement
|
|
||||
err = adminServer.UpdateTaskProgress(assignedTask.ID, 70.0) |
|
||||
if err != nil { |
|
||||
t.Errorf("Failed to update progress (placement): %v", err) |
|
||||
} |
|
||||
t.Logf("Phase 4: Optimal shard placement calculated with affinity") |
|
||||
|
|
||||
// Phase 5: Distributing shards
|
|
||||
err = adminServer.UpdateTaskProgress(assignedTask.ID, 90.0) |
|
||||
if err != nil { |
|
||||
t.Errorf("Failed to update progress (distribution): %v", err) |
|
||||
} |
|
||||
t.Logf("Phase 5: Shards distributed across servers with rack diversity") |
|
||||
|
|
||||
// Phase 6: Verification and cleanup
|
|
||||
err = adminServer.UpdateTaskProgress(assignedTask.ID, 100.0) |
|
||||
if err != nil { |
|
||||
t.Errorf("Failed to update progress (completion): %v", err) |
|
||||
} |
|
||||
t.Logf("Phase 6: Verification and cleanup completed") |
|
||||
|
|
||||
// Step 6: Complete the task
|
|
||||
err = adminServer.CompleteTask(assignedTask.ID, true, "") |
|
||||
if err != nil { |
|
||||
t.Errorf("Failed to complete EC task: %v", err) |
|
||||
} |
|
||||
t.Logf("Successfully completed EC task %s", assignedTask.ID) |
|
||||
} else { |
|
||||
t.Logf("No EC task was assigned (expected in test environment)") |
|
||||
} |
|
||||
|
|
||||
// Step 7: Verify task completion
|
|
||||
stats := adminServer.GetSystemStats() |
|
||||
t.Logf("Final stats: Active tasks=%d, Queued tasks=%d, Active workers=%d, Total tasks=%d", |
|
||||
stats.ActiveTasks, stats.QueuedTasks, stats.ActiveWorkers, stats.TotalTasks) |
|
||||
|
|
||||
history := adminServer.GetTaskHistory() |
|
||||
t.Logf("Task history contains %d completed tasks", len(history)) |
|
||||
|
|
||||
if len(history) > 0 { |
|
||||
lastEntry := history[len(history)-1] |
|
||||
t.Logf("Last completed task: %s (%s) - Duration: %v", |
|
||||
lastEntry.TaskID, lastEntry.TaskType, lastEntry.Duration) |
|
||||
|
|
||||
if lastEntry.TaskType == types.TaskTypeErasureCoding { |
|
||||
t.Logf("EC task completed successfully") |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
t.Logf("EC integration test completed successfully") |
|
||||
} |
|
||||
|
|
||||
// TestECTaskValidation tests the EC task validation
|
|
||||
func TestECTaskValidation(t *testing.T) { |
|
||||
t.Logf("Testing EC task validation") |
|
||||
|
|
||||
// Create a temporary work directory
|
|
||||
workDir := filepath.Join(os.TempDir(), "seaweedfs_ec_test") |
|
||||
err := os.MkdirAll(workDir, 0755) |
|
||||
if err != nil { |
|
||||
t.Fatalf("Failed to create work directory: %v", err) |
|
||||
} |
|
||||
defer os.RemoveAll(workDir) |
|
||||
|
|
||||
// Create EC task
|
|
||||
ecTask := ec_task.NewTaskWithParams( |
|
||||
"localhost:8080", // source server
|
|
||||
12345, // volume ID
|
|
||||
"localhost:9333", // master client
|
|
||||
workDir, // work directory
|
|
||||
) |
|
||||
|
|
||||
// Test validation with valid parameters
|
|
||||
validParams := types.TaskParams{ |
|
||||
VolumeID: 12345, |
|
||||
Server: "localhost:8080", |
|
||||
Collection: "test", |
|
||||
Parameters: map[string]interface{}{ |
|
||||
"volume_size": int64(32 * 1024 * 1024 * 1024), |
|
||||
}, |
|
||||
} |
|
||||
|
|
||||
err = ecTask.Validate(validParams) |
|
||||
if err != nil { |
|
||||
t.Errorf("Valid parameters should pass validation: %v", err) |
|
||||
} |
|
||||
|
|
||||
// Test validation with invalid parameters
|
|
||||
invalidParams := types.TaskParams{ |
|
||||
VolumeID: 0, // Invalid volume ID
|
|
||||
Server: "", // Empty server
|
|
||||
} |
|
||||
|
|
||||
err = ecTask.Validate(invalidParams) |
|
||||
if err == nil { |
|
||||
t.Errorf("Invalid parameters should fail validation") |
|
||||
} |
|
||||
|
|
||||
// Test time estimation
|
|
||||
estimatedTime := ecTask.EstimateTime(validParams) |
|
||||
t.Logf("Estimated time for 32GB volume EC: %v", estimatedTime) |
|
||||
|
|
||||
if estimatedTime < 20*time.Minute { |
|
||||
t.Errorf("Expected at least 20 minutes for large volume EC, got %v", estimatedTime) |
|
||||
} |
|
||||
|
|
||||
t.Logf("EC task validation completed successfully") |
|
||||
} |
|
||||
|
|
||||
// TestECFeatures tests specific EC features
|
|
||||
func TestECFeatures(t *testing.T) { |
|
||||
t.Logf("Testing EC features") |
|
||||
|
|
||||
// Create temporary work directory
|
|
||||
workDir := filepath.Join(os.TempDir(), "seaweedfs_ec_features_test") |
|
||||
err := os.MkdirAll(workDir, 0755) |
|
||||
if err != nil { |
|
||||
t.Fatalf("Failed to create work directory: %v", err) |
|
||||
} |
|
||||
defer os.RemoveAll(workDir) |
|
||||
|
|
||||
ecTask := ec_task.NewTaskWithParams( |
|
||||
"localhost:8080", |
|
||||
54321, |
|
||||
"localhost:9333", |
|
||||
workDir, |
|
||||
) |
|
||||
|
|
||||
// Test step tracking
|
|
||||
t.Logf("Testing step tracking functionality") |
|
||||
|
|
||||
currentStep := ecTask.GetCurrentStep() |
|
||||
t.Logf("Initial current step: %s", currentStep) |
|
||||
|
|
||||
progress := ecTask.GetProgress() |
|
||||
t.Logf("Initial progress: %.1f%%", progress) |
|
||||
|
|
||||
// Test parameter extraction
|
|
||||
params := types.TaskParams{ |
|
||||
VolumeID: 54321, |
|
||||
Server: "localhost:8080", |
|
||||
Collection: "features_test", |
|
||||
Parameters: map[string]interface{}{ |
|
||||
"volume_size": int64(64 * 1024 * 1024 * 1024), // 64GB
|
|
||||
"data_shards": 10, |
|
||||
"parity_shards": 4, |
|
||||
"affinity_zones": []string{"zone-a", "zone-b", "zone-c"}, |
|
||||
}, |
|
||||
} |
|
||||
|
|
||||
estimatedTime := ecTask.EstimateTime(params) |
|
||||
expectedMinTime := time.Duration(64*2) * time.Minute // 2 minutes per GB
|
|
||||
|
|
||||
t.Logf("64GB volume estimated time: %v (expected minimum: %v)", estimatedTime, expectedMinTime) |
|
||||
|
|
||||
if estimatedTime < expectedMinTime { |
|
||||
t.Errorf("Time estimate seems too low for 64GB volume") |
|
||||
} |
|
||||
|
|
||||
t.Logf("EC features test completed successfully") |
|
||||
} |
|
||||
|
|
||||
// TestECTaskComparison tests EC implementation features
|
|
||||
func TestECTaskComparison(t *testing.T) { |
|
||||
t.Logf("Testing EC implementation features") |
|
||||
|
|
||||
// EC task estimation
|
|
||||
params := types.TaskParams{ |
|
||||
VolumeID: 11111, |
|
||||
Server: "localhost:8080", |
|
||||
Parameters: map[string]interface{}{ |
|
||||
"volume_size": int64(30 * 1024 * 1024 * 1024), // 30GB
|
|
||||
}, |
|
||||
} |
|
||||
|
|
||||
// Create task
|
|
||||
workDir := filepath.Join(os.TempDir(), "seaweedfs_ec_comparison") |
|
||||
defer os.RemoveAll(workDir) |
|
||||
|
|
||||
ecTask := ec_task.NewTaskWithParams( |
|
||||
"localhost:8080", |
|
||||
22222, |
|
||||
"localhost:9333", |
|
||||
workDir, |
|
||||
) |
|
||||
estimatedTime := ecTask.EstimateTime(params) |
|
||||
|
|
||||
t.Logf("EC task estimated time: %v", estimatedTime) |
|
||||
|
|
||||
// Test feature capabilities
|
|
||||
t.Logf("EC implementation features:") |
|
||||
t.Logf(" - Local volume data copying with progress tracking") |
|
||||
t.Logf(" - Local Reed-Solomon encoding (10+4 shards)") |
|
||||
t.Logf(" - Intelligent shard placement with rack awareness") |
|
||||
t.Logf(" - Load balancing across available servers") |
|
||||
t.Logf(" - Backup server selection for redundancy") |
|
||||
t.Logf(" - Detailed step-by-step progress tracking") |
|
||||
t.Logf(" - Comprehensive error handling and recovery") |
|
||||
|
|
||||
t.Logf("EC implementation test completed successfully") |
|
||||
} |
|
||||
@ -1,324 +0,0 @@ |
|||||
package task |
|
||||
|
|
||||
import ( |
|
||||
"os" |
|
||||
"path/filepath" |
|
||||
"testing" |
|
||||
"time" |
|
||||
|
|
||||
ec_task "github.com/seaweedfs/seaweedfs/weed/worker/tasks/erasure_coding" |
|
||||
"github.com/seaweedfs/seaweedfs/weed/worker/types" |
|
||||
) |
|
||||
|
|
||||
// TestEnhancedECIntegration tests the enhanced EC implementation with the admin server
|
|
||||
func TestEnhancedECIntegration(t *testing.T) { |
|
||||
t.Logf("Starting enhanced EC integration test") |
|
||||
|
|
||||
// Step 1: Create admin server
|
|
||||
config := &MinimalAdminConfig{ |
|
||||
ScanInterval: 10 * time.Second, |
|
||||
WorkerTimeout: 30 * time.Second, |
|
||||
TaskTimeout: 30 * time.Minute, // EC takes longer
|
|
||||
MaxRetries: 3, |
|
||||
ReconcileInterval: 5 * time.Minute, |
|
||||
EnableFailureRecovery: true, |
|
||||
MaxConcurrentTasks: 2, // Limit concurrency for EC tasks
|
|
||||
} |
|
||||
|
|
||||
adminServer := NewMinimalAdminServer(config, nil) |
|
||||
err := adminServer.Start() |
|
||||
if err != nil { |
|
||||
t.Fatalf("Failed to start admin server: %v", err) |
|
||||
} |
|
||||
defer adminServer.Stop() |
|
||||
|
|
||||
// Step 2: Register an EC-capable worker
|
|
||||
worker := &types.Worker{ |
|
||||
ID: "ec-worker-1", |
|
||||
Address: "localhost:9001", |
|
||||
Capabilities: []types.TaskType{types.TaskTypeErasureCoding}, |
|
||||
MaxConcurrent: 1, |
|
||||
Status: "active", |
|
||||
CurrentLoad: 0, |
|
||||
LastHeartbeat: time.Now(), |
|
||||
} |
|
||||
|
|
||||
err = adminServer.RegisterWorker(worker) |
|
||||
if err != nil { |
|
||||
t.Fatalf("Failed to register EC worker: %v", err) |
|
||||
} |
|
||||
t.Logf("Successfully registered EC worker %s", worker.ID) |
|
||||
|
|
||||
// Step 3: Create an EC task
|
|
||||
ecTask := &types.Task{ |
|
||||
ID: "enhanced-ec-task-1", |
|
||||
Type: types.TaskTypeErasureCoding, |
|
||||
VolumeID: 12345, |
|
||||
Server: "localhost:8080", |
|
||||
Status: types.TaskStatusPending, |
|
||||
Priority: types.TaskPriorityHigh, |
|
||||
Parameters: map[string]interface{}{ |
|
||||
"volume_size": int64(32 * 1024 * 1024 * 1024), // 32GB
|
|
||||
"master_client": "localhost:9333", |
|
||||
"work_dir": "/tmp/seaweedfs_ec_work", |
|
||||
"collection": "test", |
|
||||
}, |
|
||||
CreatedAt: time.Now(), |
|
||||
} |
|
||||
|
|
||||
err = adminServer.QueueTask(ecTask) |
|
||||
if err != nil { |
|
||||
t.Fatalf("Failed to queue EC task: %v", err) |
|
||||
} |
|
||||
t.Logf("Successfully queued enhanced EC task %s for volume %d", ecTask.ID, ecTask.VolumeID) |
|
||||
|
|
||||
// Step 4: Worker requests the task
|
|
||||
assignedTask, err := adminServer.RequestTask("ec-worker-1", []types.TaskType{types.TaskTypeErasureCoding}) |
|
||||
if err != nil { |
|
||||
t.Fatalf("Failed to request EC task: %v", err) |
|
||||
} |
|
||||
|
|
||||
if assignedTask != nil { |
|
||||
t.Logf("EC worker got task: %s (%s) for volume %d", |
|
||||
assignedTask.ID, assignedTask.Type, assignedTask.VolumeID) |
|
||||
|
|
||||
// Step 5: Simulate enhanced EC task execution progress
|
|
||||
t.Logf("Simulating enhanced EC task execution phases") |
|
||||
|
|
||||
// Phase 1: Copying volume data
|
|
||||
err = adminServer.UpdateTaskProgress(assignedTask.ID, 15.0) |
|
||||
if err != nil { |
|
||||
t.Errorf("Failed to update progress (copying): %v", err) |
|
||||
} |
|
||||
t.Logf("Phase 1: Volume data copied to local disk") |
|
||||
|
|
||||
// Phase 2: Marking read-only
|
|
||||
err = adminServer.UpdateTaskProgress(assignedTask.ID, 25.0) |
|
||||
if err != nil { |
|
||||
t.Errorf("Failed to update progress (read-only): %v", err) |
|
||||
} |
|
||||
t.Logf("Phase 2: Source volume marked as read-only") |
|
||||
|
|
||||
// Phase 3: Local EC encoding
|
|
||||
err = adminServer.UpdateTaskProgress(assignedTask.ID, 60.0) |
|
||||
if err != nil { |
|
||||
t.Errorf("Failed to update progress (encoding): %v", err) |
|
||||
} |
|
||||
t.Logf("Phase 3: Local Reed-Solomon encoding completed (10+4 shards)") |
|
||||
|
|
||||
// Phase 4: Calculating optimal placement
|
|
||||
err = adminServer.UpdateTaskProgress(assignedTask.ID, 70.0) |
|
||||
if err != nil { |
|
||||
t.Errorf("Failed to update progress (placement): %v", err) |
|
||||
} |
|
||||
t.Logf("Phase 4: Optimal shard placement calculated with affinity") |
|
||||
|
|
||||
// Phase 5: Distributing shards
|
|
||||
err = adminServer.UpdateTaskProgress(assignedTask.ID, 90.0) |
|
||||
if err != nil { |
|
||||
t.Errorf("Failed to update progress (distribution): %v", err) |
|
||||
} |
|
||||
t.Logf("Phase 5: Shards distributed across servers with rack diversity") |
|
||||
|
|
||||
// Phase 6: Verification and cleanup
|
|
||||
err = adminServer.UpdateTaskProgress(assignedTask.ID, 100.0) |
|
||||
if err != nil { |
|
||||
t.Errorf("Failed to update progress (completion): %v", err) |
|
||||
} |
|
||||
t.Logf("Phase 6: Verification and cleanup completed") |
|
||||
|
|
||||
// Step 6: Complete the task
|
|
||||
err = adminServer.CompleteTask(assignedTask.ID, true, "") |
|
||||
if err != nil { |
|
||||
t.Errorf("Failed to complete EC task: %v", err) |
|
||||
} |
|
||||
t.Logf("Successfully completed enhanced EC task %s", assignedTask.ID) |
|
||||
} else { |
|
||||
t.Logf("No EC task was assigned (expected in test environment)") |
|
||||
} |
|
||||
|
|
||||
// Step 7: Verify task completion
|
|
||||
stats := adminServer.GetSystemStats() |
|
||||
t.Logf("Final stats: Active tasks=%d, Queued tasks=%d, Active workers=%d, Total tasks=%d", |
|
||||
stats.ActiveTasks, stats.QueuedTasks, stats.ActiveWorkers, stats.TotalTasks) |
|
||||
|
|
||||
history := adminServer.GetTaskHistory() |
|
||||
t.Logf("Task history contains %d completed tasks", len(history)) |
|
||||
|
|
||||
if len(history) > 0 { |
|
||||
lastEntry := history[len(history)-1] |
|
||||
t.Logf("Last completed task: %s (%s) - Duration: %v", |
|
||||
lastEntry.TaskID, lastEntry.TaskType, lastEntry.Duration) |
|
||||
|
|
||||
if lastEntry.TaskType == types.TaskTypeErasureCoding { |
|
||||
t.Logf("Enhanced EC task completed successfully") |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
t.Logf("Enhanced EC integration test completed successfully") |
|
||||
} |
|
||||
|
|
||||
// TestEnhancedECTaskValidation tests the enhanced EC task validation
|
|
||||
func TestEnhancedECTaskValidation(t *testing.T) { |
|
||||
t.Logf("Testing enhanced EC task validation") |
|
||||
|
|
||||
// Create a temporary work directory
|
|
||||
workDir := filepath.Join(os.TempDir(), "seaweedfs_ec_test") |
|
||||
err := os.MkdirAll(workDir, 0755) |
|
||||
if err != nil { |
|
||||
t.Fatalf("Failed to create work directory: %v", err) |
|
||||
} |
|
||||
defer os.RemoveAll(workDir) |
|
||||
|
|
||||
// Create enhanced EC task
|
|
||||
enhancedTask := ec_task.NewEnhancedECTask( |
|
||||
"localhost:8080", // source server
|
|
||||
12345, // volume ID
|
|
||||
"localhost:9333", // master client
|
|
||||
workDir, // work directory
|
|
||||
) |
|
||||
|
|
||||
// Test validation with valid parameters
|
|
||||
validParams := types.TaskParams{ |
|
||||
VolumeID: 12345, |
|
||||
Server: "localhost:8080", |
|
||||
Collection: "test", |
|
||||
Parameters: map[string]interface{}{ |
|
||||
"volume_size": int64(32 * 1024 * 1024 * 1024), |
|
||||
}, |
|
||||
} |
|
||||
|
|
||||
err = enhancedTask.Validate(validParams) |
|
||||
if err != nil { |
|
||||
t.Errorf("Valid parameters should pass validation: %v", err) |
|
||||
} |
|
||||
|
|
||||
// Test validation with invalid parameters
|
|
||||
invalidParams := types.TaskParams{ |
|
||||
VolumeID: 0, // Invalid volume ID
|
|
||||
Server: "", // Empty server
|
|
||||
} |
|
||||
|
|
||||
err = enhancedTask.Validate(invalidParams) |
|
||||
if err == nil { |
|
||||
t.Errorf("Invalid parameters should fail validation") |
|
||||
} |
|
||||
|
|
||||
// Test time estimation
|
|
||||
estimatedTime := enhancedTask.EstimateTime(validParams) |
|
||||
t.Logf("Estimated time for 32GB volume EC: %v", estimatedTime) |
|
||||
|
|
||||
if estimatedTime < 20*time.Minute { |
|
||||
t.Errorf("Expected at least 20 minutes for large volume EC, got %v", estimatedTime) |
|
||||
} |
|
||||
|
|
||||
t.Logf("Enhanced EC task validation completed successfully") |
|
||||
} |
|
||||
|
|
||||
// TestEnhancedECFeatures tests specific enhanced EC features
|
|
||||
func TestEnhancedECFeatures(t *testing.T) { |
|
||||
t.Logf("Testing enhanced EC features") |
|
||||
|
|
||||
// Create temporary work directory
|
|
||||
workDir := filepath.Join(os.TempDir(), "seaweedfs_ec_features_test") |
|
||||
err := os.MkdirAll(workDir, 0755) |
|
||||
if err != nil { |
|
||||
t.Fatalf("Failed to create work directory: %v", err) |
|
||||
} |
|
||||
defer os.RemoveAll(workDir) |
|
||||
|
|
||||
enhancedTask := ec_task.NewEnhancedECTask( |
|
||||
"localhost:8080", |
|
||||
54321, |
|
||||
"localhost:9333", |
|
||||
workDir, |
|
||||
) |
|
||||
|
|
||||
// Test step tracking
|
|
||||
t.Logf("Testing step tracking functionality") |
|
||||
|
|
||||
currentStep := enhancedTask.GetCurrentStep() |
|
||||
t.Logf("Initial current step: %s", currentStep) |
|
||||
|
|
||||
progress := enhancedTask.GetProgress() |
|
||||
t.Logf("Initial progress: %.1f%%", progress) |
|
||||
|
|
||||
// Test parameter extraction
|
|
||||
params := types.TaskParams{ |
|
||||
VolumeID: 54321, |
|
||||
Server: "localhost:8080", |
|
||||
Collection: "enhanced_test", |
|
||||
Parameters: map[string]interface{}{ |
|
||||
"volume_size": int64(64 * 1024 * 1024 * 1024), // 64GB
|
|
||||
"data_shards": 10, |
|
||||
"parity_shards": 4, |
|
||||
"affinity_zones": []string{"zone-a", "zone-b", "zone-c"}, |
|
||||
}, |
|
||||
} |
|
||||
|
|
||||
estimatedTime := enhancedTask.EstimateTime(params) |
|
||||
expectedMinTime := time.Duration(64*2) * time.Minute // 2 minutes per GB
|
|
||||
|
|
||||
t.Logf("64GB volume estimated time: %v (expected minimum: %v)", estimatedTime, expectedMinTime) |
|
||||
|
|
||||
if estimatedTime < expectedMinTime { |
|
||||
t.Errorf("Time estimate seems too low for 64GB volume") |
|
||||
} |
|
||||
|
|
||||
t.Logf("Enhanced EC features test completed successfully") |
|
||||
} |
|
||||
|
|
||||
// TestECTaskComparison compares basic vs enhanced EC implementations
|
|
||||
func TestECTaskComparison(t *testing.T) { |
|
||||
t.Logf("Comparing basic vs enhanced EC implementations") |
|
||||
|
|
||||
// Basic EC task estimation
|
|
||||
basicParams := types.TaskParams{ |
|
||||
VolumeID: 11111, |
|
||||
Server: "localhost:8080", |
|
||||
Parameters: map[string]interface{}{ |
|
||||
"volume_size": int64(30 * 1024 * 1024 * 1024), // 30GB
|
|
||||
}, |
|
||||
} |
|
||||
|
|
||||
// Create basic task (existing implementation)
|
|
||||
basicTask := ec_task.NewTask("localhost:8080", 11111) |
|
||||
basicTime := basicTask.EstimateTime(basicParams) |
|
||||
|
|
||||
// Create enhanced task
|
|
||||
workDir := filepath.Join(os.TempDir(), "seaweedfs_ec_comparison") |
|
||||
defer os.RemoveAll(workDir) |
|
||||
|
|
||||
enhancedTask := ec_task.NewEnhancedECTask( |
|
||||
"localhost:8080", |
|
||||
22222, |
|
||||
"localhost:9333", |
|
||||
workDir, |
|
||||
) |
|
||||
enhancedTime := enhancedTask.EstimateTime(basicParams) |
|
||||
|
|
||||
t.Logf("Basic EC task estimated time: %v", basicTime) |
|
||||
t.Logf("Enhanced EC task estimated time: %v", enhancedTime) |
|
||||
|
|
||||
// Enhanced should take longer due to additional processing
|
|
||||
if enhancedTime <= basicTime { |
|
||||
t.Logf("Note: Enhanced EC might take longer due to local processing and smart distribution") |
|
||||
} |
|
||||
|
|
||||
// Test feature differences
|
|
||||
t.Logf("Basic EC features:") |
|
||||
t.Logf(" - Direct volume server EC generation") |
|
||||
t.Logf(" - Simple shard mounting") |
|
||||
t.Logf(" - No custom placement logic") |
|
||||
|
|
||||
t.Logf("Enhanced EC features:") |
|
||||
t.Logf(" - Local volume data copying") |
|
||||
t.Logf(" - Local Reed-Solomon encoding") |
|
||||
t.Logf(" - Intelligent shard placement with affinity") |
|
||||
t.Logf(" - Rack diversity for data shards") |
|
||||
t.Logf(" - Load balancing across servers") |
|
||||
t.Logf(" - Backup server selection") |
|
||||
t.Logf(" - Detailed progress tracking") |
|
||||
|
|
||||
t.Logf("EC task comparison completed successfully") |
|
||||
} |
|
||||
@ -1,3 +0,0 @@ |
|||||
module ec_test |
|
||||
|
|
||||
go 1.24.1 |
|
||||
@ -1,324 +0,0 @@ |
|||||
package task |
|
||||
|
|
||||
import ( |
|
||||
"fmt" |
|
||||
"sync" |
|
||||
"time" |
|
||||
|
|
||||
"github.com/seaweedfs/seaweedfs/weed/wdclient" |
|
||||
"github.com/seaweedfs/seaweedfs/weed/worker/types" |
|
||||
) |
|
||||
|
|
||||
// MinimalAdminConfig contains configuration for the minimal admin server
|
|
||||
type MinimalAdminConfig struct { |
|
||||
ScanInterval time.Duration |
|
||||
WorkerTimeout time.Duration |
|
||||
TaskTimeout time.Duration |
|
||||
MaxRetries int |
|
||||
ReconcileInterval time.Duration |
|
||||
EnableFailureRecovery bool |
|
||||
MaxConcurrentTasks int |
|
||||
} |
|
||||
|
|
||||
// MinimalAdminServer manages workers and tasks with a simple implementation
|
|
||||
type MinimalAdminServer struct { |
|
||||
config *MinimalAdminConfig |
|
||||
masterClient *wdclient.MasterClient |
|
||||
running bool |
|
||||
mutex sync.RWMutex |
|
||||
|
|
||||
// Task management
|
|
||||
tasks map[string]*types.Task |
|
||||
taskQueue []*types.Task |
|
||||
activeTasks map[string]*types.Task |
|
||||
|
|
||||
// Worker management
|
|
||||
workers map[string]*types.Worker |
|
||||
workerStatus map[string]*types.WorkerStatus |
|
||||
|
|
||||
// Task history
|
|
||||
taskHistory []MinimalTaskHistoryEntry |
|
||||
} |
|
||||
|
|
||||
// MinimalTaskHistoryEntry represents a single task history entry
|
|
||||
type MinimalTaskHistoryEntry struct { |
|
||||
TaskID string |
|
||||
TaskType types.TaskType |
|
||||
VolumeID uint32 |
|
||||
WorkerID string |
|
||||
Status types.TaskStatus |
|
||||
StartedAt time.Time |
|
||||
CompletedAt time.Time |
|
||||
Duration time.Duration |
|
||||
ErrorMessage string |
|
||||
} |
|
||||
|
|
||||
// MinimalSystemStats represents system statistics
|
|
||||
type MinimalSystemStats struct { |
|
||||
ActiveTasks int |
|
||||
QueuedTasks int |
|
||||
ActiveWorkers int |
|
||||
TotalTasks int |
|
||||
} |
|
||||
|
|
||||
// NewMinimalAdminServer creates a new minimal admin server
|
|
||||
func NewMinimalAdminServer(config *MinimalAdminConfig, masterClient *wdclient.MasterClient) *MinimalAdminServer { |
|
||||
return &MinimalAdminServer{ |
|
||||
config: config, |
|
||||
masterClient: masterClient, |
|
||||
tasks: make(map[string]*types.Task), |
|
||||
taskQueue: make([]*types.Task, 0), |
|
||||
activeTasks: make(map[string]*types.Task), |
|
||||
workers: make(map[string]*types.Worker), |
|
||||
workerStatus: make(map[string]*types.WorkerStatus), |
|
||||
taskHistory: make([]MinimalTaskHistoryEntry, 0), |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// Start starts the minimal admin server
|
|
||||
func (as *MinimalAdminServer) Start() error { |
|
||||
as.mutex.Lock() |
|
||||
defer as.mutex.Unlock() |
|
||||
|
|
||||
if as.running { |
|
||||
return fmt.Errorf("admin server is already running") |
|
||||
} |
|
||||
|
|
||||
as.running = true |
|
||||
return nil |
|
||||
} |
|
||||
|
|
||||
// Stop stops the minimal admin server
|
|
||||
func (as *MinimalAdminServer) Stop() error { |
|
||||
as.mutex.Lock() |
|
||||
defer as.mutex.Unlock() |
|
||||
|
|
||||
as.running = false |
|
||||
return nil |
|
||||
} |
|
||||
|
|
||||
// RegisterWorker registers a new worker
|
|
||||
func (as *MinimalAdminServer) RegisterWorker(worker *types.Worker) error { |
|
||||
as.mutex.Lock() |
|
||||
defer as.mutex.Unlock() |
|
||||
|
|
||||
if !as.running { |
|
||||
return fmt.Errorf("admin server is not running") |
|
||||
} |
|
||||
|
|
||||
as.workers[worker.ID] = worker |
|
||||
as.workerStatus[worker.ID] = &types.WorkerStatus{ |
|
||||
Status: "active", |
|
||||
CurrentLoad: 0, |
|
||||
} |
|
||||
|
|
||||
return nil |
|
||||
} |
|
||||
|
|
||||
// QueueTask adds a new task to the task queue
|
|
||||
func (as *MinimalAdminServer) QueueTask(task *types.Task) error { |
|
||||
as.mutex.Lock() |
|
||||
defer as.mutex.Unlock() |
|
||||
|
|
||||
if !as.running { |
|
||||
return fmt.Errorf("admin server is not running") |
|
||||
} |
|
||||
|
|
||||
if task.ID == "" { |
|
||||
task.ID = fmt.Sprintf("task-%d", time.Now().UnixNano()) |
|
||||
} |
|
||||
|
|
||||
task.Status = types.TaskStatusPending |
|
||||
task.CreatedAt = time.Now() |
|
||||
|
|
||||
as.tasks[task.ID] = task |
|
||||
as.taskQueue = append(as.taskQueue, task) |
|
||||
|
|
||||
return nil |
|
||||
} |
|
||||
|
|
||||
// RequestTask requests a task for a worker
|
|
||||
func (as *MinimalAdminServer) RequestTask(workerID string, capabilities []types.TaskType) (*types.Task, error) { |
|
||||
as.mutex.Lock() |
|
||||
defer as.mutex.Unlock() |
|
||||
|
|
||||
if !as.running { |
|
||||
return nil, fmt.Errorf("admin server is not running") |
|
||||
} |
|
||||
|
|
||||
// Check if worker exists
|
|
||||
worker, exists := as.workers[workerID] |
|
||||
if !exists { |
|
||||
return nil, fmt.Errorf("worker %s not found", workerID) |
|
||||
} |
|
||||
|
|
||||
// Check if worker has capacity
|
|
||||
status := as.workerStatus[workerID] |
|
||||
if status.CurrentLoad >= worker.MaxConcurrent { |
|
||||
return nil, nil // No capacity
|
|
||||
} |
|
||||
|
|
||||
// Find a suitable task
|
|
||||
for i, task := range as.taskQueue { |
|
||||
if task.Status != types.TaskStatusPending { |
|
||||
continue |
|
||||
} |
|
||||
|
|
||||
// Check if worker can handle this task type
|
|
||||
canHandle := false |
|
||||
for _, capability := range capabilities { |
|
||||
if task.Type == capability { |
|
||||
canHandle = true |
|
||||
break |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
if canHandle { |
|
||||
// Assign task to worker
|
|
||||
task.Status = types.TaskStatusInProgress |
|
||||
task.WorkerID = workerID |
|
||||
now := time.Now() |
|
||||
task.StartedAt = &now |
|
||||
|
|
||||
// Move task from queue to active tasks
|
|
||||
as.taskQueue = append(as.taskQueue[:i], as.taskQueue[i+1:]...) |
|
||||
as.activeTasks[task.ID] = task |
|
||||
|
|
||||
// Update worker load
|
|
||||
status.CurrentLoad++ |
|
||||
|
|
||||
return task, nil |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
return nil, nil // No suitable task found
|
|
||||
} |
|
||||
|
|
||||
// UpdateTaskProgress updates task progress
|
|
||||
func (as *MinimalAdminServer) UpdateTaskProgress(taskID string, progress float64) error { |
|
||||
as.mutex.Lock() |
|
||||
defer as.mutex.Unlock() |
|
||||
|
|
||||
task, exists := as.tasks[taskID] |
|
||||
if !exists { |
|
||||
return fmt.Errorf("task %s not found", taskID) |
|
||||
} |
|
||||
|
|
||||
task.Progress = progress |
|
||||
|
|
||||
return nil |
|
||||
} |
|
||||
|
|
||||
// CompleteTask marks a task as completed
|
|
||||
func (as *MinimalAdminServer) CompleteTask(taskID string, success bool, errorMessage string) error { |
|
||||
as.mutex.Lock() |
|
||||
defer as.mutex.Unlock() |
|
||||
|
|
||||
task, exists := as.tasks[taskID] |
|
||||
if !exists { |
|
||||
return fmt.Errorf("task %s not found", taskID) |
|
||||
} |
|
||||
|
|
||||
// Update task status
|
|
||||
if success { |
|
||||
task.Status = types.TaskStatusCompleted |
|
||||
} else { |
|
||||
task.Status = types.TaskStatusFailed |
|
||||
task.Error = errorMessage |
|
||||
} |
|
||||
|
|
||||
now := time.Now() |
|
||||
task.CompletedAt = &now |
|
||||
|
|
||||
// Remove from active tasks
|
|
||||
delete(as.activeTasks, taskID) |
|
||||
|
|
||||
// Update worker load
|
|
||||
if task.WorkerID != "" { |
|
||||
if status, exists := as.workerStatus[task.WorkerID]; exists { |
|
||||
status.CurrentLoad-- |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// Add to history
|
|
||||
var duration time.Duration |
|
||||
if task.StartedAt != nil { |
|
||||
duration = now.Sub(*task.StartedAt) |
|
||||
} |
|
||||
|
|
||||
entry := MinimalTaskHistoryEntry{ |
|
||||
TaskID: task.ID, |
|
||||
TaskType: task.Type, |
|
||||
VolumeID: task.VolumeID, |
|
||||
WorkerID: task.WorkerID, |
|
||||
Status: task.Status, |
|
||||
StartedAt: *task.StartedAt, |
|
||||
CompletedAt: now, |
|
||||
Duration: duration, |
|
||||
ErrorMessage: errorMessage, |
|
||||
} |
|
||||
as.taskHistory = append(as.taskHistory, entry) |
|
||||
|
|
||||
return nil |
|
||||
} |
|
||||
|
|
||||
// UpdateWorkerHeartbeat updates worker heartbeat
|
|
||||
func (as *MinimalAdminServer) UpdateWorkerHeartbeat(workerID string, status *types.WorkerStatus) error { |
|
||||
as.mutex.Lock() |
|
||||
defer as.mutex.Unlock() |
|
||||
|
|
||||
worker, exists := as.workers[workerID] |
|
||||
if !exists { |
|
||||
return fmt.Errorf("worker %s not found", workerID) |
|
||||
} |
|
||||
|
|
||||
worker.LastHeartbeat = time.Now() |
|
||||
as.workerStatus[workerID] = status |
|
||||
|
|
||||
return nil |
|
||||
} |
|
||||
|
|
||||
// GetSystemStats returns system statistics
|
|
||||
func (as *MinimalAdminServer) GetSystemStats() *MinimalSystemStats { |
|
||||
as.mutex.RLock() |
|
||||
defer as.mutex.RUnlock() |
|
||||
|
|
||||
activeWorkers := 0 |
|
||||
for _, status := range as.workerStatus { |
|
||||
if status.Status == "active" { |
|
||||
activeWorkers++ |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
return &MinimalSystemStats{ |
|
||||
ActiveTasks: len(as.activeTasks), |
|
||||
QueuedTasks: len(as.taskQueue), |
|
||||
ActiveWorkers: activeWorkers, |
|
||||
TotalTasks: len(as.tasks), |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// GetQueuedTaskCount returns the number of queued tasks
|
|
||||
func (as *MinimalAdminServer) GetQueuedTaskCount() int { |
|
||||
as.mutex.RLock() |
|
||||
defer as.mutex.RUnlock() |
|
||||
return len(as.taskQueue) |
|
||||
} |
|
||||
|
|
||||
// GetActiveTaskCount returns the number of active tasks
|
|
||||
func (as *MinimalAdminServer) GetActiveTaskCount() int { |
|
||||
as.mutex.RLock() |
|
||||
defer as.mutex.RUnlock() |
|
||||
return len(as.activeTasks) |
|
||||
} |
|
||||
|
|
||||
// GetTaskHistory returns task history
|
|
||||
func (as *MinimalAdminServer) GetTaskHistory() []MinimalTaskHistoryEntry { |
|
||||
as.mutex.RLock() |
|
||||
defer as.mutex.RUnlock() |
|
||||
|
|
||||
// Return a copy of the history
|
|
||||
history := make([]MinimalTaskHistoryEntry, len(as.taskHistory)) |
|
||||
copy(history, as.taskHistory) |
|
||||
return history |
|
||||
} |
|
||||
@ -1,434 +0,0 @@ |
|||||
package task |
|
||||
|
|
||||
import ( |
|
||||
"fmt" |
|
||||
"testing" |
|
||||
"time" |
|
||||
|
|
||||
"github.com/seaweedfs/seaweedfs/weed/worker/types" |
|
||||
) |
|
||||
|
|
||||
// TestMinimalIntegration tests basic admin-worker operational flow using the minimal implementation
|
|
||||
func TestMinimalIntegration(t *testing.T) { |
|
||||
t.Logf("Starting minimal integration test") |
|
||||
|
|
||||
// Step 1: Create a minimal admin server configuration
|
|
||||
config := &MinimalAdminConfig{ |
|
||||
ScanInterval: 10 * time.Second, |
|
||||
WorkerTimeout: 30 * time.Second, |
|
||||
TaskTimeout: 2 * time.Hour, |
|
||||
MaxRetries: 3, |
|
||||
ReconcileInterval: 5 * time.Minute, |
|
||||
EnableFailureRecovery: true, |
|
||||
MaxConcurrentTasks: 5, |
|
||||
} |
|
||||
|
|
||||
// Step 2: Create minimal admin server with nil master client (for testing)
|
|
||||
adminServer := NewMinimalAdminServer(config, nil) |
|
||||
|
|
||||
// Step 3: Start admin server
|
|
||||
err := adminServer.Start() |
|
||||
if err != nil { |
|
||||
t.Fatalf("Failed to start admin server: %v", err) |
|
||||
} |
|
||||
defer adminServer.Stop() |
|
||||
|
|
||||
// Step 4: Test worker registration
|
|
||||
t.Logf("Testing worker registration") |
|
||||
|
|
||||
worker := &types.Worker{ |
|
||||
ID: "test-worker-1", |
|
||||
Address: "localhost:9001", |
|
||||
Capabilities: []types.TaskType{types.TaskTypeVacuum}, |
|
||||
MaxConcurrent: 2, |
|
||||
Status: "active", |
|
||||
CurrentLoad: 0, |
|
||||
LastHeartbeat: time.Now(), |
|
||||
} |
|
||||
|
|
||||
err = adminServer.RegisterWorker(worker) |
|
||||
if err != nil { |
|
||||
t.Fatalf("Failed to register worker: %v", err) |
|
||||
} |
|
||||
t.Logf("Successfully registered worker %s", worker.ID) |
|
||||
|
|
||||
// Step 5: Test task queueing
|
|
||||
t.Logf("Testing task queueing") |
|
||||
|
|
||||
task := &types.Task{ |
|
||||
ID: "test-task-1", |
|
||||
Type: types.TaskTypeVacuum, |
|
||||
VolumeID: 1001, |
|
||||
Server: "localhost:8080", |
|
||||
Status: types.TaskStatusPending, |
|
||||
Priority: types.TaskPriorityNormal, |
|
||||
Parameters: map[string]interface{}{ |
|
||||
"garbage_threshold": "0.3", |
|
||||
}, |
|
||||
CreatedAt: time.Now(), |
|
||||
} |
|
||||
|
|
||||
err = adminServer.QueueTask(task) |
|
||||
if err != nil { |
|
||||
t.Fatalf("Failed to queue task: %v", err) |
|
||||
} |
|
||||
t.Logf("Successfully queued task %s", task.ID) |
|
||||
|
|
||||
// Step 6: Test task request by worker
|
|
||||
t.Logf("Testing task request") |
|
||||
|
|
||||
assignedTask, err := adminServer.RequestTask("test-worker-1", []types.TaskType{types.TaskTypeVacuum}) |
|
||||
if err != nil { |
|
||||
t.Fatalf("Failed to request task: %v", err) |
|
||||
} |
|
||||
|
|
||||
if assignedTask != nil { |
|
||||
t.Logf("Successfully assigned task %s to worker", assignedTask.ID) |
|
||||
|
|
||||
// Step 7: Test task progress updates
|
|
||||
t.Logf("Testing task progress updates") |
|
||||
|
|
||||
err = adminServer.UpdateTaskProgress(assignedTask.ID, 25.0) |
|
||||
if err != nil { |
|
||||
t.Errorf("Failed to update task progress to 25%%: %v", err) |
|
||||
} |
|
||||
|
|
||||
err = adminServer.UpdateTaskProgress(assignedTask.ID, 50.0) |
|
||||
if err != nil { |
|
||||
t.Errorf("Failed to update task progress to 50%%: %v", err) |
|
||||
} |
|
||||
|
|
||||
err = adminServer.UpdateTaskProgress(assignedTask.ID, 75.0) |
|
||||
if err != nil { |
|
||||
t.Errorf("Failed to update task progress to 75%%: %v", err) |
|
||||
} |
|
||||
|
|
||||
err = adminServer.UpdateTaskProgress(assignedTask.ID, 100.0) |
|
||||
if err != nil { |
|
||||
t.Errorf("Failed to update task progress to 100%%: %v", err) |
|
||||
} |
|
||||
|
|
||||
// Step 8: Test task completion
|
|
||||
t.Logf("Testing task completion") |
|
||||
|
|
||||
err = adminServer.CompleteTask(assignedTask.ID, true, "") |
|
||||
if err != nil { |
|
||||
t.Errorf("Failed to complete task: %v", err) |
|
||||
} |
|
||||
t.Logf("Successfully completed task %s", assignedTask.ID) |
|
||||
} else { |
|
||||
t.Logf("No task was assigned (queue might be empty)") |
|
||||
} |
|
||||
|
|
||||
// Step 9: Test basic metrics
|
|
||||
t.Logf("Testing basic metrics") |
|
||||
|
|
||||
stats := adminServer.GetSystemStats() |
|
||||
if stats != nil { |
|
||||
t.Logf("System stats: Active tasks=%d, Queued tasks=%d, Active workers=%d, Total tasks=%d", |
|
||||
stats.ActiveTasks, stats.QueuedTasks, stats.ActiveWorkers, stats.TotalTasks) |
|
||||
} |
|
||||
|
|
||||
queuedCount := adminServer.GetQueuedTaskCount() |
|
||||
activeCount := adminServer.GetActiveTaskCount() |
|
||||
t.Logf("Queue status: %d queued, %d active tasks", queuedCount, activeCount) |
|
||||
|
|
||||
// Step 10: Test task history
|
|
||||
history := adminServer.GetTaskHistory() |
|
||||
t.Logf("Task history contains %d entries", len(history)) |
|
||||
|
|
||||
if len(history) > 0 { |
|
||||
lastEntry := history[len(history)-1] |
|
||||
t.Logf("Last task in history: %s (%s) - Status: %s, Duration: %v", |
|
||||
lastEntry.TaskID, lastEntry.TaskType, lastEntry.Status, lastEntry.Duration) |
|
||||
} |
|
||||
|
|
||||
t.Logf("Minimal integration test completed successfully") |
|
||||
} |
|
||||
|
|
||||
// TestMinimalWorkerHeartbeat tests worker heartbeat functionality
|
|
||||
func TestMinimalWorkerHeartbeat(t *testing.T) { |
|
||||
t.Logf("Testing minimal worker heartbeat") |
|
||||
|
|
||||
config := &MinimalAdminConfig{ |
|
||||
ScanInterval: 10 * time.Second, |
|
||||
WorkerTimeout: 30 * time.Second, |
|
||||
TaskTimeout: 2 * time.Hour, |
|
||||
MaxRetries: 3, |
|
||||
ReconcileInterval: 5 * time.Minute, |
|
||||
EnableFailureRecovery: true, |
|
||||
MaxConcurrentTasks: 5, |
|
||||
} |
|
||||
|
|
||||
adminServer := NewMinimalAdminServer(config, nil) |
|
||||
err := adminServer.Start() |
|
||||
if err != nil { |
|
||||
t.Fatalf("Failed to start admin server: %v", err) |
|
||||
} |
|
||||
defer adminServer.Stop() |
|
||||
|
|
||||
// Register a worker
|
|
||||
worker := &types.Worker{ |
|
||||
ID: "heartbeat-worker", |
|
||||
Address: "localhost:9002", |
|
||||
Capabilities: []types.TaskType{types.TaskTypeVacuum}, |
|
||||
MaxConcurrent: 1, |
|
||||
Status: "active", |
|
||||
CurrentLoad: 0, |
|
||||
LastHeartbeat: time.Now(), |
|
||||
} |
|
||||
|
|
||||
err = adminServer.RegisterWorker(worker) |
|
||||
if err != nil { |
|
||||
t.Fatalf("Failed to register worker: %v", err) |
|
||||
} |
|
||||
|
|
||||
// Test heartbeat update
|
|
||||
status := &types.WorkerStatus{ |
|
||||
Status: "active", |
|
||||
CurrentLoad: 0, |
|
||||
} |
|
||||
|
|
||||
err = adminServer.UpdateWorkerHeartbeat("heartbeat-worker", status) |
|
||||
if err != nil { |
|
||||
t.Errorf("Failed to update worker heartbeat: %v", err) |
|
||||
} |
|
||||
|
|
||||
t.Logf("Minimal worker heartbeat test completed successfully") |
|
||||
} |
|
||||
|
|
||||
// TestMinimalTaskQueueOperations tests task queue operations
|
|
||||
func TestMinimalTaskQueueOperations(t *testing.T) { |
|
||||
t.Logf("Testing minimal task queue operations") |
|
||||
|
|
||||
config := &MinimalAdminConfig{ |
|
||||
ScanInterval: 10 * time.Second, |
|
||||
WorkerTimeout: 30 * time.Second, |
|
||||
TaskTimeout: 2 * time.Hour, |
|
||||
MaxRetries: 3, |
|
||||
ReconcileInterval: 5 * time.Minute, |
|
||||
EnableFailureRecovery: true, |
|
||||
MaxConcurrentTasks: 5, |
|
||||
} |
|
||||
|
|
||||
adminServer := NewMinimalAdminServer(config, nil) |
|
||||
err := adminServer.Start() |
|
||||
if err != nil { |
|
||||
t.Fatalf("Failed to start admin server: %v", err) |
|
||||
} |
|
||||
defer adminServer.Stop() |
|
||||
|
|
||||
// Test queuing multiple tasks
|
|
||||
taskCount := 3 |
|
||||
for i := 0; i < taskCount; i++ { |
|
||||
task := &types.Task{ |
|
||||
ID: fmt.Sprintf("queue-test-task-%d", i), |
|
||||
Type: types.TaskTypeVacuum, |
|
||||
VolumeID: uint32(2000 + i), |
|
||||
Server: "localhost:8080", |
|
||||
Status: types.TaskStatusPending, |
|
||||
Priority: types.TaskPriorityNormal, |
|
||||
Parameters: map[string]interface{}{ |
|
||||
"garbage_threshold": "0.3", |
|
||||
}, |
|
||||
CreatedAt: time.Now(), |
|
||||
} |
|
||||
|
|
||||
err = adminServer.QueueTask(task) |
|
||||
if err != nil { |
|
||||
t.Errorf("Failed to queue task %d: %v", i, err) |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// Check queue size
|
|
||||
queuedCount := adminServer.GetQueuedTaskCount() |
|
||||
if queuedCount != taskCount { |
|
||||
t.Errorf("Expected %d queued tasks, got %d", taskCount, queuedCount) |
|
||||
} |
|
||||
|
|
||||
t.Logf("Minimal task queue operations test completed successfully") |
|
||||
} |
|
||||
|
|
||||
// TestMinimalFullWorkflow tests the complete workflow from task creation to completion
|
|
||||
func TestMinimalFullWorkflow(t *testing.T) { |
|
||||
t.Logf("Testing minimal full workflow") |
|
||||
|
|
||||
config := &MinimalAdminConfig{ |
|
||||
ScanInterval: 10 * time.Second, |
|
||||
WorkerTimeout: 30 * time.Second, |
|
||||
TaskTimeout: 2 * time.Hour, |
|
||||
MaxRetries: 3, |
|
||||
ReconcileInterval: 5 * time.Minute, |
|
||||
EnableFailureRecovery: true, |
|
||||
MaxConcurrentTasks: 5, |
|
||||
} |
|
||||
|
|
||||
adminServer := NewMinimalAdminServer(config, nil) |
|
||||
err := adminServer.Start() |
|
||||
if err != nil { |
|
||||
t.Fatalf("Failed to start admin server: %v", err) |
|
||||
} |
|
||||
defer adminServer.Stop() |
|
||||
|
|
||||
// Register multiple workers with different capabilities
|
|
||||
workers := []*types.Worker{ |
|
||||
{ |
|
||||
ID: "vacuum-worker-1", |
|
||||
Address: "localhost:9001", |
|
||||
Capabilities: []types.TaskType{types.TaskTypeVacuum}, |
|
||||
MaxConcurrent: 2, |
|
||||
Status: "active", |
|
||||
CurrentLoad: 0, |
|
||||
LastHeartbeat: time.Now(), |
|
||||
}, |
|
||||
{ |
|
||||
ID: "ec-worker-1", |
|
||||
Address: "localhost:9002", |
|
||||
Capabilities: []types.TaskType{types.TaskTypeErasureCoding}, |
|
||||
MaxConcurrent: 1, |
|
||||
Status: "active", |
|
||||
CurrentLoad: 0, |
|
||||
LastHeartbeat: time.Now(), |
|
||||
}, |
|
||||
{ |
|
||||
ID: "multi-worker-1", |
|
||||
Address: "localhost:9003", |
|
||||
Capabilities: []types.TaskType{types.TaskTypeVacuum, types.TaskTypeErasureCoding}, |
|
||||
MaxConcurrent: 3, |
|
||||
Status: "active", |
|
||||
CurrentLoad: 0, |
|
||||
LastHeartbeat: time.Now(), |
|
||||
}, |
|
||||
} |
|
||||
|
|
||||
for _, worker := range workers { |
|
||||
err = adminServer.RegisterWorker(worker) |
|
||||
if err != nil { |
|
||||
t.Fatalf("Failed to register worker %s: %v", worker.ID, err) |
|
||||
} |
|
||||
t.Logf("Registered worker %s with capabilities %v", worker.ID, worker.Capabilities) |
|
||||
} |
|
||||
|
|
||||
// Create multiple tasks of different types
|
|
||||
tasks := []*types.Task{ |
|
||||
{ |
|
||||
ID: "vacuum-task-1", |
|
||||
Type: types.TaskTypeVacuum, |
|
||||
VolumeID: 3001, |
|
||||
Server: "localhost:8080", |
|
||||
Status: types.TaskStatusPending, |
|
||||
Priority: types.TaskPriorityNormal, |
|
||||
Parameters: map[string]interface{}{ |
|
||||
"garbage_threshold": "0.4", |
|
||||
}, |
|
||||
CreatedAt: time.Now(), |
|
||||
}, |
|
||||
{ |
|
||||
ID: "ec-task-1", |
|
||||
Type: types.TaskTypeErasureCoding, |
|
||||
VolumeID: 3002, |
|
||||
Server: "localhost:8080", |
|
||||
Status: types.TaskStatusPending, |
|
||||
Priority: types.TaskPriorityHigh, |
|
||||
Parameters: map[string]interface{}{ |
|
||||
"shard_count": "14", |
|
||||
}, |
|
||||
CreatedAt: time.Now(), |
|
||||
}, |
|
||||
{ |
|
||||
ID: "vacuum-task-2", |
|
||||
Type: types.TaskTypeVacuum, |
|
||||
VolumeID: 3003, |
|
||||
Server: "localhost:8081", |
|
||||
Status: types.TaskStatusPending, |
|
||||
Priority: types.TaskPriorityLow, |
|
||||
Parameters: map[string]interface{}{ |
|
||||
"garbage_threshold": "0.5", |
|
||||
}, |
|
||||
CreatedAt: time.Now(), |
|
||||
}, |
|
||||
} |
|
||||
|
|
||||
for _, task := range tasks { |
|
||||
err = adminServer.QueueTask(task) |
|
||||
if err != nil { |
|
||||
t.Fatalf("Failed to queue task %s: %v", task.ID, err) |
|
||||
} |
|
||||
t.Logf("Queued task %s (%s) for volume %d", task.ID, task.Type, task.VolumeID) |
|
||||
} |
|
||||
|
|
||||
// Test task assignment to different workers
|
|
||||
t.Logf("Testing task assignments") |
|
||||
|
|
||||
// Vacuum worker should get vacuum tasks
|
|
||||
assignedTask, err := adminServer.RequestTask("vacuum-worker-1", []types.TaskType{types.TaskTypeVacuum}) |
|
||||
if err != nil { |
|
||||
t.Errorf("Failed to request task for vacuum worker: %v", err) |
|
||||
} else if assignedTask != nil { |
|
||||
t.Logf("Vacuum worker got task: %s (%s)", assignedTask.ID, assignedTask.Type) |
|
||||
|
|
||||
// Complete the task
|
|
||||
err = adminServer.UpdateTaskProgress(assignedTask.ID, 50.0) |
|
||||
if err != nil { |
|
||||
t.Errorf("Failed to update progress: %v", err) |
|
||||
} |
|
||||
|
|
||||
err = adminServer.CompleteTask(assignedTask.ID, true, "") |
|
||||
if err != nil { |
|
||||
t.Errorf("Failed to complete task: %v", err) |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// EC worker should get EC tasks
|
|
||||
assignedTask, err = adminServer.RequestTask("ec-worker-1", []types.TaskType{types.TaskTypeErasureCoding}) |
|
||||
if err != nil { |
|
||||
t.Errorf("Failed to request task for EC worker: %v", err) |
|
||||
} else if assignedTask != nil { |
|
||||
t.Logf("EC worker got task: %s (%s)", assignedTask.ID, assignedTask.Type) |
|
||||
|
|
||||
// Complete the task
|
|
||||
err = adminServer.UpdateTaskProgress(assignedTask.ID, 100.0) |
|
||||
if err != nil { |
|
||||
t.Errorf("Failed to update progress: %v", err) |
|
||||
} |
|
||||
|
|
||||
err = adminServer.CompleteTask(assignedTask.ID, true, "") |
|
||||
if err != nil { |
|
||||
t.Errorf("Failed to complete task: %v", err) |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// Multi-capability worker should be able to get any remaining task
|
|
||||
assignedTask, err = adminServer.RequestTask("multi-worker-1", []types.TaskType{types.TaskTypeVacuum, types.TaskTypeErasureCoding}) |
|
||||
if err != nil { |
|
||||
t.Errorf("Failed to request task for multi worker: %v", err) |
|
||||
} else if assignedTask != nil { |
|
||||
t.Logf("Multi worker got task: %s (%s)", assignedTask.ID, assignedTask.Type) |
|
||||
|
|
||||
// Complete the task
|
|
||||
err = adminServer.UpdateTaskProgress(assignedTask.ID, 100.0) |
|
||||
if err != nil { |
|
||||
t.Errorf("Failed to update progress: %v", err) |
|
||||
} |
|
||||
|
|
||||
err = adminServer.CompleteTask(assignedTask.ID, true, "") |
|
||||
if err != nil { |
|
||||
t.Errorf("Failed to complete task: %v", err) |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// Check final statistics
|
|
||||
stats := adminServer.GetSystemStats() |
|
||||
t.Logf("Final stats: Active tasks=%d, Queued tasks=%d, Active workers=%d, Total tasks=%d", |
|
||||
stats.ActiveTasks, stats.QueuedTasks, stats.ActiveWorkers, stats.TotalTasks) |
|
||||
|
|
||||
history := adminServer.GetTaskHistory() |
|
||||
t.Logf("Task history contains %d completed tasks", len(history)) |
|
||||
|
|
||||
for _, entry := range history { |
|
||||
t.Logf("Completed: %s (%s) - Worker: %s, Duration: %v", |
|
||||
entry.TaskID, entry.TaskType, entry.WorkerID, entry.Duration) |
|
||||
} |
|
||||
|
|
||||
t.Logf("Minimal full workflow test completed successfully") |
|
||||
} |
|
||||
@ -1,488 +0,0 @@ |
|||||
package task |
|
||||
|
|
||||
import ( |
|
||||
"os" |
|
||||
"path/filepath" |
|
||||
"testing" |
|
||||
"time" |
|
||||
|
|
||||
"github.com/seaweedfs/seaweedfs/weed/worker/tasks/erasure_coding" |
|
||||
"github.com/seaweedfs/seaweedfs/weed/worker/types" |
|
||||
) |
|
||||
|
|
||||
// TestECWorkerIntegration tests the complete EC worker functionality
|
|
||||
func TestECWorkerIntegration(t *testing.T) { |
|
||||
t.Logf("Starting EC worker integration test") |
|
||||
|
|
||||
// Step 1: Create admin server with EC configuration
|
|
||||
config := &MinimalAdminConfig{ |
|
||||
ScanInterval: 5 * time.Second, |
|
||||
WorkerTimeout: 60 * time.Second, |
|
||||
TaskTimeout: 45 * time.Minute, // EC takes longer
|
|
||||
MaxRetries: 3, |
|
||||
ReconcileInterval: 5 * time.Minute, |
|
||||
EnableFailureRecovery: true, |
|
||||
MaxConcurrentTasks: 1, // One at a time for EC
|
|
||||
} |
|
||||
|
|
||||
adminServer := NewMinimalAdminServer(config, nil) |
|
||||
err := adminServer.Start() |
|
||||
if err != nil { |
|
||||
t.Fatalf("Failed to start admin server: %v", err) |
|
||||
} |
|
||||
defer adminServer.Stop() |
|
||||
t.Logf("✓ Admin server started successfully") |
|
||||
|
|
||||
// Step 2: Register EC-capable worker
|
|
||||
worker := &types.Worker{ |
|
||||
ID: "ec-worker-1", |
|
||||
Address: "localhost:9001", |
|
||||
Capabilities: []types.TaskType{types.TaskTypeErasureCoding}, |
|
||||
MaxConcurrent: 1, |
|
||||
Status: "active", |
|
||||
CurrentLoad: 0, |
|
||||
LastHeartbeat: time.Now(), |
|
||||
} |
|
||||
|
|
||||
err = adminServer.RegisterWorker(worker) |
|
||||
if err != nil { |
|
||||
t.Fatalf("Failed to register EC worker: %v", err) |
|
||||
} |
|
||||
t.Logf("✓ EC worker registered: %s", worker.ID) |
|
||||
|
|
||||
// Step 3: Create work directory for EC processing
|
|
||||
workDir := filepath.Join(os.TempDir(), "seaweedfs_ec_test") |
|
||||
err = os.MkdirAll(workDir, 0755) |
|
||||
if err != nil { |
|
||||
t.Fatalf("Failed to create work directory: %v", err) |
|
||||
} |
|
||||
defer os.RemoveAll(workDir) |
|
||||
t.Logf("✓ Work directory created: %s", workDir) |
|
||||
|
|
||||
// Step 4: Create EC task with comprehensive parameters
|
|
||||
ecTask := &types.Task{ |
|
||||
ID: "ec-test-task-1", |
|
||||
Type: types.TaskTypeErasureCoding, |
|
||||
VolumeID: 54321, |
|
||||
Server: "localhost:8080", |
|
||||
Status: types.TaskStatusPending, |
|
||||
Priority: types.TaskPriorityHigh, |
|
||||
Parameters: map[string]interface{}{ |
|
||||
"volume_size": int64(64 * 1024 * 1024 * 1024), // 64GB volume
|
|
||||
"master_client": "localhost:9333", |
|
||||
"work_dir": workDir, |
|
||||
"collection": "test", |
|
||||
"data_shards": 10, |
|
||||
"parity_shards": 4, |
|
||||
"rack_aware": true, |
|
||||
"load_balance": true, |
|
||||
}, |
|
||||
CreatedAt: time.Now(), |
|
||||
} |
|
||||
|
|
||||
err = adminServer.QueueTask(ecTask) |
|
||||
if err != nil { |
|
||||
t.Fatalf("Failed to queue EC task: %v", err) |
|
||||
} |
|
||||
t.Logf("✓ EC task queued: %s for volume %d", ecTask.ID, ecTask.VolumeID) |
|
||||
|
|
||||
// Step 5: Worker requests and receives the EC task
|
|
||||
assignedTask, err := adminServer.RequestTask("ec-worker-1", []types.TaskType{types.TaskTypeErasureCoding}) |
|
||||
if err != nil { |
|
||||
t.Fatalf("Failed to request EC task: %v", err) |
|
||||
} |
|
||||
|
|
||||
if assignedTask == nil { |
|
||||
t.Fatalf("No EC task was assigned") |
|
||||
} |
|
||||
|
|
||||
t.Logf("✓ EC task assigned: %s (%s) for volume %d", |
|
||||
assignedTask.ID, assignedTask.Type, assignedTask.VolumeID) |
|
||||
|
|
||||
// Step 6: Test EC task creation and validation
|
|
||||
t.Logf("Testing EC task creation and validation") |
|
||||
|
|
||||
// Create EC task instance directly
|
|
||||
factory := erasure_coding.NewFactory() |
|
||||
taskParams := types.TaskParams{ |
|
||||
VolumeID: assignedTask.VolumeID, |
|
||||
Server: assignedTask.Server, |
|
||||
Collection: "test", |
|
||||
Parameters: assignedTask.Parameters, |
|
||||
} |
|
||||
|
|
||||
taskInstance, err := factory.Create(taskParams) |
|
||||
if err != nil { |
|
||||
t.Fatalf("Failed to create EC task instance: %v", err) |
|
||||
} |
|
||||
t.Logf("✓ EC task instance created successfully") |
|
||||
|
|
||||
// Step 7: Validate task parameters
|
|
||||
err = taskInstance.Validate(taskParams) |
|
||||
if err != nil { |
|
||||
t.Errorf("EC task validation failed: %v", err) |
|
||||
} else { |
|
||||
t.Logf("✓ EC task validation passed") |
|
||||
} |
|
||||
|
|
||||
// Step 8: Test time estimation
|
|
||||
estimatedTime := taskInstance.EstimateTime(taskParams) |
|
||||
expectedMinTime := time.Duration(64*2) * time.Minute // 2 minutes per GB for 64GB
|
|
||||
|
|
||||
t.Logf("✓ EC estimated time: %v (minimum expected: %v)", estimatedTime, expectedMinTime) |
|
||||
|
|
||||
if estimatedTime < expectedMinTime { |
|
||||
t.Logf("⚠ Note: Estimated time seems optimistic for 64GB volume") |
|
||||
} |
|
||||
|
|
||||
// Step 9: Simulate EC task execution phases
|
|
||||
t.Logf("Simulating EC execution phases:") |
|
||||
|
|
||||
phases := []struct { |
|
||||
progress float64 |
|
||||
phase string |
|
||||
}{ |
|
||||
{5.0, "Initializing EC processing"}, |
|
||||
{15.0, "Volume data copied to local disk with progress tracking"}, |
|
||||
{25.0, "Source volume marked as read-only"}, |
|
||||
{45.0, "Local Reed-Solomon encoding (10+4 shards) completed"}, |
|
||||
{60.0, "Created 14 EC shards with verification"}, |
|
||||
{70.0, "Optimal shard placement calculated with rack awareness"}, |
|
||||
{85.0, "Intelligent shard distribution with load balancing"}, |
|
||||
{95.0, "Shard placement verified across multiple racks"}, |
|
||||
{100.0, "EC processing completed with cleanup"}, |
|
||||
} |
|
||||
|
|
||||
for _, phase := range phases { |
|
||||
err = adminServer.UpdateTaskProgress(assignedTask.ID, phase.progress) |
|
||||
if err != nil { |
|
||||
t.Errorf("Failed to update task progress to %.1f%%: %v", phase.progress, err) |
|
||||
} else { |
|
||||
t.Logf(" %.1f%% - %s", phase.progress, phase.phase) |
|
||||
} |
|
||||
time.Sleep(50 * time.Millisecond) // Simulate processing time
|
|
||||
} |
|
||||
|
|
||||
// Step 10: Complete the EC task
|
|
||||
err = adminServer.CompleteTask(assignedTask.ID, true, "") |
|
||||
if err != nil { |
|
||||
t.Errorf("Failed to complete EC task: %v", err) |
|
||||
} else { |
|
||||
t.Logf("✓ EC task completed successfully") |
|
||||
} |
|
||||
|
|
||||
// Step 11: Verify EC task completion and metrics
|
|
||||
stats := adminServer.GetSystemStats() |
|
||||
t.Logf("✓ Final stats: Active tasks=%d, Queued tasks=%d, Active workers=%d, Total tasks=%d", |
|
||||
stats.ActiveTasks, stats.QueuedTasks, stats.ActiveWorkers, stats.TotalTasks) |
|
||||
|
|
||||
history := adminServer.GetTaskHistory() |
|
||||
t.Logf("✓ Task history contains %d completed tasks", len(history)) |
|
||||
|
|
||||
if len(history) > 0 { |
|
||||
lastEntry := history[len(history)-1] |
|
||||
t.Logf("✓ Last completed task: %s (%s) - Duration: %v", |
|
||||
lastEntry.TaskID, lastEntry.TaskType, lastEntry.Duration) |
|
||||
|
|
||||
if lastEntry.TaskType == types.TaskTypeErasureCoding { |
|
||||
t.Logf("✅ EC task execution verified!") |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
t.Logf("✅ EC worker integration test completed successfully") |
|
||||
} |
|
||||
|
|
||||
// TestECFeatureValidation tests specific EC features
|
|
||||
func TestECFeatureValidation(t *testing.T) { |
|
||||
t.Logf("Testing EC feature validation") |
|
||||
|
|
||||
// Create work directory
|
|
||||
workDir := filepath.Join(os.TempDir(), "seaweedfs_ec_features_test") |
|
||||
err := os.MkdirAll(workDir, 0755) |
|
||||
if err != nil { |
|
||||
t.Fatalf("Failed to create work directory: %v", err) |
|
||||
} |
|
||||
defer os.RemoveAll(workDir) |
|
||||
|
|
||||
// Test EC task features
|
|
||||
ecTask := erasure_coding.NewTaskWithParams( |
|
||||
"localhost:8080", // source server
|
|
||||
98765, // volume ID
|
|
||||
"localhost:9333", // master client
|
|
||||
workDir, // work directory
|
|
||||
) |
|
||||
|
|
||||
// Test current step tracking
|
|
||||
currentStep := ecTask.GetCurrentStep() |
|
||||
t.Logf("✓ Initial current step: '%s'", currentStep) |
|
||||
|
|
||||
initialProgress := ecTask.GetProgress() |
|
||||
t.Logf("✓ Initial progress: %.1f%%", initialProgress) |
|
||||
|
|
||||
// Test parameter validation with features
|
|
||||
validParams := types.TaskParams{ |
|
||||
VolumeID: 98765, |
|
||||
Server: "localhost:8080", |
|
||||
Collection: "features_test", |
|
||||
Parameters: map[string]interface{}{ |
|
||||
"volume_size": int64(128 * 1024 * 1024 * 1024), // 128GB
|
|
||||
"master_client": "localhost:9333", |
|
||||
"work_dir": workDir, |
|
||||
"data_shards": 10, |
|
||||
"parity_shards": 4, |
|
||||
"rack_awareness": true, |
|
||||
"load_balancing": true, |
|
||||
"backup_servers": 2, |
|
||||
"affinity_zones": []string{"zone-a", "zone-b", "zone-c"}, |
|
||||
}, |
|
||||
} |
|
||||
|
|
||||
err = ecTask.Validate(validParams) |
|
||||
if err != nil { |
|
||||
t.Errorf("Valid parameters should pass validation: %v", err) |
|
||||
} else { |
|
||||
t.Logf("✓ Parameter validation passed") |
|
||||
} |
|
||||
|
|
||||
// Test time estimation for large volume
|
|
||||
estimatedTime := ecTask.EstimateTime(validParams) |
|
||||
expectedMinTime := time.Duration(128*2) * time.Minute // 2 minutes per GB
|
|
||||
|
|
||||
t.Logf("✓ 128GB volume estimated time: %v (expected minimum: %v)", estimatedTime, expectedMinTime) |
|
||||
|
|
||||
if estimatedTime < expectedMinTime { |
|
||||
t.Errorf("Time estimate seems too low for 128GB volume") |
|
||||
} |
|
||||
|
|
||||
// Test invalid parameters
|
|
||||
invalidParams := types.TaskParams{ |
|
||||
VolumeID: 0, // Invalid
|
|
||||
Server: "", // Invalid
|
|
||||
} |
|
||||
|
|
||||
err = ecTask.Validate(invalidParams) |
|
||||
if err == nil { |
|
||||
t.Errorf("Invalid parameters should fail validation") |
|
||||
} else { |
|
||||
t.Logf("✓ Invalid parameter validation correctly failed: %v", err) |
|
||||
} |
|
||||
|
|
||||
t.Logf("✅ EC feature validation completed successfully") |
|
||||
} |
|
||||
|
|
||||
// TestECWorkflow tests the complete EC workflow
|
|
||||
func TestECWorkflow(t *testing.T) { |
|
||||
t.Logf("Testing complete EC workflow") |
|
||||
|
|
||||
// Create admin server
|
|
||||
config := &MinimalAdminConfig{ |
|
||||
ScanInterval: 10 * time.Second, |
|
||||
WorkerTimeout: 30 * time.Second, |
|
||||
TaskTimeout: 60 * time.Minute, |
|
||||
MaxRetries: 3, |
|
||||
ReconcileInterval: 5 * time.Minute, |
|
||||
EnableFailureRecovery: true, |
|
||||
MaxConcurrentTasks: 1, |
|
||||
} |
|
||||
|
|
||||
adminServer := NewMinimalAdminServer(config, nil) |
|
||||
err := adminServer.Start() |
|
||||
if err != nil { |
|
||||
t.Fatalf("Failed to start admin server: %v", err) |
|
||||
} |
|
||||
defer adminServer.Stop() |
|
||||
|
|
||||
// Register multiple workers with different capabilities
|
|
||||
workers := []*types.Worker{ |
|
||||
{ |
|
||||
ID: "ec-specialist-1", |
|
||||
Address: "localhost:9001", |
|
||||
Capabilities: []types.TaskType{types.TaskTypeErasureCoding}, |
|
||||
MaxConcurrent: 1, |
|
||||
Status: "active", |
|
||||
CurrentLoad: 0, |
|
||||
LastHeartbeat: time.Now(), |
|
||||
}, |
|
||||
{ |
|
||||
ID: "vacuum-worker-1", |
|
||||
Address: "localhost:9002", |
|
||||
Capabilities: []types.TaskType{types.TaskTypeVacuum}, |
|
||||
MaxConcurrent: 2, |
|
||||
Status: "active", |
|
||||
CurrentLoad: 0, |
|
||||
LastHeartbeat: time.Now(), |
|
||||
}, |
|
||||
{ |
|
||||
ID: "multi-capability-worker-1", |
|
||||
Address: "localhost:9003", |
|
||||
Capabilities: []types.TaskType{types.TaskTypeVacuum, types.TaskTypeErasureCoding}, |
|
||||
MaxConcurrent: 2, |
|
||||
Status: "active", |
|
||||
CurrentLoad: 0, |
|
||||
LastHeartbeat: time.Now(), |
|
||||
}, |
|
||||
} |
|
||||
|
|
||||
for _, worker := range workers { |
|
||||
err = adminServer.RegisterWorker(worker) |
|
||||
if err != nil { |
|
||||
t.Fatalf("Failed to register worker %s: %v", worker.ID, err) |
|
||||
} |
|
||||
t.Logf("✓ Registered worker %s with capabilities %v", worker.ID, worker.Capabilities) |
|
||||
} |
|
||||
|
|
||||
// Create test work directory
|
|
||||
workDir := filepath.Join(os.TempDir(), "seaweedfs_workflow_test") |
|
||||
err = os.MkdirAll(workDir, 0755) |
|
||||
if err != nil { |
|
||||
t.Fatalf("Failed to create work directory: %v", err) |
|
||||
} |
|
||||
defer os.RemoveAll(workDir) |
|
||||
|
|
||||
// Create multiple tasks of different types
|
|
||||
tasks := []*types.Task{ |
|
||||
{ |
|
||||
ID: "ec-workflow-1", |
|
||||
Type: types.TaskTypeErasureCoding, |
|
||||
VolumeID: 11111, |
|
||||
Server: "localhost:8080", |
|
||||
Status: types.TaskStatusPending, |
|
||||
Priority: types.TaskPriorityHigh, |
|
||||
Parameters: map[string]interface{}{ |
|
||||
"volume_size": int64(50 * 1024 * 1024 * 1024), |
|
||||
"master_client": "localhost:9333", |
|
||||
"work_dir": workDir, |
|
||||
"collection": "workflow_test", |
|
||||
}, |
|
||||
CreatedAt: time.Now(), |
|
||||
}, |
|
||||
{ |
|
||||
ID: "vacuum-workflow-1", |
|
||||
Type: types.TaskTypeVacuum, |
|
||||
VolumeID: 22222, |
|
||||
Server: "localhost:8081", |
|
||||
Status: types.TaskStatusPending, |
|
||||
Priority: types.TaskPriorityNormal, |
|
||||
Parameters: map[string]interface{}{ |
|
||||
"garbage_threshold": "0.4", |
|
||||
"volume_size": int64(20 * 1024 * 1024 * 1024), |
|
||||
}, |
|
||||
CreatedAt: time.Now(), |
|
||||
}, |
|
||||
{ |
|
||||
ID: "ec-workflow-2", |
|
||||
Type: types.TaskTypeErasureCoding, |
|
||||
VolumeID: 33333, |
|
||||
Server: "localhost:8082", |
|
||||
Status: types.TaskStatusPending, |
|
||||
Priority: types.TaskPriorityNormal, |
|
||||
Parameters: map[string]interface{}{ |
|
||||
"volume_size": int64(80 * 1024 * 1024 * 1024), |
|
||||
"master_client": "localhost:9333", |
|
||||
"work_dir": workDir, |
|
||||
"collection": "workflow_test", |
|
||||
}, |
|
||||
CreatedAt: time.Now(), |
|
||||
}, |
|
||||
} |
|
||||
|
|
||||
// Queue all tasks
|
|
||||
for _, task := range tasks { |
|
||||
err = adminServer.QueueTask(task) |
|
||||
if err != nil { |
|
||||
t.Fatalf("Failed to queue task %s: %v", task.ID, err) |
|
||||
} |
|
||||
t.Logf("✓ Queued task %s (%s) for volume %d", task.ID, task.Type, task.VolumeID) |
|
||||
} |
|
||||
|
|
||||
// Test task assignment to appropriate workers
|
|
||||
t.Logf("Testing task assignments to appropriate workers") |
|
||||
|
|
||||
// EC specialist should get EC tasks
|
|
||||
assignedTask, err := adminServer.RequestTask("ec-specialist-1", []types.TaskType{types.TaskTypeErasureCoding}) |
|
||||
if err != nil { |
|
||||
t.Errorf("Failed to request task for EC specialist: %v", err) |
|
||||
} else if assignedTask != nil { |
|
||||
t.Logf("✓ EC specialist got task: %s (%s)", assignedTask.ID, assignedTask.Type) |
|
||||
|
|
||||
// Complete the task
|
|
||||
err = adminServer.UpdateTaskProgress(assignedTask.ID, 100.0) |
|
||||
if err != nil { |
|
||||
t.Errorf("Failed to update progress: %v", err) |
|
||||
} |
|
||||
|
|
||||
err = adminServer.CompleteTask(assignedTask.ID, true, "") |
|
||||
if err != nil { |
|
||||
t.Errorf("Failed to complete task: %v", err) |
|
||||
} |
|
||||
t.Logf("✓ EC task completed by specialist") |
|
||||
} |
|
||||
|
|
||||
// Vacuum worker should get vacuum tasks
|
|
||||
assignedTask, err = adminServer.RequestTask("vacuum-worker-1", []types.TaskType{types.TaskTypeVacuum}) |
|
||||
if err != nil { |
|
||||
t.Errorf("Failed to request task for vacuum worker: %v", err) |
|
||||
} else if assignedTask != nil { |
|
||||
t.Logf("✓ Vacuum worker got task: %s (%s)", assignedTask.ID, assignedTask.Type) |
|
||||
|
|
||||
// Complete the task
|
|
||||
err = adminServer.UpdateTaskProgress(assignedTask.ID, 100.0) |
|
||||
if err != nil { |
|
||||
t.Errorf("Failed to update progress: %v", err) |
|
||||
} |
|
||||
|
|
||||
err = adminServer.CompleteTask(assignedTask.ID, true, "") |
|
||||
if err != nil { |
|
||||
t.Errorf("Failed to complete task: %v", err) |
|
||||
} |
|
||||
t.Logf("✓ Vacuum task completed by vacuum worker") |
|
||||
} |
|
||||
|
|
||||
// Multi-capability worker should get remaining tasks
|
|
||||
assignedTask, err = adminServer.RequestTask("multi-capability-worker-1", []types.TaskType{types.TaskTypeVacuum, types.TaskTypeErasureCoding}) |
|
||||
if err != nil { |
|
||||
t.Errorf("Failed to request task for multi-capability worker: %v", err) |
|
||||
} else if assignedTask != nil { |
|
||||
t.Logf("✓ Multi-capability worker got task: %s (%s)", assignedTask.ID, assignedTask.Type) |
|
||||
|
|
||||
// Complete the task
|
|
||||
err = adminServer.UpdateTaskProgress(assignedTask.ID, 100.0) |
|
||||
if err != nil { |
|
||||
t.Errorf("Failed to update progress: %v", err) |
|
||||
} |
|
||||
|
|
||||
err = adminServer.CompleteTask(assignedTask.ID, true, "") |
|
||||
if err != nil { |
|
||||
t.Errorf("Failed to complete task: %v", err) |
|
||||
} |
|
||||
t.Logf("✓ Task completed by multi-capability worker") |
|
||||
} |
|
||||
|
|
||||
// Check final workflow statistics
|
|
||||
stats := adminServer.GetSystemStats() |
|
||||
t.Logf("✓ Final workflow stats: Active tasks=%d, Queued tasks=%d, Active workers=%d, Total tasks=%d", |
|
||||
stats.ActiveTasks, stats.QueuedTasks, stats.ActiveWorkers, stats.TotalTasks) |
|
||||
|
|
||||
history := adminServer.GetTaskHistory() |
|
||||
t.Logf("✓ Workflow history contains %d completed tasks", len(history)) |
|
||||
|
|
||||
// Analyze task completion by type
|
|
||||
ecTasks := 0 |
|
||||
vacuumTasks := 0 |
|
||||
|
|
||||
for _, entry := range history { |
|
||||
switch entry.TaskType { |
|
||||
case types.TaskTypeErasureCoding: |
|
||||
ecTasks++ |
|
||||
t.Logf(" EC: %s - Worker: %s, Duration: %v", |
|
||||
entry.TaskID, entry.WorkerID, entry.Duration) |
|
||||
case types.TaskTypeVacuum: |
|
||||
vacuumTasks++ |
|
||||
t.Logf(" Vacuum: %s - Worker: %s, Duration: %v", |
|
||||
entry.TaskID, entry.WorkerID, entry.Duration) |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
t.Logf("✓ Completed tasks: %d EC, %d Vacuum", ecTasks, vacuumTasks) |
|
||||
t.Logf("✅ EC workflow test completed successfully") |
|
||||
} |
|
||||
@ -1,346 +0,0 @@ |
|||||
package task |
|
||||
|
|
||||
import ( |
|
||||
"time" |
|
||||
|
|
||||
"github.com/seaweedfs/seaweedfs/weed/glog" |
|
||||
"github.com/seaweedfs/seaweedfs/weed/wdclient" |
|
||||
"github.com/seaweedfs/seaweedfs/weed/worker/types" |
|
||||
) |
|
||||
|
|
||||
// ExampleUsage demonstrates how to use the task distribution system
|
|
||||
func ExampleUsage() { |
|
||||
glog.Infof("=== SeaweedFS Task Distribution System Example ===") |
|
||||
|
|
||||
// Example 1: Setting up the Admin Server
|
|
||||
setupAdminServerExample() |
|
||||
|
|
||||
// Example 2: Simulating Workers
|
|
||||
simulateWorkersExample() |
|
||||
|
|
||||
// Example 3: Running Simulations
|
|
||||
runSimulationsExample() |
|
||||
|
|
||||
// Example 4: Demonstrating Features
|
|
||||
demonstrateFeaturesExample() |
|
||||
} |
|
||||
|
|
||||
// setupAdminServerExample shows how to set up the admin server
|
|
||||
func setupAdminServerExample() { |
|
||||
glog.Infof("\n--- Example 1: Setting up Admin Server ---") |
|
||||
|
|
||||
// Create master client (in real usage, this would connect to actual master)
|
|
||||
masterClient := &wdclient.MasterClient{} // Simplified for example
|
|
||||
|
|
||||
// Create admin server configuration
|
|
||||
config := &AdminConfig{ |
|
||||
ScanInterval: 30 * time.Minute, |
|
||||
WorkerTimeout: 5 * time.Minute, |
|
||||
TaskTimeout: 10 * time.Minute, |
|
||||
MaxRetries: 3, |
|
||||
ReconcileInterval: 5 * time.Minute, |
|
||||
EnableFailureRecovery: true, |
|
||||
MaxConcurrentTasks: 10, |
|
||||
} |
|
||||
|
|
||||
// Create admin server
|
|
||||
adminServer := NewAdminServer(config, masterClient) |
|
||||
|
|
||||
// Start the admin server
|
|
||||
if err := adminServer.Start(); err != nil { |
|
||||
glog.Errorf("Failed to start admin server: %v", err) |
|
||||
return |
|
||||
} |
|
||||
|
|
||||
glog.Infof("✓ Admin server started with configuration:") |
|
||||
glog.Infof(" - Scan Interval: %v", config.ScanInterval) |
|
||||
glog.Infof(" - Worker Timeout: %v", config.WorkerTimeout) |
|
||||
glog.Infof(" - Max Concurrent Tasks: %d", config.MaxConcurrentTasks) |
|
||||
|
|
||||
// Simulate some operations
|
|
||||
time.Sleep(2 * time.Second) |
|
||||
|
|
||||
// Stop the admin server
|
|
||||
adminServer.Stop() |
|
||||
glog.Infof("✓ Admin server stopped gracefully") |
|
||||
} |
|
||||
|
|
||||
// simulateWorkersExample shows how workers would register and operate
|
|
||||
func simulateWorkersExample() { |
|
||||
glog.Infof("\n--- Example 2: Worker Registration and Operation ---") |
|
||||
|
|
||||
// Create mock workers
|
|
||||
workers := []*types.Worker{ |
|
||||
{ |
|
||||
ID: "worker-ec-01", |
|
||||
Address: "192.168.1.100:8080", |
|
||||
Capabilities: []types.TaskType{types.TaskTypeErasureCoding}, |
|
||||
MaxConcurrent: 2, |
|
||||
Status: "active", |
|
||||
CurrentLoad: 0, |
|
||||
}, |
|
||||
{ |
|
||||
ID: "worker-vacuum-01", |
|
||||
Address: "192.168.1.101:8080", |
|
||||
Capabilities: []types.TaskType{types.TaskTypeVacuum}, |
|
||||
MaxConcurrent: 3, |
|
||||
Status: "active", |
|
||||
CurrentLoad: 0, |
|
||||
}, |
|
||||
{ |
|
||||
ID: "worker-multi-01", |
|
||||
Address: "192.168.1.102:8080", |
|
||||
Capabilities: []types.TaskType{types.TaskTypeErasureCoding, types.TaskTypeVacuum}, |
|
||||
MaxConcurrent: 2, |
|
||||
Status: "active", |
|
||||
CurrentLoad: 0, |
|
||||
}, |
|
||||
} |
|
||||
|
|
||||
// Create worker registry
|
|
||||
registry := NewWorkerRegistry() |
|
||||
|
|
||||
// Register workers
|
|
||||
for _, worker := range workers { |
|
||||
if err := registry.RegisterWorker(worker); err != nil { |
|
||||
glog.Errorf("Failed to register worker %s: %v", worker.ID, err) |
|
||||
} else { |
|
||||
glog.Infof("✓ Registered worker %s with capabilities: %v", worker.ID, worker.Capabilities) |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// Demonstrate worker selection
|
|
||||
bestECWorker := registry.GetBestWorkerForTask(types.TaskTypeErasureCoding) |
|
||||
if bestECWorker != nil { |
|
||||
glog.Infof("✓ Best worker for EC tasks: %s", bestECWorker.ID) |
|
||||
} |
|
||||
|
|
||||
bestVacuumWorker := registry.GetBestWorkerForTask(types.TaskTypeVacuum) |
|
||||
if bestVacuumWorker != nil { |
|
||||
glog.Infof("✓ Best worker for vacuum tasks: %s", bestVacuumWorker.ID) |
|
||||
} |
|
||||
|
|
||||
// Show registry statistics
|
|
||||
stats := registry.GetRegistryStats() |
|
||||
glog.Infof("✓ Registry statistics: %+v", stats) |
|
||||
} |
|
||||
|
|
||||
// runSimulationsExample shows how to run simulation scenarios
|
|
||||
func runSimulationsExample() { |
|
||||
glog.Infof("\n--- Example 3: Running Simulation Scenarios ---") |
|
||||
|
|
||||
// Note: Simulation framework moved to simulation package
|
|
||||
// To use: simulationRunner := simulation.NewComprehensiveSimulationRunner()
|
|
||||
// simulationRunner.RunAllComprehensiveTests()
|
|
||||
|
|
||||
glog.Infof("✅ Simulation framework available in separate package") |
|
||||
glog.Infof("Use simulation.NewComprehensiveSimulationRunner() to access comprehensive testing") |
|
||||
} |
|
||||
|
|
||||
// demonstrateFeaturesExample shows key system features
|
|
||||
func demonstrateFeaturesExample() { |
|
||||
glog.Infof("\n--- Example 4: Key System Features ---") |
|
||||
|
|
||||
// Feature 1: Task Discovery
|
|
||||
demonstrateTaskDiscovery() |
|
||||
|
|
||||
// Feature 2: Volume State Tracking
|
|
||||
demonstrateVolumeStateTracking() |
|
||||
|
|
||||
// Feature 3: Failure Handling
|
|
||||
demonstrateFailureHandling() |
|
||||
|
|
||||
// Feature 4: Task Scheduling
|
|
||||
demonstrateTaskScheduling() |
|
||||
} |
|
||||
|
|
||||
// demonstrateTaskDiscovery shows how task discovery works
|
|
||||
func demonstrateTaskDiscovery() { |
|
||||
glog.Infof("\n Feature 1: Task Discovery") |
|
||||
|
|
||||
// Create mock volumes
|
|
||||
volumes := []*VolumeInfo{ |
|
||||
{ |
|
||||
ID: 1, |
|
||||
Size: 28 * 1024 * 1024 * 1024, // 28GB (93% of 30GB)
|
|
||||
Collection: "photos", |
|
||||
DeletedByteCount: 0, |
|
||||
ReadOnly: false, |
|
||||
ModifiedAtSecond: time.Now().Add(-2 * time.Hour).Unix(), // 2 hours old
|
|
||||
}, |
|
||||
{ |
|
||||
ID: 2, |
|
||||
Size: 20 * 1024 * 1024 * 1024, // 20GB
|
|
||||
Collection: "documents", |
|
||||
DeletedByteCount: 8 * 1024 * 1024 * 1024, // 8GB garbage (40%)
|
|
||||
ReadOnly: false, |
|
||||
ModifiedAtSecond: time.Now().Add(-1 * time.Hour).Unix(), // 1 hour old
|
|
||||
}, |
|
||||
} |
|
||||
|
|
||||
// Create detectors
|
|
||||
ecDetector := NewECDetector() |
|
||||
vacuumDetector := NewVacuumDetector() |
|
||||
|
|
||||
// Test EC detection
|
|
||||
ecCandidates, _ := ecDetector.DetectECCandidates(volumes) |
|
||||
glog.Infof(" ✓ EC detector found %d candidates", len(ecCandidates)) |
|
||||
for _, candidate := range ecCandidates { |
|
||||
glog.Infof(" - Volume %d: %s (priority: %d)", candidate.VolumeID, candidate.Reason, candidate.Priority) |
|
||||
} |
|
||||
|
|
||||
// Test vacuum detection
|
|
||||
vacuumCandidates, _ := vacuumDetector.DetectVacuumCandidates(volumes) |
|
||||
glog.Infof(" ✓ Vacuum detector found %d candidates", len(vacuumCandidates)) |
|
||||
for _, candidate := range vacuumCandidates { |
|
||||
glog.Infof(" - Volume %d: %s (priority: %d)", candidate.VolumeID, candidate.Reason, candidate.Priority) |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// demonstrateVolumeStateTracking shows volume state management
|
|
||||
func demonstrateVolumeStateTracking() { |
|
||||
glog.Infof("\n Feature 2: Volume State Tracking") |
|
||||
|
|
||||
// Create volume state tracker
|
|
||||
tracker := NewVolumeStateTracker(nil, 5*time.Minute) |
|
||||
|
|
||||
// Reserve volumes for tasks
|
|
||||
tracker.ReserveVolume(1, "task-ec-001") |
|
||||
tracker.ReserveVolume(2, "task-vacuum-001") |
|
||||
|
|
||||
glog.Infof(" ✓ Reserved volumes for tasks") |
|
||||
|
|
||||
// Check reservations
|
|
||||
if tracker.IsVolumeReserved(1) { |
|
||||
glog.Infof(" ✓ Volume 1 is correctly reserved") |
|
||||
} |
|
||||
|
|
||||
// Record volume changes
|
|
||||
tracker.RecordVolumeChange(1, types.TaskTypeErasureCoding, "task-ec-001") |
|
||||
glog.Infof(" ✓ Recorded volume change for EC completion") |
|
||||
|
|
||||
// Get pending changes
|
|
||||
if change := tracker.GetPendingChange(1); change != nil { |
|
||||
glog.Infof(" ✓ Pending change found: %s for volume %d", change.ChangeType, change.VolumeID) |
|
||||
} |
|
||||
|
|
||||
// Release reservation
|
|
||||
tracker.ReleaseVolume(2, "task-vacuum-001") |
|
||||
glog.Infof(" ✓ Released volume reservation") |
|
||||
|
|
||||
// Show statistics
|
|
||||
stats := tracker.GetStats() |
|
||||
glog.Infof(" ✓ Tracker statistics: %+v", stats) |
|
||||
} |
|
||||
|
|
||||
// demonstrateFailureHandling shows failure recovery mechanisms
|
|
||||
func demonstrateFailureHandling() { |
|
||||
glog.Infof("\n Feature 3: Failure Handling") |
|
||||
|
|
||||
// Create failure handler
|
|
||||
config := DefaultAdminConfig() |
|
||||
handler := NewFailureHandler(config) |
|
||||
|
|
||||
// Create mock task
|
|
||||
task := &InProgressTask{ |
|
||||
Task: &types.Task{ |
|
||||
ID: "test-task-001", |
|
||||
Type: types.TaskTypeErasureCoding, |
|
||||
VolumeID: 1, |
|
||||
RetryCount: 0, |
|
||||
}, |
|
||||
WorkerID: "worker-01", |
|
||||
StartedAt: time.Now(), |
|
||||
LastUpdate: time.Now().Add(-30 * time.Minute), // 30 minutes ago
|
|
||||
Progress: 45.0, |
|
||||
} |
|
||||
|
|
||||
// Demonstrate different failure scenarios
|
|
||||
glog.Infof(" ✓ Simulating worker timeout scenario") |
|
||||
handler.HandleWorkerTimeout("worker-01", []*InProgressTask{task}) |
|
||||
|
|
||||
glog.Infof(" ✓ Simulating stuck task scenario") |
|
||||
handler.HandleTaskStuck(task) |
|
||||
|
|
||||
glog.Infof(" ✓ Simulating duplicate task detection") |
|
||||
handler.HandleDuplicateTask("existing-task", "duplicate-task", 1) |
|
||||
|
|
||||
// Show failure statistics
|
|
||||
stats := handler.GetFailureStats() |
|
||||
glog.Infof(" ✓ Failure handler statistics: %+v", stats) |
|
||||
} |
|
||||
|
|
||||
// demonstrateTaskScheduling shows task scheduling logic
|
|
||||
func demonstrateTaskScheduling() { |
|
||||
glog.Infof("\n Feature 4: Task Scheduling") |
|
||||
|
|
||||
// Create worker registry and task queue
|
|
||||
registry := NewWorkerRegistry() |
|
||||
queue := NewPriorityTaskQueue() |
|
||||
scheduler := NewTaskScheduler(registry, queue) |
|
||||
|
|
||||
// Add mock worker
|
|
||||
worker := &types.Worker{ |
|
||||
ID: "scheduler-worker-01", |
|
||||
Capabilities: []types.TaskType{types.TaskTypeErasureCoding, types.TaskTypeVacuum}, |
|
||||
MaxConcurrent: 2, |
|
||||
Status: "active", |
|
||||
CurrentLoad: 0, |
|
||||
} |
|
||||
registry.RegisterWorker(worker) |
|
||||
|
|
||||
// Create mock tasks with different priorities
|
|
||||
highPriorityTask := &types.Task{ |
|
||||
ID: "high-priority-task", |
|
||||
Type: types.TaskTypeErasureCoding, |
|
||||
Priority: types.TaskPriorityHigh, |
|
||||
VolumeID: 1, |
|
||||
} |
|
||||
|
|
||||
normalPriorityTask := &types.Task{ |
|
||||
ID: "normal-priority-task", |
|
||||
Type: types.TaskTypeVacuum, |
|
||||
Priority: types.TaskPriorityNormal, |
|
||||
VolumeID: 2, |
|
||||
} |
|
||||
|
|
||||
// Add tasks to queue
|
|
||||
queue.Push(normalPriorityTask) |
|
||||
queue.Push(highPriorityTask) // Should be prioritized
|
|
||||
|
|
||||
glog.Infof(" ✓ Added tasks to priority queue (size: %d)", queue.Size()) |
|
||||
|
|
||||
// Test worker selection
|
|
||||
selectedWorker := scheduler.SelectWorker(highPriorityTask, []*types.Worker{worker}) |
|
||||
if selectedWorker != nil { |
|
||||
glog.Infof(" ✓ Selected worker %s for high-priority task", selectedWorker.ID) |
|
||||
} |
|
||||
|
|
||||
// Test task retrieval
|
|
||||
nextTask := scheduler.GetNextTask("scheduler-worker-01", []types.TaskType{types.TaskTypeErasureCoding, types.TaskTypeVacuum}) |
|
||||
if nextTask != nil { |
|
||||
glog.Infof(" ✓ Next task for worker: %s (priority: %d)", nextTask.ID, nextTask.Priority) |
|
||||
} |
|
||||
|
|
||||
glog.Infof(" ✓ Task scheduling demonstration complete") |
|
||||
} |
|
||||
|
|
||||
// RunComprehensiveDemo runs a full demonstration of the system
|
|
||||
func RunComprehensiveDemo() { |
|
||||
glog.Infof("Starting comprehensive task distribution system demonstration...") |
|
||||
|
|
||||
// Run comprehensive example
|
|
||||
ExampleUsage() |
|
||||
|
|
||||
// Note: To run the comprehensive simulation framework, use:
|
|
||||
// simulationRunner := simulation.NewComprehensiveSimulationRunner()
|
|
||||
// simulationRunner.RunAllComprehensiveTests()
|
|
||||
|
|
||||
glog.Infof("=== Comprehensive demonstration complete ===") |
|
||||
glog.Infof("💡 To run comprehensive simulations, use the simulation package separately") |
|
||||
glog.Infof("Step 9: Comprehensive Simulation Testing") |
|
||||
glog.Infof("Note: Simulation framework moved to separate 'simulation' package") |
|
||||
glog.Infof("To run simulations: simulation.NewComprehensiveSimulationRunner().RunAllComprehensiveTests()") |
|
||||
glog.Infof("✅ Simulation framework available in separate package") |
|
||||
glog.Infof("") |
|
||||
} |
|
||||
@ -1,123 +0,0 @@ |
|||||
package task |
|
||||
|
|
||||
import ( |
|
||||
"time" |
|
||||
|
|
||||
"github.com/seaweedfs/seaweedfs/weed/glog" |
|
||||
) |
|
||||
|
|
||||
// FailureHandler handles various failure scenarios in the task system
|
|
||||
type FailureHandler struct { |
|
||||
config *AdminConfig |
|
||||
} |
|
||||
|
|
||||
// NewFailureHandler creates a new failure handler
|
|
||||
func NewFailureHandler(config *AdminConfig) *FailureHandler { |
|
||||
return &FailureHandler{ |
|
||||
config: config, |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// HandleWorkerTimeout handles worker timeout scenarios
|
|
||||
func (fh *FailureHandler) HandleWorkerTimeout(workerID string, affectedTasks []*InProgressTask) { |
|
||||
glog.Warningf("Handling worker timeout for worker %s with %d affected tasks", workerID, len(affectedTasks)) |
|
||||
|
|
||||
for _, task := range affectedTasks { |
|
||||
fh.handleTaskFailure(task, "worker_timeout", "Worker became unresponsive") |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// HandleTaskStuck handles stuck task scenarios
|
|
||||
func (fh *FailureHandler) HandleTaskStuck(task *InProgressTask) { |
|
||||
glog.Warningf("Handling stuck task %s (no progress for %v)", task.Task.ID, time.Since(task.LastUpdate)) |
|
||||
|
|
||||
fh.handleTaskFailure(task, "task_stuck", "Task made no progress within timeout period") |
|
||||
} |
|
||||
|
|
||||
// HandleTaskFailure handles general task failure scenarios
|
|
||||
func (fh *FailureHandler) HandleTaskFailure(task *InProgressTask, reason string, details string) { |
|
||||
glog.Errorf("Handling task failure for task %s: %s - %s", task.Task.ID, reason, details) |
|
||||
|
|
||||
fh.handleTaskFailure(task, reason, details) |
|
||||
} |
|
||||
|
|
||||
// handleTaskFailure is the internal handler for task failures
|
|
||||
func (fh *FailureHandler) handleTaskFailure(task *InProgressTask, reason string, details string) { |
|
||||
// Record failure reason
|
|
||||
task.Task.Error = details |
|
||||
|
|
||||
// Determine if task should be retried
|
|
||||
if task.Task.RetryCount < fh.config.MaxRetries { |
|
||||
fh.scheduleRetry(task, reason) |
|
||||
} else { |
|
||||
fh.markTaskFailed(task, reason) |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// scheduleRetry schedules a task for retry
|
|
||||
func (fh *FailureHandler) scheduleRetry(task *InProgressTask, reason string) { |
|
||||
task.Task.RetryCount++ |
|
||||
|
|
||||
// Calculate retry delay with exponential backoff
|
|
||||
retryDelay := time.Duration(task.Task.RetryCount) * 5 * time.Minute |
|
||||
task.Task.ScheduledAt = time.Now().Add(retryDelay) |
|
||||
|
|
||||
glog.Infof("Scheduling retry %d/%d for task %s (reason: %s, delay: %v)", |
|
||||
task.Task.RetryCount, fh.config.MaxRetries, task.Task.ID, reason, retryDelay) |
|
||||
} |
|
||||
|
|
||||
// markTaskFailed permanently marks a task as failed
|
|
||||
func (fh *FailureHandler) markTaskFailed(task *InProgressTask, reason string) { |
|
||||
glog.Errorf("Task %s permanently failed after %d retries (reason: %s)", |
|
||||
task.Task.ID, task.Task.RetryCount, reason) |
|
||||
|
|
||||
// Could trigger alerts or notifications here
|
|
||||
fh.sendFailureAlert(task, reason) |
|
||||
} |
|
||||
|
|
||||
// sendFailureAlert sends alerts for permanently failed tasks
|
|
||||
func (fh *FailureHandler) sendFailureAlert(task *InProgressTask, reason string) { |
|
||||
// In a real implementation, this would:
|
|
||||
// 1. Send notifications to administrators
|
|
||||
// 2. Update monitoring dashboards
|
|
||||
// 3. Log to audit trails
|
|
||||
// 4. Possibly trigger automatic remediation
|
|
||||
|
|
||||
glog.Errorf("ALERT: Task permanently failed - ID: %s, Type: %s, Volume: %d, Reason: %s", |
|
||||
task.Task.ID, task.Task.Type, task.Task.VolumeID, reason) |
|
||||
} |
|
||||
|
|
||||
// HandleDuplicateTask handles duplicate task detection
|
|
||||
func (fh *FailureHandler) HandleDuplicateTask(existingTaskID string, duplicateTaskID string, volumeID uint32) { |
|
||||
glog.Warningf("Detected duplicate task for volume %d: existing=%s, duplicate=%s", |
|
||||
volumeID, existingTaskID, duplicateTaskID) |
|
||||
|
|
||||
// Cancel the duplicate task
|
|
||||
// In a real implementation, this would send a cancellation signal
|
|
||||
} |
|
||||
|
|
||||
// HandleResourceExhaustion handles resource exhaustion scenarios
|
|
||||
func (fh *FailureHandler) HandleResourceExhaustion(workerID string, taskType string) { |
|
||||
glog.Warningf("Worker %s reported resource exhaustion for task type %s", workerID, taskType) |
|
||||
|
|
||||
// Could implement:
|
|
||||
// 1. Temporary worker blacklisting
|
|
||||
// 2. Task redistribution
|
|
||||
// 3. Resource monitoring alerts
|
|
||||
} |
|
||||
|
|
||||
// GetFailureStats returns failure statistics
|
|
||||
func (fh *FailureHandler) GetFailureStats() map[string]interface{} { |
|
||||
// In a real implementation, this would track:
|
|
||||
// - Failure rates by type
|
|
||||
// - Worker reliability scores
|
|
||||
// - Task retry statistics
|
|
||||
// - System health metrics
|
|
||||
|
|
||||
return map[string]interface{}{ |
|
||||
"enabled": true, |
|
||||
"max_retries": fh.config.MaxRetries, |
|
||||
"task_timeout": fh.config.TaskTimeout.String(), |
|
||||
"worker_timeout": fh.config.WorkerTimeout.String(), |
|
||||
} |
|
||||
} |
|
||||
@ -1,486 +0,0 @@ |
|||||
package task |
|
||||
|
|
||||
import ( |
|
||||
"context" |
|
||||
"fmt" |
|
||||
"strconv" |
|
||||
"strings" |
|
||||
"time" |
|
||||
|
|
||||
"github.com/seaweedfs/seaweedfs/weed/glog" |
|
||||
"github.com/seaweedfs/seaweedfs/weed/pb/master_pb" |
|
||||
"github.com/seaweedfs/seaweedfs/weed/wdclient" |
|
||||
"github.com/seaweedfs/seaweedfs/weed/worker/tasks/erasure_coding" |
|
||||
"github.com/seaweedfs/seaweedfs/weed/worker/tasks/vacuum" |
|
||||
"github.com/seaweedfs/seaweedfs/weed/worker/types" |
|
||||
) |
|
||||
|
|
||||
// MasterSynchronizer handles periodic synchronization with the master server
|
|
||||
type MasterSynchronizer struct { |
|
||||
masterClient *wdclient.MasterClient |
|
||||
volumeStateManager *VolumeStateManager |
|
||||
adminServer *AdminServer |
|
||||
syncInterval time.Duration |
|
||||
stopCh chan struct{} |
|
||||
volumeSizeLimitMB uint64 // Volume size limit from master in MB
|
|
||||
} |
|
||||
|
|
||||
// NewMasterSynchronizer creates a new master synchronizer
|
|
||||
func NewMasterSynchronizer(masterClient *wdclient.MasterClient, vsm *VolumeStateManager, admin *AdminServer) *MasterSynchronizer { |
|
||||
return &MasterSynchronizer{ |
|
||||
masterClient: masterClient, |
|
||||
volumeStateManager: vsm, |
|
||||
adminServer: admin, |
|
||||
syncInterval: 30 * time.Second, // Default 30 second sync interval
|
|
||||
stopCh: make(chan struct{}), |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// Start begins the periodic master synchronization
|
|
||||
func (ms *MasterSynchronizer) Start() { |
|
||||
glog.Infof("Starting master synchronization with interval %v", ms.syncInterval) |
|
||||
|
|
||||
go func() { |
|
||||
// Immediate sync on startup
|
|
||||
ms.performSync() |
|
||||
|
|
||||
ticker := time.NewTicker(ms.syncInterval) |
|
||||
defer ticker.Stop() |
|
||||
|
|
||||
for { |
|
||||
select { |
|
||||
case <-ticker.C: |
|
||||
ms.performSync() |
|
||||
case <-ms.stopCh: |
|
||||
glog.Infof("Master synchronization stopped") |
|
||||
return |
|
||||
} |
|
||||
} |
|
||||
}() |
|
||||
} |
|
||||
|
|
||||
// Stop stops the master synchronization
|
|
||||
func (ms *MasterSynchronizer) Stop() { |
|
||||
close(ms.stopCh) |
|
||||
} |
|
||||
|
|
||||
// performSync executes a single synchronization cycle
|
|
||||
func (ms *MasterSynchronizer) performSync() { |
|
||||
glog.V(1).Infof("Starting master sync cycle") |
|
||||
startTime := time.Now() |
|
||||
|
|
||||
// Get volume list from master
|
|
||||
volumeData, err := ms.getVolumeListFromMaster() |
|
||||
if err != nil { |
|
||||
glog.Errorf("Failed to get volume list from master: %v", err) |
|
||||
return |
|
||||
} |
|
||||
|
|
||||
// Update volume size limit from master
|
|
||||
if volumeData.VolumeSizeLimitMb > 0 { |
|
||||
ms.volumeSizeLimitMB = volumeData.VolumeSizeLimitMb |
|
||||
glog.V(2).Infof("Updated volume size limit to %d MB from master", ms.volumeSizeLimitMB) |
|
||||
} |
|
||||
|
|
||||
// Merge data into volume state manager
|
|
||||
err = ms.mergeVolumeData(volumeData) |
|
||||
if err != nil { |
|
||||
glog.Errorf("Failed to merge volume data: %v", err) |
|
||||
return |
|
||||
} |
|
||||
|
|
||||
// Detect volumes needing work
|
|
||||
candidates := ms.detectMaintenanceCandidates(volumeData) |
|
||||
|
|
||||
// Process candidates for task assignment
|
|
||||
ms.processCandidates(candidates) |
|
||||
|
|
||||
duration := time.Since(startTime) |
|
||||
glog.V(1).Infof("Master sync completed in %v, found %d maintenance candidates", |
|
||||
duration, len(candidates)) |
|
||||
} |
|
||||
|
|
||||
// getVolumeListFromMaster retrieves the current volume topology from master
|
|
||||
func (ms *MasterSynchronizer) getVolumeListFromMaster() (*master_pb.VolumeListResponse, error) { |
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) |
|
||||
defer cancel() |
|
||||
|
|
||||
err := ms.masterClient.WithClient(false, func(client master_pb.SeaweedClient) error { |
|
||||
req := &master_pb.VolumeListRequest{} |
|
||||
response, err := client.VolumeList(ctx, req) |
|
||||
if err != nil { |
|
||||
return fmt.Errorf("VolumeList RPC failed: %v", err) |
|
||||
} |
|
||||
volumeData = response |
|
||||
return nil |
|
||||
}) |
|
||||
|
|
||||
if err != nil { |
|
||||
return nil, err |
|
||||
} |
|
||||
|
|
||||
return volumeData, nil |
|
||||
} |
|
||||
|
|
||||
// VolumeMaintenanceCandidate represents a volume that needs maintenance
|
|
||||
type VolumeMaintenanceCandidate struct { |
|
||||
VolumeID uint32 |
|
||||
Server string |
|
||||
TaskType string |
|
||||
Priority TaskPriority |
|
||||
Reason string |
|
||||
VolumeInfo *VolumeInfo |
|
||||
ECShardInfo map[int]*ShardInfo |
|
||||
} |
|
||||
|
|
||||
// mergeVolumeData merges master volume data into the volume state manager
|
|
||||
func (ms *MasterSynchronizer) mergeVolumeData(data *master_pb.VolumeListResponse) error { |
|
||||
if data.TopologyInfo == nil { |
|
||||
return fmt.Errorf("empty topology info from master") |
|
||||
} |
|
||||
|
|
||||
volumes := make(map[uint32]*VolumeInfo) |
|
||||
ecShards := make(map[uint32]map[int]*ShardInfo) |
|
||||
serverCapacity := make(map[string]*CapacityInfo) |
|
||||
|
|
||||
// Extract volume information from topology
|
|
||||
ms.extractVolumesFromTopology(data.TopologyInfo, volumes, ecShards, serverCapacity) |
|
||||
|
|
||||
// Update volume state manager
|
|
||||
err := ms.volumeStateManager.SyncWithMasterData(volumes, ecShards, serverCapacity) |
|
||||
if err != nil { |
|
||||
return fmt.Errorf("failed to sync with volume state manager: %v", err) |
|
||||
} |
|
||||
|
|
||||
glog.V(2).Infof("Synced %d volumes, %d EC volume groups, %d servers", |
|
||||
len(volumes), len(ecShards), len(serverCapacity)) |
|
||||
|
|
||||
return nil |
|
||||
} |
|
||||
|
|
||||
// extractVolumesFromTopology extracts volume and capacity data from master topology
|
|
||||
func (ms *MasterSynchronizer) extractVolumesFromTopology( |
|
||||
topology *master_pb.TopologyInfo, |
|
||||
volumes map[uint32]*VolumeInfo, |
|
||||
ecShards map[uint32]map[int]*ShardInfo, |
|
||||
serverCapacity map[string]*CapacityInfo) { |
|
||||
|
|
||||
for _, dcInfo := range topology.DataCenterInfos { |
|
||||
for _, rackInfo := range dcInfo.RackInfos { |
|
||||
for _, nodeInfo := range rackInfo.DataNodeInfos { |
|
||||
serverID := fmt.Sprintf("%s:%d", nodeInfo.Id, nodeInfo.GrpcPort) |
|
||||
|
|
||||
// Initialize server capacity info
|
|
||||
if serverCapacity[serverID] == nil { |
|
||||
serverCapacity[serverID] = &CapacityInfo{ |
|
||||
Server: serverID, |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// Process disk information
|
|
||||
for diskType, diskInfo := range nodeInfo.DiskInfos { |
|
||||
ms.processDiskInfo(diskInfo, diskType, serverID, volumes, ecShards, serverCapacity) |
|
||||
} |
|
||||
} |
|
||||
} |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// processDiskInfo processes disk information for a specific server
|
|
||||
func (ms *MasterSynchronizer) processDiskInfo( |
|
||||
diskInfo *master_pb.DiskInfo, |
|
||||
diskType string, |
|
||||
serverID string, |
|
||||
volumes map[uint32]*VolumeInfo, |
|
||||
ecShards map[uint32]map[int]*ShardInfo, |
|
||||
serverCapacity map[string]*CapacityInfo) { |
|
||||
|
|
||||
// Update capacity information
|
|
||||
capacity := serverCapacity[serverID] |
|
||||
volumeSizeBytes := int64(ms.volumeSizeLimitMB) * 1024 * 1024 // Convert MB to bytes
|
|
||||
capacity.TotalCapacity += int64(diskInfo.MaxVolumeCount) * volumeSizeBytes |
|
||||
capacity.UsedCapacity += int64(diskInfo.ActiveVolumeCount) * volumeSizeBytes |
|
||||
|
|
||||
// Process regular volumes
|
|
||||
for _, volInfo := range diskInfo.VolumeInfos { |
|
||||
volumes[volInfo.Id] = &VolumeInfo{ |
|
||||
ID: volInfo.Id, |
|
||||
Size: volInfo.Size, |
|
||||
Collection: volInfo.Collection, |
|
||||
FileCount: volInfo.FileCount, |
|
||||
DeleteCount: volInfo.DeleteCount, |
|
||||
DeletedByteCount: volInfo.DeletedByteCount, |
|
||||
ReadOnly: volInfo.ReadOnly, |
|
||||
Server: serverID, |
|
||||
DiskType: diskType, |
|
||||
ModifiedAtSecond: volInfo.ModifiedAtSecond, |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// Process EC shards
|
|
||||
for _, shardInfo := range diskInfo.EcShardInfos { |
|
||||
volumeID := shardInfo.Id |
|
||||
if ecShards[volumeID] == nil { |
|
||||
ecShards[volumeID] = make(map[int]*ShardInfo) |
|
||||
} |
|
||||
|
|
||||
// Extract shard IDs from ec_index_bits
|
|
||||
for shardID := 0; shardID < 14; shardID++ { |
|
||||
if (shardInfo.EcIndexBits & (1 << uint(shardID))) != 0 { |
|
||||
ecShards[volumeID][shardID] = &ShardInfo{ |
|
||||
ShardID: shardID, |
|
||||
Server: serverID, |
|
||||
Status: ShardStatusExists, |
|
||||
Size: 0, // Size not available in shard info
|
|
||||
} |
|
||||
} |
|
||||
} |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// detectMaintenanceCandidates identifies volumes that need maintenance
|
|
||||
func (ms *MasterSynchronizer) detectMaintenanceCandidates(data *master_pb.VolumeListResponse) []*VolumeMaintenanceCandidate { |
|
||||
var candidates []*VolumeMaintenanceCandidate |
|
||||
|
|
||||
// Get current volume states
|
|
||||
currentVolumes := ms.volumeStateManager.GetAllVolumeStates() |
|
||||
|
|
||||
for volumeID, volumeState := range currentVolumes { |
|
||||
// Skip volumes with in-progress tasks
|
|
||||
if len(volumeState.InProgressTasks) > 0 { |
|
||||
continue |
|
||||
} |
|
||||
|
|
||||
// Check for EC encoding candidates
|
|
||||
if candidate := ms.checkECEncodingCandidate(volumeID, volumeState); candidate != nil { |
|
||||
candidates = append(candidates, candidate) |
|
||||
} |
|
||||
|
|
||||
// Check for vacuum candidates
|
|
||||
if candidate := ms.checkVacuumCandidate(volumeID, volumeState); candidate != nil { |
|
||||
candidates = append(candidates, candidate) |
|
||||
} |
|
||||
|
|
||||
// Check for EC rebuild candidates
|
|
||||
if candidate := ms.checkECRebuildCandidate(volumeID, volumeState); candidate != nil { |
|
||||
candidates = append(candidates, candidate) |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
return candidates |
|
||||
} |
|
||||
|
|
||||
// EC encoding criteria - using configuration from EC detector
|
|
||||
func (ms *MasterSynchronizer) checkECEncodingCandidate(volumeID uint32, state *VolumeState) *VolumeMaintenanceCandidate { |
|
||||
volume := state.CurrentState |
|
||||
if volume == nil { |
|
||||
return nil |
|
||||
} |
|
||||
|
|
||||
// Get the current configuration from the EC detector
|
|
||||
ecDetector, _ := erasure_coding.GetSharedInstances() |
|
||||
if ecDetector == nil || !ecDetector.IsEnabled() { |
|
||||
return nil |
|
||||
} |
|
||||
|
|
||||
// Get configuration values from the detector
|
|
||||
fullnessThreshold := ecDetector.GetFullnessRatio() |
|
||||
quietForSeconds := ecDetector.GetQuietForSeconds() |
|
||||
collectionFilter := ecDetector.GetCollectionFilter() |
|
||||
|
|
||||
// EC encoding criteria:
|
|
||||
// 1. Volume meets fullness ratio threshold
|
|
||||
// 2. Volume has been quiet for required duration
|
|
||||
// 3. Collection filter matches (if specified)
|
|
||||
// 4. Not already EC encoded
|
|
||||
|
|
||||
// Check fullness ratio (if we have size info)
|
|
||||
if volume.Size == 0 { |
|
||||
return nil |
|
||||
} |
|
||||
|
|
||||
// Calculate fullness ratio (assuming total capacity is close to actual size for near-full volumes)
|
|
||||
// For a more accurate calculation, we'd need the volume's max capacity
|
|
||||
fullnessRatio := float64(volume.Size-volume.DeletedByteCount) / float64(volume.Size) |
|
||||
if fullnessRatio < fullnessThreshold { |
|
||||
return nil |
|
||||
} |
|
||||
|
|
||||
// Check collection filter if specified
|
|
||||
if collectionFilter != "" { |
|
||||
// Parse comma-separated collections
|
|
||||
allowedCollections := make(map[string]bool) |
|
||||
for _, collection := range strings.Split(collectionFilter, ",") { |
|
||||
allowedCollections[strings.TrimSpace(collection)] = true |
|
||||
} |
|
||||
// Skip if volume's collection is not in the allowed list
|
|
||||
if !allowedCollections[volume.Collection] { |
|
||||
return nil |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// Check quiet duration using volume's last modification time
|
|
||||
now := time.Now() |
|
||||
lastModified := time.Unix(volume.ModifiedAtSecond, 0) |
|
||||
timeSinceModification := now.Sub(lastModified) |
|
||||
|
|
||||
if timeSinceModification < time.Duration(quietForSeconds)*time.Second { |
|
||||
return nil // Volume hasn't been quiet long enough
|
|
||||
} |
|
||||
|
|
||||
return &VolumeMaintenanceCandidate{ |
|
||||
VolumeID: volumeID, |
|
||||
Server: volume.Server, |
|
||||
TaskType: "ec_encode", |
|
||||
Priority: types.TaskPriorityLow, // EC is typically low priority
|
|
||||
Reason: fmt.Sprintf("Volume meets EC criteria: fullness=%.1f%% (>%.1f%%), quiet for %s (>%ds), collection='%s'", fullnessRatio*100, fullnessThreshold*100, timeSinceModification.Truncate(time.Second), quietForSeconds, volume.Collection), |
|
||||
VolumeInfo: volume, |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// checkVacuumCandidate checks if a volume is a candidate for vacuum
|
|
||||
func (ms *MasterSynchronizer) checkVacuumCandidate(volumeID uint32, state *VolumeState) *VolumeMaintenanceCandidate { |
|
||||
volume := state.CurrentState |
|
||||
if volume == nil || volume.ReadOnly { |
|
||||
return nil |
|
||||
} |
|
||||
|
|
||||
// Get the current configuration from the vacuum detector
|
|
||||
vacuumDetector, _ := vacuum.GetSharedInstances() |
|
||||
if vacuumDetector == nil || !vacuumDetector.IsEnabled() { |
|
||||
return nil |
|
||||
} |
|
||||
|
|
||||
// Get configuration values from the detector
|
|
||||
garbageThreshold := vacuumDetector.GetGarbageThreshold() |
|
||||
minVolumeAge := vacuumDetector.GetMinVolumeAge() |
|
||||
|
|
||||
// Vacuum criteria:
|
|
||||
// 1. Volume meets garbage threshold
|
|
||||
// 2. Volume is old enough (respects minimum age)
|
|
||||
// 3. Volume has sufficient size
|
|
||||
|
|
||||
// Check minimum volume size (avoid vacuum on tiny volumes)
|
|
||||
if volume.Size == 0 { |
|
||||
return nil |
|
||||
} |
|
||||
|
|
||||
// Check garbage ratio
|
|
||||
deletedRatio := float64(volume.DeletedByteCount) / float64(volume.Size) |
|
||||
if deletedRatio < garbageThreshold { |
|
||||
return nil |
|
||||
} |
|
||||
|
|
||||
// Check minimum volume age using volume's last modification time
|
|
||||
now := time.Now() |
|
||||
lastModified := time.Unix(volume.ModifiedAtSecond, 0) |
|
||||
volumeAge := now.Sub(lastModified) |
|
||||
|
|
||||
if volumeAge < minVolumeAge { |
|
||||
return nil // Volume is too new for vacuum
|
|
||||
} |
|
||||
|
|
||||
// Determine priority based on garbage ratio
|
|
||||
priority := types.TaskPriorityNormal |
|
||||
if deletedRatio > 0.6 { // High garbage ratio gets higher priority
|
|
||||
priority = types.TaskPriorityHigh |
|
||||
} |
|
||||
|
|
||||
return &VolumeMaintenanceCandidate{ |
|
||||
VolumeID: volumeID, |
|
||||
Server: volume.Server, |
|
||||
TaskType: "vacuum", |
|
||||
Priority: priority, |
|
||||
Reason: fmt.Sprintf("Volume meets vacuum criteria: garbage=%.1f%% (>%.1f%%), age=%s (>%s)", |
|
||||
deletedRatio*100, garbageThreshold*100, volumeAge.Truncate(time.Second), minVolumeAge.Truncate(time.Second)), |
|
||||
VolumeInfo: volume, |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// checkECRebuildCandidate checks if an EC volume needs shard rebuilding
|
|
||||
func (ms *MasterSynchronizer) checkECRebuildCandidate(volumeID uint32, state *VolumeState) *VolumeMaintenanceCandidate { |
|
||||
// For now, skip EC rebuild detection as it requires more complex shard state tracking
|
|
||||
// This would be implemented when the volume state manager provides proper EC shard access
|
|
||||
return nil |
|
||||
} |
|
||||
|
|
||||
// processCandidates attempts to assign tasks for maintenance candidates
|
|
||||
func (ms *MasterSynchronizer) processCandidates(candidates []*VolumeMaintenanceCandidate) { |
|
||||
for _, candidate := range candidates { |
|
||||
// Check if we can assign this task
|
|
||||
if !ms.canAssignCandidate(candidate) { |
|
||||
glog.V(2).Infof("Cannot assign task for volume %d: insufficient capacity or no workers", |
|
||||
candidate.VolumeID) |
|
||||
continue |
|
||||
} |
|
||||
|
|
||||
// Create and queue the task
|
|
||||
task := ms.createTaskFromCandidate(candidate) |
|
||||
if task != nil { |
|
||||
ms.adminServer.QueueTask(task) |
|
||||
glog.V(1).Infof("Queued %s task for volume %d on server %s: %s", |
|
||||
candidate.TaskType, candidate.VolumeID, candidate.Server, candidate.Reason) |
|
||||
} |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// canAssignCandidate checks if a candidate can be assigned (capacity, workers available)
|
|
||||
func (ms *MasterSynchronizer) canAssignCandidate(candidate *VolumeMaintenanceCandidate) bool { |
|
||||
// Check if server has capacity for the task
|
|
||||
if candidate.TaskType == "ec_encode" { |
|
||||
// EC encoding requires significant temporary space
|
|
||||
requiredSpace := int64(candidate.VolumeInfo.Size * 2) // Estimate 2x volume size needed
|
|
||||
if !ms.volumeStateManager.CanAssignVolumeToServer(requiredSpace, candidate.Server) { |
|
||||
return false |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// Check if we have workers capable of this task type
|
|
||||
availableWorkers := ms.adminServer.GetAvailableWorkers(candidate.TaskType) |
|
||||
if len(availableWorkers) == 0 { |
|
||||
return false |
|
||||
} |
|
||||
|
|
||||
return true |
|
||||
} |
|
||||
|
|
||||
// createTaskFromCandidate creates a task from a maintenance candidate
|
|
||||
func (ms *MasterSynchronizer) createTaskFromCandidate(candidate *VolumeMaintenanceCandidate) *Task { |
|
||||
now := time.Now() |
|
||||
|
|
||||
task := &Task{ |
|
||||
ID: generateTaskID(), |
|
||||
Type: TaskType(candidate.TaskType), |
|
||||
VolumeID: candidate.VolumeID, |
|
||||
Priority: candidate.Priority, |
|
||||
Status: types.TaskStatusPending, |
|
||||
CreatedAt: now, |
|
||||
Parameters: map[string]interface{}{ |
|
||||
"volume_id": fmt.Sprintf("%d", candidate.VolumeID), |
|
||||
"server": candidate.Server, |
|
||||
"reason": candidate.Reason, |
|
||||
}, |
|
||||
} |
|
||||
|
|
||||
// Add task-specific parameters
|
|
||||
switch candidate.TaskType { |
|
||||
case "ec_encode": |
|
||||
task.Parameters["replication"] = "001" // Default replication for EC
|
|
||||
task.Parameters["collection"] = candidate.VolumeInfo.Collection |
|
||||
case "vacuum": |
|
||||
// Get the current garbage threshold from the vacuum detector
|
|
||||
vacuumDetector, _ := vacuum.GetSharedInstances() |
|
||||
var garbageThreshold float64 = 0.3 // Default fallback
|
|
||||
if vacuumDetector != nil { |
|
||||
garbageThreshold = vacuumDetector.GetGarbageThreshold() |
|
||||
} |
|
||||
task.Parameters["garbage_threshold"] = strconv.FormatFloat(garbageThreshold, 'f', -1, 64) |
|
||||
case "ec_rebuild": |
|
||||
// Add info about which shards need rebuilding
|
|
||||
} |
|
||||
|
|
||||
return task |
|
||||
} |
|
||||
|
|
||||
// Global variable to hold the master volume data
|
|
||||
var volumeData *master_pb.VolumeListResponse |
|
||||
@ -1,324 +0,0 @@ |
|||||
package task |
|
||||
|
|
||||
import ( |
|
||||
"fmt" |
|
||||
"sync" |
|
||||
"time" |
|
||||
|
|
||||
"github.com/seaweedfs/seaweedfs/weed/wdclient" |
|
||||
"github.com/seaweedfs/seaweedfs/weed/worker/types" |
|
||||
) |
|
||||
|
|
||||
// MinimalAdminConfig contains configuration for the minimal admin server
|
|
||||
type MinimalAdminConfig struct { |
|
||||
ScanInterval time.Duration |
|
||||
WorkerTimeout time.Duration |
|
||||
TaskTimeout time.Duration |
|
||||
MaxRetries int |
|
||||
ReconcileInterval time.Duration |
|
||||
EnableFailureRecovery bool |
|
||||
MaxConcurrentTasks int |
|
||||
} |
|
||||
|
|
||||
// MinimalAdminServer manages workers and tasks with a simple implementation
|
|
||||
type MinimalAdminServer struct { |
|
||||
config *MinimalAdminConfig |
|
||||
masterClient *wdclient.MasterClient |
|
||||
running bool |
|
||||
mutex sync.RWMutex |
|
||||
|
|
||||
// Task management
|
|
||||
tasks map[string]*types.Task |
|
||||
taskQueue []*types.Task |
|
||||
activeTasks map[string]*types.Task |
|
||||
|
|
||||
// Worker management
|
|
||||
workers map[string]*types.Worker |
|
||||
workerStatus map[string]*types.WorkerStatus |
|
||||
|
|
||||
// Task history
|
|
||||
taskHistory []MinimalTaskHistoryEntry |
|
||||
} |
|
||||
|
|
||||
// MinimalTaskHistoryEntry represents a single task history entry
|
|
||||
type MinimalTaskHistoryEntry struct { |
|
||||
TaskID string |
|
||||
TaskType types.TaskType |
|
||||
VolumeID uint32 |
|
||||
WorkerID string |
|
||||
Status types.TaskStatus |
|
||||
StartedAt time.Time |
|
||||
CompletedAt time.Time |
|
||||
Duration time.Duration |
|
||||
ErrorMessage string |
|
||||
} |
|
||||
|
|
||||
// MinimalSystemStats represents system statistics
|
|
||||
type MinimalSystemStats struct { |
|
||||
ActiveTasks int |
|
||||
QueuedTasks int |
|
||||
ActiveWorkers int |
|
||||
TotalTasks int |
|
||||
} |
|
||||
|
|
||||
// NewMinimalAdminServer creates a new minimal admin server
|
|
||||
func NewMinimalAdminServer(config *MinimalAdminConfig, masterClient *wdclient.MasterClient) *MinimalAdminServer { |
|
||||
return &MinimalAdminServer{ |
|
||||
config: config, |
|
||||
masterClient: masterClient, |
|
||||
tasks: make(map[string]*types.Task), |
|
||||
taskQueue: make([]*types.Task, 0), |
|
||||
activeTasks: make(map[string]*types.Task), |
|
||||
workers: make(map[string]*types.Worker), |
|
||||
workerStatus: make(map[string]*types.WorkerStatus), |
|
||||
taskHistory: make([]MinimalTaskHistoryEntry, 0), |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// Start starts the minimal admin server
|
|
||||
func (as *MinimalAdminServer) Start() error { |
|
||||
as.mutex.Lock() |
|
||||
defer as.mutex.Unlock() |
|
||||
|
|
||||
if as.running { |
|
||||
return fmt.Errorf("admin server is already running") |
|
||||
} |
|
||||
|
|
||||
as.running = true |
|
||||
return nil |
|
||||
} |
|
||||
|
|
||||
// Stop stops the minimal admin server
|
|
||||
func (as *MinimalAdminServer) Stop() error { |
|
||||
as.mutex.Lock() |
|
||||
defer as.mutex.Unlock() |
|
||||
|
|
||||
as.running = false |
|
||||
return nil |
|
||||
} |
|
||||
|
|
||||
// RegisterWorker registers a new worker
|
|
||||
func (as *MinimalAdminServer) RegisterWorker(worker *types.Worker) error { |
|
||||
as.mutex.Lock() |
|
||||
defer as.mutex.Unlock() |
|
||||
|
|
||||
if !as.running { |
|
||||
return fmt.Errorf("admin server is not running") |
|
||||
} |
|
||||
|
|
||||
as.workers[worker.ID] = worker |
|
||||
as.workerStatus[worker.ID] = &types.WorkerStatus{ |
|
||||
Status: "active", |
|
||||
CurrentLoad: 0, |
|
||||
} |
|
||||
|
|
||||
return nil |
|
||||
} |
|
||||
|
|
||||
// QueueTask adds a new task to the task queue
|
|
||||
func (as *MinimalAdminServer) QueueTask(task *types.Task) error { |
|
||||
as.mutex.Lock() |
|
||||
defer as.mutex.Unlock() |
|
||||
|
|
||||
if !as.running { |
|
||||
return fmt.Errorf("admin server is not running") |
|
||||
} |
|
||||
|
|
||||
if task.ID == "" { |
|
||||
task.ID = fmt.Sprintf("task-%d", time.Now().UnixNano()) |
|
||||
} |
|
||||
|
|
||||
task.Status = types.TaskStatusPending |
|
||||
task.CreatedAt = time.Now() |
|
||||
|
|
||||
as.tasks[task.ID] = task |
|
||||
as.taskQueue = append(as.taskQueue, task) |
|
||||
|
|
||||
return nil |
|
||||
} |
|
||||
|
|
||||
// RequestTask requests a task for a worker
|
|
||||
func (as *MinimalAdminServer) RequestTask(workerID string, capabilities []types.TaskType) (*types.Task, error) { |
|
||||
as.mutex.Lock() |
|
||||
defer as.mutex.Unlock() |
|
||||
|
|
||||
if !as.running { |
|
||||
return nil, fmt.Errorf("admin server is not running") |
|
||||
} |
|
||||
|
|
||||
// Check if worker exists
|
|
||||
worker, exists := as.workers[workerID] |
|
||||
if !exists { |
|
||||
return nil, fmt.Errorf("worker %s not found", workerID) |
|
||||
} |
|
||||
|
|
||||
// Check if worker has capacity
|
|
||||
status := as.workerStatus[workerID] |
|
||||
if status.CurrentLoad >= worker.MaxConcurrent { |
|
||||
return nil, nil // No capacity
|
|
||||
} |
|
||||
|
|
||||
// Find a suitable task
|
|
||||
for i, task := range as.taskQueue { |
|
||||
if task.Status != types.TaskStatusPending { |
|
||||
continue |
|
||||
} |
|
||||
|
|
||||
// Check if worker can handle this task type
|
|
||||
canHandle := false |
|
||||
for _, capability := range capabilities { |
|
||||
if task.Type == capability { |
|
||||
canHandle = true |
|
||||
break |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
if canHandle { |
|
||||
// Assign task to worker
|
|
||||
task.Status = types.TaskStatusInProgress |
|
||||
task.WorkerID = workerID |
|
||||
now := time.Now() |
|
||||
task.StartedAt = &now |
|
||||
|
|
||||
// Move task from queue to active tasks
|
|
||||
as.taskQueue = append(as.taskQueue[:i], as.taskQueue[i+1:]...) |
|
||||
as.activeTasks[task.ID] = task |
|
||||
|
|
||||
// Update worker load
|
|
||||
status.CurrentLoad++ |
|
||||
|
|
||||
return task, nil |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
return nil, nil // No suitable task found
|
|
||||
} |
|
||||
|
|
||||
// UpdateTaskProgress updates task progress
|
|
||||
func (as *MinimalAdminServer) UpdateTaskProgress(taskID string, progress float64) error { |
|
||||
as.mutex.Lock() |
|
||||
defer as.mutex.Unlock() |
|
||||
|
|
||||
task, exists := as.tasks[taskID] |
|
||||
if !exists { |
|
||||
return fmt.Errorf("task %s not found", taskID) |
|
||||
} |
|
||||
|
|
||||
task.Progress = progress |
|
||||
|
|
||||
return nil |
|
||||
} |
|
||||
|
|
||||
// CompleteTask marks a task as completed
|
|
||||
func (as *MinimalAdminServer) CompleteTask(taskID string, success bool, errorMessage string) error { |
|
||||
as.mutex.Lock() |
|
||||
defer as.mutex.Unlock() |
|
||||
|
|
||||
task, exists := as.tasks[taskID] |
|
||||
if !exists { |
|
||||
return fmt.Errorf("task %s not found", taskID) |
|
||||
} |
|
||||
|
|
||||
// Update task status
|
|
||||
if success { |
|
||||
task.Status = types.TaskStatusCompleted |
|
||||
} else { |
|
||||
task.Status = types.TaskStatusFailed |
|
||||
task.Error = errorMessage |
|
||||
} |
|
||||
|
|
||||
now := time.Now() |
|
||||
task.CompletedAt = &now |
|
||||
|
|
||||
// Remove from active tasks
|
|
||||
delete(as.activeTasks, taskID) |
|
||||
|
|
||||
// Update worker load
|
|
||||
if task.WorkerID != "" { |
|
||||
if status, exists := as.workerStatus[task.WorkerID]; exists { |
|
||||
status.CurrentLoad-- |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// Add to history
|
|
||||
var duration time.Duration |
|
||||
if task.StartedAt != nil { |
|
||||
duration = now.Sub(*task.StartedAt) |
|
||||
} |
|
||||
|
|
||||
entry := MinimalTaskHistoryEntry{ |
|
||||
TaskID: task.ID, |
|
||||
TaskType: task.Type, |
|
||||
VolumeID: task.VolumeID, |
|
||||
WorkerID: task.WorkerID, |
|
||||
Status: task.Status, |
|
||||
StartedAt: *task.StartedAt, |
|
||||
CompletedAt: now, |
|
||||
Duration: duration, |
|
||||
ErrorMessage: errorMessage, |
|
||||
} |
|
||||
as.taskHistory = append(as.taskHistory, entry) |
|
||||
|
|
||||
return nil |
|
||||
} |
|
||||
|
|
||||
// UpdateWorkerHeartbeat updates worker heartbeat
|
|
||||
func (as *MinimalAdminServer) UpdateWorkerHeartbeat(workerID string, status *types.WorkerStatus) error { |
|
||||
as.mutex.Lock() |
|
||||
defer as.mutex.Unlock() |
|
||||
|
|
||||
worker, exists := as.workers[workerID] |
|
||||
if !exists { |
|
||||
return fmt.Errorf("worker %s not found", workerID) |
|
||||
} |
|
||||
|
|
||||
worker.LastHeartbeat = time.Now() |
|
||||
as.workerStatus[workerID] = status |
|
||||
|
|
||||
return nil |
|
||||
} |
|
||||
|
|
||||
// GetSystemStats returns system statistics
|
|
||||
func (as *MinimalAdminServer) GetSystemStats() *MinimalSystemStats { |
|
||||
as.mutex.RLock() |
|
||||
defer as.mutex.RUnlock() |
|
||||
|
|
||||
activeWorkers := 0 |
|
||||
for _, status := range as.workerStatus { |
|
||||
if status.Status == "active" { |
|
||||
activeWorkers++ |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
return &MinimalSystemStats{ |
|
||||
ActiveTasks: len(as.activeTasks), |
|
||||
QueuedTasks: len(as.taskQueue), |
|
||||
ActiveWorkers: activeWorkers, |
|
||||
TotalTasks: len(as.tasks), |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// GetQueuedTaskCount returns the number of queued tasks
|
|
||||
func (as *MinimalAdminServer) GetQueuedTaskCount() int { |
|
||||
as.mutex.RLock() |
|
||||
defer as.mutex.RUnlock() |
|
||||
return len(as.taskQueue) |
|
||||
} |
|
||||
|
|
||||
// GetActiveTaskCount returns the number of active tasks
|
|
||||
func (as *MinimalAdminServer) GetActiveTaskCount() int { |
|
||||
as.mutex.RLock() |
|
||||
defer as.mutex.RUnlock() |
|
||||
return len(as.activeTasks) |
|
||||
} |
|
||||
|
|
||||
// GetTaskHistory returns task history
|
|
||||
func (as *MinimalAdminServer) GetTaskHistory() []MinimalTaskHistoryEntry { |
|
||||
as.mutex.RLock() |
|
||||
defer as.mutex.RUnlock() |
|
||||
|
|
||||
// Return a copy of the history
|
|
||||
history := make([]MinimalTaskHistoryEntry, len(as.taskHistory)) |
|
||||
copy(history, as.taskHistory) |
|
||||
return history |
|
||||
} |
|
||||
@ -1,434 +0,0 @@ |
|||||
package task |
|
||||
|
|
||||
import ( |
|
||||
"fmt" |
|
||||
"testing" |
|
||||
"time" |
|
||||
|
|
||||
"github.com/seaweedfs/seaweedfs/weed/worker/types" |
|
||||
) |
|
||||
|
|
||||
// TestMinimalIntegration tests basic admin-worker operational flow using the minimal implementation
|
|
||||
func TestMinimalIntegration(t *testing.T) { |
|
||||
t.Logf("Starting minimal integration test") |
|
||||
|
|
||||
// Step 1: Create a minimal admin server configuration
|
|
||||
config := &MinimalAdminConfig{ |
|
||||
ScanInterval: 10 * time.Second, |
|
||||
WorkerTimeout: 30 * time.Second, |
|
||||
TaskTimeout: 2 * time.Hour, |
|
||||
MaxRetries: 3, |
|
||||
ReconcileInterval: 5 * time.Minute, |
|
||||
EnableFailureRecovery: true, |
|
||||
MaxConcurrentTasks: 5, |
|
||||
} |
|
||||
|
|
||||
// Step 2: Create minimal admin server with nil master client (for testing)
|
|
||||
adminServer := NewMinimalAdminServer(config, nil) |
|
||||
|
|
||||
// Step 3: Start admin server
|
|
||||
err := adminServer.Start() |
|
||||
if err != nil { |
|
||||
t.Fatalf("Failed to start admin server: %v", err) |
|
||||
} |
|
||||
defer adminServer.Stop() |
|
||||
|
|
||||
// Step 4: Test worker registration
|
|
||||
t.Logf("Testing worker registration") |
|
||||
|
|
||||
worker := &types.Worker{ |
|
||||
ID: "test-worker-1", |
|
||||
Address: "localhost:9001", |
|
||||
Capabilities: []types.TaskType{types.TaskTypeVacuum}, |
|
||||
MaxConcurrent: 2, |
|
||||
Status: "active", |
|
||||
CurrentLoad: 0, |
|
||||
LastHeartbeat: time.Now(), |
|
||||
} |
|
||||
|
|
||||
err = adminServer.RegisterWorker(worker) |
|
||||
if err != nil { |
|
||||
t.Fatalf("Failed to register worker: %v", err) |
|
||||
} |
|
||||
t.Logf("Successfully registered worker %s", worker.ID) |
|
||||
|
|
||||
// Step 5: Test task queueing
|
|
||||
t.Logf("Testing task queueing") |
|
||||
|
|
||||
task := &types.Task{ |
|
||||
ID: "test-task-1", |
|
||||
Type: types.TaskTypeVacuum, |
|
||||
VolumeID: 1001, |
|
||||
Server: "localhost:8080", |
|
||||
Status: types.TaskStatusPending, |
|
||||
Priority: types.TaskPriorityNormal, |
|
||||
Parameters: map[string]interface{}{ |
|
||||
"garbage_threshold": "0.3", |
|
||||
}, |
|
||||
CreatedAt: time.Now(), |
|
||||
} |
|
||||
|
|
||||
err = adminServer.QueueTask(task) |
|
||||
if err != nil { |
|
||||
t.Fatalf("Failed to queue task: %v", err) |
|
||||
} |
|
||||
t.Logf("Successfully queued task %s", task.ID) |
|
||||
|
|
||||
// Step 6: Test task request by worker
|
|
||||
t.Logf("Testing task request") |
|
||||
|
|
||||
assignedTask, err := adminServer.RequestTask("test-worker-1", []types.TaskType{types.TaskTypeVacuum}) |
|
||||
if err != nil { |
|
||||
t.Fatalf("Failed to request task: %v", err) |
|
||||
} |
|
||||
|
|
||||
if assignedTask != nil { |
|
||||
t.Logf("Successfully assigned task %s to worker", assignedTask.ID) |
|
||||
|
|
||||
// Step 7: Test task progress updates
|
|
||||
t.Logf("Testing task progress updates") |
|
||||
|
|
||||
err = adminServer.UpdateTaskProgress(assignedTask.ID, 25.0) |
|
||||
if err != nil { |
|
||||
t.Errorf("Failed to update task progress to 25%%: %v", err) |
|
||||
} |
|
||||
|
|
||||
err = adminServer.UpdateTaskProgress(assignedTask.ID, 50.0) |
|
||||
if err != nil { |
|
||||
t.Errorf("Failed to update task progress to 50%%: %v", err) |
|
||||
} |
|
||||
|
|
||||
err = adminServer.UpdateTaskProgress(assignedTask.ID, 75.0) |
|
||||
if err != nil { |
|
||||
t.Errorf("Failed to update task progress to 75%%: %v", err) |
|
||||
} |
|
||||
|
|
||||
err = adminServer.UpdateTaskProgress(assignedTask.ID, 100.0) |
|
||||
if err != nil { |
|
||||
t.Errorf("Failed to update task progress to 100%%: %v", err) |
|
||||
} |
|
||||
|
|
||||
// Step 8: Test task completion
|
|
||||
t.Logf("Testing task completion") |
|
||||
|
|
||||
err = adminServer.CompleteTask(assignedTask.ID, true, "") |
|
||||
if err != nil { |
|
||||
t.Errorf("Failed to complete task: %v", err) |
|
||||
} |
|
||||
t.Logf("Successfully completed task %s", assignedTask.ID) |
|
||||
} else { |
|
||||
t.Logf("No task was assigned (queue might be empty)") |
|
||||
} |
|
||||
|
|
||||
// Step 9: Test basic metrics
|
|
||||
t.Logf("Testing basic metrics") |
|
||||
|
|
||||
stats := adminServer.GetSystemStats() |
|
||||
if stats != nil { |
|
||||
t.Logf("System stats: Active tasks=%d, Queued tasks=%d, Active workers=%d, Total tasks=%d", |
|
||||
stats.ActiveTasks, stats.QueuedTasks, stats.ActiveWorkers, stats.TotalTasks) |
|
||||
} |
|
||||
|
|
||||
queuedCount := adminServer.GetQueuedTaskCount() |
|
||||
activeCount := adminServer.GetActiveTaskCount() |
|
||||
t.Logf("Queue status: %d queued, %d active tasks", queuedCount, activeCount) |
|
||||
|
|
||||
// Step 10: Test task history
|
|
||||
history := adminServer.GetTaskHistory() |
|
||||
t.Logf("Task history contains %d entries", len(history)) |
|
||||
|
|
||||
if len(history) > 0 { |
|
||||
lastEntry := history[len(history)-1] |
|
||||
t.Logf("Last task in history: %s (%s) - Status: %s, Duration: %v", |
|
||||
lastEntry.TaskID, lastEntry.TaskType, lastEntry.Status, lastEntry.Duration) |
|
||||
} |
|
||||
|
|
||||
t.Logf("Minimal integration test completed successfully") |
|
||||
} |
|
||||
|
|
||||
// TestMinimalWorkerHeartbeat tests worker heartbeat functionality
|
|
||||
func TestMinimalWorkerHeartbeat(t *testing.T) { |
|
||||
t.Logf("Testing minimal worker heartbeat") |
|
||||
|
|
||||
config := &MinimalAdminConfig{ |
|
||||
ScanInterval: 10 * time.Second, |
|
||||
WorkerTimeout: 30 * time.Second, |
|
||||
TaskTimeout: 2 * time.Hour, |
|
||||
MaxRetries: 3, |
|
||||
ReconcileInterval: 5 * time.Minute, |
|
||||
EnableFailureRecovery: true, |
|
||||
MaxConcurrentTasks: 5, |
|
||||
} |
|
||||
|
|
||||
adminServer := NewMinimalAdminServer(config, nil) |
|
||||
err := adminServer.Start() |
|
||||
if err != nil { |
|
||||
t.Fatalf("Failed to start admin server: %v", err) |
|
||||
} |
|
||||
defer adminServer.Stop() |
|
||||
|
|
||||
// Register a worker
|
|
||||
worker := &types.Worker{ |
|
||||
ID: "heartbeat-worker", |
|
||||
Address: "localhost:9002", |
|
||||
Capabilities: []types.TaskType{types.TaskTypeVacuum}, |
|
||||
MaxConcurrent: 1, |
|
||||
Status: "active", |
|
||||
CurrentLoad: 0, |
|
||||
LastHeartbeat: time.Now(), |
|
||||
} |
|
||||
|
|
||||
err = adminServer.RegisterWorker(worker) |
|
||||
if err != nil { |
|
||||
t.Fatalf("Failed to register worker: %v", err) |
|
||||
} |
|
||||
|
|
||||
// Test heartbeat update
|
|
||||
status := &types.WorkerStatus{ |
|
||||
Status: "active", |
|
||||
CurrentLoad: 0, |
|
||||
} |
|
||||
|
|
||||
err = adminServer.UpdateWorkerHeartbeat("heartbeat-worker", status) |
|
||||
if err != nil { |
|
||||
t.Errorf("Failed to update worker heartbeat: %v", err) |
|
||||
} |
|
||||
|
|
||||
t.Logf("Minimal worker heartbeat test completed successfully") |
|
||||
} |
|
||||
|
|
||||
// TestMinimalTaskQueueOperations tests task queue operations
|
|
||||
func TestMinimalTaskQueueOperations(t *testing.T) { |
|
||||
t.Logf("Testing minimal task queue operations") |
|
||||
|
|
||||
config := &MinimalAdminConfig{ |
|
||||
ScanInterval: 10 * time.Second, |
|
||||
WorkerTimeout: 30 * time.Second, |
|
||||
TaskTimeout: 2 * time.Hour, |
|
||||
MaxRetries: 3, |
|
||||
ReconcileInterval: 5 * time.Minute, |
|
||||
EnableFailureRecovery: true, |
|
||||
MaxConcurrentTasks: 5, |
|
||||
} |
|
||||
|
|
||||
adminServer := NewMinimalAdminServer(config, nil) |
|
||||
err := adminServer.Start() |
|
||||
if err != nil { |
|
||||
t.Fatalf("Failed to start admin server: %v", err) |
|
||||
} |
|
||||
defer adminServer.Stop() |
|
||||
|
|
||||
// Test queuing multiple tasks
|
|
||||
taskCount := 3 |
|
||||
for i := 0; i < taskCount; i++ { |
|
||||
task := &types.Task{ |
|
||||
ID: fmt.Sprintf("queue-test-task-%d", i), |
|
||||
Type: types.TaskTypeVacuum, |
|
||||
VolumeID: uint32(2000 + i), |
|
||||
Server: "localhost:8080", |
|
||||
Status: types.TaskStatusPending, |
|
||||
Priority: types.TaskPriorityNormal, |
|
||||
Parameters: map[string]interface{}{ |
|
||||
"garbage_threshold": "0.3", |
|
||||
}, |
|
||||
CreatedAt: time.Now(), |
|
||||
} |
|
||||
|
|
||||
err = adminServer.QueueTask(task) |
|
||||
if err != nil { |
|
||||
t.Errorf("Failed to queue task %d: %v", i, err) |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// Check queue size
|
|
||||
queuedCount := adminServer.GetQueuedTaskCount() |
|
||||
if queuedCount != taskCount { |
|
||||
t.Errorf("Expected %d queued tasks, got %d", taskCount, queuedCount) |
|
||||
} |
|
||||
|
|
||||
t.Logf("Minimal task queue operations test completed successfully") |
|
||||
} |
|
||||
|
|
||||
// TestMinimalFullWorkflow tests the complete workflow from task creation to completion
|
|
||||
func TestMinimalFullWorkflow(t *testing.T) { |
|
||||
t.Logf("Testing minimal full workflow") |
|
||||
|
|
||||
config := &MinimalAdminConfig{ |
|
||||
ScanInterval: 10 * time.Second, |
|
||||
WorkerTimeout: 30 * time.Second, |
|
||||
TaskTimeout: 2 * time.Hour, |
|
||||
MaxRetries: 3, |
|
||||
ReconcileInterval: 5 * time.Minute, |
|
||||
EnableFailureRecovery: true, |
|
||||
MaxConcurrentTasks: 5, |
|
||||
} |
|
||||
|
|
||||
adminServer := NewMinimalAdminServer(config, nil) |
|
||||
err := adminServer.Start() |
|
||||
if err != nil { |
|
||||
t.Fatalf("Failed to start admin server: %v", err) |
|
||||
} |
|
||||
defer adminServer.Stop() |
|
||||
|
|
||||
// Register multiple workers with different capabilities
|
|
||||
workers := []*types.Worker{ |
|
||||
{ |
|
||||
ID: "vacuum-worker-1", |
|
||||
Address: "localhost:9001", |
|
||||
Capabilities: []types.TaskType{types.TaskTypeVacuum}, |
|
||||
MaxConcurrent: 2, |
|
||||
Status: "active", |
|
||||
CurrentLoad: 0, |
|
||||
LastHeartbeat: time.Now(), |
|
||||
}, |
|
||||
{ |
|
||||
ID: "ec-worker-1", |
|
||||
Address: "localhost:9002", |
|
||||
Capabilities: []types.TaskType{types.TaskTypeErasureCoding}, |
|
||||
MaxConcurrent: 1, |
|
||||
Status: "active", |
|
||||
CurrentLoad: 0, |
|
||||
LastHeartbeat: time.Now(), |
|
||||
}, |
|
||||
{ |
|
||||
ID: "multi-worker-1", |
|
||||
Address: "localhost:9003", |
|
||||
Capabilities: []types.TaskType{types.TaskTypeVacuum, types.TaskTypeErasureCoding}, |
|
||||
MaxConcurrent: 3, |
|
||||
Status: "active", |
|
||||
CurrentLoad: 0, |
|
||||
LastHeartbeat: time.Now(), |
|
||||
}, |
|
||||
} |
|
||||
|
|
||||
for _, worker := range workers { |
|
||||
err = adminServer.RegisterWorker(worker) |
|
||||
if err != nil { |
|
||||
t.Fatalf("Failed to register worker %s: %v", worker.ID, err) |
|
||||
} |
|
||||
t.Logf("Registered worker %s with capabilities %v", worker.ID, worker.Capabilities) |
|
||||
} |
|
||||
|
|
||||
// Create multiple tasks of different types
|
|
||||
tasks := []*types.Task{ |
|
||||
{ |
|
||||
ID: "vacuum-task-1", |
|
||||
Type: types.TaskTypeVacuum, |
|
||||
VolumeID: 3001, |
|
||||
Server: "localhost:8080", |
|
||||
Status: types.TaskStatusPending, |
|
||||
Priority: types.TaskPriorityNormal, |
|
||||
Parameters: map[string]interface{}{ |
|
||||
"garbage_threshold": "0.4", |
|
||||
}, |
|
||||
CreatedAt: time.Now(), |
|
||||
}, |
|
||||
{ |
|
||||
ID: "ec-task-1", |
|
||||
Type: types.TaskTypeErasureCoding, |
|
||||
VolumeID: 3002, |
|
||||
Server: "localhost:8080", |
|
||||
Status: types.TaskStatusPending, |
|
||||
Priority: types.TaskPriorityHigh, |
|
||||
Parameters: map[string]interface{}{ |
|
||||
"shard_count": "14", |
|
||||
}, |
|
||||
CreatedAt: time.Now(), |
|
||||
}, |
|
||||
{ |
|
||||
ID: "vacuum-task-2", |
|
||||
Type: types.TaskTypeVacuum, |
|
||||
VolumeID: 3003, |
|
||||
Server: "localhost:8081", |
|
||||
Status: types.TaskStatusPending, |
|
||||
Priority: types.TaskPriorityLow, |
|
||||
Parameters: map[string]interface{}{ |
|
||||
"garbage_threshold": "0.5", |
|
||||
}, |
|
||||
CreatedAt: time.Now(), |
|
||||
}, |
|
||||
} |
|
||||
|
|
||||
for _, task := range tasks { |
|
||||
err = adminServer.QueueTask(task) |
|
||||
if err != nil { |
|
||||
t.Fatalf("Failed to queue task %s: %v", task.ID, err) |
|
||||
} |
|
||||
t.Logf("Queued task %s (%s) for volume %d", task.ID, task.Type, task.VolumeID) |
|
||||
} |
|
||||
|
|
||||
// Test task assignment to different workers
|
|
||||
t.Logf("Testing task assignments") |
|
||||
|
|
||||
// Vacuum worker should get vacuum tasks
|
|
||||
assignedTask, err := adminServer.RequestTask("vacuum-worker-1", []types.TaskType{types.TaskTypeVacuum}) |
|
||||
if err != nil { |
|
||||
t.Errorf("Failed to request task for vacuum worker: %v", err) |
|
||||
} else if assignedTask != nil { |
|
||||
t.Logf("Vacuum worker got task: %s (%s)", assignedTask.ID, assignedTask.Type) |
|
||||
|
|
||||
// Complete the task
|
|
||||
err = adminServer.UpdateTaskProgress(assignedTask.ID, 50.0) |
|
||||
if err != nil { |
|
||||
t.Errorf("Failed to update progress: %v", err) |
|
||||
} |
|
||||
|
|
||||
err = adminServer.CompleteTask(assignedTask.ID, true, "") |
|
||||
if err != nil { |
|
||||
t.Errorf("Failed to complete task: %v", err) |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// EC worker should get EC tasks
|
|
||||
assignedTask, err = adminServer.RequestTask("ec-worker-1", []types.TaskType{types.TaskTypeErasureCoding}) |
|
||||
if err != nil { |
|
||||
t.Errorf("Failed to request task for EC worker: %v", err) |
|
||||
} else if assignedTask != nil { |
|
||||
t.Logf("EC worker got task: %s (%s)", assignedTask.ID, assignedTask.Type) |
|
||||
|
|
||||
// Complete the task
|
|
||||
err = adminServer.UpdateTaskProgress(assignedTask.ID, 100.0) |
|
||||
if err != nil { |
|
||||
t.Errorf("Failed to update progress: %v", err) |
|
||||
} |
|
||||
|
|
||||
err = adminServer.CompleteTask(assignedTask.ID, true, "") |
|
||||
if err != nil { |
|
||||
t.Errorf("Failed to complete task: %v", err) |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// Multi-capability worker should be able to get any remaining task
|
|
||||
assignedTask, err = adminServer.RequestTask("multi-worker-1", []types.TaskType{types.TaskTypeVacuum, types.TaskTypeErasureCoding}) |
|
||||
if err != nil { |
|
||||
t.Errorf("Failed to request task for multi worker: %v", err) |
|
||||
} else if assignedTask != nil { |
|
||||
t.Logf("Multi worker got task: %s (%s)", assignedTask.ID, assignedTask.Type) |
|
||||
|
|
||||
// Complete the task
|
|
||||
err = adminServer.UpdateTaskProgress(assignedTask.ID, 100.0) |
|
||||
if err != nil { |
|
||||
t.Errorf("Failed to update progress: %v", err) |
|
||||
} |
|
||||
|
|
||||
err = adminServer.CompleteTask(assignedTask.ID, true, "") |
|
||||
if err != nil { |
|
||||
t.Errorf("Failed to complete task: %v", err) |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// Check final statistics
|
|
||||
stats := adminServer.GetSystemStats() |
|
||||
t.Logf("Final stats: Active tasks=%d, Queued tasks=%d, Active workers=%d, Total tasks=%d", |
|
||||
stats.ActiveTasks, stats.QueuedTasks, stats.ActiveWorkers, stats.TotalTasks) |
|
||||
|
|
||||
history := adminServer.GetTaskHistory() |
|
||||
t.Logf("Task history contains %d completed tasks", len(history)) |
|
||||
|
|
||||
for _, entry := range history { |
|
||||
t.Logf("Completed: %s (%s) - Worker: %s, Duration: %v", |
|
||||
entry.TaskID, entry.TaskType, entry.WorkerID, entry.Duration) |
|
||||
} |
|
||||
|
|
||||
t.Logf("Minimal full workflow test completed successfully") |
|
||||
} |
|
||||
@ -1,197 +0,0 @@ |
|||||
package task |
|
||||
|
|
||||
import ( |
|
||||
"fmt" |
|
||||
"testing" |
|
||||
"time" |
|
||||
|
|
||||
"github.com/seaweedfs/seaweedfs/weed/wdclient" |
|
||||
"github.com/seaweedfs/seaweedfs/weed/worker/types" |
|
||||
) |
|
||||
|
|
||||
// TestOperationalIntegration tests the basic admin-worker operational flow
|
|
||||
func TestOperationalIntegration(t *testing.T) { |
|
||||
t.Logf("Starting operational integration test") |
|
||||
|
|
||||
// Step 1: Create admin server with operational configuration
|
|
||||
config := &AdminConfig{ |
|
||||
ScanInterval: 10 * time.Second, |
|
||||
WorkerTimeout: 30 * time.Second, |
|
||||
TaskTimeout: 2 * time.Hour, |
|
||||
MaxRetries: 3, |
|
||||
ReconcileInterval: 5 * time.Minute, |
|
||||
EnableFailureRecovery: true, |
|
||||
MaxConcurrentTasks: 5, |
|
||||
} |
|
||||
|
|
||||
// Create a nil master client for testing (simplified)
|
|
||||
var masterClient *wdclient.MasterClient |
|
||||
|
|
||||
adminServer := NewAdminServer(config, masterClient) |
|
||||
|
|
||||
// Step 2: Start admin server
|
|
||||
err := adminServer.Start() |
|
||||
if err != nil { |
|
||||
t.Fatalf("Failed to start admin server: %v", err) |
|
||||
} |
|
||||
defer adminServer.Stop() |
|
||||
|
|
||||
// Step 3: Create and register test workers
|
|
||||
worker1 := createTestWorker("worker-1", []types.TaskType{types.TaskTypeVacuum, types.TaskTypeErasureCoding}) |
|
||||
worker2 := createTestWorker("worker-2", []types.TaskType{types.TaskTypeVacuum}) |
|
||||
|
|
||||
err = adminServer.RegisterWorker(worker1) |
|
||||
if err != nil { |
|
||||
t.Fatalf("Failed to register worker1: %v", err) |
|
||||
} |
|
||||
|
|
||||
err = adminServer.RegisterWorker(worker2) |
|
||||
if err != nil { |
|
||||
t.Fatalf("Failed to register worker2: %v", err) |
|
||||
} |
|
||||
|
|
||||
// Step 4: Test basic task queueing
|
|
||||
t.Logf("Testing task queueing") |
|
||||
|
|
||||
// Create a simple test task
|
|
||||
testTask := &types.Task{ |
|
||||
ID: "test-vacuum-1", |
|
||||
Type: types.TaskTypeVacuum, |
|
||||
VolumeID: 1001, |
|
||||
Server: "localhost:8080", |
|
||||
Status: types.TaskStatusPending, |
|
||||
Priority: types.TaskPriorityNormal, |
|
||||
Parameters: map[string]interface{}{ |
|
||||
"garbage_threshold": "0.3", |
|
||||
"server": "localhost:8080", |
|
||||
}, |
|
||||
CreatedAt: time.Now(), |
|
||||
} |
|
||||
|
|
||||
err = adminServer.QueueTask(testTask) |
|
||||
if err != nil { |
|
||||
t.Fatalf("Failed to queue test task: %v", err) |
|
||||
} |
|
||||
t.Logf("Successfully queued test vacuum task for volume %d", testTask.VolumeID) |
|
||||
|
|
||||
// Step 5: Test worker task request and assignment
|
|
||||
t.Logf("Testing worker task requests and assignment") |
|
||||
|
|
||||
// Worker requests task
|
|
||||
task, err := adminServer.RequestTask("worker-1", []types.TaskType{types.TaskTypeVacuum}) |
|
||||
if err != nil { |
|
||||
t.Fatalf("Failed to request task from worker: %v", err) |
|
||||
} |
|
||||
|
|
||||
if task == nil { |
|
||||
t.Logf("No tasks available for assignment (this is expected in test environment)") |
|
||||
} else { |
|
||||
t.Logf("Successfully assigned task %s (%s) to worker-1", task.ID, task.Type) |
|
||||
|
|
||||
// Step 6: Simulate task progress updates
|
|
||||
t.Logf("Testing task progress updates") |
|
||||
|
|
||||
err = adminServer.UpdateTaskProgress(task.ID, 25.0) |
|
||||
if err != nil { |
|
||||
t.Errorf("Failed to update task progress: %v", err) |
|
||||
} |
|
||||
|
|
||||
err = adminServer.UpdateTaskProgress(task.ID, 50.0) |
|
||||
if err != nil { |
|
||||
t.Errorf("Failed to update task progress: %v", err) |
|
||||
} |
|
||||
|
|
||||
err = adminServer.UpdateTaskProgress(task.ID, 100.0) |
|
||||
if err != nil { |
|
||||
t.Errorf("Failed to update task progress: %v", err) |
|
||||
} |
|
||||
|
|
||||
// Step 7: Test task completion
|
|
||||
t.Logf("Testing task completion") |
|
||||
|
|
||||
err = adminServer.CompleteTask(task.ID, true, "") |
|
||||
if err != nil { |
|
||||
t.Errorf("Failed to complete task: %v", err) |
|
||||
} |
|
||||
|
|
||||
t.Logf("Successfully completed task %s", task.ID) |
|
||||
} |
|
||||
|
|
||||
// Step 8: Test metrics and statistics
|
|
||||
t.Logf("Testing system metrics") |
|
||||
|
|
||||
stats := adminServer.GetSystemStats() |
|
||||
t.Logf("System stats: Active tasks=%d, Queued tasks=%d, Active workers=%d", |
|
||||
stats.ActiveTasks, stats.QueuedTasks, stats.ActiveWorkers) |
|
||||
|
|
||||
queuedCount := adminServer.GetQueuedTaskCount() |
|
||||
activeCount := adminServer.GetActiveTaskCount() |
|
||||
t.Logf("Queue status: %d queued, %d active tasks", queuedCount, activeCount) |
|
||||
|
|
||||
// Step 9: Test task history
|
|
||||
history := adminServer.GetTaskHistory() |
|
||||
t.Logf("Task history contains %d entries", len(history)) |
|
||||
|
|
||||
t.Logf("Operational integration test completed successfully") |
|
||||
} |
|
||||
|
|
||||
func createTestWorker(id string, capabilities []types.TaskType) *types.Worker { |
|
||||
return &types.Worker{ |
|
||||
ID: id, |
|
||||
Address: fmt.Sprintf("localhost:900%s", id[len(id)-1:]), |
|
||||
Capabilities: capabilities, |
|
||||
MaxConcurrent: 2, |
|
||||
Status: "active", |
|
||||
CurrentLoad: 0, |
|
||||
LastHeartbeat: time.Now(), |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// TestECTaskExecution tests the EC task validation (without actual execution)
|
|
||||
func TestECTaskExecution(t *testing.T) { |
|
||||
t.Logf("Testing EC task validation") |
|
||||
|
|
||||
params := types.TaskParams{ |
|
||||
VolumeID: 1002, |
|
||||
Server: "localhost:8080", |
|
||||
Collection: "test", |
|
||||
Parameters: map[string]interface{}{ |
|
||||
"volume_size": int64(32 * 1024 * 1024 * 1024), |
|
||||
}, |
|
||||
} |
|
||||
|
|
||||
// Test that basic validation would work
|
|
||||
if params.VolumeID == 0 { |
|
||||
t.Errorf("VolumeID should not be zero") |
|
||||
} |
|
||||
if params.Server == "" { |
|
||||
t.Errorf("Server should not be empty") |
|
||||
} |
|
||||
|
|
||||
t.Logf("EC task validation passed") |
|
||||
} |
|
||||
|
|
||||
// TestVacuumTaskExecution tests the vacuum task validation (without actual execution)
|
|
||||
func TestVacuumTaskExecution(t *testing.T) { |
|
||||
t.Logf("Testing vacuum task validation") |
|
||||
|
|
||||
params := types.TaskParams{ |
|
||||
VolumeID: 1001, |
|
||||
Server: "localhost:8080", |
|
||||
Collection: "test", |
|
||||
Parameters: map[string]interface{}{ |
|
||||
"garbage_threshold": "0.3", |
|
||||
"volume_size": int64(25 * 1024 * 1024 * 1024), |
|
||||
}, |
|
||||
} |
|
||||
|
|
||||
// Test that basic validation would work
|
|
||||
if params.VolumeID == 0 { |
|
||||
t.Errorf("VolumeID should not be zero") |
|
||||
} |
|
||||
if params.Server == "" { |
|
||||
t.Errorf("Server should not be empty") |
|
||||
} |
|
||||
|
|
||||
t.Logf("Vacuum task validation passed") |
|
||||
} |
|
||||
@ -1,233 +0,0 @@ |
|||||
package task |
|
||||
|
|
||||
import ( |
|
||||
"fmt" |
|
||||
"testing" |
|
||||
"time" |
|
||||
|
|
||||
"github.com/seaweedfs/seaweedfs/weed/worker/types" |
|
||||
) |
|
||||
|
|
||||
// TestSimpleIntegration tests basic admin-worker operational flow without complex dependencies
|
|
||||
func TestSimpleIntegration(t *testing.T) { |
|
||||
t.Logf("Starting simple integration test") |
|
||||
|
|
||||
// Step 1: Create a minimal admin server configuration
|
|
||||
config := &AdminConfig{ |
|
||||
ScanInterval: 10 * time.Second, |
|
||||
WorkerTimeout: 30 * time.Second, |
|
||||
TaskTimeout: 2 * time.Hour, |
|
||||
MaxRetries: 3, |
|
||||
ReconcileInterval: 5 * time.Minute, |
|
||||
EnableFailureRecovery: true, |
|
||||
MaxConcurrentTasks: 5, |
|
||||
} |
|
||||
|
|
||||
// Step 2: Create admin server with nil master client (for testing)
|
|
||||
adminServer := NewAdminServer(config, nil) |
|
||||
|
|
||||
// Step 3: Start admin server
|
|
||||
err := adminServer.Start() |
|
||||
if err != nil { |
|
||||
t.Fatalf("Failed to start admin server: %v", err) |
|
||||
} |
|
||||
defer adminServer.Stop() |
|
||||
|
|
||||
// Step 4: Test worker registration
|
|
||||
t.Logf("Testing worker registration") |
|
||||
|
|
||||
worker := &types.Worker{ |
|
||||
ID: "test-worker-1", |
|
||||
Address: "localhost:9001", |
|
||||
Capabilities: []types.TaskType{types.TaskTypeVacuum}, |
|
||||
MaxConcurrent: 2, |
|
||||
Status: "active", |
|
||||
CurrentLoad: 0, |
|
||||
LastHeartbeat: time.Now(), |
|
||||
} |
|
||||
|
|
||||
err = adminServer.RegisterWorker(worker) |
|
||||
if err != nil { |
|
||||
t.Fatalf("Failed to register worker: %v", err) |
|
||||
} |
|
||||
t.Logf("Successfully registered worker %s", worker.ID) |
|
||||
|
|
||||
// Step 5: Test task queueing
|
|
||||
t.Logf("Testing task queueing") |
|
||||
|
|
||||
task := &types.Task{ |
|
||||
ID: "test-task-1", |
|
||||
Type: types.TaskTypeVacuum, |
|
||||
VolumeID: 1001, |
|
||||
Server: "localhost:8080", |
|
||||
Status: types.TaskStatusPending, |
|
||||
Priority: types.TaskPriorityNormal, |
|
||||
Parameters: map[string]interface{}{ |
|
||||
"garbage_threshold": "0.3", |
|
||||
}, |
|
||||
CreatedAt: time.Now(), |
|
||||
} |
|
||||
|
|
||||
err = adminServer.QueueTask(task) |
|
||||
if err != nil { |
|
||||
t.Fatalf("Failed to queue task: %v", err) |
|
||||
} |
|
||||
t.Logf("Successfully queued task %s", task.ID) |
|
||||
|
|
||||
// Step 6: Test task request by worker
|
|
||||
t.Logf("Testing task request") |
|
||||
|
|
||||
assignedTask, err := adminServer.RequestTask("test-worker-1", []types.TaskType{types.TaskTypeVacuum}) |
|
||||
if err != nil { |
|
||||
t.Fatalf("Failed to request task: %v", err) |
|
||||
} |
|
||||
|
|
||||
if assignedTask != nil { |
|
||||
t.Logf("Successfully assigned task %s to worker", assignedTask.ID) |
|
||||
|
|
||||
// Step 7: Test task progress updates
|
|
||||
t.Logf("Testing task progress updates") |
|
||||
|
|
||||
err = adminServer.UpdateTaskProgress(assignedTask.ID, 50.0) |
|
||||
if err != nil { |
|
||||
t.Errorf("Failed to update task progress: %v", err) |
|
||||
} |
|
||||
|
|
||||
err = adminServer.UpdateTaskProgress(assignedTask.ID, 100.0) |
|
||||
if err != nil { |
|
||||
t.Errorf("Failed to update task progress: %v", err) |
|
||||
} |
|
||||
|
|
||||
// Step 8: Test task completion
|
|
||||
t.Logf("Testing task completion") |
|
||||
|
|
||||
err = adminServer.CompleteTask(assignedTask.ID, true, "") |
|
||||
if err != nil { |
|
||||
t.Errorf("Failed to complete task: %v", err) |
|
||||
} |
|
||||
t.Logf("Successfully completed task %s", assignedTask.ID) |
|
||||
} else { |
|
||||
t.Logf("No task was assigned (queue might be empty)") |
|
||||
} |
|
||||
|
|
||||
// Step 9: Test basic metrics
|
|
||||
t.Logf("Testing basic metrics") |
|
||||
|
|
||||
stats := adminServer.GetSystemStats() |
|
||||
if stats != nil { |
|
||||
t.Logf("System stats: Active tasks=%d, Queued tasks=%d, Active workers=%d", |
|
||||
stats.ActiveTasks, stats.QueuedTasks, stats.ActiveWorkers) |
|
||||
} |
|
||||
|
|
||||
queuedCount := adminServer.GetQueuedTaskCount() |
|
||||
activeCount := adminServer.GetActiveTaskCount() |
|
||||
t.Logf("Queue status: %d queued, %d active tasks", queuedCount, activeCount) |
|
||||
|
|
||||
// Step 10: Test task history
|
|
||||
history := adminServer.GetTaskHistory() |
|
||||
t.Logf("Task history contains %d entries", len(history)) |
|
||||
|
|
||||
t.Logf("Simple integration test completed successfully") |
|
||||
} |
|
||||
|
|
||||
// TestWorkerHeartbeat tests worker heartbeat functionality
|
|
||||
func TestWorkerHeartbeat(t *testing.T) { |
|
||||
t.Logf("Testing worker heartbeat") |
|
||||
|
|
||||
config := &AdminConfig{ |
|
||||
ScanInterval: 10 * time.Second, |
|
||||
WorkerTimeout: 30 * time.Second, |
|
||||
TaskTimeout: 2 * time.Hour, |
|
||||
MaxRetries: 3, |
|
||||
ReconcileInterval: 5 * time.Minute, |
|
||||
EnableFailureRecovery: true, |
|
||||
MaxConcurrentTasks: 5, |
|
||||
} |
|
||||
|
|
||||
adminServer := NewAdminServer(config, nil) |
|
||||
err := adminServer.Start() |
|
||||
if err != nil { |
|
||||
t.Fatalf("Failed to start admin server: %v", err) |
|
||||
} |
|
||||
defer adminServer.Stop() |
|
||||
|
|
||||
// Register a worker
|
|
||||
worker := &types.Worker{ |
|
||||
ID: "heartbeat-worker", |
|
||||
Address: "localhost:9002", |
|
||||
Capabilities: []types.TaskType{types.TaskTypeVacuum}, |
|
||||
MaxConcurrent: 1, |
|
||||
Status: "active", |
|
||||
CurrentLoad: 0, |
|
||||
LastHeartbeat: time.Now(), |
|
||||
} |
|
||||
|
|
||||
err = adminServer.RegisterWorker(worker) |
|
||||
if err != nil { |
|
||||
t.Fatalf("Failed to register worker: %v", err) |
|
||||
} |
|
||||
|
|
||||
// Test heartbeat update
|
|
||||
status := &types.WorkerStatus{ |
|
||||
Status: "active", |
|
||||
CurrentLoad: 0, |
|
||||
} |
|
||||
|
|
||||
err = adminServer.UpdateWorkerHeartbeat("heartbeat-worker", status) |
|
||||
if err != nil { |
|
||||
t.Errorf("Failed to update worker heartbeat: %v", err) |
|
||||
} |
|
||||
|
|
||||
t.Logf("Worker heartbeat test completed successfully") |
|
||||
} |
|
||||
|
|
||||
// TestTaskQueueOperations tests task queue operations
|
|
||||
func TestTaskQueueOperations(t *testing.T) { |
|
||||
t.Logf("Testing task queue operations") |
|
||||
|
|
||||
config := &AdminConfig{ |
|
||||
ScanInterval: 10 * time.Second, |
|
||||
WorkerTimeout: 30 * time.Second, |
|
||||
TaskTimeout: 2 * time.Hour, |
|
||||
MaxRetries: 3, |
|
||||
ReconcileInterval: 5 * time.Minute, |
|
||||
EnableFailureRecovery: true, |
|
||||
MaxConcurrentTasks: 5, |
|
||||
} |
|
||||
|
|
||||
adminServer := NewAdminServer(config, nil) |
|
||||
err := adminServer.Start() |
|
||||
if err != nil { |
|
||||
t.Fatalf("Failed to start admin server: %v", err) |
|
||||
} |
|
||||
defer adminServer.Stop() |
|
||||
|
|
||||
// Test queuing multiple tasks
|
|
||||
for i := 0; i < 3; i++ { |
|
||||
task := &types.Task{ |
|
||||
ID: fmt.Sprintf("queue-test-task-%d", i), |
|
||||
Type: types.TaskTypeVacuum, |
|
||||
VolumeID: uint32(2000 + i), |
|
||||
Server: "localhost:8080", |
|
||||
Status: types.TaskStatusPending, |
|
||||
Priority: types.TaskPriorityNormal, |
|
||||
Parameters: map[string]interface{}{ |
|
||||
"garbage_threshold": "0.3", |
|
||||
}, |
|
||||
CreatedAt: time.Now(), |
|
||||
} |
|
||||
|
|
||||
err = adminServer.QueueTask(task) |
|
||||
if err != nil { |
|
||||
t.Errorf("Failed to queue task %d: %v", i, err) |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// Check queue size
|
|
||||
queuedCount := adminServer.GetQueuedTaskCount() |
|
||||
if queuedCount != 3 { |
|
||||
t.Errorf("Expected 3 queued tasks, got %d", queuedCount) |
|
||||
} |
|
||||
|
|
||||
t.Logf("Task queue operations test completed successfully") |
|
||||
} |
|
||||
@ -1,604 +0,0 @@ |
|||||
package task |
|
||||
|
|
||||
import ( |
|
||||
"context" |
|
||||
"fmt" |
|
||||
"math/rand" |
|
||||
"sync" |
|
||||
"time" |
|
||||
|
|
||||
"github.com/seaweedfs/seaweedfs/weed/glog" |
|
||||
"github.com/seaweedfs/seaweedfs/weed/worker/types" |
|
||||
) |
|
||||
|
|
||||
// TaskSimulator provides a comprehensive simulation framework for testing the task distribution system
|
|
||||
type TaskSimulator struct { |
|
||||
adminServer *AdminServer |
|
||||
mockWorkers []*MockWorker |
|
||||
mockMaster *MockMasterClient |
|
||||
scenarios map[string]*SimulationScenario |
|
||||
results map[string]*SimulationResult |
|
||||
mutex sync.RWMutex |
|
||||
} |
|
||||
|
|
||||
// SimulationScenario defines a test scenario
|
|
||||
type SimulationScenario struct { |
|
||||
Name string |
|
||||
Description string |
|
||||
WorkerCount int |
|
||||
VolumeCount int |
|
||||
Duration time.Duration |
|
||||
FailurePatterns []*FailurePattern |
|
||||
TestCases []*TestCase |
|
||||
} |
|
||||
|
|
||||
// FailurePattern defines how failures occur during simulation
|
|
||||
type FailurePattern struct { |
|
||||
Type FailureType |
|
||||
Probability float64 // 0.0 to 1.0
|
|
||||
Timing *TimingSpec // When during task execution
|
|
||||
Duration time.Duration |
|
||||
Details string |
|
||||
} |
|
||||
|
|
||||
// TestCase defines specific test scenarios
|
|
||||
type TestCase struct { |
|
||||
Name string |
|
||||
VolumeID uint32 |
|
||||
TaskType types.TaskType |
|
||||
ExpectedOutcome string |
|
||||
FailureToInject *FailurePattern |
|
||||
} |
|
||||
|
|
||||
// FailureType represents different types of failures
|
|
||||
type FailureType string |
|
||||
|
|
||||
const ( |
|
||||
FailureWorkerTimeout FailureType = "worker_timeout" |
|
||||
FailureTaskStuck FailureType = "task_stuck" |
|
||||
FailureTaskCrash FailureType = "task_crash" |
|
||||
FailureDuplicate FailureType = "duplicate_task" |
|
||||
FailureResourceExhaust FailureType = "resource_exhaustion" |
|
||||
FailureNetworkPartition FailureType = "network_partition" |
|
||||
) |
|
||||
|
|
||||
// TimingSpec defines when a failure occurs
|
|
||||
type TimingSpec struct { |
|
||||
MinProgress float64 // Minimum progress before failure can occur
|
|
||||
MaxProgress float64 // Maximum progress before failure must occur
|
|
||||
Delay time.Duration // Fixed delay before failure
|
|
||||
} |
|
||||
|
|
||||
// SimulationResult tracks the results of a simulation
|
|
||||
type SimulationResult struct { |
|
||||
ScenarioName string |
|
||||
StartTime time.Time |
|
||||
EndTime time.Time |
|
||||
Duration time.Duration |
|
||||
TasksCreated int |
|
||||
TasksCompleted int |
|
||||
TasksFailed int |
|
||||
TasksStuck int |
|
||||
WorkerTimeouts int |
|
||||
DuplicatesFound int |
|
||||
StateInconsistencies int |
|
||||
Errors []string |
|
||||
Warnings []string |
|
||||
Success bool |
|
||||
} |
|
||||
|
|
||||
// MockWorker simulates a worker with controllable behavior
|
|
||||
type MockWorker struct { |
|
||||
ID string |
|
||||
Capabilities []types.TaskType |
|
||||
MaxConcurrent int |
|
||||
CurrentTasks map[string]*MockTask |
|
||||
Status string |
|
||||
FailureMode *FailurePattern |
|
||||
mutex sync.Mutex |
|
||||
} |
|
||||
|
|
||||
// MockTask represents a simulated task execution
|
|
||||
type MockTask struct { |
|
||||
Task *types.Task |
|
||||
StartTime time.Time |
|
||||
Progress float64 |
|
||||
Stuck bool |
|
||||
Failed bool |
|
||||
Completed bool |
|
||||
} |
|
||||
|
|
||||
// MockMasterClient simulates master server interactions
|
|
||||
type MockMasterClient struct { |
|
||||
volumes map[uint32]*VolumeInfo |
|
||||
inconsistency bool |
|
||||
mutex sync.RWMutex |
|
||||
} |
|
||||
|
|
||||
// NewTaskSimulator creates a new task simulator
|
|
||||
func NewTaskSimulator() *TaskSimulator { |
|
||||
return &TaskSimulator{ |
|
||||
scenarios: make(map[string]*SimulationScenario), |
|
||||
results: make(map[string]*SimulationResult), |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// RegisterScenario registers a simulation scenario
|
|
||||
func (ts *TaskSimulator) RegisterScenario(scenario *SimulationScenario) { |
|
||||
ts.mutex.Lock() |
|
||||
defer ts.mutex.Unlock() |
|
||||
|
|
||||
ts.scenarios[scenario.Name] = scenario |
|
||||
glog.Infof("Registered simulation scenario: %s", scenario.Name) |
|
||||
} |
|
||||
|
|
||||
// RunScenario executes a simulation scenario
|
|
||||
func (ts *TaskSimulator) RunScenario(scenarioName string) (*SimulationResult, error) { |
|
||||
ts.mutex.RLock() |
|
||||
scenario, exists := ts.scenarios[scenarioName] |
|
||||
ts.mutex.RUnlock() |
|
||||
|
|
||||
if !exists { |
|
||||
return nil, fmt.Errorf("scenario %s not found", scenarioName) |
|
||||
} |
|
||||
|
|
||||
glog.Infof("Starting simulation scenario: %s", scenarioName) |
|
||||
|
|
||||
result := &SimulationResult{ |
|
||||
ScenarioName: scenarioName, |
|
||||
StartTime: time.Now(), |
|
||||
Errors: make([]string, 0), |
|
||||
Warnings: make([]string, 0), |
|
||||
} |
|
||||
|
|
||||
// Setup simulation environment
|
|
||||
if err := ts.setupEnvironment(scenario); err != nil { |
|
||||
return nil, fmt.Errorf("failed to setup environment: %v", err) |
|
||||
} |
|
||||
|
|
||||
// Execute test cases
|
|
||||
ctx, cancel := context.WithTimeout(context.Background(), scenario.Duration) |
|
||||
defer cancel() |
|
||||
|
|
||||
ts.executeScenario(ctx, scenario, result) |
|
||||
|
|
||||
// Cleanup
|
|
||||
ts.cleanup() |
|
||||
|
|
||||
result.EndTime = time.Now() |
|
||||
result.Duration = result.EndTime.Sub(result.StartTime) |
|
||||
result.Success = len(result.Errors) == 0 |
|
||||
|
|
||||
ts.mutex.Lock() |
|
||||
ts.results[scenarioName] = result |
|
||||
ts.mutex.Unlock() |
|
||||
|
|
||||
glog.Infof("Completed simulation scenario: %s (success: %v)", scenarioName, result.Success) |
|
||||
return result, nil |
|
||||
} |
|
||||
|
|
||||
// setupEnvironment prepares the simulation environment
|
|
||||
func (ts *TaskSimulator) setupEnvironment(scenario *SimulationScenario) error { |
|
||||
// Create mock master client
|
|
||||
ts.mockMaster = &MockMasterClient{ |
|
||||
volumes: make(map[uint32]*VolumeInfo), |
|
||||
} |
|
||||
|
|
||||
// Generate mock volumes
|
|
||||
for i := uint32(1); i <= uint32(scenario.VolumeCount); i++ { |
|
||||
volume := &VolumeInfo{ |
|
||||
ID: i, |
|
||||
Size: uint64(rand.Intn(30 * 1024 * 1024 * 1024)), // Random size up to 30GB
|
|
||||
Collection: fmt.Sprintf("collection_%d", (i%3)+1), |
|
||||
DeletedByteCount: uint64(rand.Intn(1024 * 1024 * 1024)), // Random garbage
|
|
||||
ReadOnly: false, |
|
||||
Server: fmt.Sprintf("server_%d", (i%6)+1), |
|
||||
ModifiedAtSecond: time.Now().Add(-time.Duration(rand.Intn(86400)) * time.Second).Unix(), |
|
||||
} |
|
||||
ts.mockMaster.volumes[i] = volume |
|
||||
} |
|
||||
|
|
||||
// Create mock workers
|
|
||||
ts.mockWorkers = make([]*MockWorker, scenario.WorkerCount) |
|
||||
for i := 0; i < scenario.WorkerCount; i++ { |
|
||||
worker := &MockWorker{ |
|
||||
ID: fmt.Sprintf("worker_%d", i+1), |
|
||||
Capabilities: []types.TaskType{types.TaskTypeErasureCoding, types.TaskTypeVacuum}, |
|
||||
MaxConcurrent: 2, |
|
||||
CurrentTasks: make(map[string]*MockTask), |
|
||||
Status: "active", |
|
||||
} |
|
||||
|
|
||||
// Apply failure patterns
|
|
||||
if i < len(scenario.FailurePatterns) { |
|
||||
worker.FailureMode = scenario.FailurePatterns[i] |
|
||||
} |
|
||||
|
|
||||
ts.mockWorkers[i] = worker |
|
||||
} |
|
||||
|
|
||||
// Initialize admin server (simplified for simulation)
|
|
||||
config := DefaultAdminConfig() |
|
||||
config.ScanInterval = 10 * time.Second |
|
||||
config.TaskTimeout = 30 * time.Second |
|
||||
|
|
||||
// Note: In a real implementation, this would use the actual master client
|
|
||||
// For simulation, we'd need to inject our mock
|
|
||||
|
|
||||
return nil |
|
||||
} |
|
||||
|
|
||||
// executeScenario runs the actual simulation scenario
|
|
||||
func (ts *TaskSimulator) executeScenario(ctx context.Context, scenario *SimulationScenario, result *SimulationResult) { |
|
||||
// Execute each test case
|
|
||||
for _, testCase := range scenario.TestCases { |
|
||||
ts.executeTestCase(ctx, testCase, result) |
|
||||
} |
|
||||
|
|
||||
// Run continuous simulation for remaining duration
|
|
||||
ts.runContinuousSimulation(ctx, scenario, result) |
|
||||
} |
|
||||
|
|
||||
// executeTestCase runs a specific test case
|
|
||||
func (ts *TaskSimulator) executeTestCase(ctx context.Context, testCase *TestCase, result *SimulationResult) { |
|
||||
glog.V(1).Infof("Executing test case: %s", testCase.Name) |
|
||||
|
|
||||
// Create task for the test case
|
|
||||
task := &types.Task{ |
|
||||
ID: fmt.Sprintf("test_%s_%d", testCase.Name, time.Now().UnixNano()), |
|
||||
Type: testCase.TaskType, |
|
||||
VolumeID: testCase.VolumeID, |
|
||||
Priority: types.TaskPriorityNormal, |
|
||||
CreatedAt: time.Now(), |
|
||||
} |
|
||||
|
|
||||
result.TasksCreated++ |
|
||||
|
|
||||
// Assign to worker
|
|
||||
worker := ts.selectWorkerForTask(task) |
|
||||
if worker == nil { |
|
||||
result.Errors = append(result.Errors, fmt.Sprintf("No available worker for test case %s", testCase.Name)) |
|
||||
return |
|
||||
} |
|
||||
|
|
||||
// Execute task with potential failure injection
|
|
||||
ts.executeTaskOnWorker(ctx, task, worker, testCase.FailureToInject, result) |
|
||||
} |
|
||||
|
|
||||
// runContinuousSimulation runs ongoing simulation
|
|
||||
func (ts *TaskSimulator) runContinuousSimulation(ctx context.Context, scenario *SimulationScenario, result *SimulationResult) { |
|
||||
ticker := time.NewTicker(5 * time.Second) |
|
||||
defer ticker.Stop() |
|
||||
|
|
||||
for { |
|
||||
select { |
|
||||
case <-ctx.Done(): |
|
||||
return |
|
||||
case <-ticker.C: |
|
||||
ts.simulateOngoingTasks(result) |
|
||||
ts.checkForInconsistencies(result) |
|
||||
} |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// executeTaskOnWorker simulates task execution on a worker
|
|
||||
func (ts *TaskSimulator) executeTaskOnWorker(ctx context.Context, task *types.Task, worker *MockWorker, failurePattern *FailurePattern, result *SimulationResult) { |
|
||||
worker.mutex.Lock() |
|
||||
defer worker.mutex.Unlock() |
|
||||
|
|
||||
mockTask := &MockTask{ |
|
||||
Task: task, |
|
||||
StartTime: time.Now(), |
|
||||
Progress: 0.0, |
|
||||
} |
|
||||
|
|
||||
worker.CurrentTasks[task.ID] = mockTask |
|
||||
|
|
||||
// Simulate task execution
|
|
||||
go ts.simulateTaskExecution(ctx, mockTask, worker, failurePattern, result) |
|
||||
} |
|
||||
|
|
||||
// simulateTaskExecution simulates the execution of a single task
|
|
||||
func (ts *TaskSimulator) simulateTaskExecution(ctx context.Context, mockTask *MockTask, worker *MockWorker, failurePattern *FailurePattern, result *SimulationResult) { |
|
||||
defer func() { |
|
||||
worker.mutex.Lock() |
|
||||
delete(worker.CurrentTasks, mockTask.Task.ID) |
|
||||
worker.mutex.Unlock() |
|
||||
}() |
|
||||
|
|
||||
duration := 20 * time.Second // Base task duration
|
|
||||
progressTicker := time.NewTicker(time.Second) |
|
||||
defer progressTicker.Stop() |
|
||||
|
|
||||
startTime := time.Now() |
|
||||
|
|
||||
for { |
|
||||
select { |
|
||||
case <-ctx.Done(): |
|
||||
return |
|
||||
case <-progressTicker.C: |
|
||||
elapsed := time.Since(startTime) |
|
||||
progress := float64(elapsed) / float64(duration) * 100.0 |
|
||||
|
|
||||
if progress >= 100.0 { |
|
||||
mockTask.Completed = true |
|
||||
result.TasksCompleted++ |
|
||||
glog.V(2).Infof("Task %s completed successfully", mockTask.Task.ID) |
|
||||
return |
|
||||
} |
|
||||
|
|
||||
mockTask.Progress = progress |
|
||||
|
|
||||
// Check for failure injection
|
|
||||
if failurePattern != nil && ts.shouldInjectFailure(failurePattern, progress, elapsed) { |
|
||||
ts.injectFailure(mockTask, worker, failurePattern, result) |
|
||||
return |
|
||||
} |
|
||||
|
|
||||
// Check for worker failure mode
|
|
||||
if worker.FailureMode != nil && ts.shouldInjectFailure(worker.FailureMode, progress, elapsed) { |
|
||||
ts.injectFailure(mockTask, worker, worker.FailureMode, result) |
|
||||
return |
|
||||
} |
|
||||
} |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// shouldInjectFailure determines if a failure should be injected
|
|
||||
func (ts *TaskSimulator) shouldInjectFailure(pattern *FailurePattern, progress float64, elapsed time.Duration) bool { |
|
||||
if pattern.Timing != nil { |
|
||||
if progress < pattern.Timing.MinProgress || progress > pattern.Timing.MaxProgress { |
|
||||
return false |
|
||||
} |
|
||||
if elapsed < pattern.Timing.Delay { |
|
||||
return false |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
return rand.Float64() < pattern.Probability |
|
||||
} |
|
||||
|
|
||||
// injectFailure simulates a failure
|
|
||||
func (ts *TaskSimulator) injectFailure(mockTask *MockTask, worker *MockWorker, pattern *FailurePattern, result *SimulationResult) { |
|
||||
glog.Warningf("Injecting failure: %s for task %s", pattern.Type, mockTask.Task.ID) |
|
||||
|
|
||||
switch pattern.Type { |
|
||||
case FailureWorkerTimeout: |
|
||||
worker.Status = "timeout" |
|
||||
result.WorkerTimeouts++ |
|
||||
|
|
||||
case FailureTaskStuck: |
|
||||
mockTask.Stuck = true |
|
||||
result.TasksStuck++ |
|
||||
|
|
||||
case FailureTaskCrash: |
|
||||
mockTask.Failed = true |
|
||||
result.TasksFailed++ |
|
||||
|
|
||||
case FailureDuplicate: |
|
||||
result.DuplicatesFound++ |
|
||||
|
|
||||
case FailureResourceExhaust: |
|
||||
worker.Status = "resource_exhausted" |
|
||||
result.Warnings = append(result.Warnings, fmt.Sprintf("Worker %s resource exhausted", worker.ID)) |
|
||||
|
|
||||
case FailureNetworkPartition: |
|
||||
worker.Status = "partitioned" |
|
||||
result.Warnings = append(result.Warnings, fmt.Sprintf("Worker %s network partitioned", worker.ID)) |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// selectWorkerForTask selects an available worker for a task
|
|
||||
func (ts *TaskSimulator) selectWorkerForTask(task *types.Task) *MockWorker { |
|
||||
for _, worker := range ts.mockWorkers { |
|
||||
if worker.Status == "active" && len(worker.CurrentTasks) < worker.MaxConcurrent { |
|
||||
// Check capabilities
|
|
||||
for _, capability := range worker.Capabilities { |
|
||||
if capability == task.Type { |
|
||||
return worker |
|
||||
} |
|
||||
} |
|
||||
} |
|
||||
} |
|
||||
return nil |
|
||||
} |
|
||||
|
|
||||
// simulateOngoingTasks handles ongoing task simulation
|
|
||||
func (ts *TaskSimulator) simulateOngoingTasks(result *SimulationResult) { |
|
||||
// Create random new tasks
|
|
||||
if rand.Float64() < 0.3 { // 30% chance to create new task every tick
|
|
||||
taskType := types.TaskTypeVacuum |
|
||||
if rand.Float64() < 0.5 { |
|
||||
taskType = types.TaskTypeErasureCoding |
|
||||
} |
|
||||
|
|
||||
task := &types.Task{ |
|
||||
ID: fmt.Sprintf("auto_%d", time.Now().UnixNano()), |
|
||||
Type: taskType, |
|
||||
VolumeID: uint32(rand.Intn(len(ts.mockMaster.volumes)) + 1), |
|
||||
Priority: types.TaskPriorityNormal, |
|
||||
CreatedAt: time.Now(), |
|
||||
} |
|
||||
|
|
||||
result.TasksCreated++ |
|
||||
|
|
||||
worker := ts.selectWorkerForTask(task) |
|
||||
if worker != nil { |
|
||||
ts.executeTaskOnWorker(context.Background(), task, worker, nil, result) |
|
||||
} |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// checkForInconsistencies checks for state inconsistencies
|
|
||||
func (ts *TaskSimulator) checkForInconsistencies(result *SimulationResult) { |
|
||||
// Check for volume reservation inconsistencies
|
|
||||
// Check for duplicate tasks
|
|
||||
// Check for orphaned tasks
|
|
||||
// This would be more comprehensive in a real implementation
|
|
||||
|
|
||||
for _, worker := range ts.mockWorkers { |
|
||||
worker.mutex.Lock() |
|
||||
for taskID, mockTask := range worker.CurrentTasks { |
|
||||
if mockTask.Stuck && time.Since(mockTask.StartTime) > 60*time.Second { |
|
||||
result.StateInconsistencies++ |
|
||||
result.Warnings = append(result.Warnings, fmt.Sprintf("Long-running stuck task detected: %s", taskID)) |
|
||||
} |
|
||||
} |
|
||||
worker.mutex.Unlock() |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// cleanup cleans up simulation resources
|
|
||||
func (ts *TaskSimulator) cleanup() { |
|
||||
ts.mockWorkers = nil |
|
||||
ts.mockMaster = nil |
|
||||
} |
|
||||
|
|
||||
// GetSimulationResults returns all simulation results
|
|
||||
func (ts *TaskSimulator) GetSimulationResults() map[string]*SimulationResult { |
|
||||
ts.mutex.RLock() |
|
||||
defer ts.mutex.RUnlock() |
|
||||
|
|
||||
results := make(map[string]*SimulationResult) |
|
||||
for k, v := range ts.results { |
|
||||
results[k] = v |
|
||||
} |
|
||||
return results |
|
||||
} |
|
||||
|
|
||||
// CreateStandardScenarios creates a set of standard test scenarios
|
|
||||
func (ts *TaskSimulator) CreateStandardScenarios() { |
|
||||
// Scenario 1: Worker Timeout During EC
|
|
||||
ts.RegisterScenario(&SimulationScenario{ |
|
||||
Name: "worker_timeout_during_ec", |
|
||||
Description: "Test worker timeout during erasure coding operation", |
|
||||
WorkerCount: 3, |
|
||||
VolumeCount: 10, |
|
||||
Duration: 2 * time.Minute, |
|
||||
FailurePatterns: []*FailurePattern{ |
|
||||
{ |
|
||||
Type: FailureWorkerTimeout, |
|
||||
Probability: 1.0, |
|
||||
Timing: &TimingSpec{ |
|
||||
MinProgress: 50.0, |
|
||||
MaxProgress: 60.0, |
|
||||
}, |
|
||||
}, |
|
||||
}, |
|
||||
TestCases: []*TestCase{ |
|
||||
{ |
|
||||
Name: "ec_timeout_test", |
|
||||
VolumeID: 1, |
|
||||
TaskType: types.TaskTypeErasureCoding, |
|
||||
ExpectedOutcome: "task_reassigned", |
|
||||
}, |
|
||||
}, |
|
||||
}) |
|
||||
|
|
||||
// Scenario 2: Stuck Vacuum Task
|
|
||||
ts.RegisterScenario(&SimulationScenario{ |
|
||||
Name: "stuck_vacuum_task", |
|
||||
Description: "Test stuck vacuum task detection and cleanup", |
|
||||
WorkerCount: 2, |
|
||||
VolumeCount: 5, |
|
||||
Duration: 90 * time.Second, |
|
||||
TestCases: []*TestCase{ |
|
||||
{ |
|
||||
Name: "vacuum_stuck_test", |
|
||||
VolumeID: 2, |
|
||||
TaskType: types.TaskTypeVacuum, |
|
||||
FailureToInject: &FailurePattern{ |
|
||||
Type: FailureTaskStuck, |
|
||||
Probability: 1.0, |
|
||||
Timing: &TimingSpec{ |
|
||||
MinProgress: 75.0, |
|
||||
MaxProgress: 80.0, |
|
||||
}, |
|
||||
}, |
|
||||
ExpectedOutcome: "task_timeout_detected", |
|
||||
}, |
|
||||
}, |
|
||||
}) |
|
||||
|
|
||||
// Scenario 3: Duplicate Task Prevention
|
|
||||
ts.RegisterScenario(&SimulationScenario{ |
|
||||
Name: "duplicate_task_prevention", |
|
||||
Description: "Test duplicate task detection and prevention", |
|
||||
WorkerCount: 4, |
|
||||
VolumeCount: 8, |
|
||||
Duration: 60 * time.Second, |
|
||||
TestCases: []*TestCase{ |
|
||||
{ |
|
||||
Name: "duplicate_ec_test_1", |
|
||||
VolumeID: 3, |
|
||||
TaskType: types.TaskTypeErasureCoding, |
|
||||
}, |
|
||||
{ |
|
||||
Name: "duplicate_ec_test_2", // Same volume, should be detected as duplicate
|
|
||||
VolumeID: 3, |
|
||||
TaskType: types.TaskTypeErasureCoding, |
|
||||
FailureToInject: &FailurePattern{ |
|
||||
Type: FailureDuplicate, |
|
||||
Probability: 1.0, |
|
||||
}, |
|
||||
ExpectedOutcome: "duplicate_detected", |
|
||||
}, |
|
||||
}, |
|
||||
}) |
|
||||
|
|
||||
// Scenario 4: Master-Admin State Divergence
|
|
||||
ts.RegisterScenario(&SimulationScenario{ |
|
||||
Name: "master_admin_divergence", |
|
||||
Description: "Test state reconciliation between master and admin server", |
|
||||
WorkerCount: 3, |
|
||||
VolumeCount: 15, |
|
||||
Duration: 2 * time.Minute, |
|
||||
TestCases: []*TestCase{ |
|
||||
{ |
|
||||
Name: "state_reconciliation_test", |
|
||||
VolumeID: 4, |
|
||||
TaskType: types.TaskTypeErasureCoding, |
|
||||
ExpectedOutcome: "state_reconciled", |
|
||||
}, |
|
||||
}, |
|
||||
}) |
|
||||
} |
|
||||
|
|
||||
// GenerateSimulationReport creates a comprehensive report of simulation results
|
|
||||
func (ts *TaskSimulator) GenerateSimulationReport() string { |
|
||||
ts.mutex.RLock() |
|
||||
defer ts.mutex.RUnlock() |
|
||||
|
|
||||
report := "# Task Distribution System Simulation Report\n\n" |
|
||||
|
|
||||
for scenarioName, result := range ts.results { |
|
||||
report += fmt.Sprintf("## Scenario: %s\n", scenarioName) |
|
||||
report += fmt.Sprintf("- **Duration**: %v\n", result.Duration) |
|
||||
report += fmt.Sprintf("- **Success**: %v\n", result.Success) |
|
||||
report += fmt.Sprintf("- **Tasks Created**: %d\n", result.TasksCreated) |
|
||||
report += fmt.Sprintf("- **Tasks Completed**: %d\n", result.TasksCompleted) |
|
||||
report += fmt.Sprintf("- **Tasks Failed**: %d\n", result.TasksFailed) |
|
||||
report += fmt.Sprintf("- **Tasks Stuck**: %d\n", result.TasksStuck) |
|
||||
report += fmt.Sprintf("- **Worker Timeouts**: %d\n", result.WorkerTimeouts) |
|
||||
report += fmt.Sprintf("- **Duplicates Found**: %d\n", result.DuplicatesFound) |
|
||||
report += fmt.Sprintf("- **State Inconsistencies**: %d\n", result.StateInconsistencies) |
|
||||
|
|
||||
if len(result.Errors) > 0 { |
|
||||
report += "- **Errors**:\n" |
|
||||
for _, err := range result.Errors { |
|
||||
report += fmt.Sprintf(" - %s\n", err) |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
if len(result.Warnings) > 0 { |
|
||||
report += "- **Warnings**:\n" |
|
||||
for _, warning := range result.Warnings { |
|
||||
report += fmt.Sprintf(" - %s\n", warning) |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
report += "\n" |
|
||||
} |
|
||||
|
|
||||
return report |
|
||||
} |
|
||||
@ -1,695 +0,0 @@ |
|||||
package simulation |
|
||||
|
|
||||
import ( |
|
||||
"context" |
|
||||
"fmt" |
|
||||
"math/rand" |
|
||||
"sync" |
|
||||
"time" |
|
||||
|
|
||||
"github.com/seaweedfs/seaweedfs/weed/admin/task" |
|
||||
"github.com/seaweedfs/seaweedfs/weed/glog" |
|
||||
"github.com/seaweedfs/seaweedfs/weed/worker/types" |
|
||||
) |
|
||||
|
|
||||
// ComprehensiveSimulator tests all possible edge cases in volume/shard state management
|
|
||||
type ComprehensiveSimulator struct { |
|
||||
stateManager *task.VolumeStateManager |
|
||||
mockMaster *MockMasterServer |
|
||||
mockWorkers []*MockWorker |
|
||||
scenarios []*StateTestScenario |
|
||||
currentScenario *StateTestScenario |
|
||||
results *SimulationResults |
|
||||
eventLog []*SimulationEvent |
|
||||
mutex sync.RWMutex |
|
||||
} |
|
||||
|
|
||||
// StateTestScenario represents a specific state management test case
|
|
||||
type StateTestScenario struct { |
|
||||
Name string |
|
||||
Description string |
|
||||
InitialState *ClusterState |
|
||||
EventSequence []*SimulationEvent |
|
||||
ExpectedFinalState *ClusterState |
|
||||
InconsistencyChecks []*InconsistencyCheck |
|
||||
Duration time.Duration |
|
||||
} |
|
||||
|
|
||||
// ClusterState represents the complete state of the cluster
|
|
||||
type ClusterState struct { |
|
||||
Volumes map[uint32]*task.VolumeInfo |
|
||||
ECShards map[uint32]map[int]*task.ShardInfo |
|
||||
ServerCapacity map[string]*task.CapacityInfo |
|
||||
InProgressTasks map[string]*task.TaskImpact |
|
||||
Timestamp time.Time |
|
||||
} |
|
||||
|
|
||||
// SimulationEvent represents an event that can occur during simulation
|
|
||||
type SimulationEvent struct { |
|
||||
Type EventType |
|
||||
Timestamp time.Time |
|
||||
VolumeID uint32 |
|
||||
ShardID *int |
|
||||
Server string |
|
||||
TaskID string |
|
||||
Parameters map[string]interface{} |
|
||||
Description string |
|
||||
} |
|
||||
|
|
||||
// EventType represents different types of simulation events
|
|
||||
type EventType string |
|
||||
|
|
||||
const ( |
|
||||
// Volume events
|
|
||||
EventVolumeCreated EventType = "volume_created" |
|
||||
EventVolumeDeleted EventType = "volume_deleted" |
|
||||
EventVolumeSizeChanged EventType = "volume_size_changed" |
|
||||
EventVolumeReadOnly EventType = "volume_readonly" |
|
||||
|
|
||||
// Shard events
|
|
||||
EventShardCreated EventType = "shard_created" |
|
||||
EventShardDeleted EventType = "shard_deleted" |
|
||||
EventShardMoved EventType = "shard_moved" |
|
||||
EventShardCorrupted EventType = "shard_corrupted" |
|
||||
|
|
||||
// Task events
|
|
||||
EventTaskStarted EventType = "task_started" |
|
||||
EventTaskCompleted EventType = "task_completed" |
|
||||
EventTaskFailed EventType = "task_failed" |
|
||||
EventTaskStuck EventType = "task_stuck" |
|
||||
EventTaskCancelled EventType = "task_cancelled" |
|
||||
|
|
||||
// Worker events
|
|
||||
EventWorkerJoined EventType = "worker_joined" |
|
||||
EventWorkerLeft EventType = "worker_left" |
|
||||
EventWorkerTimeout EventType = "worker_timeout" |
|
||||
EventWorkerRestarted EventType = "worker_restarted" |
|
||||
|
|
||||
// Master events
|
|
||||
EventMasterSync EventType = "master_sync" |
|
||||
EventMasterInconsistent EventType = "master_inconsistent" |
|
||||
EventMasterPartitioned EventType = "master_partitioned" |
|
||||
EventMasterReconnected EventType = "master_reconnected" |
|
||||
|
|
||||
// Network events
|
|
||||
EventNetworkPartition EventType = "network_partition" |
|
||||
EventNetworkHealed EventType = "network_healed" |
|
||||
EventMessageDelayed EventType = "message_delayed" |
|
||||
EventMessageLost EventType = "message_lost" |
|
||||
) |
|
||||
|
|
||||
// InconsistencyCheck defines what inconsistencies to check for
|
|
||||
type InconsistencyCheck struct { |
|
||||
Name string |
|
||||
Type task.InconsistencyType |
|
||||
ExpectedCount int |
|
||||
MaxAllowedCount int |
|
||||
SeverityThreshold task.SeverityLevel |
|
||||
} |
|
||||
|
|
||||
// MockMasterServer simulates master server behavior with controllable inconsistencies
|
|
||||
type MockMasterServer struct { |
|
||||
volumes map[uint32]*task.VolumeInfo |
|
||||
ecShards map[uint32]map[int]*task.ShardInfo |
|
||||
serverCapacity map[string]*task.CapacityInfo |
|
||||
inconsistencyMode bool |
|
||||
networkPartitioned bool |
|
||||
responseDelay time.Duration |
|
||||
mutex sync.RWMutex |
|
||||
} |
|
||||
|
|
||||
// MockWorker represents a mock worker for testing
|
|
||||
type MockWorker struct { |
|
||||
ID string |
|
||||
Capabilities []types.TaskType |
|
||||
IsActive bool |
|
||||
TaskDelay time.Duration |
|
||||
FailureRate float64 |
|
||||
} |
|
||||
|
|
||||
// SimulationResults tracks comprehensive simulation results
|
|
||||
type SimulationResults struct { |
|
||||
ScenarioName string |
|
||||
StartTime time.Time |
|
||||
EndTime time.Time |
|
||||
Duration time.Duration |
|
||||
TotalEvents int |
|
||||
EventsByType map[EventType]int |
|
||||
InconsistenciesFound map[task.InconsistencyType]int |
|
||||
TasksExecuted int |
|
||||
TasksSucceeded int |
|
||||
TasksFailed int |
|
||||
StateValidationsPassed int |
|
||||
StateValidationsFailed int |
|
||||
CriticalErrors []string |
|
||||
Warnings []string |
|
||||
DetailedLog []string |
|
||||
Success bool |
|
||||
} |
|
||||
|
|
||||
// NewComprehensiveSimulator creates a new comprehensive simulator
|
|
||||
func NewComprehensiveSimulator() *ComprehensiveSimulator { |
|
||||
return &ComprehensiveSimulator{ |
|
||||
stateManager: task.NewVolumeStateManager(nil), |
|
||||
mockMaster: NewMockMasterServer(), |
|
||||
scenarios: []*StateTestScenario{}, |
|
||||
eventLog: []*SimulationEvent{}, |
|
||||
results: &SimulationResults{ |
|
||||
EventsByType: make(map[EventType]int), |
|
||||
InconsistenciesFound: make(map[task.InconsistencyType]int), |
|
||||
CriticalErrors: []string{}, |
|
||||
Warnings: []string{}, |
|
||||
DetailedLog: []string{}, |
|
||||
}, |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// CreateComprehensiveScenarios creates all possible edge case scenarios
|
|
||||
func (cs *ComprehensiveSimulator) CreateComprehensiveScenarios() { |
|
||||
cs.scenarios = []*StateTestScenario{ |
|
||||
cs.createVolumeCreationDuringTaskScenario(), |
|
||||
cs.createVolumeDeletionDuringTaskScenario(), |
|
||||
cs.createShardCreationRaceConditionScenario(), |
|
||||
cs.createMasterSyncDuringTaskScenario(), |
|
||||
cs.createNetworkPartitionScenario(), |
|
||||
cs.createWorkerFailureDuringECScenario(), |
|
||||
cs.createConcurrentTasksScenario(), |
|
||||
cs.createCapacityOverflowScenario(), |
|
||||
cs.createShardCorruptionScenario(), |
|
||||
cs.createMasterInconsistencyScenario(), |
|
||||
cs.createTaskOrphanScenario(), |
|
||||
cs.createDuplicateTaskDetectionScenario(), |
|
||||
cs.createVolumeStateRollbackScenario(), |
|
||||
cs.createComplexECOperationScenario(), |
|
||||
cs.createHighLoadStressTestScenario(), |
|
||||
} |
|
||||
|
|
||||
glog.Infof("Created %d comprehensive test scenarios", len(cs.scenarios)) |
|
||||
} |
|
||||
|
|
||||
// RunAllComprehensiveScenarios runs all edge case scenarios
|
|
||||
func (cs *ComprehensiveSimulator) RunAllComprehensiveScenarios() (*SimulationResults, error) { |
|
||||
glog.Infof("Starting comprehensive state management simulation") |
|
||||
|
|
||||
cs.results.StartTime = time.Now() |
|
||||
|
|
||||
for _, scenario := range cs.scenarios { |
|
||||
glog.Infof("Running scenario: %s", scenario.Name) |
|
||||
|
|
||||
if err := cs.RunScenario(scenario); err != nil { |
|
||||
cs.results.CriticalErrors = append(cs.results.CriticalErrors, |
|
||||
fmt.Sprintf("Scenario %s failed: %v", scenario.Name, err)) |
|
||||
} |
|
||||
|
|
||||
// Brief pause between scenarios
|
|
||||
time.Sleep(1 * time.Second) |
|
||||
} |
|
||||
|
|
||||
cs.results.EndTime = time.Now() |
|
||||
cs.results.Duration = cs.results.EndTime.Sub(cs.results.StartTime) |
|
||||
cs.results.Success = len(cs.results.CriticalErrors) == 0 |
|
||||
|
|
||||
cs.generateDetailedReport() |
|
||||
|
|
||||
glog.Infof("Comprehensive simulation completed: %v", cs.results.Success) |
|
||||
return cs.results, nil |
|
||||
} |
|
||||
|
|
||||
// Scenario creation methods
|
|
||||
|
|
||||
func (cs *ComprehensiveSimulator) createVolumeCreationDuringTaskScenario() *StateTestScenario { |
|
||||
return &StateTestScenario{ |
|
||||
Name: "volume_creation_during_task", |
|
||||
Description: "Tests state consistency when master reports new volume while task is creating it", |
|
||||
InitialState: &ClusterState{ |
|
||||
Volumes: make(map[uint32]*task.VolumeInfo), |
|
||||
ECShards: make(map[uint32]map[int]*task.ShardInfo), |
|
||||
}, |
|
||||
EventSequence: []*SimulationEvent{ |
|
||||
{Type: EventTaskStarted, VolumeID: 1, TaskID: "create_task_1", Parameters: map[string]interface{}{"type": "create"}}, |
|
||||
{Type: EventVolumeCreated, VolumeID: 1, Parameters: map[string]interface{}{"size": int64(1024 * 1024 * 1024)}}, |
|
||||
{Type: EventMasterSync}, |
|
||||
{Type: EventTaskCompleted, TaskID: "create_task_1"}, |
|
||||
}, |
|
||||
ExpectedFinalState: &ClusterState{ |
|
||||
Volumes: map[uint32]*task.VolumeInfo{ |
|
||||
1: {ID: 1, Size: 1024 * 1024 * 1024}, |
|
||||
}, |
|
||||
}, |
|
||||
InconsistencyChecks: []*InconsistencyCheck{ |
|
||||
{Name: "No unexpected volumes", Type: task.InconsistencyVolumeUnexpected, MaxAllowedCount: 0}, |
|
||||
}, |
|
||||
Duration: 30 * time.Second, |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
func (cs *ComprehensiveSimulator) createVolumeDeletionDuringTaskScenario() *StateTestScenario { |
|
||||
return &StateTestScenario{ |
|
||||
Name: "volume_deletion_during_task", |
|
||||
Description: "Tests handling when volume is deleted while task is working on it", |
|
||||
InitialState: &ClusterState{ |
|
||||
Volumes: map[uint32]*task.VolumeInfo{ |
|
||||
1: {ID: 1, Size: 1024 * 1024 * 1024}, |
|
||||
}, |
|
||||
}, |
|
||||
EventSequence: []*SimulationEvent{ |
|
||||
{Type: EventTaskStarted, VolumeID: 1, TaskID: "vacuum_task_1", Parameters: map[string]interface{}{"type": "vacuum"}}, |
|
||||
{Type: EventVolumeDeleted, VolumeID: 1}, |
|
||||
{Type: EventMasterSync}, |
|
||||
{Type: EventTaskFailed, TaskID: "vacuum_task_1", Parameters: map[string]interface{}{"reason": "volume_deleted"}}, |
|
||||
}, |
|
||||
InconsistencyChecks: []*InconsistencyCheck{ |
|
||||
{Name: "Missing volume detected", Type: task.InconsistencyVolumeMissing, ExpectedCount: 1}, |
|
||||
}, |
|
||||
Duration: 30 * time.Second, |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
func (cs *ComprehensiveSimulator) createShardCreationRaceConditionScenario() *StateTestScenario { |
|
||||
return &StateTestScenario{ |
|
||||
Name: "shard_creation_race_condition", |
|
||||
Description: "Tests race condition between EC task creating shards and master sync", |
|
||||
InitialState: &ClusterState{ |
|
||||
Volumes: map[uint32]*task.VolumeInfo{ |
|
||||
1: {ID: 1, Size: 28 * 1024 * 1024 * 1024}, // Large volume ready for EC
|
|
||||
}, |
|
||||
}, |
|
||||
EventSequence: []*SimulationEvent{ |
|
||||
{Type: EventTaskStarted, VolumeID: 1, TaskID: "ec_task_1", Parameters: map[string]interface{}{"type": "ec_encode"}}, |
|
||||
// Simulate shards being created one by one
|
|
||||
{Type: EventShardCreated, VolumeID: 1, ShardID: intPtr(0), Server: "server1"}, |
|
||||
{Type: EventShardCreated, VolumeID: 1, ShardID: intPtr(1), Server: "server1"}, |
|
||||
{Type: EventMasterSync}, // Master sync happens while shards are being created
|
|
||||
{Type: EventShardCreated, VolumeID: 1, ShardID: intPtr(2), Server: "server2"}, |
|
||||
{Type: EventShardCreated, VolumeID: 1, ShardID: intPtr(3), Server: "server2"}, |
|
||||
{Type: EventTaskCompleted, TaskID: "ec_task_1"}, |
|
||||
{Type: EventMasterSync}, |
|
||||
}, |
|
||||
InconsistencyChecks: []*InconsistencyCheck{ |
|
||||
{Name: "All shards accounted for", Type: task.InconsistencyShardMissing, MaxAllowedCount: 0}, |
|
||||
}, |
|
||||
Duration: 45 * time.Second, |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
func (cs *ComprehensiveSimulator) createNetworkPartitionScenario() *StateTestScenario { |
|
||||
return &StateTestScenario{ |
|
||||
Name: "network_partition_recovery", |
|
||||
Description: "Tests state consistency during and after network partitions", |
|
||||
EventSequence: []*SimulationEvent{ |
|
||||
{Type: EventTaskStarted, VolumeID: 1, TaskID: "partition_task_1"}, |
|
||||
{Type: EventNetworkPartition, Parameters: map[string]interface{}{"duration": "30s"}}, |
|
||||
{Type: EventVolumeCreated, VolumeID: 2}, // Created during partition
|
|
||||
{Type: EventNetworkHealed}, |
|
||||
{Type: EventMasterReconnected}, |
|
||||
{Type: EventMasterSync}, |
|
||||
{Type: EventTaskCompleted, TaskID: "partition_task_1"}, |
|
||||
}, |
|
||||
InconsistencyChecks: []*InconsistencyCheck{ |
|
||||
{Name: "State reconciled after partition", Type: task.InconsistencyVolumeUnexpected, MaxAllowedCount: 1}, |
|
||||
}, |
|
||||
Duration: 60 * time.Second, |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
func (cs *ComprehensiveSimulator) createConcurrentTasksScenario() *StateTestScenario { |
|
||||
return &StateTestScenario{ |
|
||||
Name: "concurrent_tasks_capacity_tracking", |
|
||||
Description: "Tests capacity tracking with multiple concurrent tasks", |
|
||||
EventSequence: []*SimulationEvent{ |
|
||||
{Type: EventTaskStarted, VolumeID: 1, TaskID: "ec_task_1"}, |
|
||||
{Type: EventTaskStarted, VolumeID: 2, TaskID: "vacuum_task_1"}, |
|
||||
{Type: EventTaskStarted, VolumeID: 3, TaskID: "ec_task_2"}, |
|
||||
{Type: EventMasterSync}, |
|
||||
{Type: EventTaskCompleted, TaskID: "vacuum_task_1"}, |
|
||||
{Type: EventTaskCompleted, TaskID: "ec_task_1"}, |
|
||||
{Type: EventTaskCompleted, TaskID: "ec_task_2"}, |
|
||||
{Type: EventMasterSync}, |
|
||||
}, |
|
||||
InconsistencyChecks: []*InconsistencyCheck{ |
|
||||
{Name: "Capacity tracking accurate", Type: task.InconsistencyCapacityMismatch, MaxAllowedCount: 0}, |
|
||||
}, |
|
||||
Duration: 90 * time.Second, |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
func (cs *ComprehensiveSimulator) createComplexECOperationScenario() *StateTestScenario { |
|
||||
return &StateTestScenario{ |
|
||||
Name: "complex_ec_operation", |
|
||||
Description: "Tests complex EC operations with shard movements and rebuilds", |
|
||||
EventSequence: []*SimulationEvent{ |
|
||||
{Type: EventTaskStarted, VolumeID: 1, TaskID: "ec_encode_1"}, |
|
||||
// Create all 14 shards
|
|
||||
{Type: EventShardCreated, VolumeID: 1, ShardID: intPtr(0), Server: "server1"}, |
|
||||
{Type: EventShardCreated, VolumeID: 1, ShardID: intPtr(1), Server: "server1"}, |
|
||||
// ... more shards
|
|
||||
{Type: EventTaskCompleted, TaskID: "ec_encode_1"}, |
|
||||
{Type: EventShardCorrupted, VolumeID: 1, ShardID: intPtr(2)}, |
|
||||
{Type: EventTaskStarted, VolumeID: 1, TaskID: "ec_rebuild_1"}, |
|
||||
{Type: EventShardCreated, VolumeID: 1, ShardID: intPtr(2), Server: "server3"}, // Rebuilt
|
|
||||
{Type: EventTaskCompleted, TaskID: "ec_rebuild_1"}, |
|
||||
{Type: EventMasterSync}, |
|
||||
}, |
|
||||
Duration: 120 * time.Second, |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
func (cs *ComprehensiveSimulator) createHighLoadStressTestScenario() *StateTestScenario { |
|
||||
events := []*SimulationEvent{} |
|
||||
|
|
||||
// Create 100 concurrent tasks
|
|
||||
for i := 0; i < 100; i++ { |
|
||||
events = append(events, &SimulationEvent{ |
|
||||
Type: EventTaskStarted, |
|
||||
VolumeID: uint32(i + 1), |
|
||||
TaskID: fmt.Sprintf("stress_task_%d", i), |
|
||||
}) |
|
||||
} |
|
||||
|
|
||||
// Add master syncs throughout
|
|
||||
for i := 0; i < 10; i++ { |
|
||||
events = append(events, &SimulationEvent{ |
|
||||
Type: EventMasterSync, |
|
||||
}) |
|
||||
} |
|
||||
|
|
||||
// Complete all tasks
|
|
||||
for i := 0; i < 100; i++ { |
|
||||
events = append(events, &SimulationEvent{ |
|
||||
Type: EventTaskCompleted, |
|
||||
TaskID: fmt.Sprintf("stress_task_%d", i), |
|
||||
}) |
|
||||
} |
|
||||
|
|
||||
return &StateTestScenario{ |
|
||||
Name: "high_load_stress_test", |
|
||||
Description: "Tests system under high load with many concurrent operations", |
|
||||
EventSequence: events, |
|
||||
Duration: 5 * time.Minute, |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// Add more scenario creation methods...
|
|
||||
func (cs *ComprehensiveSimulator) createMasterSyncDuringTaskScenario() *StateTestScenario { |
|
||||
return &StateTestScenario{Name: "master_sync_during_task", Description: "Test", Duration: 30 * time.Second} |
|
||||
} |
|
||||
|
|
||||
func (cs *ComprehensiveSimulator) createWorkerFailureDuringECScenario() *StateTestScenario { |
|
||||
return &StateTestScenario{Name: "worker_failure_during_ec", Description: "Test", Duration: 30 * time.Second} |
|
||||
} |
|
||||
|
|
||||
func (cs *ComprehensiveSimulator) createCapacityOverflowScenario() *StateTestScenario { |
|
||||
return &StateTestScenario{Name: "capacity_overflow", Description: "Test", Duration: 30 * time.Second} |
|
||||
} |
|
||||
|
|
||||
func (cs *ComprehensiveSimulator) createShardCorruptionScenario() *StateTestScenario { |
|
||||
return &StateTestScenario{Name: "shard_corruption", Description: "Test", Duration: 30 * time.Second} |
|
||||
} |
|
||||
|
|
||||
func (cs *ComprehensiveSimulator) createMasterInconsistencyScenario() *StateTestScenario { |
|
||||
return &StateTestScenario{Name: "master_inconsistency", Description: "Test", Duration: 30 * time.Second} |
|
||||
} |
|
||||
|
|
||||
func (cs *ComprehensiveSimulator) createTaskOrphanScenario() *StateTestScenario { |
|
||||
return &StateTestScenario{Name: "task_orphan", Description: "Test", Duration: 30 * time.Second} |
|
||||
} |
|
||||
|
|
||||
func (cs *ComprehensiveSimulator) createDuplicateTaskDetectionScenario() *StateTestScenario { |
|
||||
return &StateTestScenario{Name: "duplicate_task_detection", Description: "Test", Duration: 30 * time.Second} |
|
||||
} |
|
||||
|
|
||||
func (cs *ComprehensiveSimulator) createVolumeStateRollbackScenario() *StateTestScenario { |
|
||||
return &StateTestScenario{Name: "volume_state_rollback", Description: "Test", Duration: 30 * time.Second} |
|
||||
} |
|
||||
|
|
||||
// RunScenario executes a single test scenario
|
|
||||
func (cs *ComprehensiveSimulator) RunScenario(scenario *StateTestScenario) error { |
|
||||
cs.mutex.Lock() |
|
||||
cs.currentScenario = scenario |
|
||||
cs.mutex.Unlock() |
|
||||
|
|
||||
glog.V(1).Infof("Setting up scenario: %s", scenario.Name) |
|
||||
|
|
||||
// Setup initial state
|
|
||||
if err := cs.setupInitialState(scenario.InitialState); err != nil { |
|
||||
return fmt.Errorf("failed to setup initial state: %v", err) |
|
||||
} |
|
||||
|
|
||||
// Execute event sequence
|
|
||||
ctx, cancel := context.WithTimeout(context.Background(), scenario.Duration) |
|
||||
defer cancel() |
|
||||
|
|
||||
for _, event := range scenario.EventSequence { |
|
||||
select { |
|
||||
case <-ctx.Done(): |
|
||||
return fmt.Errorf("scenario timed out") |
|
||||
default: |
|
||||
if err := cs.executeEvent(event); err != nil { |
|
||||
cs.results.Warnings = append(cs.results.Warnings, |
|
||||
fmt.Sprintf("Event execution warning in %s: %v", scenario.Name, err)) |
|
||||
} |
|
||||
cs.logEvent(event) |
|
||||
} |
|
||||
|
|
||||
// Small delay between events
|
|
||||
time.Sleep(100 * time.Millisecond) |
|
||||
} |
|
||||
|
|
||||
// Validate final state
|
|
||||
if err := cs.validateFinalState(scenario); err != nil { |
|
||||
cs.results.StateValidationsFailed++ |
|
||||
return fmt.Errorf("final state validation failed: %v", err) |
|
||||
} else { |
|
||||
cs.results.StateValidationsPassed++ |
|
||||
} |
|
||||
|
|
||||
glog.V(1).Infof("Scenario %s completed successfully", scenario.Name) |
|
||||
return nil |
|
||||
} |
|
||||
|
|
||||
// executeEvent executes a single simulation event
|
|
||||
func (cs *ComprehensiveSimulator) executeEvent(event *SimulationEvent) error { |
|
||||
cs.results.TotalEvents++ |
|
||||
cs.results.EventsByType[event.Type]++ |
|
||||
|
|
||||
switch event.Type { |
|
||||
case EventTaskStarted: |
|
||||
return cs.simulateTaskStart(event) |
|
||||
case EventTaskCompleted: |
|
||||
return cs.simulateTaskCompletion(event) |
|
||||
case EventVolumeCreated: |
|
||||
return cs.simulateVolumeCreation(event) |
|
||||
case EventVolumeDeleted: |
|
||||
return cs.simulateVolumeDeletion(event) |
|
||||
case EventShardCreated: |
|
||||
return cs.simulateShardCreation(event) |
|
||||
case EventMasterSync: |
|
||||
return cs.simulateMasterSync(event) |
|
||||
case EventNetworkPartition: |
|
||||
return cs.simulateNetworkPartition(event) |
|
||||
default: |
|
||||
return nil // Unsupported event type
|
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// Event simulation methods
|
|
||||
func (cs *ComprehensiveSimulator) simulateTaskStart(event *SimulationEvent) error { |
|
||||
taskType, _ := event.Parameters["type"].(string) |
|
||||
|
|
||||
impact := &task.TaskImpact{ |
|
||||
TaskID: event.TaskID, |
|
||||
TaskType: types.TaskType(taskType), |
|
||||
VolumeID: event.VolumeID, |
|
||||
StartedAt: time.Now(), |
|
||||
EstimatedEnd: time.Now().Add(30 * time.Second), |
|
||||
VolumeChanges: &task.VolumeChanges{}, |
|
||||
ShardChanges: make(map[int]*task.ShardChange), |
|
||||
CapacityDelta: make(map[string]int64), |
|
||||
} |
|
||||
|
|
||||
cs.stateManager.RegisterTaskImpact(event.TaskID, impact) |
|
||||
cs.results.TasksExecuted++ |
|
||||
|
|
||||
return nil |
|
||||
} |
|
||||
|
|
||||
func (cs *ComprehensiveSimulator) simulateTaskCompletion(event *SimulationEvent) error { |
|
||||
cs.stateManager.UnregisterTaskImpact(event.TaskID) |
|
||||
cs.results.TasksSucceeded++ |
|
||||
return nil |
|
||||
} |
|
||||
|
|
||||
func (cs *ComprehensiveSimulator) simulateVolumeCreation(event *SimulationEvent) error { |
|
||||
size, _ := event.Parameters["size"].(int64) |
|
||||
cs.mockMaster.CreateVolume(event.VolumeID, size) |
|
||||
return nil |
|
||||
} |
|
||||
|
|
||||
func (cs *ComprehensiveSimulator) simulateVolumeDeletion(event *SimulationEvent) error { |
|
||||
cs.mockMaster.DeleteVolume(event.VolumeID) |
|
||||
return nil |
|
||||
} |
|
||||
|
|
||||
func (cs *ComprehensiveSimulator) simulateShardCreation(event *SimulationEvent) error { |
|
||||
if event.ShardID != nil { |
|
||||
cs.mockMaster.CreateShard(event.VolumeID, *event.ShardID, event.Server) |
|
||||
} |
|
||||
return nil |
|
||||
} |
|
||||
|
|
||||
func (cs *ComprehensiveSimulator) simulateMasterSync(event *SimulationEvent) error { |
|
||||
return cs.stateManager.SyncWithMaster() |
|
||||
} |
|
||||
|
|
||||
func (cs *ComprehensiveSimulator) simulateNetworkPartition(event *SimulationEvent) error { |
|
||||
cs.mockMaster.SetNetworkPartitioned(true) |
|
||||
|
|
||||
// Auto-heal after duration
|
|
||||
if durationStr, ok := event.Parameters["duration"].(string); ok { |
|
||||
if duration, err := time.ParseDuration(durationStr); err == nil { |
|
||||
time.AfterFunc(duration, func() { |
|
||||
cs.mockMaster.SetNetworkPartitioned(false) |
|
||||
}) |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
return nil |
|
||||
} |
|
||||
|
|
||||
// Helper methods
|
|
||||
func (cs *ComprehensiveSimulator) setupInitialState(initialState *ClusterState) error { |
|
||||
if initialState == nil { |
|
||||
return nil |
|
||||
} |
|
||||
|
|
||||
// Setup mock master with initial state
|
|
||||
for volumeID, volume := range initialState.Volumes { |
|
||||
cs.mockMaster.CreateVolume(volumeID, int64(volume.Size)) |
|
||||
} |
|
||||
|
|
||||
for volumeID, shards := range initialState.ECShards { |
|
||||
for shardID, shard := range shards { |
|
||||
cs.mockMaster.CreateShard(volumeID, shardID, shard.Server) |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
return nil |
|
||||
} |
|
||||
|
|
||||
func (cs *ComprehensiveSimulator) validateFinalState(scenario *StateTestScenario) error { |
|
||||
// Run inconsistency checks
|
|
||||
for _, check := range scenario.InconsistencyChecks { |
|
||||
if err := cs.validateInconsistencyCheck(check); err != nil { |
|
||||
return err |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
return nil |
|
||||
} |
|
||||
|
|
||||
func (cs *ComprehensiveSimulator) validateInconsistencyCheck(check *InconsistencyCheck) error { |
|
||||
// This would check for specific inconsistencies
|
|
||||
// For now, we'll simulate the check
|
|
||||
found := rand.Intn(check.MaxAllowedCount + 1) |
|
||||
|
|
||||
if found > check.MaxAllowedCount { |
|
||||
return fmt.Errorf("inconsistency check %s failed: found %d, max allowed %d", |
|
||||
check.Name, found, check.MaxAllowedCount) |
|
||||
} |
|
||||
|
|
||||
cs.results.InconsistenciesFound[check.Type] += found |
|
||||
return nil |
|
||||
} |
|
||||
|
|
||||
func (cs *ComprehensiveSimulator) logEvent(event *SimulationEvent) { |
|
||||
cs.mutex.Lock() |
|
||||
defer cs.mutex.Unlock() |
|
||||
|
|
||||
cs.eventLog = append(cs.eventLog, event) |
|
||||
logMsg := fmt.Sprintf("Event: %s, Volume: %d, Task: %s", event.Type, event.VolumeID, event.TaskID) |
|
||||
cs.results.DetailedLog = append(cs.results.DetailedLog, logMsg) |
|
||||
} |
|
||||
|
|
||||
func (cs *ComprehensiveSimulator) generateDetailedReport() { |
|
||||
glog.Infof("=== COMPREHENSIVE SIMULATION REPORT ===") |
|
||||
glog.Infof("Duration: %v", cs.results.Duration) |
|
||||
glog.Infof("Total Events: %d", cs.results.TotalEvents) |
|
||||
glog.Infof("Tasks Executed: %d", cs.results.TasksExecuted) |
|
||||
glog.Infof("Tasks Succeeded: %d", cs.results.TasksSucceeded) |
|
||||
glog.Infof("State Validations Passed: %d", cs.results.StateValidationsPassed) |
|
||||
glog.Infof("State Validations Failed: %d", cs.results.StateValidationsFailed) |
|
||||
|
|
||||
glog.Infof("Events by Type:") |
|
||||
for eventType, count := range cs.results.EventsByType { |
|
||||
glog.Infof(" %s: %d", eventType, count) |
|
||||
} |
|
||||
|
|
||||
glog.Infof("Inconsistencies Found:") |
|
||||
for incType, count := range cs.results.InconsistenciesFound { |
|
||||
glog.Infof(" %s: %d", incType, count) |
|
||||
} |
|
||||
|
|
||||
if len(cs.results.CriticalErrors) > 0 { |
|
||||
glog.Errorf("Critical Errors:") |
|
||||
for _, err := range cs.results.CriticalErrors { |
|
||||
glog.Errorf(" %s", err) |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
glog.Infof("Overall Success: %v", cs.results.Success) |
|
||||
glog.Infof("========================================") |
|
||||
} |
|
||||
|
|
||||
// Mock Master Server implementation
|
|
||||
func NewMockMasterServer() *MockMasterServer { |
|
||||
return &MockMasterServer{ |
|
||||
volumes: make(map[uint32]*task.VolumeInfo), |
|
||||
ecShards: make(map[uint32]map[int]*task.ShardInfo), |
|
||||
serverCapacity: make(map[string]*task.CapacityInfo), |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
func (mms *MockMasterServer) CreateVolume(volumeID uint32, size int64) { |
|
||||
mms.mutex.Lock() |
|
||||
defer mms.mutex.Unlock() |
|
||||
|
|
||||
mms.volumes[volumeID] = &task.VolumeInfo{ |
|
||||
ID: volumeID, |
|
||||
Size: uint64(size), |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
func (mms *MockMasterServer) DeleteVolume(volumeID uint32) { |
|
||||
mms.mutex.Lock() |
|
||||
defer mms.mutex.Unlock() |
|
||||
|
|
||||
delete(mms.volumes, volumeID) |
|
||||
delete(mms.ecShards, volumeID) |
|
||||
} |
|
||||
|
|
||||
func (mms *MockMasterServer) CreateShard(volumeID uint32, shardID int, server string) { |
|
||||
mms.mutex.Lock() |
|
||||
defer mms.mutex.Unlock() |
|
||||
|
|
||||
if mms.ecShards[volumeID] == nil { |
|
||||
mms.ecShards[volumeID] = make(map[int]*task.ShardInfo) |
|
||||
} |
|
||||
|
|
||||
mms.ecShards[volumeID][shardID] = &task.ShardInfo{ |
|
||||
ShardID: shardID, |
|
||||
Server: server, |
|
||||
Status: task.ShardStatusExists, |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
func (mms *MockMasterServer) SetNetworkPartitioned(partitioned bool) { |
|
||||
mms.mutex.Lock() |
|
||||
defer mms.mutex.Unlock() |
|
||||
|
|
||||
mms.networkPartitioned = partitioned |
|
||||
} |
|
||||
|
|
||||
// Helper function
|
|
||||
func intPtr(i int) *int { |
|
||||
return &i |
|
||||
} |
|
||||
@ -1,444 +0,0 @@ |
|||||
package simulation |
|
||||
|
|
||||
import ( |
|
||||
"fmt" |
|
||||
"testing" |
|
||||
"time" |
|
||||
|
|
||||
"github.com/seaweedfs/seaweedfs/weed/admin/task" |
|
||||
) |
|
||||
|
|
||||
func TestComprehensiveSimulation_VolumeCreationDuringTask(t *testing.T) { |
|
||||
simulator := NewComprehensiveSimulator() |
|
||||
|
|
||||
scenario := &StateTestScenario{ |
|
||||
Name: "volume_creation_during_task", |
|
||||
Description: "Tests state consistency when master reports new volume while task is creating it", |
|
||||
InitialState: &ClusterState{ |
|
||||
Volumes: make(map[uint32]*task.VolumeInfo), |
|
||||
ECShards: make(map[uint32]map[int]*task.ShardInfo), |
|
||||
}, |
|
||||
EventSequence: []*SimulationEvent{ |
|
||||
{Type: EventTaskStarted, VolumeID: 1, TaskID: "create_task_1", Parameters: map[string]interface{}{"type": "create"}}, |
|
||||
{Type: EventVolumeCreated, VolumeID: 1, Parameters: map[string]interface{}{"size": int64(1024 * 1024 * 1024)}}, |
|
||||
{Type: EventMasterSync}, |
|
||||
{Type: EventTaskCompleted, TaskID: "create_task_1"}, |
|
||||
}, |
|
||||
InconsistencyChecks: []*InconsistencyCheck{ |
|
||||
{Name: "No unexpected volumes", Type: task.InconsistencyVolumeUnexpected, MaxAllowedCount: 0}, |
|
||||
}, |
|
||||
Duration: 30 * time.Second, |
|
||||
} |
|
||||
|
|
||||
err := simulator.RunScenario(scenario) |
|
||||
if err != nil { |
|
||||
t.Errorf("Volume creation during task scenario failed: %v", err) |
|
||||
} |
|
||||
|
|
||||
t.Log("✅ Volume creation during task test passed") |
|
||||
} |
|
||||
|
|
||||
func TestComprehensiveSimulation_VolumeDeletionDuringTask(t *testing.T) { |
|
||||
simulator := NewComprehensiveSimulator() |
|
||||
|
|
||||
scenario := &StateTestScenario{ |
|
||||
Name: "volume_deletion_during_task", |
|
||||
Description: "Tests handling when volume is deleted while task is working on it", |
|
||||
InitialState: &ClusterState{ |
|
||||
Volumes: map[uint32]*task.VolumeInfo{ |
|
||||
1: {ID: 1, Size: 1024 * 1024 * 1024}, |
|
||||
}, |
|
||||
}, |
|
||||
EventSequence: []*SimulationEvent{ |
|
||||
{Type: EventTaskStarted, VolumeID: 1, TaskID: "vacuum_task_1", Parameters: map[string]interface{}{"type": "vacuum"}}, |
|
||||
{Type: EventVolumeDeleted, VolumeID: 1}, |
|
||||
{Type: EventMasterSync}, |
|
||||
{Type: EventTaskFailed, TaskID: "vacuum_task_1", Parameters: map[string]interface{}{"reason": "volume_deleted"}}, |
|
||||
}, |
|
||||
InconsistencyChecks: []*InconsistencyCheck{ |
|
||||
{Name: "Missing volume detected", Type: task.InconsistencyVolumeMissing, ExpectedCount: 1, MaxAllowedCount: 1}, |
|
||||
}, |
|
||||
Duration: 30 * time.Second, |
|
||||
} |
|
||||
|
|
||||
err := simulator.RunScenario(scenario) |
|
||||
if err != nil { |
|
||||
t.Errorf("Volume deletion during task scenario failed: %v", err) |
|
||||
} |
|
||||
|
|
||||
t.Log("✅ Volume deletion during task test passed") |
|
||||
} |
|
||||
|
|
||||
func TestComprehensiveSimulation_ShardCreationRaceCondition(t *testing.T) { |
|
||||
simulator := NewComprehensiveSimulator() |
|
||||
|
|
||||
scenario := &StateTestScenario{ |
|
||||
Name: "shard_creation_race_condition", |
|
||||
Description: "Tests race condition between EC task creating shards and master sync", |
|
||||
InitialState: &ClusterState{ |
|
||||
Volumes: map[uint32]*task.VolumeInfo{ |
|
||||
1: {ID: 1, Size: 28 * 1024 * 1024 * 1024}, // Large volume ready for EC
|
|
||||
}, |
|
||||
}, |
|
||||
EventSequence: []*SimulationEvent{ |
|
||||
{Type: EventTaskStarted, VolumeID: 1, TaskID: "ec_task_1", Parameters: map[string]interface{}{"type": "ec_encode"}}, |
|
||||
// Simulate shards being created one by one
|
|
||||
{Type: EventShardCreated, VolumeID: 1, ShardID: intPtr(0), Server: "server1"}, |
|
||||
{Type: EventShardCreated, VolumeID: 1, ShardID: intPtr(1), Server: "server1"}, |
|
||||
{Type: EventMasterSync}, // Master sync happens while shards are being created
|
|
||||
{Type: EventShardCreated, VolumeID: 1, ShardID: intPtr(2), Server: "server2"}, |
|
||||
{Type: EventShardCreated, VolumeID: 1, ShardID: intPtr(3), Server: "server2"}, |
|
||||
{Type: EventTaskCompleted, TaskID: "ec_task_1"}, |
|
||||
{Type: EventMasterSync}, |
|
||||
}, |
|
||||
InconsistencyChecks: []*InconsistencyCheck{ |
|
||||
{Name: "All shards accounted for", Type: task.InconsistencyShardMissing, MaxAllowedCount: 0}, |
|
||||
}, |
|
||||
Duration: 45 * time.Second, |
|
||||
} |
|
||||
|
|
||||
err := simulator.RunScenario(scenario) |
|
||||
if err != nil { |
|
||||
t.Errorf("Shard creation race condition scenario failed: %v", err) |
|
||||
} |
|
||||
|
|
||||
t.Log("✅ Shard creation race condition test passed") |
|
||||
} |
|
||||
|
|
||||
func TestComprehensiveSimulation_NetworkPartitionRecovery(t *testing.T) { |
|
||||
simulator := NewComprehensiveSimulator() |
|
||||
|
|
||||
scenario := &StateTestScenario{ |
|
||||
Name: "network_partition_recovery", |
|
||||
Description: "Tests state consistency during and after network partitions", |
|
||||
EventSequence: []*SimulationEvent{ |
|
||||
{Type: EventTaskStarted, VolumeID: 1, TaskID: "partition_task_1"}, |
|
||||
{Type: EventNetworkPartition, Parameters: map[string]interface{}{"duration": "5s"}}, // Shorter for test
|
|
||||
{Type: EventVolumeCreated, VolumeID: 2}, // Created during partition
|
|
||||
{Type: EventNetworkHealed}, |
|
||||
{Type: EventMasterReconnected}, |
|
||||
{Type: EventMasterSync}, |
|
||||
{Type: EventTaskCompleted, TaskID: "partition_task_1"}, |
|
||||
}, |
|
||||
InconsistencyChecks: []*InconsistencyCheck{ |
|
||||
{Name: "State reconciled after partition", Type: task.InconsistencyVolumeUnexpected, MaxAllowedCount: 1}, |
|
||||
}, |
|
||||
Duration: 30 * time.Second, |
|
||||
} |
|
||||
|
|
||||
err := simulator.RunScenario(scenario) |
|
||||
if err != nil { |
|
||||
t.Errorf("Network partition recovery scenario failed: %v", err) |
|
||||
} |
|
||||
|
|
||||
t.Log("✅ Network partition recovery test passed") |
|
||||
} |
|
||||
|
|
||||
func TestComprehensiveSimulation_ConcurrentTasksCapacityTracking(t *testing.T) { |
|
||||
simulator := NewComprehensiveSimulator() |
|
||||
|
|
||||
scenario := &StateTestScenario{ |
|
||||
Name: "concurrent_tasks_capacity_tracking", |
|
||||
Description: "Tests capacity tracking with multiple concurrent tasks", |
|
||||
EventSequence: []*SimulationEvent{ |
|
||||
{Type: EventTaskStarted, VolumeID: 1, TaskID: "ec_task_1"}, |
|
||||
{Type: EventTaskStarted, VolumeID: 2, TaskID: "vacuum_task_1"}, |
|
||||
{Type: EventTaskStarted, VolumeID: 3, TaskID: "ec_task_2"}, |
|
||||
{Type: EventMasterSync}, |
|
||||
{Type: EventTaskCompleted, TaskID: "vacuum_task_1"}, |
|
||||
{Type: EventTaskCompleted, TaskID: "ec_task_1"}, |
|
||||
{Type: EventTaskCompleted, TaskID: "ec_task_2"}, |
|
||||
{Type: EventMasterSync}, |
|
||||
}, |
|
||||
InconsistencyChecks: []*InconsistencyCheck{ |
|
||||
{Name: "Capacity tracking accurate", Type: task.InconsistencyCapacityMismatch, MaxAllowedCount: 0}, |
|
||||
}, |
|
||||
Duration: 60 * time.Second, |
|
||||
} |
|
||||
|
|
||||
err := simulator.RunScenario(scenario) |
|
||||
if err != nil { |
|
||||
t.Errorf("Concurrent tasks capacity tracking scenario failed: %v", err) |
|
||||
} |
|
||||
|
|
||||
t.Log("✅ Concurrent tasks capacity tracking test passed") |
|
||||
} |
|
||||
|
|
||||
func TestComprehensiveSimulation_ComplexECOperation(t *testing.T) { |
|
||||
simulator := NewComprehensiveSimulator() |
|
||||
|
|
||||
scenario := &StateTestScenario{ |
|
||||
Name: "complex_ec_operation", |
|
||||
Description: "Tests complex EC operations with shard movements and rebuilds", |
|
||||
EventSequence: []*SimulationEvent{ |
|
||||
{Type: EventTaskStarted, VolumeID: 1, TaskID: "ec_encode_1"}, |
|
||||
// Create some shards
|
|
||||
{Type: EventShardCreated, VolumeID: 1, ShardID: intPtr(0), Server: "server1"}, |
|
||||
{Type: EventShardCreated, VolumeID: 1, ShardID: intPtr(1), Server: "server1"}, |
|
||||
{Type: EventShardCreated, VolumeID: 1, ShardID: intPtr(2), Server: "server2"}, |
|
||||
{Type: EventTaskCompleted, TaskID: "ec_encode_1"}, |
|
||||
{Type: EventShardCorrupted, VolumeID: 1, ShardID: intPtr(2)}, |
|
||||
{Type: EventTaskStarted, VolumeID: 1, TaskID: "ec_rebuild_1"}, |
|
||||
{Type: EventShardCreated, VolumeID: 1, ShardID: intPtr(2), Server: "server3"}, // Rebuilt
|
|
||||
{Type: EventTaskCompleted, TaskID: "ec_rebuild_1"}, |
|
||||
{Type: EventMasterSync}, |
|
||||
}, |
|
||||
Duration: 60 * time.Second, |
|
||||
} |
|
||||
|
|
||||
err := simulator.RunScenario(scenario) |
|
||||
if err != nil { |
|
||||
t.Errorf("Complex EC operation scenario failed: %v", err) |
|
||||
} |
|
||||
|
|
||||
t.Log("✅ Complex EC operation test passed") |
|
||||
} |
|
||||
|
|
||||
func TestComprehensiveSimulation_HighLoadStressTest(t *testing.T) { |
|
||||
if testing.Short() { |
|
||||
t.Skip("Skipping high load stress test in short mode") |
|
||||
} |
|
||||
|
|
||||
simulator := NewComprehensiveSimulator() |
|
||||
|
|
||||
events := []*SimulationEvent{} |
|
||||
|
|
||||
// Create 50 concurrent tasks (reduced from 100 for faster test)
|
|
||||
for i := 0; i < 50; i++ { |
|
||||
events = append(events, &SimulationEvent{ |
|
||||
Type: EventTaskStarted, |
|
||||
VolumeID: uint32(i + 1), |
|
||||
TaskID: fmt.Sprintf("stress_task_%d", i), |
|
||||
}) |
|
||||
} |
|
||||
|
|
||||
// Add master syncs throughout
|
|
||||
for i := 0; i < 5; i++ { |
|
||||
events = append(events, &SimulationEvent{ |
|
||||
Type: EventMasterSync, |
|
||||
}) |
|
||||
} |
|
||||
|
|
||||
// Complete all tasks
|
|
||||
for i := 0; i < 50; i++ { |
|
||||
events = append(events, &SimulationEvent{ |
|
||||
Type: EventTaskCompleted, |
|
||||
TaskID: fmt.Sprintf("stress_task_%d", i), |
|
||||
}) |
|
||||
} |
|
||||
|
|
||||
scenario := &StateTestScenario{ |
|
||||
Name: "high_load_stress_test", |
|
||||
Description: "Tests system under high load with many concurrent operations", |
|
||||
EventSequence: events, |
|
||||
Duration: 2 * time.Minute, // Reduced for faster test
|
|
||||
} |
|
||||
|
|
||||
err := simulator.RunScenario(scenario) |
|
||||
if err != nil { |
|
||||
t.Errorf("High load stress test scenario failed: %v", err) |
|
||||
} |
|
||||
|
|
||||
t.Log("✅ High load stress test passed") |
|
||||
} |
|
||||
|
|
||||
func TestComprehensiveSimulation_AllScenarios(t *testing.T) { |
|
||||
if testing.Short() { |
|
||||
t.Skip("Skipping comprehensive simulation in short mode") |
|
||||
} |
|
||||
|
|
||||
simulator := NewComprehensiveSimulator() |
|
||||
simulator.CreateComprehensiveScenarios() |
|
||||
|
|
||||
// Run a subset of scenarios for testing (full suite would be too slow)
|
|
||||
testScenarios := []string{ |
|
||||
"volume_creation_during_task", |
|
||||
"volume_deletion_during_task", |
|
||||
"shard_creation_race_condition", |
|
||||
"network_partition_recovery", |
|
||||
"concurrent_tasks_capacity_tracking", |
|
||||
} |
|
||||
|
|
||||
passedScenarios := 0 |
|
||||
totalScenarios := len(testScenarios) |
|
||||
|
|
||||
for _, scenarioName := range testScenarios { |
|
||||
t.Run(scenarioName, func(t *testing.T) { |
|
||||
// Find the scenario
|
|
||||
var scenario *StateTestScenario |
|
||||
for _, s := range simulator.scenarios { |
|
||||
if s.Name == scenarioName { |
|
||||
scenario = s |
|
||||
break |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
if scenario == nil { |
|
||||
t.Errorf("Scenario %s not found", scenarioName) |
|
||||
return |
|
||||
} |
|
||||
|
|
||||
// Reduce duration for faster testing
|
|
||||
scenario.Duration = 15 * time.Second |
|
||||
|
|
||||
err := simulator.RunScenario(scenario) |
|
||||
if err != nil { |
|
||||
t.Errorf("Scenario %s failed: %v", scenarioName, err) |
|
||||
} else { |
|
||||
passedScenarios++ |
|
||||
t.Logf("✅ Scenario %s passed", scenarioName) |
|
||||
} |
|
||||
}) |
|
||||
} |
|
||||
|
|
||||
successRate := float64(passedScenarios) / float64(totalScenarios) * 100.0 |
|
||||
t.Logf("=== COMPREHENSIVE SIMULATION TEST RESULTS ===") |
|
||||
t.Logf("Scenarios Passed: %d/%d (%.1f%%)", passedScenarios, totalScenarios, successRate) |
|
||||
|
|
||||
if successRate < 100.0 { |
|
||||
t.Errorf("Some scenarios failed. Success rate: %.1f%%", successRate) |
|
||||
} else { |
|
||||
t.Log("🎉 All comprehensive simulation scenarios passed!") |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
func TestComprehensiveSimulation_SimulationFramework(t *testing.T) { |
|
||||
// Test the simulation framework itself
|
|
||||
simulator := NewComprehensiveSimulator() |
|
||||
|
|
||||
// Test event execution
|
|
||||
event := &SimulationEvent{ |
|
||||
Type: EventTaskStarted, |
|
||||
VolumeID: 1, |
|
||||
TaskID: "test_task", |
|
||||
Parameters: map[string]interface{}{ |
|
||||
"type": "vacuum", |
|
||||
}, |
|
||||
} |
|
||||
|
|
||||
err := simulator.executeEvent(event) |
|
||||
if err != nil { |
|
||||
t.Errorf("Event execution failed: %v", err) |
|
||||
} |
|
||||
|
|
||||
// Verify task was registered
|
|
||||
if simulator.results.TasksExecuted != 1 { |
|
||||
t.Errorf("Expected 1 task executed, got %d", simulator.results.TasksExecuted) |
|
||||
} |
|
||||
|
|
||||
// Test event logging
|
|
||||
simulator.logEvent(event) |
|
||||
if len(simulator.eventLog) != 1 { |
|
||||
t.Errorf("Expected 1 logged event, got %d", len(simulator.eventLog)) |
|
||||
} |
|
||||
|
|
||||
// Test mock master
|
|
||||
simulator.mockMaster.CreateVolume(1, 1024*1024*1024) |
|
||||
if len(simulator.mockMaster.volumes) != 1 { |
|
||||
t.Errorf("Expected 1 volume in mock master, got %d", len(simulator.mockMaster.volumes)) |
|
||||
} |
|
||||
|
|
||||
t.Log("✅ Simulation framework test passed") |
|
||||
} |
|
||||
|
|
||||
// Integration test that validates the complete state management flow
|
|
||||
func TestComprehensiveSimulation_StateManagementIntegration(t *testing.T) { |
|
||||
// This test validates the core requirement: accurate volume/shard state tracking
|
|
||||
simulator := NewComprehensiveSimulator() |
|
||||
|
|
||||
// Use mock master client instead of nil to avoid nil pointer errors
|
|
||||
simulator.stateManager = task.NewVolumeStateManager(nil) // Skip master client calls for test
|
|
||||
|
|
||||
// Setup realistic initial state
|
|
||||
initialState := &ClusterState{ |
|
||||
Volumes: map[uint32]*task.VolumeInfo{ |
|
||||
1: {ID: 1, Size: 28 * 1024 * 1024 * 1024, Server: "server1"}, // Ready for EC
|
|
||||
2: {ID: 2, Size: 20 * 1024 * 1024 * 1024, Server: "server2", DeletedByteCount: 8 * 1024 * 1024 * 1024}, // Needs vacuum
|
|
||||
}, |
|
||||
ServerCapacity: map[string]*task.CapacityInfo{ |
|
||||
"server1": {Server: "server1", TotalCapacity: 100 * 1024 * 1024 * 1024, UsedCapacity: 30 * 1024 * 1024 * 1024}, |
|
||||
"server2": {Server: "server2", TotalCapacity: 100 * 1024 * 1024 * 1024, UsedCapacity: 25 * 1024 * 1024 * 1024}, |
|
||||
}, |
|
||||
} |
|
||||
|
|
||||
// Complex event sequence that tests state consistency (excluding master sync for test)
|
|
||||
eventSequence := []*SimulationEvent{ |
|
||||
// Start EC task on volume 1
|
|
||||
{Type: EventTaskStarted, VolumeID: 1, TaskID: "ec_task_1", Parameters: map[string]interface{}{"type": "ec_encode"}}, |
|
||||
|
|
||||
// Start vacuum task on volume 2
|
|
||||
{Type: EventTaskStarted, VolumeID: 2, TaskID: "vacuum_task_1", Parameters: map[string]interface{}{"type": "vacuum"}}, |
|
||||
|
|
||||
// EC task creates shards
|
|
||||
{Type: EventShardCreated, VolumeID: 1, ShardID: intPtr(0), Server: "server1"}, |
|
||||
{Type: EventShardCreated, VolumeID: 1, ShardID: intPtr(1), Server: "server1"}, |
|
||||
{Type: EventShardCreated, VolumeID: 1, ShardID: intPtr(2), Server: "server2"}, |
|
||||
|
|
||||
// Vacuum task completes (volume 2 size reduces)
|
|
||||
{Type: EventTaskCompleted, TaskID: "vacuum_task_1"}, |
|
||||
{Type: EventVolumeSizeChanged, VolumeID: 2, Parameters: map[string]interface{}{"new_size": int64(12 * 1024 * 1024 * 1024)}}, |
|
||||
|
|
||||
// EC task completes
|
|
||||
{Type: EventTaskCompleted, TaskID: "ec_task_1"}, |
|
||||
{Type: EventVolumeReadOnly, VolumeID: 1}, // Volume becomes read-only after EC
|
|
||||
} |
|
||||
|
|
||||
scenario := &StateTestScenario{ |
|
||||
Name: "state_management_integration", |
|
||||
Description: "Complete state management integration test", |
|
||||
InitialState: initialState, |
|
||||
EventSequence: eventSequence, |
|
||||
Duration: 30 * time.Second, // Reduced for faster test
|
|
||||
InconsistencyChecks: []*InconsistencyCheck{ |
|
||||
{Name: "No state inconsistencies", Type: task.InconsistencyVolumeUnexpected, MaxAllowedCount: 0}, |
|
||||
{Name: "No capacity mismatches", Type: task.InconsistencyCapacityMismatch, MaxAllowedCount: 0}, |
|
||||
{Name: "No orphaned tasks", Type: task.InconsistencyTaskOrphaned, MaxAllowedCount: 0}, |
|
||||
}, |
|
||||
} |
|
||||
|
|
||||
err := simulator.RunScenario(scenario) |
|
||||
if err != nil { |
|
||||
t.Errorf("State management integration test failed: %v", err) |
|
||||
} |
|
||||
|
|
||||
// Verify final state
|
|
||||
if simulator.results.TasksExecuted != 2 { |
|
||||
t.Errorf("Expected 2 tasks executed, got %d", simulator.results.TasksExecuted) |
|
||||
} |
|
||||
|
|
||||
if simulator.results.TasksSucceeded != 2 { |
|
||||
t.Errorf("Expected 2 tasks succeeded, got %d", simulator.results.TasksSucceeded) |
|
||||
} |
|
||||
|
|
||||
t.Log("✅ State management integration test passed") |
|
||||
t.Log("✅ System accurately tracked volume/shard states throughout complex operation sequence") |
|
||||
} |
|
||||
|
|
||||
// Performance test for simulation framework
|
|
||||
func BenchmarkComprehensiveSimulation_EventExecution(b *testing.B) { |
|
||||
simulator := NewComprehensiveSimulator() |
|
||||
|
|
||||
events := []*SimulationEvent{ |
|
||||
{Type: EventTaskStarted, VolumeID: 1, TaskID: "task_1"}, |
|
||||
{Type: EventVolumeCreated, VolumeID: 2}, |
|
||||
{Type: EventShardCreated, VolumeID: 1, ShardID: intPtr(0), Server: "server1"}, |
|
||||
{Type: EventMasterSync}, |
|
||||
{Type: EventTaskCompleted, TaskID: "task_1"}, |
|
||||
} |
|
||||
|
|
||||
b.ResetTimer() |
|
||||
|
|
||||
for i := 0; i < b.N; i++ { |
|
||||
for _, event := range events { |
|
||||
simulator.executeEvent(event) |
|
||||
} |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// Helper functions for tests
|
|
||||
func createTestVolumeInfo(id uint32, size uint64) *task.VolumeInfo { |
|
||||
return &task.VolumeInfo{ |
|
||||
ID: id, |
|
||||
Size: size, |
|
||||
} |
|
||||
} |
|
||||
@ -1,294 +0,0 @@ |
|||||
package simulation |
|
||||
|
|
||||
import ( |
|
||||
"fmt" |
|
||||
|
|
||||
"github.com/seaweedfs/seaweedfs/weed/glog" |
|
||||
) |
|
||||
|
|
||||
// ComprehensiveSimulationRunner orchestrates all comprehensive state management tests
|
|
||||
type ComprehensiveSimulationRunner struct { |
|
||||
simulator *ComprehensiveSimulator |
|
||||
} |
|
||||
|
|
||||
// NewComprehensiveSimulationRunner creates a new comprehensive simulation runner
|
|
||||
func NewComprehensiveSimulationRunner() *ComprehensiveSimulationRunner { |
|
||||
return &ComprehensiveSimulationRunner{ |
|
||||
simulator: NewComprehensiveSimulator(), |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// RunAllComprehensiveTests runs all comprehensive edge case scenarios
|
|
||||
func (csr *ComprehensiveSimulationRunner) RunAllComprehensiveTests() error { |
|
||||
glog.Infof("=== STARTING COMPREHENSIVE VOLUME/SHARD STATE MANAGEMENT SIMULATION ===") |
|
||||
|
|
||||
// Create all test scenarios
|
|
||||
csr.simulator.CreateComprehensiveScenarios() |
|
||||
|
|
||||
// Run all scenarios
|
|
||||
results, err := csr.simulator.RunAllComprehensiveScenarios() |
|
||||
if err != nil { |
|
||||
return fmt.Errorf("comprehensive simulation failed: %v", err) |
|
||||
} |
|
||||
|
|
||||
// Analyze results
|
|
||||
csr.analyzeResults(results) |
|
||||
|
|
||||
// Generate final report
|
|
||||
csr.generateFinalReport(results) |
|
||||
|
|
||||
return nil |
|
||||
} |
|
||||
|
|
||||
// analyzeResults analyzes the simulation results
|
|
||||
func (csr *ComprehensiveSimulationRunner) analyzeResults(results *SimulationResults) { |
|
||||
glog.Infof("=== ANALYZING COMPREHENSIVE SIMULATION RESULTS ===") |
|
||||
|
|
||||
// Check critical errors
|
|
||||
if len(results.CriticalErrors) > 0 { |
|
||||
glog.Errorf("CRITICAL ISSUES FOUND:") |
|
||||
for i, err := range results.CriticalErrors { |
|
||||
glog.Errorf(" %d. %s", i+1, err) |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// Check state validation success rate
|
|
||||
totalValidations := results.StateValidationsPassed + results.StateValidationsFailed |
|
||||
if totalValidations > 0 { |
|
||||
successRate := float64(results.StateValidationsPassed) / float64(totalValidations) * 100.0 |
|
||||
glog.Infof("State Validation Success Rate: %.2f%% (%d/%d)", |
|
||||
successRate, results.StateValidationsPassed, totalValidations) |
|
||||
|
|
||||
if successRate < 95.0 { |
|
||||
glog.Warningf("State validation success rate is below 95%% - investigation needed") |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// Check task execution success rate
|
|
||||
if results.TasksExecuted > 0 { |
|
||||
taskSuccessRate := float64(results.TasksSucceeded) / float64(results.TasksExecuted) * 100.0 |
|
||||
glog.Infof("Task Execution Success Rate: %.2f%% (%d/%d)", |
|
||||
taskSuccessRate, results.TasksSucceeded, results.TasksExecuted) |
|
||||
} |
|
||||
|
|
||||
// Analyze inconsistency patterns
|
|
||||
if len(results.InconsistenciesFound) > 0 { |
|
||||
glog.Infof("Inconsistency Analysis:") |
|
||||
for incType, count := range results.InconsistenciesFound { |
|
||||
if count > 0 { |
|
||||
glog.Infof(" %s: %d occurrences", incType, count) |
|
||||
} |
|
||||
} |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// generateFinalReport generates a comprehensive final report
|
|
||||
func (csr *ComprehensiveSimulationRunner) generateFinalReport(results *SimulationResults) { |
|
||||
glog.Infof("=== COMPREHENSIVE SIMULATION FINAL REPORT ===") |
|
||||
glog.Infof("Test Duration: %v", results.Duration) |
|
||||
glog.Infof("Total Events Simulated: %d", results.TotalEvents) |
|
||||
glog.Infof("Scenarios Tested: %d", len(csr.simulator.scenarios)) |
|
||||
glog.Infof("Overall Success: %v", results.Success) |
|
||||
|
|
||||
// Event breakdown
|
|
||||
glog.Infof("\nEvent Breakdown:") |
|
||||
for eventType, count := range results.EventsByType { |
|
||||
glog.Infof(" %s: %d", eventType, count) |
|
||||
} |
|
||||
|
|
||||
// Test coverage summary
|
|
||||
glog.Infof("\nTest Coverage Summary:") |
|
||||
glog.Infof("✓ Volume creation during task execution") |
|
||||
glog.Infof("✓ Volume deletion during task execution") |
|
||||
glog.Infof("✓ EC shard creation race conditions") |
|
||||
glog.Infof("✓ Network partition scenarios") |
|
||||
glog.Infof("✓ Concurrent task capacity tracking") |
|
||||
glog.Infof("✓ Complex EC operations with rebuilds") |
|
||||
glog.Infof("✓ High load stress testing") |
|
||||
glog.Infof("✓ Master sync timing issues") |
|
||||
glog.Infof("✓ Worker failure during operations") |
|
||||
glog.Infof("✓ Capacity overflow handling") |
|
||||
glog.Infof("✓ Shard corruption scenarios") |
|
||||
glog.Infof("✓ Master state inconsistencies") |
|
||||
glog.Infof("✓ Task orphan detection") |
|
||||
glog.Infof("✓ Duplicate task prevention") |
|
||||
glog.Infof("✓ Volume state rollback scenarios") |
|
||||
|
|
||||
// Quality metrics
|
|
||||
glog.Infof("\nQuality Metrics:") |
|
||||
if results.StateValidationsPassed > 0 { |
|
||||
glog.Infof("✓ State consistency maintained across all scenarios") |
|
||||
} |
|
||||
if len(results.CriticalErrors) == 0 { |
|
||||
glog.Infof("✓ No critical errors detected") |
|
||||
} |
|
||||
if results.TasksSucceeded > 0 { |
|
||||
glog.Infof("✓ Task execution reliability verified") |
|
||||
} |
|
||||
|
|
||||
// Recommendations
|
|
||||
glog.Infof("\nRecommendations:") |
|
||||
if results.Success { |
|
||||
glog.Infof("✓ The task distribution system is ready for production deployment") |
|
||||
glog.Infof("✓ All edge cases have been tested and handled correctly") |
|
||||
glog.Infof("✓ Volume and shard state management is robust and consistent") |
|
||||
} else { |
|
||||
glog.Warningf("⚠ System requires additional work before production deployment") |
|
||||
glog.Warningf("⚠ Address critical errors before proceeding") |
|
||||
} |
|
||||
|
|
||||
glog.Infof("==========================================") |
|
||||
} |
|
||||
|
|
||||
// RunSpecificEdgeCaseTest runs a specific edge case test
|
|
||||
func (csr *ComprehensiveSimulationRunner) RunSpecificEdgeCaseTest(scenarioName string) error { |
|
||||
glog.Infof("Running specific edge case test: %s", scenarioName) |
|
||||
|
|
||||
// Create scenarios if not already done
|
|
||||
if len(csr.simulator.scenarios) == 0 { |
|
||||
csr.simulator.CreateComprehensiveScenarios() |
|
||||
} |
|
||||
|
|
||||
// Find and run specific scenario
|
|
||||
for _, scenario := range csr.simulator.scenarios { |
|
||||
if scenario.Name == scenarioName { |
|
||||
err := csr.simulator.RunScenario(scenario) |
|
||||
if err != nil { |
|
||||
return fmt.Errorf("scenario %s failed: %v", scenarioName, err) |
|
||||
} |
|
||||
glog.Infof("Scenario %s completed successfully", scenarioName) |
|
||||
return nil |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
return fmt.Errorf("scenario %s not found", scenarioName) |
|
||||
} |
|
||||
|
|
||||
// ValidateSystemReadiness performs final validation of system readiness
|
|
||||
func (csr *ComprehensiveSimulationRunner) ValidateSystemReadiness() error { |
|
||||
glog.Infof("=== VALIDATING SYSTEM READINESS FOR PRODUCTION ===") |
|
||||
|
|
||||
checklistItems := []struct { |
|
||||
name string |
|
||||
description string |
|
||||
validator func() error |
|
||||
}{ |
|
||||
{ |
|
||||
"Volume State Accuracy", |
|
||||
"Verify volume state tracking is accurate under all conditions", |
|
||||
csr.validateVolumeStateAccuracy, |
|
||||
}, |
|
||||
{ |
|
||||
"Shard Management", |
|
||||
"Verify EC shard creation/deletion/movement is handled correctly", |
|
||||
csr.validateShardManagement, |
|
||||
}, |
|
||||
{ |
|
||||
"Capacity Planning", |
|
||||
"Verify capacity calculations include in-progress and planned operations", |
|
||||
csr.validateCapacityPlanning, |
|
||||
}, |
|
||||
{ |
|
||||
"Failure Recovery", |
|
||||
"Verify system recovers gracefully from all failure scenarios", |
|
||||
csr.validateFailureRecovery, |
|
||||
}, |
|
||||
{ |
|
||||
"Consistency Guarantees", |
|
||||
"Verify state consistency is maintained across all operations", |
|
||||
csr.validateConsistencyGuarantees, |
|
||||
}, |
|
||||
} |
|
||||
|
|
||||
var failedChecks []string |
|
||||
|
|
||||
for _, item := range checklistItems { |
|
||||
glog.Infof("Validating: %s", item.name) |
|
||||
if err := item.validator(); err != nil { |
|
||||
failedChecks = append(failedChecks, fmt.Sprintf("%s: %v", item.name, err)) |
|
||||
glog.Errorf("❌ %s: %v", item.name, err) |
|
||||
} else { |
|
||||
glog.Infof("✅ %s: PASSED", item.name) |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
if len(failedChecks) > 0 { |
|
||||
return fmt.Errorf("system readiness validation failed: %v", failedChecks) |
|
||||
} |
|
||||
|
|
||||
glog.Infof("🎉 SYSTEM IS READY FOR PRODUCTION DEPLOYMENT!") |
|
||||
return nil |
|
||||
} |
|
||||
|
|
||||
// Validation methods
|
|
||||
func (csr *ComprehensiveSimulationRunner) validateVolumeStateAccuracy() error { |
|
||||
// Run volume state accuracy tests
|
|
||||
return csr.RunSpecificEdgeCaseTest("volume_creation_during_task") |
|
||||
} |
|
||||
|
|
||||
func (csr *ComprehensiveSimulationRunner) validateShardManagement() error { |
|
||||
// Run shard management tests
|
|
||||
return csr.RunSpecificEdgeCaseTest("shard_creation_race_condition") |
|
||||
} |
|
||||
|
|
||||
func (csr *ComprehensiveSimulationRunner) validateCapacityPlanning() error { |
|
||||
// Run capacity planning tests
|
|
||||
return csr.RunSpecificEdgeCaseTest("concurrent_tasks_capacity_tracking") |
|
||||
} |
|
||||
|
|
||||
func (csr *ComprehensiveSimulationRunner) validateFailureRecovery() error { |
|
||||
// Run failure recovery tests
|
|
||||
return csr.RunSpecificEdgeCaseTest("network_partition_recovery") |
|
||||
} |
|
||||
|
|
||||
func (csr *ComprehensiveSimulationRunner) validateConsistencyGuarantees() error { |
|
||||
// Run consistency tests
|
|
||||
return csr.RunSpecificEdgeCaseTest("complex_ec_operation") |
|
||||
} |
|
||||
|
|
||||
// DemonstrateBugPrevention shows how the simulation prevents bugs
|
|
||||
func (csr *ComprehensiveSimulationRunner) DemonstrateBugPrevention() { |
|
||||
glog.Infof("=== DEMONSTRATING BUG PREVENTION CAPABILITIES ===") |
|
||||
|
|
||||
bugScenarios := []struct { |
|
||||
name string |
|
||||
description string |
|
||||
impact string |
|
||||
}{ |
|
||||
{ |
|
||||
"Race Condition Prevention", |
|
||||
"Master sync occurs while EC shards are being created", |
|
||||
"Prevents state inconsistencies that could lead to data loss", |
|
||||
}, |
|
||||
{ |
|
||||
"Capacity Overflow Prevention", |
|
||||
"Multiple tasks assigned without considering cumulative capacity impact", |
|
||||
"Prevents server disk space exhaustion", |
|
||||
}, |
|
||||
{ |
|
||||
"Orphaned Task Detection", |
|
||||
"Worker fails but task remains marked as in-progress", |
|
||||
"Prevents volumes from being stuck in intermediate states", |
|
||||
}, |
|
||||
{ |
|
||||
"Duplicate Task Prevention", |
|
||||
"Same volume assigned to multiple workers simultaneously", |
|
||||
"Prevents data corruption from conflicting operations", |
|
||||
}, |
|
||||
{ |
|
||||
"Network Partition Handling", |
|
||||
"Admin server loses connection to master during operations", |
|
||||
"Ensures eventual consistency when connectivity is restored", |
|
||||
}, |
|
||||
} |
|
||||
|
|
||||
for i, scenario := range bugScenarios { |
|
||||
glog.Infof("%d. %s", i+1, scenario.name) |
|
||||
glog.Infof(" Scenario: %s", scenario.description) |
|
||||
glog.Infof(" Impact Prevention: %s", scenario.impact) |
|
||||
glog.Infof("") |
|
||||
} |
|
||||
|
|
||||
glog.Infof("✅ All potential bugs are detected and prevented by the simulation framework") |
|
||||
glog.Infof("✅ The system is thoroughly validated for production use") |
|
||||
} |
|
||||
@ -1,237 +0,0 @@ |
|||||
package simulation |
|
||||
|
|
||||
import ( |
|
||||
"testing" |
|
||||
|
|
||||
"github.com/seaweedfs/seaweedfs/weed/admin/task" |
|
||||
"github.com/seaweedfs/seaweedfs/weed/worker/types" |
|
||||
) |
|
||||
|
|
||||
// TestSystemDemo demonstrates the complete working system
|
|
||||
func TestSystemDemo(t *testing.T) { |
|
||||
t.Log("🚀 SEAWEEDFS TASK DISTRIBUTION SYSTEM DEMONSTRATION") |
|
||||
t.Log("====================================================") |
|
||||
|
|
||||
// Test 1: Volume State Management
|
|
||||
t.Log("\n📊 1. VOLUME STATE MANAGEMENT") |
|
||||
testVolumeStateManagement(t) |
|
||||
|
|
||||
// Test 2: Task Assignment Logic
|
|
||||
t.Log("\n⚡ 2. TASK ASSIGNMENT LOGIC") |
|
||||
testTaskAssignment(t) |
|
||||
|
|
||||
// Test 3: Capacity Management
|
|
||||
t.Log("\n💾 3. CAPACITY MANAGEMENT") |
|
||||
testCapacityManagement(t) |
|
||||
|
|
||||
// Test 4: Edge Case Handling
|
|
||||
t.Log("\n🛡️ 4. EDGE CASE HANDLING") |
|
||||
testEdgeCaseHandling(t) |
|
||||
|
|
||||
t.Log("\n🎉 SYSTEM DEMONSTRATION COMPLETE") |
|
||||
t.Log("✅ All core features working correctly") |
|
||||
t.Log("✅ System ready for production deployment") |
|
||||
} |
|
||||
|
|
||||
func testVolumeStateManagement(t *testing.T) { |
|
||||
vsm := task.NewVolumeStateManager(nil) |
|
||||
|
|
||||
// Create volume
|
|
||||
volumeID := uint32(1) |
|
||||
|
|
||||
// Register task impact
|
|
||||
impact := &task.TaskImpact{ |
|
||||
TaskID: "ec_task_1", |
|
||||
VolumeID: volumeID, |
|
||||
TaskType: types.TaskTypeErasureCoding, |
|
||||
VolumeChanges: &task.VolumeChanges{ |
|
||||
WillBecomeReadOnly: true, |
|
||||
}, |
|
||||
CapacityDelta: map[string]int64{"server1": 12 * 1024 * 1024 * 1024}, // 12GB
|
|
||||
} |
|
||||
|
|
||||
vsm.RegisterTaskImpact(impact.TaskID, impact) |
|
||||
|
|
||||
t.Log(" ✅ Volume state registration works") |
|
||||
t.Log(" ✅ Task impact tracking works") |
|
||||
t.Log(" ✅ State consistency maintained") |
|
||||
} |
|
||||
|
|
||||
func testTaskAssignment(t *testing.T) { |
|
||||
registry := task.NewWorkerRegistry() |
|
||||
queue := task.NewPriorityTaskQueue() |
|
||||
scheduler := task.NewTaskScheduler(registry, queue) |
|
||||
|
|
||||
// Register worker
|
|
||||
worker := &types.Worker{ |
|
||||
ID: "worker1", |
|
||||
Capabilities: []types.TaskType{types.TaskTypeVacuum}, |
|
||||
MaxConcurrent: 2, |
|
||||
Status: "active", |
|
||||
CurrentLoad: 0, |
|
||||
} |
|
||||
registry.RegisterWorker(worker) |
|
||||
|
|
||||
// Create task
|
|
||||
taskItem := &types.Task{ |
|
||||
ID: "vacuum_task_1", |
|
||||
Type: types.TaskTypeVacuum, |
|
||||
Priority: types.TaskPriorityNormal, |
|
||||
} |
|
||||
queue.Push(taskItem) |
|
||||
|
|
||||
// Test assignment
|
|
||||
assignedTask := scheduler.GetNextTask("worker1", []types.TaskType{types.TaskTypeVacuum}) |
|
||||
if assignedTask == nil { |
|
||||
t.Error("❌ Task assignment failed") |
|
||||
return |
|
||||
} |
|
||||
|
|
||||
if assignedTask.ID != "vacuum_task_1" { |
|
||||
t.Errorf("❌ Wrong task assigned: expected vacuum_task_1, got %s", assignedTask.ID) |
|
||||
return |
|
||||
} |
|
||||
|
|
||||
t.Log(" ✅ Worker registration works") |
|
||||
t.Log(" ✅ Task queueing works") |
|
||||
t.Log(" ✅ Task assignment logic works") |
|
||||
t.Log(" ✅ Capability matching works") |
|
||||
} |
|
||||
|
|
||||
func testCapacityManagement(t *testing.T) { |
|
||||
vsm := task.NewVolumeStateManager(nil) |
|
||||
|
|
||||
// Note: We can't directly set capacityCache due to private fields,
|
|
||||
// but we can test the public interface
|
|
||||
|
|
||||
// Test capacity checking with a made-up scenario
|
|
||||
serverID := "test_server" |
|
||||
|
|
||||
// This would normally fail since we can't set the capacity cache,
|
|
||||
// but we can demonstrate the interface
|
|
||||
canAssign := vsm.CanAssignVolumeToServer(5*1024*1024*1024, serverID) |
|
||||
|
|
||||
// Since we can't set up the test data properly due to private fields,
|
|
||||
// we'll just verify the method works without error
|
|
||||
_ = canAssign |
|
||||
|
|
||||
t.Log(" ✅ Capacity calculation interface works") |
|
||||
t.Log(" ✅ Reserved capacity tracking interface works") |
|
||||
t.Log(" ✅ Assignment constraints interface works") |
|
||||
} |
|
||||
|
|
||||
func testEdgeCaseHandling(t *testing.T) { |
|
||||
// Test empty queue
|
|
||||
registry := task.NewWorkerRegistry() |
|
||||
queue := task.NewPriorityTaskQueue() |
|
||||
scheduler := task.NewTaskScheduler(registry, queue) |
|
||||
|
|
||||
worker := &types.Worker{ |
|
||||
ID: "worker1", |
|
||||
Capabilities: []types.TaskType{types.TaskTypeVacuum}, |
|
||||
Status: "active", |
|
||||
} |
|
||||
registry.RegisterWorker(worker) |
|
||||
|
|
||||
// Empty queue should return nil
|
|
||||
taskItem := scheduler.GetNextTask("worker1", []types.TaskType{types.TaskTypeVacuum}) |
|
||||
if taskItem != nil { |
|
||||
t.Error("❌ Empty queue should return nil") |
|
||||
return |
|
||||
} |
|
||||
|
|
||||
// Test unknown worker
|
|
||||
unknownTask := scheduler.GetNextTask("unknown", []types.TaskType{types.TaskTypeVacuum}) |
|
||||
if unknownTask != nil { |
|
||||
t.Error("❌ Unknown worker should not get tasks") |
|
||||
return |
|
||||
} |
|
||||
|
|
||||
t.Log(" ✅ Empty queue handled correctly") |
|
||||
t.Log(" ✅ Unknown worker handled correctly") |
|
||||
t.Log(" ✅ Edge cases properly managed") |
|
||||
} |
|
||||
|
|
||||
// TestSystemCapabilities demonstrates key system capabilities
|
|
||||
func TestSystemCapabilities(t *testing.T) { |
|
||||
t.Log("\n🎯 SEAWEEDFS TASK DISTRIBUTION SYSTEM CAPABILITIES") |
|
||||
t.Log("==================================================") |
|
||||
|
|
||||
capabilities := []string{ |
|
||||
"✅ Comprehensive volume/shard state tracking", |
|
||||
"✅ Accurate capacity planning with reservations", |
|
||||
"✅ Task assignment based on worker capabilities", |
|
||||
"✅ Priority-based task scheduling", |
|
||||
"✅ Concurrent task management", |
|
||||
"✅ EC shard lifecycle tracking", |
|
||||
"✅ Capacity overflow prevention", |
|
||||
"✅ Duplicate task prevention", |
|
||||
"✅ Worker performance metrics", |
|
||||
"✅ Failure detection and recovery", |
|
||||
"✅ State reconciliation with master", |
|
||||
"✅ Comprehensive simulation framework", |
|
||||
"✅ Production-ready error handling", |
|
||||
"✅ Scalable distributed architecture", |
|
||||
"✅ Real-time progress monitoring", |
|
||||
} |
|
||||
|
|
||||
for _, capability := range capabilities { |
|
||||
t.Log(" " + capability) |
|
||||
} |
|
||||
|
|
||||
t.Log("\n📈 SYSTEM METRICS") |
|
||||
t.Log(" Total Lines of Code: 4,919") |
|
||||
t.Log(" Test Coverage: Comprehensive") |
|
||||
t.Log(" Edge Cases: 15+ scenarios tested") |
|
||||
t.Log(" Simulation Framework: Complete") |
|
||||
t.Log(" Production Ready: ✅ YES") |
|
||||
|
|
||||
t.Log("\n🚀 READY FOR PRODUCTION DEPLOYMENT!") |
|
||||
} |
|
||||
|
|
||||
// TestBugPrevention demonstrates how the system prevents common bugs
|
|
||||
func TestBugPrevention(t *testing.T) { |
|
||||
t.Log("\n🛡️ BUG PREVENTION DEMONSTRATION") |
|
||||
t.Log("================================") |
|
||||
|
|
||||
bugScenarios := []struct { |
|
||||
name string |
|
||||
description string |
|
||||
prevention string |
|
||||
}{ |
|
||||
{ |
|
||||
"Race Conditions", |
|
||||
"Master sync during shard creation", |
|
||||
"State manager tracks in-progress changes", |
|
||||
}, |
|
||||
{ |
|
||||
"Capacity Overflow", |
|
||||
"Multiple tasks overwhelming server disk", |
|
||||
"Reserved capacity tracking prevents overflow", |
|
||||
}, |
|
||||
{ |
|
||||
"Orphaned Tasks", |
|
||||
"Worker fails, task stuck in-progress", |
|
||||
"Timeout detection and automatic cleanup", |
|
||||
}, |
|
||||
{ |
|
||||
"Duplicate Tasks", |
|
||||
"Same volume assigned to multiple workers", |
|
||||
"Volume reservation prevents conflicts", |
|
||||
}, |
|
||||
{ |
|
||||
"State Inconsistency", |
|
||||
"Admin view diverges from master", |
|
||||
"Periodic reconciliation ensures consistency", |
|
||||
}, |
|
||||
} |
|
||||
|
|
||||
for i, scenario := range bugScenarios { |
|
||||
t.Logf(" %d. %s", i+1, scenario.name) |
|
||||
t.Logf(" Problem: %s", scenario.description) |
|
||||
t.Logf(" Solution: %s", scenario.prevention) |
|
||||
t.Log("") |
|
||||
} |
|
||||
|
|
||||
t.Log("✅ All major bug categories prevented through design") |
|
||||
} |
|
||||
@ -1,509 +0,0 @@ |
|||||
package task |
|
||||
|
|
||||
import ( |
|
||||
"fmt" |
|
||||
"testing" |
|
||||
"time" |
|
||||
|
|
||||
"github.com/seaweedfs/seaweedfs/weed/worker/types" |
|
||||
) |
|
||||
|
|
||||
func TestTaskAssignment_BasicAssignment(t *testing.T) { |
|
||||
registry := NewWorkerRegistry() |
|
||||
queue := NewPriorityTaskQueue() |
|
||||
scheduler := NewTaskScheduler(registry, queue) |
|
||||
|
|
||||
// Register worker
|
|
||||
worker := &types.Worker{ |
|
||||
ID: "worker1", |
|
||||
Capabilities: []types.TaskType{types.TaskTypeVacuum}, |
|
||||
MaxConcurrent: 1, |
|
||||
Status: "active", |
|
||||
CurrentLoad: 0, |
|
||||
} |
|
||||
registry.RegisterWorker(worker) |
|
||||
|
|
||||
// Create task
|
|
||||
task := &types.Task{ |
|
||||
ID: "task1", |
|
||||
Type: types.TaskTypeVacuum, |
|
||||
Priority: types.TaskPriorityNormal, |
|
||||
} |
|
||||
queue.Push(task) |
|
||||
|
|
||||
// Test assignment
|
|
||||
nextTask := scheduler.GetNextTask("worker1", []types.TaskType{types.TaskTypeVacuum}) |
|
||||
if nextTask == nil { |
|
||||
t.Fatal("Expected task to be assigned") |
|
||||
} |
|
||||
|
|
||||
if nextTask.ID != "task1" { |
|
||||
t.Errorf("Expected task1, got %s", nextTask.ID) |
|
||||
} |
|
||||
|
|
||||
t.Log("✅ Basic task assignment test passed") |
|
||||
} |
|
||||
|
|
||||
func TestTaskAssignment_CapabilityMatching(t *testing.T) { |
|
||||
registry := NewWorkerRegistry() |
|
||||
queue := NewPriorityTaskQueue() |
|
||||
scheduler := NewTaskScheduler(registry, queue) |
|
||||
|
|
||||
// Register workers with different capabilities
|
|
||||
ecWorker := &types.Worker{ |
|
||||
ID: "ec_worker", |
|
||||
Capabilities: []types.TaskType{types.TaskTypeErasureCoding}, |
|
||||
Status: "active", |
|
||||
CurrentLoad: 0, |
|
||||
} |
|
||||
registry.RegisterWorker(ecWorker) |
|
||||
|
|
||||
vacuumWorker := &types.Worker{ |
|
||||
ID: "vacuum_worker", |
|
||||
Capabilities: []types.TaskType{types.TaskTypeVacuum}, |
|
||||
Status: "active", |
|
||||
CurrentLoad: 0, |
|
||||
} |
|
||||
registry.RegisterWorker(vacuumWorker) |
|
||||
|
|
||||
// Create different types of tasks
|
|
||||
ecTask := &types.Task{ |
|
||||
ID: "ec_task", |
|
||||
Type: types.TaskTypeErasureCoding, |
|
||||
} |
|
||||
vacuumTask := &types.Task{ |
|
||||
ID: "vacuum_task", |
|
||||
Type: types.TaskTypeVacuum, |
|
||||
} |
|
||||
|
|
||||
queue.Push(ecTask) |
|
||||
queue.Push(vacuumTask) |
|
||||
|
|
||||
// Test EC worker gets EC task
|
|
||||
assignedECTask := scheduler.GetNextTask("ec_worker", []types.TaskType{types.TaskTypeErasureCoding}) |
|
||||
if assignedECTask == nil || assignedECTask.Type != types.TaskTypeErasureCoding { |
|
||||
t.Error("EC worker should get EC task") |
|
||||
} |
|
||||
|
|
||||
// Test vacuum worker gets vacuum task
|
|
||||
assignedVacuumTask := scheduler.GetNextTask("vacuum_worker", []types.TaskType{types.TaskTypeVacuum}) |
|
||||
if assignedVacuumTask == nil || assignedVacuumTask.Type != types.TaskTypeVacuum { |
|
||||
t.Error("Vacuum worker should get vacuum task") |
|
||||
} |
|
||||
|
|
||||
// Test wrong capability - should get nothing
|
|
||||
wrongTask := scheduler.GetNextTask("ec_worker", []types.TaskType{types.TaskTypeVacuum}) |
|
||||
if wrongTask != nil { |
|
||||
t.Error("EC worker should not get vacuum task") |
|
||||
} |
|
||||
|
|
||||
t.Log("✅ Capability matching test passed") |
|
||||
} |
|
||||
|
|
||||
func TestTaskAssignment_PriorityOrdering(t *testing.T) { |
|
||||
queue := NewPriorityTaskQueue() |
|
||||
|
|
||||
// Add tasks in reverse priority order
|
|
||||
lowTask := &types.Task{ |
|
||||
ID: "low_task", |
|
||||
Priority: types.TaskPriorityLow, |
|
||||
} |
|
||||
highTask := &types.Task{ |
|
||||
ID: "high_task", |
|
||||
Priority: types.TaskPriorityHigh, |
|
||||
} |
|
||||
normalTask := &types.Task{ |
|
||||
ID: "normal_task", |
|
||||
Priority: types.TaskPriorityNormal, |
|
||||
} |
|
||||
|
|
||||
queue.Push(lowTask) |
|
||||
queue.Push(normalTask) |
|
||||
queue.Push(highTask) |
|
||||
|
|
||||
// Should get high priority first
|
|
||||
first := queue.Pop() |
|
||||
if first.Priority != types.TaskPriorityHigh { |
|
||||
t.Errorf("Expected high priority first, got %d", first.Priority) |
|
||||
} |
|
||||
|
|
||||
// Then normal priority
|
|
||||
second := queue.Pop() |
|
||||
if second.Priority != types.TaskPriorityNormal { |
|
||||
t.Errorf("Expected normal priority second, got %d", second.Priority) |
|
||||
} |
|
||||
|
|
||||
// Finally low priority
|
|
||||
third := queue.Pop() |
|
||||
if third.Priority != types.TaskPriorityLow { |
|
||||
t.Errorf("Expected low priority third, got %d", third.Priority) |
|
||||
} |
|
||||
|
|
||||
t.Log("✅ Priority ordering test passed") |
|
||||
} |
|
||||
|
|
||||
func TestTaskAssignment_WorkerCapacityLimits(t *testing.T) { |
|
||||
registry := NewWorkerRegistry() |
|
||||
|
|
||||
// Register worker with limited capacity
|
|
||||
worker := &types.Worker{ |
|
||||
ID: "limited_worker", |
|
||||
Capabilities: []types.TaskType{types.TaskTypeVacuum}, |
|
||||
MaxConcurrent: 2, |
|
||||
Status: "active", |
|
||||
CurrentLoad: 2, // Already at capacity
|
|
||||
} |
|
||||
registry.RegisterWorker(worker) |
|
||||
|
|
||||
// Worker should not be available
|
|
||||
availableWorkers := registry.GetAvailableWorkers() |
|
||||
if len(availableWorkers) != 0 { |
|
||||
t.Error("Worker at capacity should not be available") |
|
||||
} |
|
||||
|
|
||||
// Reduce load
|
|
||||
worker.CurrentLoad = 1 |
|
||||
|
|
||||
// Worker should now be available
|
|
||||
availableWorkers = registry.GetAvailableWorkers() |
|
||||
if len(availableWorkers) != 1 { |
|
||||
t.Error("Worker with capacity should be available") |
|
||||
} |
|
||||
|
|
||||
t.Log("✅ Worker capacity limits test passed") |
|
||||
} |
|
||||
|
|
||||
func TestTaskAssignment_ScheduledTasks(t *testing.T) { |
|
||||
registry := NewWorkerRegistry() |
|
||||
queue := NewPriorityTaskQueue() |
|
||||
scheduler := NewTaskScheduler(registry, queue) |
|
||||
|
|
||||
worker := &types.Worker{ |
|
||||
ID: "worker1", |
|
||||
Capabilities: []types.TaskType{types.TaskTypeVacuum}, |
|
||||
Status: "active", |
|
||||
CurrentLoad: 0, |
|
||||
} |
|
||||
registry.RegisterWorker(worker) |
|
||||
|
|
||||
// Create task scheduled for future
|
|
||||
futureTask := &types.Task{ |
|
||||
ID: "future_task", |
|
||||
Type: types.TaskTypeVacuum, |
|
||||
ScheduledAt: time.Now().Add(1 * time.Hour), // 1 hour from now
|
|
||||
} |
|
||||
|
|
||||
// Create task ready now
|
|
||||
readyTask := &types.Task{ |
|
||||
ID: "ready_task", |
|
||||
Type: types.TaskTypeVacuum, |
|
||||
ScheduledAt: time.Now().Add(-1 * time.Minute), // 1 minute ago
|
|
||||
} |
|
||||
|
|
||||
queue.Push(futureTask) |
|
||||
queue.Push(readyTask) |
|
||||
|
|
||||
// Should get ready task, not future task
|
|
||||
assignedTask := scheduler.GetNextTask("worker1", []types.TaskType{types.TaskTypeVacuum}) |
|
||||
if assignedTask == nil || assignedTask.ID != "ready_task" { |
|
||||
t.Error("Should assign ready task, not future scheduled task") |
|
||||
} |
|
||||
|
|
||||
t.Log("✅ Scheduled tasks test passed") |
|
||||
} |
|
||||
|
|
||||
func TestTaskAssignment_WorkerSelection(t *testing.T) { |
|
||||
registry := NewWorkerRegistry() |
|
||||
queue := NewPriorityTaskQueue() |
|
||||
scheduler := NewTaskScheduler(registry, queue) |
|
||||
|
|
||||
// Register workers with different characteristics
|
|
||||
highPerformanceWorker := &types.Worker{ |
|
||||
ID: "high_perf_worker", |
|
||||
Address: "server1", |
|
||||
Capabilities: []types.TaskType{types.TaskTypeErasureCoding}, |
|
||||
Status: "active", |
|
||||
CurrentLoad: 0, |
|
||||
MaxConcurrent: 4, |
|
||||
} |
|
||||
|
|
||||
lowPerformanceWorker := &types.Worker{ |
|
||||
ID: "low_perf_worker", |
|
||||
Address: "server2", |
|
||||
Capabilities: []types.TaskType{types.TaskTypeErasureCoding}, |
|
||||
Status: "active", |
|
||||
CurrentLoad: 1, |
|
||||
MaxConcurrent: 2, |
|
||||
} |
|
||||
|
|
||||
registry.RegisterWorker(highPerformanceWorker) |
|
||||
registry.RegisterWorker(lowPerformanceWorker) |
|
||||
|
|
||||
// Set up metrics to favor high performance worker
|
|
||||
registry.metrics[highPerformanceWorker.ID] = &WorkerMetrics{ |
|
||||
TasksCompleted: 100, |
|
||||
TasksFailed: 5, |
|
||||
SuccessRate: 0.95, |
|
||||
AverageTaskTime: 10 * time.Minute, |
|
||||
LastTaskTime: time.Now().Add(-5 * time.Minute), |
|
||||
} |
|
||||
|
|
||||
registry.metrics[lowPerformanceWorker.ID] = &WorkerMetrics{ |
|
||||
TasksCompleted: 50, |
|
||||
TasksFailed: 10, |
|
||||
SuccessRate: 0.83, |
|
||||
AverageTaskTime: 20 * time.Minute, |
|
||||
LastTaskTime: time.Now().Add(-1 * time.Hour), |
|
||||
} |
|
||||
|
|
||||
// Create high priority task
|
|
||||
task := &types.Task{ |
|
||||
ID: "important_task", |
|
||||
Type: types.TaskTypeErasureCoding, |
|
||||
Priority: types.TaskPriorityHigh, |
|
||||
Server: "server1", // Prefers server1
|
|
||||
} |
|
||||
|
|
||||
availableWorkers := []*types.Worker{highPerformanceWorker, lowPerformanceWorker} |
|
||||
selectedWorker := scheduler.SelectWorker(task, availableWorkers) |
|
||||
|
|
||||
if selectedWorker == nil { |
|
||||
t.Fatal("No worker selected") |
|
||||
} |
|
||||
|
|
||||
if selectedWorker.ID != "high_perf_worker" { |
|
||||
t.Errorf("Expected high performance worker to be selected, got %s", selectedWorker.ID) |
|
||||
} |
|
||||
|
|
||||
t.Log("✅ Worker selection test passed") |
|
||||
} |
|
||||
|
|
||||
func TestTaskAssignment_ServerAffinity(t *testing.T) { |
|
||||
registry := NewWorkerRegistry() |
|
||||
queue := NewPriorityTaskQueue() |
|
||||
scheduler := NewTaskScheduler(registry, queue) |
|
||||
|
|
||||
// Workers on different servers
|
|
||||
worker1 := &types.Worker{ |
|
||||
ID: "worker1", |
|
||||
Address: "server1", |
|
||||
Capabilities: []types.TaskType{types.TaskTypeVacuum}, |
|
||||
Status: "active", |
|
||||
CurrentLoad: 0, |
|
||||
} |
|
||||
|
|
||||
worker2 := &types.Worker{ |
|
||||
ID: "worker2", |
|
||||
Address: "server2", |
|
||||
Capabilities: []types.TaskType{types.TaskTypeVacuum}, |
|
||||
Status: "active", |
|
||||
CurrentLoad: 0, |
|
||||
} |
|
||||
|
|
||||
registry.RegisterWorker(worker1) |
|
||||
registry.RegisterWorker(worker2) |
|
||||
|
|
||||
// Task that prefers server1
|
|
||||
task := &types.Task{ |
|
||||
ID: "affinity_task", |
|
||||
Type: types.TaskTypeVacuum, |
|
||||
Server: "server1", // Should prefer worker on server1
|
|
||||
} |
|
||||
|
|
||||
availableWorkers := []*types.Worker{worker1, worker2} |
|
||||
selectedWorker := scheduler.SelectWorker(task, availableWorkers) |
|
||||
|
|
||||
if selectedWorker == nil { |
|
||||
t.Fatal("No worker selected") |
|
||||
} |
|
||||
|
|
||||
if selectedWorker.Address != "server1" { |
|
||||
t.Errorf("Expected worker on server1 to be selected for server affinity") |
|
||||
} |
|
||||
|
|
||||
t.Log("✅ Server affinity test passed") |
|
||||
} |
|
||||
|
|
||||
func TestTaskAssignment_DuplicateTaskPrevention(t *testing.T) { |
|
||||
queue := NewPriorityTaskQueue() |
|
||||
|
|
||||
// Add initial task
|
|
||||
task1 := &types.Task{ |
|
||||
ID: "task1", |
|
||||
Type: types.TaskTypeVacuum, |
|
||||
VolumeID: 1, |
|
||||
} |
|
||||
queue.Push(task1) |
|
||||
|
|
||||
// Check for duplicate
|
|
||||
hasDuplicate := queue.HasTask(1, types.TaskTypeVacuum) |
|
||||
if !hasDuplicate { |
|
||||
t.Error("Should detect existing task for volume") |
|
||||
} |
|
||||
|
|
||||
// Check for non-existent task
|
|
||||
hasNonExistent := queue.HasTask(2, types.TaskTypeVacuum) |
|
||||
if hasNonExistent { |
|
||||
t.Error("Should not detect task for different volume") |
|
||||
} |
|
||||
|
|
||||
// Check for different task type
|
|
||||
hasDifferentType := queue.HasTask(1, types.TaskTypeErasureCoding) |
|
||||
if hasDifferentType { |
|
||||
t.Error("Should not detect different task type for same volume") |
|
||||
} |
|
||||
|
|
||||
t.Log("✅ Duplicate task prevention test passed") |
|
||||
} |
|
||||
|
|
||||
func TestTaskAssignment_TaskRemoval(t *testing.T) { |
|
||||
queue := NewPriorityTaskQueue() |
|
||||
|
|
||||
// Add tasks
|
|
||||
task1 := &types.Task{ID: "task1", Priority: types.TaskPriorityNormal} |
|
||||
task2 := &types.Task{ID: "task2", Priority: types.TaskPriorityHigh} |
|
||||
task3 := &types.Task{ID: "task3", Priority: types.TaskPriorityLow} |
|
||||
|
|
||||
queue.Push(task1) |
|
||||
queue.Push(task2) |
|
||||
queue.Push(task3) |
|
||||
|
|
||||
if queue.Size() != 3 { |
|
||||
t.Errorf("Expected queue size 3, got %d", queue.Size()) |
|
||||
} |
|
||||
|
|
||||
// Remove middle priority task
|
|
||||
removed := queue.RemoveTask("task1") |
|
||||
if !removed { |
|
||||
t.Error("Should have removed task1") |
|
||||
} |
|
||||
|
|
||||
if queue.Size() != 2 { |
|
||||
t.Errorf("Expected queue size 2 after removal, got %d", queue.Size()) |
|
||||
} |
|
||||
|
|
||||
// Verify order maintained (high priority first)
|
|
||||
next := queue.Peek() |
|
||||
if next.ID != "task2" { |
|
||||
t.Errorf("Expected task2 (high priority) to be next, got %s", next.ID) |
|
||||
} |
|
||||
|
|
||||
t.Log("✅ Task removal test passed") |
|
||||
} |
|
||||
|
|
||||
func TestTaskAssignment_EdgeCases(t *testing.T) { |
|
||||
t.Run("EmptyQueue", func(t *testing.T) { |
|
||||
registry := NewWorkerRegistry() |
|
||||
queue := NewPriorityTaskQueue() |
|
||||
scheduler := NewTaskScheduler(registry, queue) |
|
||||
|
|
||||
worker := &types.Worker{ |
|
||||
ID: "worker1", |
|
||||
Capabilities: []types.TaskType{types.TaskTypeVacuum}, |
|
||||
Status: "active", |
|
||||
} |
|
||||
registry.RegisterWorker(worker) |
|
||||
|
|
||||
// Empty queue should return nil
|
|
||||
task := scheduler.GetNextTask("worker1", []types.TaskType{types.TaskTypeVacuum}) |
|
||||
if task != nil { |
|
||||
t.Error("Empty queue should return nil task") |
|
||||
} |
|
||||
}) |
|
||||
|
|
||||
t.Run("UnknownWorker", func(t *testing.T) { |
|
||||
registry := NewWorkerRegistry() |
|
||||
queue := NewPriorityTaskQueue() |
|
||||
scheduler := NewTaskScheduler(registry, queue) |
|
||||
|
|
||||
task := &types.Task{ID: "task1", Type: types.TaskTypeVacuum} |
|
||||
queue.Push(task) |
|
||||
|
|
||||
// Unknown worker should return nil
|
|
||||
assignedTask := scheduler.GetNextTask("unknown_worker", []types.TaskType{types.TaskTypeVacuum}) |
|
||||
if assignedTask != nil { |
|
||||
t.Error("Unknown worker should not get tasks") |
|
||||
} |
|
||||
}) |
|
||||
|
|
||||
t.Run("InactiveWorker", func(t *testing.T) { |
|
||||
registry := NewWorkerRegistry() |
|
||||
|
|
||||
worker := &types.Worker{ |
|
||||
ID: "inactive_worker", |
|
||||
Capabilities: []types.TaskType{types.TaskTypeVacuum}, |
|
||||
Status: "inactive", |
|
||||
CurrentLoad: 0, |
|
||||
} |
|
||||
registry.RegisterWorker(worker) |
|
||||
|
|
||||
// Inactive worker should not be available
|
|
||||
available := registry.GetAvailableWorkers() |
|
||||
if len(available) != 0 { |
|
||||
t.Error("Inactive worker should not be available") |
|
||||
} |
|
||||
}) |
|
||||
|
|
||||
t.Log("✅ Edge cases test passed") |
|
||||
} |
|
||||
|
|
||||
// Performance test for task assignment
|
|
||||
func BenchmarkTaskAssignment_GetNextTask(b *testing.B) { |
|
||||
registry := NewWorkerRegistry() |
|
||||
queue := NewPriorityTaskQueue() |
|
||||
scheduler := NewTaskScheduler(registry, queue) |
|
||||
|
|
||||
// Setup worker
|
|
||||
worker := &types.Worker{ |
|
||||
ID: "bench_worker", |
|
||||
Capabilities: []types.TaskType{types.TaskTypeVacuum}, |
|
||||
Status: "active", |
|
||||
CurrentLoad: 0, |
|
||||
} |
|
||||
registry.RegisterWorker(worker) |
|
||||
|
|
||||
// Add many tasks
|
|
||||
for i := 0; i < 1000; i++ { |
|
||||
task := &types.Task{ |
|
||||
ID: fmt.Sprintf("task_%d", i), |
|
||||
Type: types.TaskTypeVacuum, |
|
||||
Priority: types.TaskPriorityNormal, |
|
||||
} |
|
||||
queue.Push(task) |
|
||||
} |
|
||||
|
|
||||
b.ResetTimer() |
|
||||
|
|
||||
for i := 0; i < b.N; i++ { |
|
||||
scheduler.GetNextTask("bench_worker", []types.TaskType{types.TaskTypeVacuum}) |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
func BenchmarkTaskAssignment_WorkerSelection(b *testing.B) { |
|
||||
registry := NewWorkerRegistry() |
|
||||
scheduler := NewTaskScheduler(registry, nil) |
|
||||
|
|
||||
// Create many workers
|
|
||||
workers := make([]*types.Worker, 100) |
|
||||
for i := 0; i < 100; i++ { |
|
||||
worker := &types.Worker{ |
|
||||
ID: fmt.Sprintf("worker_%d", i), |
|
||||
Capabilities: []types.TaskType{types.TaskTypeVacuum}, |
|
||||
Status: "active", |
|
||||
CurrentLoad: i % 3, // Varying loads
|
|
||||
} |
|
||||
registry.RegisterWorker(worker) |
|
||||
workers[i] = worker |
|
||||
} |
|
||||
|
|
||||
task := &types.Task{ |
|
||||
ID: "bench_task", |
|
||||
Type: types.TaskTypeVacuum, |
|
||||
} |
|
||||
|
|
||||
b.ResetTimer() |
|
||||
|
|
||||
for i := 0; i < b.N; i++ { |
|
||||
scheduler.SelectWorker(task, workers) |
|
||||
} |
|
||||
} |
|
||||
@ -1,168 +0,0 @@ |
|||||
package task |
|
||||
|
|
||||
import ( |
|
||||
"time" |
|
||||
|
|
||||
"github.com/seaweedfs/seaweedfs/weed/glog" |
|
||||
"github.com/seaweedfs/seaweedfs/weed/worker/types" |
|
||||
) |
|
||||
|
|
||||
// ECDetector detects volumes that need erasure coding
|
|
||||
type ECDetector struct { |
|
||||
minUtilization float64 |
|
||||
minIdleTime time.Duration |
|
||||
} |
|
||||
|
|
||||
// NewECDetector creates a new EC detector
|
|
||||
func NewECDetector() *ECDetector { |
|
||||
return &ECDetector{ |
|
||||
minUtilization: 95.0, // 95% full
|
|
||||
minIdleTime: time.Hour, // 1 hour idle
|
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// DetectECCandidates finds volumes that need erasure coding
|
|
||||
func (ed *ECDetector) DetectECCandidates(volumes []*VolumeInfo) ([]*VolumeCandidate, error) { |
|
||||
var candidates []*VolumeCandidate |
|
||||
|
|
||||
for _, vol := range volumes { |
|
||||
if ed.isECCandidate(vol) { |
|
||||
candidate := &VolumeCandidate{ |
|
||||
VolumeID: vol.ID, |
|
||||
Server: vol.Server, |
|
||||
Collection: vol.Collection, |
|
||||
TaskType: types.TaskTypeErasureCoding, |
|
||||
Priority: ed.calculateECPriority(vol), |
|
||||
Reason: "Volume is full and idle, ready for erasure coding", |
|
||||
DetectedAt: time.Now(), |
|
||||
ScheduleAt: time.Now(), |
|
||||
Parameters: map[string]interface{}{ |
|
||||
"utilization": vol.GetUtilization(), |
|
||||
"idle_time": vol.GetIdleTime().String(), |
|
||||
"volume_size": vol.Size, |
|
||||
}, |
|
||||
} |
|
||||
candidates = append(candidates, candidate) |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
glog.V(2).Infof("EC detector found %d candidates", len(candidates)) |
|
||||
return candidates, nil |
|
||||
} |
|
||||
|
|
||||
// isECCandidate checks if a volume is suitable for EC
|
|
||||
func (ed *ECDetector) isECCandidate(vol *VolumeInfo) bool { |
|
||||
// Skip if read-only
|
|
||||
if vol.ReadOnly { |
|
||||
return false |
|
||||
} |
|
||||
|
|
||||
// Skip if already has remote storage (likely already EC'd)
|
|
||||
if vol.RemoteStorageKey != "" { |
|
||||
return false |
|
||||
} |
|
||||
|
|
||||
// Check utilization
|
|
||||
if vol.GetUtilization() < ed.minUtilization { |
|
||||
return false |
|
||||
} |
|
||||
|
|
||||
// Check idle time
|
|
||||
if vol.GetIdleTime() < ed.minIdleTime { |
|
||||
return false |
|
||||
} |
|
||||
|
|
||||
return true |
|
||||
} |
|
||||
|
|
||||
// calculateECPriority calculates priority for EC tasks
|
|
||||
func (ed *ECDetector) calculateECPriority(vol *VolumeInfo) types.TaskPriority { |
|
||||
utilization := vol.GetUtilization() |
|
||||
idleTime := vol.GetIdleTime() |
|
||||
|
|
||||
// Higher priority for fuller volumes that have been idle longer
|
|
||||
if utilization >= 98.0 && idleTime > 24*time.Hour { |
|
||||
return types.TaskPriorityHigh |
|
||||
} |
|
||||
if utilization >= 96.0 && idleTime > 6*time.Hour { |
|
||||
return types.TaskPriorityNormal |
|
||||
} |
|
||||
return types.TaskPriorityLow |
|
||||
} |
|
||||
|
|
||||
// VacuumDetector detects volumes that need vacuum operations
|
|
||||
type VacuumDetector struct { |
|
||||
minGarbageRatio float64 |
|
||||
minDeleteCount uint64 |
|
||||
} |
|
||||
|
|
||||
// NewVacuumDetector creates a new vacuum detector
|
|
||||
func NewVacuumDetector() *VacuumDetector { |
|
||||
return &VacuumDetector{ |
|
||||
minGarbageRatio: 0.3, // 30% garbage
|
|
||||
minDeleteCount: 100, // At least 100 deleted files
|
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// DetectVacuumCandidates finds volumes that need vacuum operations
|
|
||||
func (vd *VacuumDetector) DetectVacuumCandidates(volumes []*VolumeInfo) ([]*VolumeCandidate, error) { |
|
||||
var candidates []*VolumeCandidate |
|
||||
|
|
||||
for _, vol := range volumes { |
|
||||
if vd.isVacuumCandidate(vol) { |
|
||||
candidate := &VolumeCandidate{ |
|
||||
VolumeID: vol.ID, |
|
||||
Server: vol.Server, |
|
||||
Collection: vol.Collection, |
|
||||
TaskType: types.TaskTypeVacuum, |
|
||||
Priority: vd.calculateVacuumPriority(vol), |
|
||||
Reason: "Volume has high garbage ratio and needs vacuum", |
|
||||
DetectedAt: time.Now(), |
|
||||
ScheduleAt: time.Now(), |
|
||||
Parameters: map[string]interface{}{ |
|
||||
"garbage_ratio": vol.GetGarbageRatio(), |
|
||||
"delete_count": vol.DeleteCount, |
|
||||
"deleted_byte_count": vol.DeletedByteCount, |
|
||||
}, |
|
||||
} |
|
||||
candidates = append(candidates, candidate) |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
glog.V(2).Infof("Vacuum detector found %d candidates", len(candidates)) |
|
||||
return candidates, nil |
|
||||
} |
|
||||
|
|
||||
// isVacuumCandidate checks if a volume needs vacuum
|
|
||||
func (vd *VacuumDetector) isVacuumCandidate(vol *VolumeInfo) bool { |
|
||||
// Skip if read-only
|
|
||||
if vol.ReadOnly { |
|
||||
return false |
|
||||
} |
|
||||
|
|
||||
// Check garbage ratio
|
|
||||
if vol.GetGarbageRatio() < vd.minGarbageRatio { |
|
||||
return false |
|
||||
} |
|
||||
|
|
||||
// Check delete count
|
|
||||
if vol.DeleteCount < vd.minDeleteCount { |
|
||||
return false |
|
||||
} |
|
||||
|
|
||||
return true |
|
||||
} |
|
||||
|
|
||||
// calculateVacuumPriority calculates priority for vacuum tasks
|
|
||||
func (vd *VacuumDetector) calculateVacuumPriority(vol *VolumeInfo) types.TaskPriority { |
|
||||
garbageRatio := vol.GetGarbageRatio() |
|
||||
|
|
||||
// Higher priority for volumes with more garbage
|
|
||||
if garbageRatio >= 0.6 { |
|
||||
return types.TaskPriorityHigh |
|
||||
} |
|
||||
if garbageRatio >= 0.4 { |
|
||||
return types.TaskPriorityNormal |
|
||||
} |
|
||||
return types.TaskPriorityLow |
|
||||
} |
|
||||
@ -1,161 +0,0 @@ |
|||||
package task |
|
||||
|
|
||||
import ( |
|
||||
"context" |
|
||||
"time" |
|
||||
|
|
||||
"github.com/seaweedfs/seaweedfs/weed/glog" |
|
||||
"github.com/seaweedfs/seaweedfs/weed/pb/master_pb" |
|
||||
"github.com/seaweedfs/seaweedfs/weed/wdclient" |
|
||||
) |
|
||||
|
|
||||
// TaskDiscoveryEngine discovers volumes that need maintenance tasks
|
|
||||
type TaskDiscoveryEngine struct { |
|
||||
masterClient *wdclient.MasterClient |
|
||||
scanInterval time.Duration |
|
||||
ecDetector *ECDetector |
|
||||
vacuumDetector *VacuumDetector |
|
||||
} |
|
||||
|
|
||||
// NewTaskDiscoveryEngine creates a new task discovery engine
|
|
||||
func NewTaskDiscoveryEngine(masterClient *wdclient.MasterClient, scanInterval time.Duration) *TaskDiscoveryEngine { |
|
||||
return &TaskDiscoveryEngine{ |
|
||||
masterClient: masterClient, |
|
||||
scanInterval: scanInterval, |
|
||||
ecDetector: NewECDetector(), |
|
||||
vacuumDetector: NewVacuumDetector(), |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// ScanForTasks scans for volumes that need maintenance tasks
|
|
||||
func (tde *TaskDiscoveryEngine) ScanForTasks() ([]*VolumeCandidate, error) { |
|
||||
var candidates []*VolumeCandidate |
|
||||
|
|
||||
// Get cluster topology and volume information
|
|
||||
volumeInfos, err := tde.getVolumeInformation() |
|
||||
if err != nil { |
|
||||
return nil, err |
|
||||
} |
|
||||
|
|
||||
// Scan for EC candidates
|
|
||||
ecCandidates, err := tde.ecDetector.DetectECCandidates(volumeInfos) |
|
||||
if err != nil { |
|
||||
glog.Errorf("EC detection failed: %v", err) |
|
||||
} else { |
|
||||
candidates = append(candidates, ecCandidates...) |
|
||||
} |
|
||||
|
|
||||
// Scan for vacuum candidates
|
|
||||
vacuumCandidates, err := tde.vacuumDetector.DetectVacuumCandidates(volumeInfos) |
|
||||
if err != nil { |
|
||||
glog.Errorf("Vacuum detection failed: %v", err) |
|
||||
} else { |
|
||||
candidates = append(candidates, vacuumCandidates...) |
|
||||
} |
|
||||
|
|
||||
glog.V(1).Infof("Task discovery found %d candidates (%d EC, %d vacuum)", |
|
||||
len(candidates), len(ecCandidates), len(vacuumCandidates)) |
|
||||
|
|
||||
return candidates, nil |
|
||||
} |
|
||||
|
|
||||
// getVolumeInformation retrieves volume information from master
|
|
||||
func (tde *TaskDiscoveryEngine) getVolumeInformation() ([]*VolumeInfo, error) { |
|
||||
var volumeInfos []*VolumeInfo |
|
||||
|
|
||||
err := tde.masterClient.WithClient(false, func(client master_pb.SeaweedClient) error { |
|
||||
resp, err := client.VolumeList(context.Background(), &master_pb.VolumeListRequest{}) |
|
||||
if err != nil { |
|
||||
return err |
|
||||
} |
|
||||
|
|
||||
if resp.TopologyInfo != nil { |
|
||||
for _, dc := range resp.TopologyInfo.DataCenterInfos { |
|
||||
for _, rack := range dc.RackInfos { |
|
||||
for _, node := range rack.DataNodeInfos { |
|
||||
for _, diskInfo := range node.DiskInfos { |
|
||||
for _, volInfo := range diskInfo.VolumeInfos { |
|
||||
volumeInfo := &VolumeInfo{ |
|
||||
ID: volInfo.Id, |
|
||||
Size: volInfo.Size, |
|
||||
Collection: volInfo.Collection, |
|
||||
FileCount: volInfo.FileCount, |
|
||||
DeleteCount: volInfo.DeleteCount, |
|
||||
DeletedByteCount: volInfo.DeletedByteCount, |
|
||||
ReadOnly: volInfo.ReadOnly, |
|
||||
Server: node.Id, |
|
||||
DataCenter: dc.Id, |
|
||||
Rack: rack.Id, |
|
||||
DiskType: volInfo.DiskType, |
|
||||
ModifiedAtSecond: volInfo.ModifiedAtSecond, |
|
||||
RemoteStorageKey: volInfo.RemoteStorageKey, |
|
||||
} |
|
||||
volumeInfos = append(volumeInfos, volumeInfo) |
|
||||
} |
|
||||
} |
|
||||
} |
|
||||
} |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
return nil |
|
||||
}) |
|
||||
|
|
||||
return volumeInfos, err |
|
||||
} |
|
||||
|
|
||||
// VolumeInfo contains detailed volume information
|
|
||||
type VolumeInfo struct { |
|
||||
ID uint32 |
|
||||
Size uint64 |
|
||||
Collection string |
|
||||
FileCount uint64 |
|
||||
DeleteCount uint64 |
|
||||
DeletedByteCount uint64 |
|
||||
ReadOnly bool |
|
||||
Server string |
|
||||
DataCenter string |
|
||||
Rack string |
|
||||
DiskType string |
|
||||
ModifiedAtSecond int64 |
|
||||
RemoteStorageKey string |
|
||||
} |
|
||||
|
|
||||
// GetUtilization calculates volume utilization percentage
|
|
||||
func (vi *VolumeInfo) GetUtilization() float64 { |
|
||||
if vi.Size == 0 { |
|
||||
return 0.0 |
|
||||
} |
|
||||
// Assuming max volume size of 30GB
|
|
||||
maxSize := uint64(30 * 1024 * 1024 * 1024) |
|
||||
return float64(vi.Size) / float64(maxSize) * 100.0 |
|
||||
} |
|
||||
|
|
||||
// GetGarbageRatio calculates the garbage ratio
|
|
||||
func (vi *VolumeInfo) GetGarbageRatio() float64 { |
|
||||
if vi.Size == 0 { |
|
||||
return 0.0 |
|
||||
} |
|
||||
return float64(vi.DeletedByteCount) / float64(vi.Size) |
|
||||
} |
|
||||
|
|
||||
// GetIdleTime calculates how long the volume has been idle
|
|
||||
func (vi *VolumeInfo) GetIdleTime() time.Duration { |
|
||||
lastModified := time.Unix(vi.ModifiedAtSecond, 0) |
|
||||
return time.Since(lastModified) |
|
||||
} |
|
||||
|
|
||||
// IsECCandidate checks if volume is a candidate for EC
|
|
||||
func (vi *VolumeInfo) IsECCandidate() bool { |
|
||||
return !vi.ReadOnly && |
|
||||
vi.GetUtilization() >= 95.0 && |
|
||||
vi.GetIdleTime() > time.Hour && |
|
||||
vi.RemoteStorageKey == "" // Not already EC'd
|
|
||||
} |
|
||||
|
|
||||
// IsVacuumCandidate checks if volume is a candidate for vacuum
|
|
||||
func (vi *VolumeInfo) IsVacuumCandidate() bool { |
|
||||
return !vi.ReadOnly && |
|
||||
vi.GetGarbageRatio() >= 0.3 && |
|
||||
vi.DeleteCount > 0 |
|
||||
} |
|
||||
@ -1,257 +0,0 @@ |
|||||
package task |
|
||||
|
|
||||
import ( |
|
||||
"sync" |
|
||||
"time" |
|
||||
|
|
||||
"github.com/seaweedfs/seaweedfs/weed/glog" |
|
||||
"github.com/seaweedfs/seaweedfs/weed/worker/types" |
|
||||
) |
|
||||
|
|
||||
// TaskScheduler handles task assignment to workers
|
|
||||
type TaskScheduler struct { |
|
||||
workerRegistry *WorkerRegistry |
|
||||
taskQueue *PriorityTaskQueue |
|
||||
mutex sync.RWMutex |
|
||||
} |
|
||||
|
|
||||
// NewTaskScheduler creates a new task scheduler
|
|
||||
func NewTaskScheduler(registry *WorkerRegistry, queue *PriorityTaskQueue) *TaskScheduler { |
|
||||
return &TaskScheduler{ |
|
||||
workerRegistry: registry, |
|
||||
taskQueue: queue, |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// GetNextTask gets the next suitable task for a worker
|
|
||||
func (ts *TaskScheduler) GetNextTask(workerID string, capabilities []types.TaskType) *types.Task { |
|
||||
ts.mutex.RLock() |
|
||||
defer ts.mutex.RUnlock() |
|
||||
|
|
||||
// Get worker info
|
|
||||
_, exists := ts.workerRegistry.GetWorker(workerID) |
|
||||
if !exists { |
|
||||
return nil |
|
||||
} |
|
||||
|
|
||||
// Check worker capabilities
|
|
||||
capabilityMap := make(map[types.TaskType]bool) |
|
||||
for _, cap := range capabilities { |
|
||||
capabilityMap[cap] = true |
|
||||
} |
|
||||
|
|
||||
// Find next suitable task
|
|
||||
tasks := ts.taskQueue.GetTasks() |
|
||||
for _, task := range tasks { |
|
||||
// Check if worker can handle this task type
|
|
||||
if !capabilityMap[task.Type] { |
|
||||
continue |
|
||||
} |
|
||||
|
|
||||
// Check if task is ready to be scheduled
|
|
||||
if !task.ScheduledAt.IsZero() && task.ScheduledAt.After(time.Now()) { |
|
||||
continue |
|
||||
} |
|
||||
|
|
||||
// Additional checks can be added here
|
|
||||
// (e.g., server affinity, resource requirements)
|
|
||||
|
|
||||
return task |
|
||||
} |
|
||||
|
|
||||
return nil |
|
||||
} |
|
||||
|
|
||||
// SelectWorker selects the best worker for a task
|
|
||||
func (ts *TaskScheduler) SelectWorker(task *types.Task, availableWorkers []*types.Worker) *types.Worker { |
|
||||
ts.mutex.RLock() |
|
||||
defer ts.mutex.RUnlock() |
|
||||
|
|
||||
var bestWorker *types.Worker |
|
||||
bestScore := -1.0 |
|
||||
|
|
||||
for _, worker := range availableWorkers { |
|
||||
// Check if worker supports this task type
|
|
||||
if !ts.workerSupportsTask(worker, task.Type) { |
|
||||
continue |
|
||||
} |
|
||||
|
|
||||
// Calculate selection score
|
|
||||
score := ts.calculateSelectionScore(worker, task) |
|
||||
if bestWorker == nil || score > bestScore { |
|
||||
bestWorker = worker |
|
||||
bestScore = score |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
if bestWorker != nil { |
|
||||
glog.V(2).Infof("Selected worker %s for task %s (score: %.2f)", bestWorker.ID, task.Type, bestScore) |
|
||||
} |
|
||||
|
|
||||
return bestWorker |
|
||||
} |
|
||||
|
|
||||
// workerSupportsTask checks if a worker supports a task type
|
|
||||
func (ts *TaskScheduler) workerSupportsTask(worker *types.Worker, taskType types.TaskType) bool { |
|
||||
for _, capability := range worker.Capabilities { |
|
||||
if capability == taskType { |
|
||||
return true |
|
||||
} |
|
||||
} |
|
||||
return false |
|
||||
} |
|
||||
|
|
||||
// calculateSelectionScore calculates a score for worker selection
|
|
||||
func (ts *TaskScheduler) calculateSelectionScore(worker *types.Worker, task *types.Task) float64 { |
|
||||
// Base score from worker registry
|
|
||||
baseScore := ts.workerRegistry.calculateWorkerScore(worker) |
|
||||
|
|
||||
// Task-specific adjustments
|
|
||||
taskScore := baseScore |
|
||||
|
|
||||
// Priority adjustment
|
|
||||
switch task.Priority { |
|
||||
case types.TaskPriorityHigh: |
|
||||
taskScore *= 1.2 // Prefer high-performing workers for high-priority tasks
|
|
||||
case types.TaskPriorityLow: |
|
||||
taskScore *= 0.9 // Low-priority tasks can use any available worker
|
|
||||
} |
|
||||
|
|
||||
// Server affinity bonus (if worker and volume are on same server)
|
|
||||
if task.Server != "" && worker.Address == task.Server { |
|
||||
taskScore += 0.1 |
|
||||
} |
|
||||
|
|
||||
// Retry penalty (prefer different workers for retried tasks)
|
|
||||
if task.RetryCount > 0 { |
|
||||
taskScore *= 0.8 |
|
||||
} |
|
||||
|
|
||||
return taskScore |
|
||||
} |
|
||||
|
|
||||
// PriorityTaskQueue implements a priority queue for tasks
|
|
||||
type PriorityTaskQueue struct { |
|
||||
tasks []*types.Task |
|
||||
mutex sync.RWMutex |
|
||||
} |
|
||||
|
|
||||
// NewPriorityTaskQueue creates a new priority task queue
|
|
||||
func NewPriorityTaskQueue() *PriorityTaskQueue { |
|
||||
return &PriorityTaskQueue{ |
|
||||
tasks: make([]*types.Task, 0), |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// Push adds a task to the queue
|
|
||||
func (ptq *PriorityTaskQueue) Push(task *types.Task) { |
|
||||
ptq.mutex.Lock() |
|
||||
defer ptq.mutex.Unlock() |
|
||||
|
|
||||
// Insert task in priority order (highest priority first)
|
|
||||
inserted := false |
|
||||
for i, existingTask := range ptq.tasks { |
|
||||
if task.Priority > existingTask.Priority { |
|
||||
// Insert at position i
|
|
||||
ptq.tasks = append(ptq.tasks[:i], append([]*types.Task{task}, ptq.tasks[i:]...)...) |
|
||||
inserted = true |
|
||||
break |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
if !inserted { |
|
||||
// Add to end
|
|
||||
ptq.tasks = append(ptq.tasks, task) |
|
||||
} |
|
||||
|
|
||||
glog.V(3).Infof("Added task %s to queue (priority: %d, queue size: %d)", task.ID, task.Priority, len(ptq.tasks)) |
|
||||
} |
|
||||
|
|
||||
// Pop removes and returns the highest priority task
|
|
||||
func (ptq *PriorityTaskQueue) Pop() *types.Task { |
|
||||
ptq.mutex.Lock() |
|
||||
defer ptq.mutex.Unlock() |
|
||||
|
|
||||
if len(ptq.tasks) == 0 { |
|
||||
return nil |
|
||||
} |
|
||||
|
|
||||
task := ptq.tasks[0] |
|
||||
ptq.tasks = ptq.tasks[1:] |
|
||||
return task |
|
||||
} |
|
||||
|
|
||||
// Peek returns the highest priority task without removing it
|
|
||||
func (ptq *PriorityTaskQueue) Peek() *types.Task { |
|
||||
ptq.mutex.RLock() |
|
||||
defer ptq.mutex.RUnlock() |
|
||||
|
|
||||
if len(ptq.tasks) == 0 { |
|
||||
return nil |
|
||||
} |
|
||||
|
|
||||
return ptq.tasks[0] |
|
||||
} |
|
||||
|
|
||||
// IsEmpty returns true if the queue is empty
|
|
||||
func (ptq *PriorityTaskQueue) IsEmpty() bool { |
|
||||
ptq.mutex.RLock() |
|
||||
defer ptq.mutex.RUnlock() |
|
||||
|
|
||||
return len(ptq.tasks) == 0 |
|
||||
} |
|
||||
|
|
||||
// Size returns the number of tasks in the queue
|
|
||||
func (ptq *PriorityTaskQueue) Size() int { |
|
||||
ptq.mutex.RLock() |
|
||||
defer ptq.mutex.RUnlock() |
|
||||
|
|
||||
return len(ptq.tasks) |
|
||||
} |
|
||||
|
|
||||
// HasTask checks if a task exists for a volume and task type
|
|
||||
func (ptq *PriorityTaskQueue) HasTask(volumeID uint32, taskType types.TaskType) bool { |
|
||||
ptq.mutex.RLock() |
|
||||
defer ptq.mutex.RUnlock() |
|
||||
|
|
||||
for _, task := range ptq.tasks { |
|
||||
if task.VolumeID == volumeID && task.Type == taskType { |
|
||||
return true |
|
||||
} |
|
||||
} |
|
||||
return false |
|
||||
} |
|
||||
|
|
||||
// GetTasks returns a copy of all tasks in the queue
|
|
||||
func (ptq *PriorityTaskQueue) GetTasks() []*types.Task { |
|
||||
ptq.mutex.RLock() |
|
||||
defer ptq.mutex.RUnlock() |
|
||||
|
|
||||
tasksCopy := make([]*types.Task, len(ptq.tasks)) |
|
||||
copy(tasksCopy, ptq.tasks) |
|
||||
return tasksCopy |
|
||||
} |
|
||||
|
|
||||
// RemoveTask removes a specific task from the queue
|
|
||||
func (ptq *PriorityTaskQueue) RemoveTask(taskID string) bool { |
|
||||
ptq.mutex.Lock() |
|
||||
defer ptq.mutex.Unlock() |
|
||||
|
|
||||
for i, task := range ptq.tasks { |
|
||||
if task.ID == taskID { |
|
||||
ptq.tasks = append(ptq.tasks[:i], ptq.tasks[i+1:]...) |
|
||||
glog.V(3).Infof("Removed task %s from queue", taskID) |
|
||||
return true |
|
||||
} |
|
||||
} |
|
||||
return false |
|
||||
} |
|
||||
|
|
||||
// Clear removes all tasks from the queue
|
|
||||
func (ptq *PriorityTaskQueue) Clear() { |
|
||||
ptq.mutex.Lock() |
|
||||
defer ptq.mutex.Unlock() |
|
||||
|
|
||||
ptq.tasks = ptq.tasks[:0] |
|
||||
glog.V(3).Infof("Cleared task queue") |
|
||||
} |
|
||||
@ -1,68 +0,0 @@ |
|||||
package task |
|
||||
|
|
||||
import ( |
|
||||
"time" |
|
||||
|
|
||||
"github.com/seaweedfs/seaweedfs/weed/worker/types" |
|
||||
) |
|
||||
|
|
||||
// InProgressTask represents a task currently being executed
|
|
||||
type InProgressTask struct { |
|
||||
Task *types.Task |
|
||||
WorkerID string |
|
||||
StartedAt time.Time |
|
||||
LastUpdate time.Time |
|
||||
Progress float64 |
|
||||
EstimatedEnd time.Time |
|
||||
VolumeReserved bool // Reserved for capacity planning
|
|
||||
} |
|
||||
|
|
||||
// VolumeCandidate represents a volume that needs maintenance
|
|
||||
type VolumeCandidate struct { |
|
||||
VolumeID uint32 |
|
||||
Server string |
|
||||
Collection string |
|
||||
TaskType types.TaskType |
|
||||
Priority types.TaskPriority |
|
||||
Reason string |
|
||||
DetectedAt time.Time |
|
||||
ScheduleAt time.Time |
|
||||
Parameters map[string]interface{} |
|
||||
} |
|
||||
|
|
||||
// VolumeChange represents a volume state change
|
|
||||
type VolumeChange struct { |
|
||||
VolumeID uint32 |
|
||||
ChangeType ChangeType |
|
||||
OldCapacity int64 |
|
||||
NewCapacity int64 |
|
||||
TaskID string |
|
||||
CompletedAt time.Time |
|
||||
ReportedToMaster bool |
|
||||
} |
|
||||
|
|
||||
// ChangeType represents the type of volume change
|
|
||||
type ChangeType string |
|
||||
|
|
||||
const ( |
|
||||
ChangeTypeECEncoding ChangeType = "ec_encoding" |
|
||||
ChangeTypeVacuumComplete ChangeType = "vacuum_completed" |
|
||||
) |
|
||||
|
|
||||
// WorkerMetrics represents performance metrics for a worker
|
|
||||
type WorkerMetrics struct { |
|
||||
TasksCompleted int |
|
||||
TasksFailed int |
|
||||
AverageTaskTime time.Duration |
|
||||
LastTaskTime time.Time |
|
||||
SuccessRate float64 |
|
||||
} |
|
||||
|
|
||||
// VolumeReservation represents a reserved volume capacity
|
|
||||
type VolumeReservation struct { |
|
||||
VolumeID uint32 |
|
||||
TaskID string |
|
||||
ReservedAt time.Time |
|
||||
ExpectedEnd time.Time |
|
||||
CapacityDelta int64 // Expected change in capacity
|
|
||||
} |
|
||||
@ -1,640 +0,0 @@ |
|||||
package task |
|
||||
|
|
||||
import ( |
|
||||
"context" |
|
||||
"sync" |
|
||||
"time" |
|
||||
|
|
||||
"github.com/seaweedfs/seaweedfs/weed/glog" |
|
||||
"github.com/seaweedfs/seaweedfs/weed/pb/master_pb" |
|
||||
"github.com/seaweedfs/seaweedfs/weed/storage/erasure_coding" |
|
||||
"github.com/seaweedfs/seaweedfs/weed/wdclient" |
|
||||
"github.com/seaweedfs/seaweedfs/weed/worker/types" |
|
||||
) |
|
||||
|
|
||||
// VolumeStateManager provides comprehensive tracking of all volume and shard states
|
|
||||
type VolumeStateManager struct { |
|
||||
masterClient *wdclient.MasterClient |
|
||||
volumes map[uint32]*VolumeState |
|
||||
ecShards map[uint32]*ECShardState // Key: VolumeID
|
|
||||
inProgressTasks map[string]*TaskImpact // Key: TaskID
|
|
||||
plannedOperations map[string]*PlannedOperation // Key: OperationID
|
|
||||
capacityCache map[string]*CapacityInfo // Key: Server address
|
|
||||
lastMasterSync time.Time |
|
||||
mutex sync.RWMutex |
|
||||
} |
|
||||
|
|
||||
// VolumeState tracks comprehensive state of a volume
|
|
||||
type VolumeState struct { |
|
||||
VolumeID uint32 |
|
||||
CurrentState *VolumeInfo // Current state from master
|
|
||||
InProgressTasks []*TaskImpact // Tasks currently affecting this volume
|
|
||||
PlannedChanges []*PlannedOperation // Future operations planned
|
|
||||
PredictedState *VolumeInfo // Predicted state after all operations
|
|
||||
LastMasterUpdate time.Time |
|
||||
Inconsistencies []StateInconsistency |
|
||||
} |
|
||||
|
|
||||
// ECShardState tracks EC shard information
|
|
||||
type ECShardState struct { |
|
||||
VolumeID uint32 |
|
||||
CurrentShards map[int]*ShardInfo // Current shards from master (0-13)
|
|
||||
InProgressTasks []*TaskImpact // Tasks affecting shards
|
|
||||
PlannedShards map[int]*PlannedShard // Planned shard operations
|
|
||||
PredictedShards map[int]*ShardInfo // Predicted final state
|
|
||||
LastUpdate time.Time |
|
||||
} |
|
||||
|
|
||||
// ShardInfo represents information about an EC shard
|
|
||||
type ShardInfo struct { |
|
||||
ShardID int |
|
||||
Server string |
|
||||
Size uint64 |
|
||||
Status ShardStatus |
|
||||
LastUpdate time.Time |
|
||||
} |
|
||||
|
|
||||
// ShardStatus represents the status of a shard
|
|
||||
type ShardStatus string |
|
||||
|
|
||||
const ( |
|
||||
ShardStatusExists ShardStatus = "exists" |
|
||||
ShardStatusCreating ShardStatus = "creating" |
|
||||
ShardStatusDeleting ShardStatus = "deleting" |
|
||||
ShardStatusMissing ShardStatus = "missing" |
|
||||
ShardStatusCorrupted ShardStatus = "corrupted" |
|
||||
) |
|
||||
|
|
||||
// TaskImpact describes how a task affects volume/shard state
|
|
||||
type TaskImpact struct { |
|
||||
TaskID string |
|
||||
TaskType types.TaskType |
|
||||
VolumeID uint32 |
|
||||
WorkerID string |
|
||||
StartedAt time.Time |
|
||||
EstimatedEnd time.Time |
|
||||
|
|
||||
// Volume impacts
|
|
||||
VolumeChanges *VolumeChanges |
|
||||
|
|
||||
// Shard impacts
|
|
||||
ShardChanges map[int]*ShardChange // Key: ShardID
|
|
||||
|
|
||||
// Capacity impacts
|
|
||||
CapacityDelta map[string]int64 // Key: Server, Value: capacity change
|
|
||||
} |
|
||||
|
|
||||
// VolumeChanges describes changes to a volume
|
|
||||
type VolumeChanges struct { |
|
||||
SizeChange int64 |
|
||||
WillBeDeleted bool |
|
||||
WillBeCreated bool |
|
||||
WillBecomeReadOnly bool |
|
||||
CollectionChange string |
|
||||
DiskTypeChange string |
|
||||
} |
|
||||
|
|
||||
// ShardChange describes changes to a shard
|
|
||||
type ShardChange struct { |
|
||||
ShardID int |
|
||||
WillBeCreated bool |
|
||||
WillBeDeleted bool |
|
||||
TargetServer string |
|
||||
SizeChange int64 |
|
||||
} |
|
||||
|
|
||||
// PlannedOperation represents a future operation
|
|
||||
type PlannedOperation struct { |
|
||||
OperationID string |
|
||||
Type OperationType |
|
||||
VolumeID uint32 |
|
||||
ScheduledAt time.Time |
|
||||
Priority types.TaskPriority |
|
||||
Prerequisites []string // Other operation IDs that must complete first
|
|
||||
Impact *TaskImpact |
|
||||
} |
|
||||
|
|
||||
// OperationType represents different types of planned operations
|
|
||||
type OperationType string |
|
||||
|
|
||||
const ( |
|
||||
OperationECEncode OperationType = "ec_encode" |
|
||||
OperationECRebuild OperationType = "ec_rebuild" |
|
||||
OperationECBalance OperationType = "ec_balance" |
|
||||
OperationVacuum OperationType = "vacuum" |
|
||||
OperationVolumeMove OperationType = "volume_move" |
|
||||
OperationShardMove OperationType = "shard_move" |
|
||||
OperationVolumeDelete OperationType = "volume_delete" |
|
||||
) |
|
||||
|
|
||||
// CapacityInfo tracks server capacity information
|
|
||||
type CapacityInfo struct { |
|
||||
Server string |
|
||||
TotalCapacity int64 |
|
||||
UsedCapacity int64 |
|
||||
ReservedCapacity int64 // Capacity reserved for in-progress tasks
|
|
||||
PredictedUsage int64 // Predicted usage after all operations
|
|
||||
LastUpdate time.Time |
|
||||
} |
|
||||
|
|
||||
// StateInconsistency represents detected inconsistencies
|
|
||||
type StateInconsistency struct { |
|
||||
Type InconsistencyType |
|
||||
Description string |
|
||||
DetectedAt time.Time |
|
||||
Severity SeverityLevel |
|
||||
VolumeID uint32 |
|
||||
ShardID *int |
|
||||
} |
|
||||
|
|
||||
// InconsistencyType represents different types of state inconsistencies
|
|
||||
type InconsistencyType string |
|
||||
|
|
||||
const ( |
|
||||
InconsistencyVolumeMissing InconsistencyType = "volume_missing" |
|
||||
InconsistencyVolumeUnexpected InconsistencyType = "volume_unexpected" |
|
||||
InconsistencyShardMissing InconsistencyType = "shard_missing" |
|
||||
InconsistencyShardUnexpected InconsistencyType = "shard_unexpected" |
|
||||
InconsistencyCapacityMismatch InconsistencyType = "capacity_mismatch" |
|
||||
InconsistencyTaskOrphaned InconsistencyType = "task_orphaned" |
|
||||
InconsistencyDuplicateTask InconsistencyType = "duplicate_task" |
|
||||
) |
|
||||
|
|
||||
// SeverityLevel represents the severity of an inconsistency
|
|
||||
type SeverityLevel string |
|
||||
|
|
||||
const ( |
|
||||
SeverityLow SeverityLevel = "low" |
|
||||
SeverityMedium SeverityLevel = "medium" |
|
||||
SeverityHigh SeverityLevel = "high" |
|
||||
SeverityCritical SeverityLevel = "critical" |
|
||||
) |
|
||||
|
|
||||
// NewVolumeStateManager creates a new volume state manager
|
|
||||
func NewVolumeStateManager(masterClient *wdclient.MasterClient) *VolumeStateManager { |
|
||||
return &VolumeStateManager{ |
|
||||
masterClient: masterClient, |
|
||||
volumes: make(map[uint32]*VolumeState), |
|
||||
ecShards: make(map[uint32]*ECShardState), |
|
||||
inProgressTasks: make(map[string]*TaskImpact), |
|
||||
plannedOperations: make(map[string]*PlannedOperation), |
|
||||
capacityCache: make(map[string]*CapacityInfo), |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// SyncWithMaster synchronizes state with the master server
|
|
||||
func (vsm *VolumeStateManager) SyncWithMaster() error { |
|
||||
vsm.mutex.Lock() |
|
||||
defer vsm.mutex.Unlock() |
|
||||
|
|
||||
glog.V(2).Infof("Syncing volume state with master") |
|
||||
|
|
||||
// Get current volume list from master
|
|
||||
masterVolumes, masterShards, err := vsm.fetchMasterState() |
|
||||
if err != nil { |
|
||||
return err |
|
||||
} |
|
||||
|
|
||||
// Update volume states
|
|
||||
vsm.updateVolumeStates(masterVolumes) |
|
||||
|
|
||||
// Update shard states
|
|
||||
vsm.updateShardStates(masterShards) |
|
||||
|
|
||||
// Detect inconsistencies
|
|
||||
vsm.detectInconsistencies() |
|
||||
|
|
||||
// Update capacity information
|
|
||||
vsm.updateCapacityInfo() |
|
||||
|
|
||||
// Recalculate predicted states
|
|
||||
vsm.recalculatePredictedStates() |
|
||||
|
|
||||
vsm.lastMasterSync = time.Now() |
|
||||
glog.V(2).Infof("Master sync completed, tracking %d volumes, %d EC volumes", |
|
||||
len(vsm.volumes), len(vsm.ecShards)) |
|
||||
|
|
||||
return nil |
|
||||
} |
|
||||
|
|
||||
// RegisterTaskImpact registers the impact of a new task
|
|
||||
func (vsm *VolumeStateManager) RegisterTaskImpact(taskID string, impact *TaskImpact) { |
|
||||
vsm.mutex.Lock() |
|
||||
defer vsm.mutex.Unlock() |
|
||||
|
|
||||
vsm.inProgressTasks[taskID] = impact |
|
||||
|
|
||||
// Update volume state
|
|
||||
if volumeState, exists := vsm.volumes[impact.VolumeID]; exists { |
|
||||
volumeState.InProgressTasks = append(volumeState.InProgressTasks, impact) |
|
||||
} |
|
||||
|
|
||||
// Update shard state for EC operations
|
|
||||
if impact.TaskType == types.TaskTypeErasureCoding { |
|
||||
if shardState, exists := vsm.ecShards[impact.VolumeID]; exists { |
|
||||
shardState.InProgressTasks = append(shardState.InProgressTasks, impact) |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// Update capacity reservations
|
|
||||
for server, capacityDelta := range impact.CapacityDelta { |
|
||||
if capacity, exists := vsm.capacityCache[server]; exists { |
|
||||
capacity.ReservedCapacity += capacityDelta |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// Recalculate predicted states
|
|
||||
vsm.recalculatePredictedStates() |
|
||||
|
|
||||
glog.V(2).Infof("Registered task impact: %s for volume %d", taskID, impact.VolumeID) |
|
||||
} |
|
||||
|
|
||||
// UnregisterTaskImpact removes a completed task's impact
|
|
||||
func (vsm *VolumeStateManager) UnregisterTaskImpact(taskID string) { |
|
||||
vsm.mutex.Lock() |
|
||||
defer vsm.mutex.Unlock() |
|
||||
|
|
||||
impact, exists := vsm.inProgressTasks[taskID] |
|
||||
if !exists { |
|
||||
return |
|
||||
} |
|
||||
|
|
||||
delete(vsm.inProgressTasks, taskID) |
|
||||
|
|
||||
// Remove from volume state
|
|
||||
if volumeState, exists := vsm.volumes[impact.VolumeID]; exists { |
|
||||
vsm.removeTaskFromVolume(volumeState, taskID) |
|
||||
} |
|
||||
|
|
||||
// Remove from shard state
|
|
||||
if shardState, exists := vsm.ecShards[impact.VolumeID]; exists { |
|
||||
vsm.removeTaskFromShards(shardState, taskID) |
|
||||
} |
|
||||
|
|
||||
// Update capacity reservations
|
|
||||
for server, capacityDelta := range impact.CapacityDelta { |
|
||||
if capacity, exists := vsm.capacityCache[server]; exists { |
|
||||
capacity.ReservedCapacity -= capacityDelta |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// Recalculate predicted states
|
|
||||
vsm.recalculatePredictedStates() |
|
||||
|
|
||||
glog.V(2).Infof("Unregistered task impact: %s", taskID) |
|
||||
} |
|
||||
|
|
||||
// GetAccurateCapacity returns accurate capacity information for a server
|
|
||||
func (vsm *VolumeStateManager) GetAccurateCapacity(server string) *CapacityInfo { |
|
||||
vsm.mutex.RLock() |
|
||||
defer vsm.mutex.RUnlock() |
|
||||
|
|
||||
if capacity, exists := vsm.capacityCache[server]; exists { |
|
||||
// Return a copy to avoid external modifications
|
|
||||
return &CapacityInfo{ |
|
||||
Server: capacity.Server, |
|
||||
TotalCapacity: capacity.TotalCapacity, |
|
||||
UsedCapacity: capacity.UsedCapacity, |
|
||||
ReservedCapacity: capacity.ReservedCapacity, |
|
||||
PredictedUsage: capacity.PredictedUsage, |
|
||||
LastUpdate: capacity.LastUpdate, |
|
||||
} |
|
||||
} |
|
||||
return nil |
|
||||
} |
|
||||
|
|
||||
// GetVolumeState returns the current state of a volume
|
|
||||
func (vsm *VolumeStateManager) GetVolumeState(volumeID uint32) *VolumeState { |
|
||||
vsm.mutex.RLock() |
|
||||
defer vsm.mutex.RUnlock() |
|
||||
|
|
||||
if state, exists := vsm.volumes[volumeID]; exists { |
|
||||
// Return a copy to avoid external modifications
|
|
||||
return vsm.copyVolumeState(state) |
|
||||
} |
|
||||
return nil |
|
||||
} |
|
||||
|
|
||||
// GetECShardState returns the current state of EC shards for a volume
|
|
||||
func (vsm *VolumeStateManager) GetECShardState(volumeID uint32) *ECShardState { |
|
||||
vsm.mutex.RLock() |
|
||||
defer vsm.mutex.RUnlock() |
|
||||
|
|
||||
if state, exists := vsm.ecShards[volumeID]; exists { |
|
||||
return vsm.copyECShardState(state) |
|
||||
} |
|
||||
return nil |
|
||||
} |
|
||||
|
|
||||
// CanAssignVolumeToServer checks if a volume can be assigned to a server
|
|
||||
func (vsm *VolumeStateManager) CanAssignVolumeToServer(volumeSize int64, server string) bool { |
|
||||
vsm.mutex.RLock() |
|
||||
defer vsm.mutex.RUnlock() |
|
||||
|
|
||||
capacity := vsm.capacityCache[server] |
|
||||
if capacity == nil { |
|
||||
return false |
|
||||
} |
|
||||
|
|
||||
// Calculate available capacity: Total - Used - Reserved
|
|
||||
availableCapacity := capacity.TotalCapacity - capacity.UsedCapacity - capacity.ReservedCapacity |
|
||||
return availableCapacity >= volumeSize |
|
||||
} |
|
||||
|
|
||||
// PlanOperation schedules a future operation
|
|
||||
func (vsm *VolumeStateManager) PlanOperation(operation *PlannedOperation) { |
|
||||
vsm.mutex.Lock() |
|
||||
defer vsm.mutex.Unlock() |
|
||||
|
|
||||
vsm.plannedOperations[operation.OperationID] = operation |
|
||||
|
|
||||
// Add to volume planned changes
|
|
||||
if volumeState, exists := vsm.volumes[operation.VolumeID]; exists { |
|
||||
volumeState.PlannedChanges = append(volumeState.PlannedChanges, operation) |
|
||||
} |
|
||||
|
|
||||
glog.V(2).Infof("Planned operation: %s for volume %d", operation.OperationID, operation.VolumeID) |
|
||||
} |
|
||||
|
|
||||
// GetPendingChange returns pending change for a volume
|
|
||||
func (vsm *VolumeStateManager) GetPendingChange(volumeID uint32) *VolumeChange { |
|
||||
vsm.mutex.RLock() |
|
||||
defer vsm.mutex.RUnlock() |
|
||||
|
|
||||
// Look for pending changes in volume state
|
|
||||
if volumeState, exists := vsm.volumes[volumeID]; exists { |
|
||||
// Return the most recent pending change
|
|
||||
if len(volumeState.PlannedChanges) > 0 { |
|
||||
latestOp := volumeState.PlannedChanges[len(volumeState.PlannedChanges)-1] |
|
||||
if latestOp.Impact != nil && latestOp.Impact.VolumeChanges != nil { |
|
||||
return &VolumeChange{ |
|
||||
VolumeID: volumeID, |
|
||||
ChangeType: ChangeType(latestOp.Type), |
|
||||
OldCapacity: int64(volumeState.CurrentState.Size), |
|
||||
NewCapacity: int64(volumeState.CurrentState.Size) + latestOp.Impact.VolumeChanges.SizeChange, |
|
||||
TaskID: latestOp.Impact.TaskID, |
|
||||
CompletedAt: time.Time{}, // Not completed yet
|
|
||||
ReportedToMaster: false, |
|
||||
} |
|
||||
} |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
return nil |
|
||||
} |
|
||||
|
|
||||
// fetchMasterState retrieves current state from master
|
|
||||
func (vsm *VolumeStateManager) fetchMasterState() (map[uint32]*VolumeInfo, map[uint32]map[int]*ShardInfo, error) { |
|
||||
volumes := make(map[uint32]*VolumeInfo) |
|
||||
shards := make(map[uint32]map[int]*ShardInfo) |
|
||||
|
|
||||
err := vsm.masterClient.WithClient(false, func(client master_pb.SeaweedClient) error { |
|
||||
// Fetch volume list
|
|
||||
resp, err := client.VolumeList(context.Background(), &master_pb.VolumeListRequest{}) |
|
||||
if err != nil { |
|
||||
return err |
|
||||
} |
|
||||
|
|
||||
// Process topology info
|
|
||||
if resp.TopologyInfo != nil { |
|
||||
for _, dc := range resp.TopologyInfo.DataCenterInfos { |
|
||||
for _, rack := range dc.RackInfos { |
|
||||
for _, node := range rack.DataNodeInfos { |
|
||||
for _, diskInfo := range node.DiskInfos { |
|
||||
// Process regular volumes
|
|
||||
for _, volInfo := range diskInfo.VolumeInfos { |
|
||||
volumes[volInfo.Id] = &VolumeInfo{ |
|
||||
ID: volInfo.Id, |
|
||||
Size: volInfo.Size, |
|
||||
Collection: volInfo.Collection, |
|
||||
FileCount: volInfo.FileCount, |
|
||||
DeleteCount: volInfo.DeleteCount, |
|
||||
DeletedByteCount: volInfo.DeletedByteCount, |
|
||||
ReadOnly: volInfo.ReadOnly, |
|
||||
Server: node.Id, |
|
||||
DataCenter: dc.Id, |
|
||||
Rack: rack.Id, |
|
||||
DiskType: volInfo.DiskType, |
|
||||
ModifiedAtSecond: volInfo.ModifiedAtSecond, |
|
||||
RemoteStorageKey: volInfo.RemoteStorageKey, |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// Process EC shards
|
|
||||
for _, ecShardInfo := range diskInfo.EcShardInfos { |
|
||||
volumeID := ecShardInfo.Id |
|
||||
if shards[volumeID] == nil { |
|
||||
shards[volumeID] = make(map[int]*ShardInfo) |
|
||||
} |
|
||||
|
|
||||
// Decode shard bits
|
|
||||
for shardID := 0; shardID < erasure_coding.TotalShardsCount; shardID++ { |
|
||||
if (ecShardInfo.EcIndexBits & (1 << uint(shardID))) != 0 { |
|
||||
shards[volumeID][shardID] = &ShardInfo{ |
|
||||
ShardID: shardID, |
|
||||
Server: node.Id, |
|
||||
Size: 0, // Size would need to be fetched separately
|
|
||||
Status: ShardStatusExists, |
|
||||
LastUpdate: time.Now(), |
|
||||
} |
|
||||
} |
|
||||
} |
|
||||
} |
|
||||
} |
|
||||
} |
|
||||
} |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
return nil |
|
||||
}) |
|
||||
|
|
||||
return volumes, shards, err |
|
||||
} |
|
||||
|
|
||||
// updateVolumeStates updates volume states based on master data
|
|
||||
func (vsm *VolumeStateManager) updateVolumeStates(masterVolumes map[uint32]*VolumeInfo) { |
|
||||
now := time.Now() |
|
||||
|
|
||||
// Update existing volumes and add new ones
|
|
||||
for volumeID, masterVolume := range masterVolumes { |
|
||||
if volumeState, exists := vsm.volumes[volumeID]; exists { |
|
||||
// Update existing volume
|
|
||||
oldState := volumeState.CurrentState |
|
||||
volumeState.CurrentState = masterVolume |
|
||||
volumeState.LastMasterUpdate = now |
|
||||
|
|
||||
// Check for unexpected changes
|
|
||||
if oldState != nil && vsm.hasUnexpectedChanges(oldState, masterVolume) { |
|
||||
vsm.addInconsistency(volumeState, InconsistencyVolumeUnexpected, |
|
||||
"Volume changed unexpectedly since last sync", SeverityMedium) |
|
||||
} |
|
||||
} else { |
|
||||
// New volume detected
|
|
||||
vsm.volumes[volumeID] = &VolumeState{ |
|
||||
VolumeID: volumeID, |
|
||||
CurrentState: masterVolume, |
|
||||
InProgressTasks: []*TaskImpact{}, |
|
||||
PlannedChanges: []*PlannedOperation{}, |
|
||||
LastMasterUpdate: now, |
|
||||
Inconsistencies: []StateInconsistency{}, |
|
||||
} |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// Detect missing volumes (volumes we knew about but master doesn't report)
|
|
||||
for volumeID, volumeState := range vsm.volumes { |
|
||||
if _, existsInMaster := masterVolumes[volumeID]; !existsInMaster { |
|
||||
// Check if this is expected (due to deletion task)
|
|
||||
if !vsm.isVolumeDeletionExpected(volumeID) { |
|
||||
vsm.addInconsistency(volumeState, InconsistencyVolumeMissing, |
|
||||
"Volume missing from master but not expected to be deleted", SeverityHigh) |
|
||||
} |
|
||||
} |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// updateShardStates updates EC shard states
|
|
||||
func (vsm *VolumeStateManager) updateShardStates(masterShards map[uint32]map[int]*ShardInfo) { |
|
||||
now := time.Now() |
|
||||
|
|
||||
// Update existing shard states
|
|
||||
for volumeID, shardMap := range masterShards { |
|
||||
if shardState, exists := vsm.ecShards[volumeID]; exists { |
|
||||
shardState.CurrentShards = shardMap |
|
||||
shardState.LastUpdate = now |
|
||||
} else { |
|
||||
vsm.ecShards[volumeID] = &ECShardState{ |
|
||||
VolumeID: volumeID, |
|
||||
CurrentShards: shardMap, |
|
||||
InProgressTasks: []*TaskImpact{}, |
|
||||
PlannedShards: make(map[int]*PlannedShard), |
|
||||
PredictedShards: make(map[int]*ShardInfo), |
|
||||
LastUpdate: now, |
|
||||
} |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// Check for missing shards that we expected to exist
|
|
||||
for volumeID, shardState := range vsm.ecShards { |
|
||||
if masterShardMap, exists := masterShards[volumeID]; exists { |
|
||||
vsm.validateShardConsistency(shardState, masterShardMap) |
|
||||
} |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// detectInconsistencies identifies state inconsistencies
|
|
||||
func (vsm *VolumeStateManager) detectInconsistencies() { |
|
||||
for _, volumeState := range vsm.volumes { |
|
||||
vsm.detectVolumeInconsistencies(volumeState) |
|
||||
} |
|
||||
|
|
||||
for _, shardState := range vsm.ecShards { |
|
||||
vsm.detectShardInconsistencies(shardState) |
|
||||
} |
|
||||
|
|
||||
vsm.detectOrphanedTasks() |
|
||||
vsm.detectDuplicateTasks() |
|
||||
vsm.detectCapacityInconsistencies() |
|
||||
} |
|
||||
|
|
||||
// updateCapacityInfo updates server capacity information
|
|
||||
func (vsm *VolumeStateManager) updateCapacityInfo() { |
|
||||
for server := range vsm.capacityCache { |
|
||||
vsm.recalculateServerCapacity(server) |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// recalculatePredictedStates recalculates predicted states after all operations
|
|
||||
func (vsm *VolumeStateManager) recalculatePredictedStates() { |
|
||||
for _, volumeState := range vsm.volumes { |
|
||||
vsm.calculatePredictedVolumeState(volumeState) |
|
||||
} |
|
||||
|
|
||||
for _, shardState := range vsm.ecShards { |
|
||||
vsm.calculatePredictedShardState(shardState) |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// Helper methods (simplified implementations)
|
|
||||
|
|
||||
func (vsm *VolumeStateManager) hasUnexpectedChanges(old, new *VolumeInfo) bool { |
|
||||
return old.Size != new.Size || old.ReadOnly != new.ReadOnly |
|
||||
} |
|
||||
|
|
||||
func (vsm *VolumeStateManager) isVolumeDeletionExpected(volumeID uint32) bool { |
|
||||
for _, impact := range vsm.inProgressTasks { |
|
||||
if impact.VolumeID == volumeID && impact.VolumeChanges != nil && impact.VolumeChanges.WillBeDeleted { |
|
||||
return true |
|
||||
} |
|
||||
} |
|
||||
return false |
|
||||
} |
|
||||
|
|
||||
func (vsm *VolumeStateManager) addInconsistency(volumeState *VolumeState, incType InconsistencyType, desc string, severity SeverityLevel) { |
|
||||
inconsistency := StateInconsistency{ |
|
||||
Type: incType, |
|
||||
Description: desc, |
|
||||
DetectedAt: time.Now(), |
|
||||
Severity: severity, |
|
||||
VolumeID: volumeState.VolumeID, |
|
||||
} |
|
||||
volumeState.Inconsistencies = append(volumeState.Inconsistencies, inconsistency) |
|
||||
|
|
||||
glog.Warningf("State inconsistency detected for volume %d: %s", volumeState.VolumeID, desc) |
|
||||
} |
|
||||
|
|
||||
func (vsm *VolumeStateManager) removeTaskFromVolume(volumeState *VolumeState, taskID string) { |
|
||||
for i, task := range volumeState.InProgressTasks { |
|
||||
if task.TaskID == taskID { |
|
||||
volumeState.InProgressTasks = append(volumeState.InProgressTasks[:i], volumeState.InProgressTasks[i+1:]...) |
|
||||
break |
|
||||
} |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
func (vsm *VolumeStateManager) removeTaskFromShards(shardState *ECShardState, taskID string) { |
|
||||
for i, task := range shardState.InProgressTasks { |
|
||||
if task.TaskID == taskID { |
|
||||
shardState.InProgressTasks = append(shardState.InProgressTasks[:i], shardState.InProgressTasks[i+1:]...) |
|
||||
break |
|
||||
} |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
func (vsm *VolumeStateManager) copyVolumeState(state *VolumeState) *VolumeState { |
|
||||
// Return a deep copy (implementation would be more detailed)
|
|
||||
return &VolumeState{ |
|
||||
VolumeID: state.VolumeID, |
|
||||
CurrentState: state.CurrentState, |
|
||||
LastMasterUpdate: state.LastMasterUpdate, |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
func (vsm *VolumeStateManager) copyECShardState(state *ECShardState) *ECShardState { |
|
||||
// Return a deep copy (implementation would be more detailed)
|
|
||||
return &ECShardState{ |
|
||||
VolumeID: state.VolumeID, |
|
||||
LastUpdate: state.LastUpdate, |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// Placeholder implementations for consistency checking methods
|
|
||||
func (vsm *VolumeStateManager) validateShardConsistency(shardState *ECShardState, masterShards map[int]*ShardInfo) { |
|
||||
} |
|
||||
func (vsm *VolumeStateManager) detectVolumeInconsistencies(volumeState *VolumeState) {} |
|
||||
func (vsm *VolumeStateManager) detectShardInconsistencies(shardState *ECShardState) {} |
|
||||
func (vsm *VolumeStateManager) detectOrphanedTasks() {} |
|
||||
func (vsm *VolumeStateManager) detectDuplicateTasks() {} |
|
||||
func (vsm *VolumeStateManager) detectCapacityInconsistencies() {} |
|
||||
func (vsm *VolumeStateManager) recalculateServerCapacity(server string) {} |
|
||||
func (vsm *VolumeStateManager) calculatePredictedVolumeState(volumeState *VolumeState) {} |
|
||||
func (vsm *VolumeStateManager) calculatePredictedShardState(shardState *ECShardState) {} |
|
||||
|
|
||||
// PlannedShard represents a planned shard operation
|
|
||||
type PlannedShard struct { |
|
||||
ShardID int |
|
||||
Operation string // "create", "delete", "move"
|
|
||||
TargetServer string |
|
||||
ScheduledAt time.Time |
|
||||
} |
|
||||
@ -1,440 +0,0 @@ |
|||||
package task |
|
||||
|
|
||||
import ( |
|
||||
"fmt" |
|
||||
"testing" |
|
||||
"time" |
|
||||
|
|
||||
"github.com/seaweedfs/seaweedfs/weed/worker/types" |
|
||||
) |
|
||||
|
|
||||
func TestVolumeStateManager_RegisterTaskImpact(t *testing.T) { |
|
||||
vsm := NewVolumeStateManager(nil) |
|
||||
|
|
||||
// Create test volume state
|
|
||||
volumeID := uint32(1) |
|
||||
volumeState := &VolumeState{ |
|
||||
VolumeID: volumeID, |
|
||||
CurrentState: &VolumeInfo{ |
|
||||
ID: volumeID, |
|
||||
Size: 1024 * 1024 * 1024, // 1GB
|
|
||||
}, |
|
||||
InProgressTasks: []*TaskImpact{}, |
|
||||
PlannedChanges: []*PlannedOperation{}, |
|
||||
Inconsistencies: []StateInconsistency{}, |
|
||||
} |
|
||||
vsm.volumes[volumeID] = volumeState |
|
||||
|
|
||||
// Create task impact
|
|
||||
impact := &TaskImpact{ |
|
||||
TaskID: "test_task_1", |
|
||||
TaskType: types.TaskTypeErasureCoding, |
|
||||
VolumeID: volumeID, |
|
||||
WorkerID: "worker_1", |
|
||||
StartedAt: time.Now(), |
|
||||
EstimatedEnd: time.Now().Add(15 * time.Minute), |
|
||||
VolumeChanges: &VolumeChanges{ |
|
||||
WillBecomeReadOnly: true, |
|
||||
}, |
|
||||
ShardChanges: make(map[int]*ShardChange), |
|
||||
CapacityDelta: map[string]int64{"server1": 400 * 1024 * 1024}, // 400MB for shards
|
|
||||
} |
|
||||
|
|
||||
// Register impact
|
|
||||
vsm.RegisterTaskImpact(impact.TaskID, impact) |
|
||||
|
|
||||
// Verify impact was registered
|
|
||||
if len(vsm.inProgressTasks) != 1 { |
|
||||
t.Errorf("Expected 1 in-progress task, got %d", len(vsm.inProgressTasks)) |
|
||||
} |
|
||||
|
|
||||
if len(volumeState.InProgressTasks) != 1 { |
|
||||
t.Errorf("Expected 1 task in volume state, got %d", len(volumeState.InProgressTasks)) |
|
||||
} |
|
||||
|
|
||||
// Verify task can be retrieved
|
|
||||
retrievedImpact := vsm.inProgressTasks[impact.TaskID] |
|
||||
if retrievedImpact == nil { |
|
||||
t.Error("Task impact not found after registration") |
|
||||
} |
|
||||
|
|
||||
if retrievedImpact.TaskType != types.TaskTypeErasureCoding { |
|
||||
t.Errorf("Expected task type %v, got %v", types.TaskTypeErasureCoding, retrievedImpact.TaskType) |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
func TestVolumeStateManager_UnregisterTaskImpact(t *testing.T) { |
|
||||
vsm := NewVolumeStateManager(nil) |
|
||||
|
|
||||
// Setup test data
|
|
||||
volumeID := uint32(1) |
|
||||
taskID := "test_task_1" |
|
||||
|
|
||||
volumeState := &VolumeState{ |
|
||||
VolumeID: volumeID, |
|
||||
CurrentState: &VolumeInfo{ID: volumeID, Size: 1024 * 1024 * 1024}, |
|
||||
InProgressTasks: []*TaskImpact{}, |
|
||||
} |
|
||||
vsm.volumes[volumeID] = volumeState |
|
||||
|
|
||||
impact := &TaskImpact{ |
|
||||
TaskID: taskID, |
|
||||
TaskType: types.TaskTypeVacuum, |
|
||||
VolumeID: volumeID, |
|
||||
CapacityDelta: map[string]int64{"server1": -100 * 1024 * 1024}, // 100MB savings
|
|
||||
} |
|
||||
|
|
||||
// Register then unregister
|
|
||||
vsm.RegisterTaskImpact(taskID, impact) |
|
||||
vsm.UnregisterTaskImpact(taskID) |
|
||||
|
|
||||
// Verify impact was removed
|
|
||||
if len(vsm.inProgressTasks) != 0 { |
|
||||
t.Errorf("Expected 0 in-progress tasks, got %d", len(vsm.inProgressTasks)) |
|
||||
} |
|
||||
|
|
||||
if len(volumeState.InProgressTasks) != 0 { |
|
||||
t.Errorf("Expected 0 tasks in volume state, got %d", len(volumeState.InProgressTasks)) |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
func TestVolumeStateManager_CanAssignVolumeToServer(t *testing.T) { |
|
||||
vsm := NewVolumeStateManager(nil) |
|
||||
|
|
||||
// Setup server capacity
|
|
||||
serverID := "test_server" |
|
||||
capacity := &CapacityInfo{ |
|
||||
Server: serverID, |
|
||||
TotalCapacity: 10 * 1024 * 1024 * 1024, // 10GB
|
|
||||
UsedCapacity: 3 * 1024 * 1024 * 1024, // 3GB used
|
|
||||
ReservedCapacity: 1 * 1024 * 1024 * 1024, // 1GB reserved
|
|
||||
PredictedUsage: 4 * 1024 * 1024 * 1024, // 4GB predicted total
|
|
||||
} |
|
||||
vsm.capacityCache[serverID] = capacity |
|
||||
|
|
||||
tests := []struct { |
|
||||
name string |
|
||||
volumeSize int64 |
|
||||
expected bool |
|
||||
desc string |
|
||||
}{ |
|
||||
{ |
|
||||
name: "Small volume fits", |
|
||||
volumeSize: 1 * 1024 * 1024 * 1024, // 1GB
|
|
||||
expected: true, |
|
||||
desc: "1GB volume should fit in 6GB available space", |
|
||||
}, |
|
||||
{ |
|
||||
name: "Large volume fits exactly", |
|
||||
volumeSize: 6 * 1024 * 1024 * 1024, // 6GB
|
|
||||
expected: true, |
|
||||
desc: "6GB volume should fit exactly in available space", |
|
||||
}, |
|
||||
{ |
|
||||
name: "Volume too large", |
|
||||
volumeSize: 7 * 1024 * 1024 * 1024, // 7GB
|
|
||||
expected: false, |
|
||||
desc: "7GB volume should not fit in 6GB available space", |
|
||||
}, |
|
||||
} |
|
||||
|
|
||||
for _, tt := range tests { |
|
||||
t.Run(tt.name, func(t *testing.T) { |
|
||||
result := vsm.CanAssignVolumeToServer(tt.volumeSize, serverID) |
|
||||
if result != tt.expected { |
|
||||
t.Errorf("CanAssignVolumeToServer() = %v, want %v. %s", result, tt.expected, tt.desc) |
|
||||
} |
|
||||
}) |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
func TestVolumeStateManager_GetPendingChange(t *testing.T) { |
|
||||
vsm := NewVolumeStateManager(nil) |
|
||||
|
|
||||
volumeID := uint32(1) |
|
||||
|
|
||||
// Create volume with planned operation
|
|
||||
volumeState := &VolumeState{ |
|
||||
VolumeID: volumeID, |
|
||||
CurrentState: &VolumeInfo{ |
|
||||
ID: volumeID, |
|
||||
Size: 2 * 1024 * 1024 * 1024, // 2GB
|
|
||||
}, |
|
||||
PlannedChanges: []*PlannedOperation{ |
|
||||
{ |
|
||||
OperationID: "op_1", |
|
||||
Type: OperationVacuum, |
|
||||
VolumeID: volumeID, |
|
||||
Impact: &TaskImpact{ |
|
||||
TaskID: "task_1", |
|
||||
VolumeChanges: &VolumeChanges{ |
|
||||
SizeChange: -500 * 1024 * 1024, // 500MB reduction
|
|
||||
}, |
|
||||
}, |
|
||||
}, |
|
||||
}, |
|
||||
} |
|
||||
vsm.volumes[volumeID] = volumeState |
|
||||
|
|
||||
// Test getting pending change
|
|
||||
change := vsm.GetPendingChange(volumeID) |
|
||||
|
|
||||
if change == nil { |
|
||||
t.Fatal("Expected pending change, got nil") |
|
||||
} |
|
||||
|
|
||||
if change.VolumeID != volumeID { |
|
||||
t.Errorf("Expected volume ID %d, got %d", volumeID, change.VolumeID) |
|
||||
} |
|
||||
|
|
||||
expectedNewCapacity := int64(2*1024*1024*1024 - 500*1024*1024) // 2GB - 500MB
|
|
||||
if change.NewCapacity != expectedNewCapacity { |
|
||||
t.Errorf("Expected new capacity %d, got %d", expectedNewCapacity, change.NewCapacity) |
|
||||
} |
|
||||
|
|
||||
// Test no pending change
|
|
||||
change2 := vsm.GetPendingChange(999) // Non-existent volume
|
|
||||
if change2 != nil { |
|
||||
t.Error("Expected nil for non-existent volume, got change") |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
func TestVolumeStateManager_StateConsistency(t *testing.T) { |
|
||||
// Test that demonstrates the core value: accurate state tracking
|
|
||||
vsm := NewVolumeStateManager(nil) |
|
||||
|
|
||||
volumeID := uint32(1) |
|
||||
serverID := "test_server" |
|
||||
|
|
||||
// Setup initial state
|
|
||||
vsm.volumes[volumeID] = &VolumeState{ |
|
||||
VolumeID: volumeID, |
|
||||
CurrentState: &VolumeInfo{ |
|
||||
ID: volumeID, |
|
||||
Size: 28 * 1024 * 1024 * 1024, // 28GB - ready for EC
|
|
||||
Server: serverID, |
|
||||
}, |
|
||||
InProgressTasks: []*TaskImpact{}, |
|
||||
PlannedChanges: []*PlannedOperation{}, |
|
||||
} |
|
||||
|
|
||||
vsm.capacityCache[serverID] = &CapacityInfo{ |
|
||||
Server: serverID, |
|
||||
TotalCapacity: 100 * 1024 * 1024 * 1024, // 100GB
|
|
||||
UsedCapacity: 50 * 1024 * 1024 * 1024, // 50GB used
|
|
||||
PredictedUsage: 50 * 1024 * 1024 * 1024, // Initially same as used
|
|
||||
} |
|
||||
|
|
||||
// Step 1: Register EC task impact
|
|
||||
ecImpact := &TaskImpact{ |
|
||||
TaskID: "ec_task_1", |
|
||||
TaskType: types.TaskTypeErasureCoding, |
|
||||
VolumeID: volumeID, |
|
||||
VolumeChanges: &VolumeChanges{ |
|
||||
WillBecomeReadOnly: true, |
|
||||
}, |
|
||||
CapacityDelta: map[string]int64{ |
|
||||
serverID: 12 * 1024 * 1024 * 1024, // 12GB for EC shards (40% overhead)
|
|
||||
}, |
|
||||
} |
|
||||
|
|
||||
vsm.RegisterTaskImpact(ecImpact.TaskID, ecImpact) |
|
||||
|
|
||||
// Verify capacity is reserved
|
|
||||
capacity := vsm.GetAccurateCapacity(serverID) |
|
||||
expectedPredicted := int64(50 * 1024 * 1024 * 1024) // 50GB initially
|
|
||||
if capacity.PredictedUsage != expectedPredicted { |
|
||||
t.Errorf("Expected predicted usage %d, got %d", expectedPredicted, capacity.PredictedUsage) |
|
||||
} |
|
||||
|
|
||||
// Verify reservation is tracked separately
|
|
||||
expectedReserved := int64(12 * 1024 * 1024 * 1024) // 12GB for EC shards
|
|
||||
if capacity.ReservedCapacity != expectedReserved { |
|
||||
t.Errorf("Expected reserved capacity %d, got %d", expectedReserved, capacity.ReservedCapacity) |
|
||||
} |
|
||||
|
|
||||
// Calculate available capacity correctly
|
|
||||
availableCapacity := capacity.TotalCapacity - capacity.UsedCapacity - capacity.ReservedCapacity |
|
||||
// 100GB - 50GB - 12GB = 38GB available
|
|
||||
expectedAvailable := int64(38 * 1024 * 1024 * 1024) |
|
||||
if availableCapacity != expectedAvailable { |
|
||||
t.Errorf("Expected available capacity %d, got %d", expectedAvailable, availableCapacity) |
|
||||
} |
|
||||
|
|
||||
// Step 2: Check assignment logic - should reject new large volume
|
|
||||
canAssign := vsm.CanAssignVolumeToServer(40*1024*1024*1024, serverID) // 40GB volume
|
|
||||
if canAssign { |
|
||||
t.Error("Should not be able to assign 40GB volume when only 38GB available after reservations") |
|
||||
} |
|
||||
|
|
||||
// Step 3: Complete EC task
|
|
||||
vsm.UnregisterTaskImpact(ecImpact.TaskID) |
|
||||
|
|
||||
// Verify capacity is updated correctly
|
|
||||
capacityAfter := vsm.GetAccurateCapacity(serverID) |
|
||||
if capacityAfter.ReservedCapacity != 0 { |
|
||||
t.Errorf("Expected 0 reserved capacity after task completion, got %d", capacityAfter.ReservedCapacity) |
|
||||
} |
|
||||
|
|
||||
t.Logf("✅ State consistency test passed - accurate capacity tracking throughout task lifecycle") |
|
||||
} |
|
||||
|
|
||||
func TestVolumeStateManager_ConcurrentTasks(t *testing.T) { |
|
||||
// Test multiple concurrent tasks affecting capacity
|
|
||||
vsm := NewVolumeStateManager(nil) |
|
||||
|
|
||||
serverID := "test_server" |
|
||||
vsm.capacityCache[serverID] = &CapacityInfo{ |
|
||||
Server: serverID, |
|
||||
TotalCapacity: 50 * 1024 * 1024 * 1024, // 50GB
|
|
||||
UsedCapacity: 10 * 1024 * 1024 * 1024, // 10GB used
|
|
||||
PredictedUsage: 10 * 1024 * 1024 * 1024, // Initially 10GB
|
|
||||
} |
|
||||
|
|
||||
// Register multiple tasks
|
|
||||
tasks := []struct { |
|
||||
taskID string |
|
||||
volumeID uint32 |
|
||||
capacityDelta int64 |
|
||||
}{ |
|
||||
{"ec_task_1", 1, 15 * 1024 * 1024 * 1024}, // 15GB for EC
|
|
||||
{"vacuum_task_1", 2, -5 * 1024 * 1024 * 1024}, // 5GB savings
|
|
||||
{"ec_task_2", 3, 20 * 1024 * 1024 * 1024}, // 20GB for EC
|
|
||||
} |
|
||||
|
|
||||
for _, task := range tasks { |
|
||||
// Setup volume state
|
|
||||
vsm.volumes[task.volumeID] = &VolumeState{ |
|
||||
VolumeID: task.volumeID, |
|
||||
CurrentState: &VolumeInfo{ID: task.volumeID, Size: 25 * 1024 * 1024 * 1024}, |
|
||||
} |
|
||||
|
|
||||
impact := &TaskImpact{ |
|
||||
TaskID: task.taskID, |
|
||||
VolumeID: task.volumeID, |
|
||||
TaskType: types.TaskTypeErasureCoding, |
|
||||
CapacityDelta: map[string]int64{serverID: task.capacityDelta}, |
|
||||
} |
|
||||
|
|
||||
vsm.RegisterTaskImpact(task.taskID, impact) |
|
||||
} |
|
||||
|
|
||||
// Check cumulative capacity impact
|
|
||||
capacity := vsm.GetAccurateCapacity(serverID) |
|
||||
expectedPredicted := int64(10*1024*1024*1024 + 15*1024*1024*1024 - 5*1024*1024*1024 + 20*1024*1024*1024) // 40GB
|
|
||||
|
|
||||
if capacity.PredictedUsage != expectedPredicted { |
|
||||
t.Errorf("Expected predicted usage %d GB, got %d GB", |
|
||||
expectedPredicted/(1024*1024*1024), capacity.PredictedUsage/(1024*1024*1024)) |
|
||||
} |
|
||||
|
|
||||
// Verify we can't assign more than available
|
|
||||
remainingCapacity := capacity.TotalCapacity - capacity.PredictedUsage |
|
||||
canAssign := vsm.CanAssignVolumeToServer(remainingCapacity+1, serverID) |
|
||||
if canAssign { |
|
||||
t.Error("Should not be able to assign volume larger than remaining capacity") |
|
||||
} |
|
||||
|
|
||||
t.Logf("✅ Concurrent tasks test passed - accurate cumulative capacity tracking") |
|
||||
} |
|
||||
|
|
||||
func TestVolumeStateManager_ECShardTracking(t *testing.T) { |
|
||||
vsm := NewVolumeStateManager(nil) |
|
||||
|
|
||||
volumeID := uint32(1) |
|
||||
|
|
||||
// Create EC shard state
|
|
||||
shardState := &ECShardState{ |
|
||||
VolumeID: volumeID, |
|
||||
CurrentShards: map[int]*ShardInfo{ |
|
||||
0: {ShardID: 0, Server: "server1", Status: ShardStatusExists}, |
|
||||
1: {ShardID: 1, Server: "server1", Status: ShardStatusExists}, |
|
||||
2: {ShardID: 2, Server: "server2", Status: ShardStatusExists}, |
|
||||
}, |
|
||||
InProgressTasks: []*TaskImpact{}, |
|
||||
PlannedShards: make(map[int]*PlannedShard), |
|
||||
PredictedShards: make(map[int]*ShardInfo), |
|
||||
} |
|
||||
vsm.ecShards[volumeID] = shardState |
|
||||
|
|
||||
// Register task that will create more shards
|
|
||||
impact := &TaskImpact{ |
|
||||
TaskID: "ec_expand_task", |
|
||||
VolumeID: volumeID, |
|
||||
TaskType: types.TaskTypeErasureCoding, |
|
||||
ShardChanges: map[int]*ShardChange{ |
|
||||
3: {ShardID: 3, WillBeCreated: true, TargetServer: "server3"}, |
|
||||
4: {ShardID: 4, WillBeCreated: true, TargetServer: "server3"}, |
|
||||
}, |
|
||||
} |
|
||||
|
|
||||
vsm.RegisterTaskImpact(impact.TaskID, impact) |
|
||||
|
|
||||
// Verify shard state tracking
|
|
||||
retrievedState := vsm.GetECShardState(volumeID) |
|
||||
if retrievedState == nil { |
|
||||
t.Fatal("Expected EC shard state, got nil") |
|
||||
} |
|
||||
|
|
||||
if len(retrievedState.InProgressTasks) != 1 { |
|
||||
t.Errorf("Expected 1 in-progress task for shards, got %d", len(retrievedState.InProgressTasks)) |
|
||||
} |
|
||||
|
|
||||
// Verify current shards are still tracked
|
|
||||
if len(retrievedState.CurrentShards) != 3 { |
|
||||
t.Errorf("Expected 3 current shards, got %d", len(retrievedState.CurrentShards)) |
|
||||
} |
|
||||
|
|
||||
t.Logf("✅ EC shard tracking test passed") |
|
||||
} |
|
||||
|
|
||||
// Benchmark tests for performance
|
|
||||
func BenchmarkVolumeStateManager_RegisterTaskImpact(b *testing.B) { |
|
||||
vsm := NewVolumeStateManager(nil) |
|
||||
|
|
||||
// Setup test data
|
|
||||
for i := 0; i < 1000; i++ { |
|
||||
volumeID := uint32(i + 1) |
|
||||
vsm.volumes[volumeID] = &VolumeState{ |
|
||||
VolumeID: volumeID, |
|
||||
CurrentState: &VolumeInfo{ID: volumeID}, |
|
||||
InProgressTasks: []*TaskImpact{}, |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
b.ResetTimer() |
|
||||
|
|
||||
for i := 0; i < b.N; i++ { |
|
||||
impact := &TaskImpact{ |
|
||||
TaskID: generateTaskID(), |
|
||||
VolumeID: uint32((i % 1000) + 1), |
|
||||
TaskType: types.TaskTypeVacuum, |
|
||||
CapacityDelta: map[string]int64{"server1": 1024 * 1024}, |
|
||||
} |
|
||||
|
|
||||
vsm.RegisterTaskImpact(impact.TaskID, impact) |
|
||||
vsm.UnregisterTaskImpact(impact.TaskID) |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
func BenchmarkVolumeStateManager_CanAssignVolumeToServer(b *testing.B) { |
|
||||
vsm := NewVolumeStateManager(nil) |
|
||||
|
|
||||
// Setup capacity data
|
|
||||
for i := 0; i < 100; i++ { |
|
||||
serverID := fmt.Sprintf("server_%d", i) |
|
||||
vsm.capacityCache[serverID] = &CapacityInfo{ |
|
||||
Server: serverID, |
|
||||
TotalCapacity: 100 * 1024 * 1024 * 1024, |
|
||||
UsedCapacity: 50 * 1024 * 1024 * 1024, |
|
||||
PredictedUsage: 50 * 1024 * 1024 * 1024, |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
b.ResetTimer() |
|
||||
|
|
||||
for i := 0; i < b.N; i++ { |
|
||||
serverID := fmt.Sprintf("server_%d", i%100) |
|
||||
vsm.CanAssignVolumeToServer(1024*1024*1024, serverID) |
|
||||
} |
|
||||
} |
|
||||
@ -1,226 +0,0 @@ |
|||||
package task |
|
||||
|
|
||||
import ( |
|
||||
"sync" |
|
||||
"time" |
|
||||
|
|
||||
"github.com/seaweedfs/seaweedfs/weed/glog" |
|
||||
"github.com/seaweedfs/seaweedfs/weed/pb/master_pb" |
|
||||
"github.com/seaweedfs/seaweedfs/weed/wdclient" |
|
||||
"github.com/seaweedfs/seaweedfs/weed/worker/types" |
|
||||
) |
|
||||
|
|
||||
// VolumeStateTracker tracks volume state changes and reconciles with master
|
|
||||
type VolumeStateTracker struct { |
|
||||
masterClient *wdclient.MasterClient |
|
||||
reconcileInterval time.Duration |
|
||||
reservedVolumes map[uint32]*VolumeReservation |
|
||||
pendingChanges map[uint32]*VolumeChange |
|
||||
mutex sync.RWMutex |
|
||||
} |
|
||||
|
|
||||
// NewVolumeStateTracker creates a new volume state tracker
|
|
||||
func NewVolumeStateTracker(masterClient *wdclient.MasterClient, reconcileInterval time.Duration) *VolumeStateTracker { |
|
||||
return &VolumeStateTracker{ |
|
||||
masterClient: masterClient, |
|
||||
reconcileInterval: reconcileInterval, |
|
||||
reservedVolumes: make(map[uint32]*VolumeReservation), |
|
||||
pendingChanges: make(map[uint32]*VolumeChange), |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// ReserveVolume reserves a volume for a task
|
|
||||
func (vst *VolumeStateTracker) ReserveVolume(volumeID uint32, taskID string) { |
|
||||
vst.mutex.Lock() |
|
||||
defer vst.mutex.Unlock() |
|
||||
|
|
||||
reservation := &VolumeReservation{ |
|
||||
VolumeID: volumeID, |
|
||||
TaskID: taskID, |
|
||||
ReservedAt: time.Now(), |
|
||||
ExpectedEnd: time.Now().Add(15 * time.Minute), // Default 15 min estimate
|
|
||||
CapacityDelta: 0, // Will be updated based on task type
|
|
||||
} |
|
||||
|
|
||||
vst.reservedVolumes[volumeID] = reservation |
|
||||
glog.V(2).Infof("Reserved volume %d for task %s", volumeID, taskID) |
|
||||
} |
|
||||
|
|
||||
// ReleaseVolume releases a volume reservation
|
|
||||
func (vst *VolumeStateTracker) ReleaseVolume(volumeID uint32, taskID string) { |
|
||||
vst.mutex.Lock() |
|
||||
defer vst.mutex.Unlock() |
|
||||
|
|
||||
if reservation, exists := vst.reservedVolumes[volumeID]; exists { |
|
||||
if reservation.TaskID == taskID { |
|
||||
delete(vst.reservedVolumes, volumeID) |
|
||||
glog.V(2).Infof("Released volume %d reservation for task %s", volumeID, taskID) |
|
||||
} |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// RecordVolumeChange records a completed volume change
|
|
||||
func (vst *VolumeStateTracker) RecordVolumeChange(volumeID uint32, taskType types.TaskType, taskID string) { |
|
||||
vst.mutex.Lock() |
|
||||
defer vst.mutex.Unlock() |
|
||||
|
|
||||
changeType := ChangeTypeECEncoding |
|
||||
if taskType == types.TaskTypeVacuum { |
|
||||
changeType = ChangeTypeVacuumComplete |
|
||||
} |
|
||||
|
|
||||
change := &VolumeChange{ |
|
||||
VolumeID: volumeID, |
|
||||
ChangeType: changeType, |
|
||||
TaskID: taskID, |
|
||||
CompletedAt: time.Now(), |
|
||||
ReportedToMaster: false, |
|
||||
} |
|
||||
|
|
||||
vst.pendingChanges[volumeID] = change |
|
||||
glog.V(1).Infof("Recorded volume change for volume %d: %s", volumeID, changeType) |
|
||||
} |
|
||||
|
|
||||
// GetPendingChange returns pending change for a volume
|
|
||||
func (vst *VolumeStateTracker) GetPendingChange(volumeID uint32) *VolumeChange { |
|
||||
vst.mutex.RLock() |
|
||||
defer vst.mutex.RUnlock() |
|
||||
|
|
||||
return vst.pendingChanges[volumeID] |
|
||||
} |
|
||||
|
|
||||
// GetVolumeReservation returns reservation for a volume
|
|
||||
func (vst *VolumeStateTracker) GetVolumeReservation(volumeID uint32) *VolumeReservation { |
|
||||
vst.mutex.RLock() |
|
||||
defer vst.mutex.RUnlock() |
|
||||
|
|
||||
return vst.reservedVolumes[volumeID] |
|
||||
} |
|
||||
|
|
||||
// IsVolumeReserved checks if a volume is reserved
|
|
||||
func (vst *VolumeStateTracker) IsVolumeReserved(volumeID uint32) bool { |
|
||||
vst.mutex.RLock() |
|
||||
defer vst.mutex.RUnlock() |
|
||||
|
|
||||
_, exists := vst.reservedVolumes[volumeID] |
|
||||
return exists |
|
||||
} |
|
||||
|
|
||||
// ReconcileWithMaster reconciles volume states with master server
|
|
||||
func (vst *VolumeStateTracker) ReconcileWithMaster() { |
|
||||
vst.mutex.Lock() |
|
||||
defer vst.mutex.Unlock() |
|
||||
|
|
||||
// Report pending changes to master
|
|
||||
for volumeID, change := range vst.pendingChanges { |
|
||||
if vst.reportChangeToMaster(change) { |
|
||||
change.ReportedToMaster = true |
|
||||
delete(vst.pendingChanges, volumeID) |
|
||||
glog.V(1).Infof("Successfully reported volume change for volume %d to master", volumeID) |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// Clean up expired reservations
|
|
||||
vst.cleanupExpiredReservations() |
|
||||
} |
|
||||
|
|
||||
// reportChangeToMaster reports a volume change to the master server
|
|
||||
func (vst *VolumeStateTracker) reportChangeToMaster(change *VolumeChange) bool { |
|
||||
// Note: In a real implementation, this would make actual API calls to master
|
|
||||
// For now, we'll simulate the reporting
|
|
||||
|
|
||||
switch change.ChangeType { |
|
||||
case ChangeTypeECEncoding: |
|
||||
return vst.reportECCompletion(change) |
|
||||
case ChangeTypeVacuumComplete: |
|
||||
return vst.reportVacuumCompletion(change) |
|
||||
} |
|
||||
|
|
||||
return false |
|
||||
} |
|
||||
|
|
||||
// reportECCompletion reports EC completion to master
|
|
||||
func (vst *VolumeStateTracker) reportECCompletion(change *VolumeChange) bool { |
|
||||
// This would typically trigger the master to:
|
|
||||
// 1. Update volume state to reflect EC encoding
|
|
||||
// 2. Update capacity calculations
|
|
||||
// 3. Redistribute volume assignments
|
|
||||
|
|
||||
glog.V(2).Infof("Reporting EC completion for volume %d", change.VolumeID) |
|
||||
|
|
||||
// Simulate master API call
|
|
||||
err := vst.masterClient.WithClient(false, func(client master_pb.SeaweedClient) error { |
|
||||
// In real implementation, there would be a specific API call here
|
|
||||
// For now, we simulate success
|
|
||||
return nil |
|
||||
}) |
|
||||
|
|
||||
return err == nil |
|
||||
} |
|
||||
|
|
||||
// reportVacuumCompletion reports vacuum completion to master
|
|
||||
func (vst *VolumeStateTracker) reportVacuumCompletion(change *VolumeChange) bool { |
|
||||
// This would typically trigger the master to:
|
|
||||
// 1. Update volume statistics
|
|
||||
// 2. Update capacity calculations
|
|
||||
// 3. Mark volume as recently vacuumed
|
|
||||
|
|
||||
glog.V(2).Infof("Reporting vacuum completion for volume %d", change.VolumeID) |
|
||||
|
|
||||
// Simulate master API call
|
|
||||
err := vst.masterClient.WithClient(false, func(client master_pb.SeaweedClient) error { |
|
||||
// In real implementation, there would be a specific API call here
|
|
||||
// For now, we simulate success
|
|
||||
return nil |
|
||||
}) |
|
||||
|
|
||||
return err == nil |
|
||||
} |
|
||||
|
|
||||
// cleanupExpiredReservations removes expired volume reservations
|
|
||||
func (vst *VolumeStateTracker) cleanupExpiredReservations() { |
|
||||
now := time.Now() |
|
||||
|
|
||||
for volumeID, reservation := range vst.reservedVolumes { |
|
||||
if now.After(reservation.ExpectedEnd) { |
|
||||
delete(vst.reservedVolumes, volumeID) |
|
||||
glog.Warningf("Cleaned up expired reservation for volume %d (task %s)", volumeID, reservation.TaskID) |
|
||||
} |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// GetAdjustedCapacity returns adjusted capacity considering in-progress tasks
|
|
||||
func (vst *VolumeStateTracker) GetAdjustedCapacity(volumeID uint32, baseCapacity int64) int64 { |
|
||||
vst.mutex.RLock() |
|
||||
defer vst.mutex.RUnlock() |
|
||||
|
|
||||
// Check for pending changes
|
|
||||
if change := vst.pendingChanges[volumeID]; change != nil { |
|
||||
return change.NewCapacity |
|
||||
} |
|
||||
|
|
||||
// Check for in-progress reservations
|
|
||||
if reservation := vst.reservedVolumes[volumeID]; reservation != nil { |
|
||||
return baseCapacity + reservation.CapacityDelta |
|
||||
} |
|
||||
|
|
||||
return baseCapacity |
|
||||
} |
|
||||
|
|
||||
// GetStats returns statistics about volume state tracking
|
|
||||
func (vst *VolumeStateTracker) GetStats() map[string]interface{} { |
|
||||
vst.mutex.RLock() |
|
||||
defer vst.mutex.RUnlock() |
|
||||
|
|
||||
stats := make(map[string]interface{}) |
|
||||
stats["reserved_volumes"] = len(vst.reservedVolumes) |
|
||||
stats["pending_changes"] = len(vst.pendingChanges) |
|
||||
|
|
||||
changeTypeCounts := make(map[ChangeType]int) |
|
||||
for _, change := range vst.pendingChanges { |
|
||||
changeTypeCounts[change.ChangeType]++ |
|
||||
} |
|
||||
stats["pending_by_type"] = changeTypeCounts |
|
||||
|
|
||||
return stats |
|
||||
} |
|
||||
@ -1,488 +0,0 @@ |
|||||
package task |
|
||||
|
|
||||
import ( |
|
||||
"context" |
|
||||
"fmt" |
|
||||
"io" |
|
||||
"sync" |
|
||||
"time" |
|
||||
|
|
||||
"github.com/seaweedfs/seaweedfs/weed/glog" |
|
||||
"github.com/seaweedfs/seaweedfs/weed/pb" |
|
||||
"github.com/seaweedfs/seaweedfs/weed/pb/worker_pb" |
|
||||
"github.com/seaweedfs/seaweedfs/weed/worker/types" |
|
||||
"google.golang.org/grpc" |
|
||||
"google.golang.org/grpc/credentials/insecure" |
|
||||
) |
|
||||
|
|
||||
// WorkerConnection manages the gRPC connection to a single worker
|
|
||||
type WorkerConnection struct { |
|
||||
workerID string |
|
||||
address string |
|
||||
conn *grpc.ClientConn |
|
||||
client worker_pb.WorkerServiceClient |
|
||||
stream worker_pb.WorkerService_WorkerStreamClient |
|
||||
lastSeen time.Time |
|
||||
mutex sync.RWMutex |
|
||||
adminServer *AdminServer |
|
||||
stopCh chan struct{} |
|
||||
active bool |
|
||||
} |
|
||||
|
|
||||
// WorkerCommunicationManager manages all worker connections
|
|
||||
type WorkerCommunicationManager struct { |
|
||||
adminServer *AdminServer |
|
||||
connections map[string]*WorkerConnection |
|
||||
mutex sync.RWMutex |
|
||||
stopCh chan struct{} |
|
||||
} |
|
||||
|
|
||||
// NewWorkerCommunicationManager creates a new worker communication manager
|
|
||||
func NewWorkerCommunicationManager(adminServer *AdminServer) *WorkerCommunicationManager { |
|
||||
return &WorkerCommunicationManager{ |
|
||||
adminServer: adminServer, |
|
||||
connections: make(map[string]*WorkerConnection), |
|
||||
stopCh: make(chan struct{}), |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// Start starts the worker communication manager
|
|
||||
func (wcm *WorkerCommunicationManager) Start() { |
|
||||
glog.Infof("Starting worker communication manager") |
|
||||
|
|
||||
go wcm.connectionMonitorLoop() |
|
||||
} |
|
||||
|
|
||||
// Stop stops the worker communication manager
|
|
||||
func (wcm *WorkerCommunicationManager) Stop() { |
|
||||
glog.Infof("Stopping worker communication manager") |
|
||||
|
|
||||
close(wcm.stopCh) |
|
||||
|
|
||||
wcm.mutex.Lock() |
|
||||
defer wcm.mutex.Unlock() |
|
||||
|
|
||||
for _, conn := range wcm.connections { |
|
||||
conn.Close() |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// EstablishWorkerConnection establishes a connection to a worker
|
|
||||
func (wcm *WorkerCommunicationManager) EstablishWorkerConnection(workerID, address string) error { |
|
||||
wcm.mutex.Lock() |
|
||||
defer wcm.mutex.Unlock() |
|
||||
|
|
||||
// Check if already connected
|
|
||||
if conn, exists := wcm.connections[workerID]; exists { |
|
||||
if conn.active { |
|
||||
return nil // Already connected
|
|
||||
} |
|
||||
conn.Close() // Close inactive connection
|
|
||||
} |
|
||||
|
|
||||
// Create new connection
|
|
||||
conn, err := NewWorkerConnection(workerID, address, wcm.adminServer) |
|
||||
if err != nil { |
|
||||
return fmt.Errorf("failed to create worker connection: %v", err) |
|
||||
} |
|
||||
|
|
||||
wcm.connections[workerID] = conn |
|
||||
|
|
||||
// Start connection
|
|
||||
go conn.Start() |
|
||||
|
|
||||
glog.Infof("Established connection to worker %s at %s", workerID, address) |
|
||||
return nil |
|
||||
} |
|
||||
|
|
||||
// SendTaskAssignment sends a task assignment to a worker
|
|
||||
func (wcm *WorkerCommunicationManager) SendTaskAssignment(workerID string, task *Task) error { |
|
||||
wcm.mutex.RLock() |
|
||||
conn, exists := wcm.connections[workerID] |
|
||||
wcm.mutex.RUnlock() |
|
||||
|
|
||||
if !exists || !conn.active { |
|
||||
return fmt.Errorf("no active connection to worker %s", workerID) |
|
||||
} |
|
||||
|
|
||||
return conn.SendTaskAssignment(task) |
|
||||
} |
|
||||
|
|
||||
// CancelTask sends a task cancellation to a worker
|
|
||||
func (wcm *WorkerCommunicationManager) CancelTask(workerID, taskID string, reason string) error { |
|
||||
wcm.mutex.RLock() |
|
||||
conn, exists := wcm.connections[workerID] |
|
||||
wcm.mutex.RUnlock() |
|
||||
|
|
||||
if !exists || !conn.active { |
|
||||
return fmt.Errorf("no active connection to worker %s", workerID) |
|
||||
} |
|
||||
|
|
||||
return conn.CancelTask(taskID, reason) |
|
||||
} |
|
||||
|
|
||||
// GetActiveConnections returns the list of active worker connections
|
|
||||
func (wcm *WorkerCommunicationManager) GetActiveConnections() []string { |
|
||||
wcm.mutex.RLock() |
|
||||
defer wcm.mutex.RUnlock() |
|
||||
|
|
||||
var active []string |
|
||||
for workerID, conn := range wcm.connections { |
|
||||
if conn.active { |
|
||||
active = append(active, workerID) |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
return active |
|
||||
} |
|
||||
|
|
||||
// connectionMonitorLoop monitors worker connections and cleans up inactive ones
|
|
||||
func (wcm *WorkerCommunicationManager) connectionMonitorLoop() { |
|
||||
ticker := time.NewTicker(30 * time.Second) |
|
||||
defer ticker.Stop() |
|
||||
|
|
||||
for { |
|
||||
select { |
|
||||
case <-ticker.C: |
|
||||
wcm.cleanupInactiveConnections() |
|
||||
case <-wcm.stopCh: |
|
||||
return |
|
||||
} |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// cleanupInactiveConnections removes inactive worker connections
|
|
||||
func (wcm *WorkerCommunicationManager) cleanupInactiveConnections() { |
|
||||
wcm.mutex.Lock() |
|
||||
defer wcm.mutex.Unlock() |
|
||||
|
|
||||
now := time.Now() |
|
||||
timeout := 2 * time.Minute |
|
||||
|
|
||||
for workerID, conn := range wcm.connections { |
|
||||
if !conn.active || now.Sub(conn.lastSeen) > timeout { |
|
||||
glog.Infof("Cleaning up inactive connection to worker %s", workerID) |
|
||||
conn.Close() |
|
||||
delete(wcm.connections, workerID) |
|
||||
|
|
||||
// Mark worker as inactive in registry
|
|
||||
wcm.adminServer.workerRegistry.MarkWorkerInactive(workerID) |
|
||||
} |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// NewWorkerConnection creates a new worker connection
|
|
||||
func NewWorkerConnection(workerID, address string, adminServer *AdminServer) (*WorkerConnection, error) { |
|
||||
// Convert address to gRPC address
|
|
||||
grpcAddress := pb.ServerToGrpcAddress(address) |
|
||||
conn, err := grpc.NewClient(grpcAddress, grpc.WithTransportCredentials(insecure.NewCredentials())) |
|
||||
if err != nil { |
|
||||
return nil, fmt.Errorf("failed to connect to worker at %s: %v", address, err) |
|
||||
} |
|
||||
|
|
||||
client := worker_pb.NewWorkerServiceClient(conn) |
|
||||
|
|
||||
return &WorkerConnection{ |
|
||||
workerID: workerID, |
|
||||
address: address, |
|
||||
conn: conn, |
|
||||
client: client, |
|
||||
lastSeen: time.Now(), |
|
||||
adminServer: adminServer, |
|
||||
stopCh: make(chan struct{}), |
|
||||
active: false, |
|
||||
}, nil |
|
||||
} |
|
||||
|
|
||||
// Start starts the worker connection and message handling
|
|
||||
func (wc *WorkerConnection) Start() { |
|
||||
defer wc.Close() |
|
||||
|
|
||||
ctx := context.Background() |
|
||||
stream, err := wc.client.WorkerStream(ctx) |
|
||||
if err != nil { |
|
||||
glog.Errorf("Failed to create worker stream for %s: %v", wc.workerID, err) |
|
||||
return |
|
||||
} |
|
||||
|
|
||||
wc.stream = stream |
|
||||
wc.active = true |
|
||||
|
|
||||
glog.Infof("Worker connection %s started", wc.workerID) |
|
||||
|
|
||||
// Start message handling goroutines
|
|
||||
go wc.receiveMessages() |
|
||||
|
|
||||
// Keep connection alive until stopped
|
|
||||
<-wc.stopCh |
|
||||
} |
|
||||
|
|
||||
// Close closes the worker connection
|
|
||||
func (wc *WorkerConnection) Close() { |
|
||||
wc.mutex.Lock() |
|
||||
defer wc.mutex.Unlock() |
|
||||
|
|
||||
if !wc.active { |
|
||||
return |
|
||||
} |
|
||||
|
|
||||
wc.active = false |
|
||||
close(wc.stopCh) |
|
||||
|
|
||||
if wc.stream != nil { |
|
||||
wc.stream.CloseSend() |
|
||||
} |
|
||||
|
|
||||
if wc.conn != nil { |
|
||||
wc.conn.Close() |
|
||||
} |
|
||||
|
|
||||
glog.Infof("Worker connection %s closed", wc.workerID) |
|
||||
} |
|
||||
|
|
||||
// receiveMessages handles incoming messages from the worker
|
|
||||
func (wc *WorkerConnection) receiveMessages() { |
|
||||
for { |
|
||||
select { |
|
||||
case <-wc.stopCh: |
|
||||
return |
|
||||
default: |
|
||||
} |
|
||||
|
|
||||
msg, err := wc.stream.Recv() |
|
||||
if err != nil { |
|
||||
if err == io.EOF { |
|
||||
glog.Infof("Worker %s closed connection", wc.workerID) |
|
||||
} else { |
|
||||
glog.Errorf("Error receiving from worker %s: %v", wc.workerID, err) |
|
||||
} |
|
||||
wc.Close() |
|
||||
return |
|
||||
} |
|
||||
|
|
||||
wc.updateLastSeen() |
|
||||
// Convert AdminMessage to WorkerMessage for processing
|
|
||||
if workerMsg := convertToWorkerMessage(msg); workerMsg != nil { |
|
||||
wc.handleMessage(workerMsg) |
|
||||
} |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// updateLastSeen updates the last seen timestamp
|
|
||||
func (wc *WorkerConnection) updateLastSeen() { |
|
||||
wc.mutex.Lock() |
|
||||
defer wc.mutex.Unlock() |
|
||||
wc.lastSeen = time.Now() |
|
||||
} |
|
||||
|
|
||||
// handleMessage processes a message from the worker
|
|
||||
func (wc *WorkerConnection) handleMessage(msg *worker_pb.WorkerMessage) { |
|
||||
switch message := msg.Message.(type) { |
|
||||
case *worker_pb.WorkerMessage_Registration: |
|
||||
registration := message.Registration |
|
||||
worker := &Worker{ |
|
||||
ID: registration.WorkerId, |
|
||||
Address: registration.Address, |
|
||||
Capabilities: registration.Capabilities, |
|
||||
} |
|
||||
wc.workerID = worker.ID |
|
||||
// UpdateWorkerStatus stub
|
|
||||
if wc.adminServer.workerRegistry != nil { |
|
||||
// wc.adminServer.workerRegistry.UpdateWorkerStatus(worker) // Commented out - method doesn't exist
|
|
||||
} |
|
||||
glog.Infof("Worker %s registered", worker.ID) |
|
||||
|
|
||||
case *worker_pb.WorkerMessage_Heartbeat: |
|
||||
glog.V(3).Infof("Heartbeat from worker %s", wc.workerID) |
|
||||
|
|
||||
case *worker_pb.WorkerMessage_TaskRequest: |
|
||||
glog.V(2).Infof("Task request from worker %s", wc.workerID) |
|
||||
// AssignTaskToWorker stub
|
|
||||
// task := wc.adminServer.AssignTaskToWorker(wc.workerID) // Commented out - method doesn't exist
|
|
||||
|
|
||||
case *worker_pb.WorkerMessage_TaskUpdate: |
|
||||
update := message.TaskUpdate |
|
||||
// UpdateTaskProgress stub - fix signature
|
|
||||
wc.adminServer.UpdateTaskProgress(update.TaskId, float64(update.Progress)) |
|
||||
|
|
||||
case *worker_pb.WorkerMessage_TaskComplete: |
|
||||
complete := message.TaskComplete |
|
||||
// CompleteTask stub - fix signature
|
|
||||
wc.adminServer.CompleteTask(complete.TaskId, complete.Success, complete.ErrorMessage) |
|
||||
|
|
||||
case *worker_pb.WorkerMessage_Shutdown: |
|
||||
glog.Infof("Worker %s shutting down", wc.workerID) |
|
||||
wc.Close() |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// SendTaskAssignment sends a task assignment to the worker
|
|
||||
func (wc *WorkerConnection) SendTaskAssignment(task *Task) error { |
|
||||
return wc.sendTaskAssignment(task) |
|
||||
} |
|
||||
|
|
||||
// sendTaskAssignment sends a task assignment message
|
|
||||
func (wc *WorkerConnection) sendTaskAssignment(task *types.Task) error { |
|
||||
// Fix type assertions for parameters
|
|
||||
server, _ := task.Parameters["server"].(string) |
|
||||
collection, _ := task.Parameters["collection"].(string) |
|
||||
|
|
||||
// Convert map[string]interface{} to map[string]string
|
|
||||
parameters := make(map[string]string) |
|
||||
for k, v := range task.Parameters { |
|
||||
if str, ok := v.(string); ok { |
|
||||
parameters[k] = str |
|
||||
} else { |
|
||||
parameters[k] = fmt.Sprintf("%v", v) |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// Add master_client parameter for tasks that need it (especially EC tasks)
|
|
||||
if wc.adminServer.masterClient != nil { |
|
||||
if currentMaster := wc.adminServer.masterClient.GetMaster(context.Background()); currentMaster != "" { |
|
||||
parameters["master_client"] = string(currentMaster) |
|
||||
glog.V(2).Infof("Added master_client parameter to task %s: %s", task.ID, currentMaster) |
|
||||
} else { |
|
||||
glog.Warningf("No master address available for task %s", task.ID) |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
assignment := &worker_pb.TaskAssignment{ |
|
||||
TaskId: task.ID, |
|
||||
TaskType: string(task.Type), |
|
||||
Priority: int32(task.Priority), |
|
||||
CreatedTime: task.CreatedAt.Unix(), |
|
||||
Params: &worker_pb.TaskParams{ |
|
||||
VolumeId: task.VolumeID, |
|
||||
Server: server, |
|
||||
Collection: collection, |
|
||||
Parameters: parameters, |
|
||||
}, |
|
||||
Metadata: map[string]string{ |
|
||||
"assigned_at": time.Now().Format(time.RFC3339), |
|
||||
}, |
|
||||
} |
|
||||
|
|
||||
response := &worker_pb.AdminMessage{ |
|
||||
AdminId: wc.adminServer.ID, |
|
||||
Timestamp: time.Now().Unix(), |
|
||||
Message: &worker_pb.AdminMessage_TaskAssignment{ |
|
||||
TaskAssignment: assignment, |
|
||||
}, |
|
||||
} |
|
||||
|
|
||||
return wc.sendMessage(response) |
|
||||
} |
|
||||
|
|
||||
// CancelTask sends a task cancellation to the worker
|
|
||||
func (wc *WorkerConnection) CancelTask(taskID, reason string) error { |
|
||||
cancellation := &worker_pb.TaskCancellation{ |
|
||||
TaskId: taskID, |
|
||||
Reason: reason, |
|
||||
Force: false, |
|
||||
} |
|
||||
|
|
||||
response := &worker_pb.AdminMessage{ |
|
||||
AdminId: wc.adminServer.ID, |
|
||||
Timestamp: time.Now().Unix(), |
|
||||
Message: &worker_pb.AdminMessage_TaskCancellation{ |
|
||||
TaskCancellation: cancellation, |
|
||||
}, |
|
||||
} |
|
||||
|
|
||||
return wc.sendMessage(response) |
|
||||
} |
|
||||
|
|
||||
// sendMessage sends a message to the worker
|
|
||||
func (wc *WorkerConnection) sendMessage(msg *worker_pb.AdminMessage) error { |
|
||||
wc.mutex.RLock() |
|
||||
defer wc.mutex.RUnlock() |
|
||||
|
|
||||
if !wc.active || wc.stream == nil { |
|
||||
return fmt.Errorf("connection to worker %s is not active", wc.workerID) |
|
||||
} |
|
||||
|
|
||||
// The stream expects WorkerMessage from client (admin) to server (worker)
|
|
||||
// Convert AdminMessage to appropriate WorkerMessage format
|
|
||||
workerMsg := &worker_pb.WorkerMessage{ |
|
||||
WorkerId: wc.workerID, |
|
||||
Timestamp: msg.Timestamp, |
|
||||
} |
|
||||
|
|
||||
// Convert AdminMessage content to WorkerMessage based on message type
|
|
||||
switch adminMsg := msg.Message.(type) { |
|
||||
case *worker_pb.AdminMessage_TaskAssignment: |
|
||||
// Task assignments should be sent as notifications to worker
|
|
||||
// Since there's no direct equivalent, we'll create a generic message
|
|
||||
// In a full implementation, this would need proper message type mapping
|
|
||||
_ = adminMsg // Use the variable to avoid unused warning
|
|
||||
workerMsg.Message = &worker_pb.WorkerMessage_Heartbeat{ |
|
||||
Heartbeat: &worker_pb.WorkerHeartbeat{ |
|
||||
WorkerId: wc.workerID, |
|
||||
Status: "task_assigned", |
|
||||
}, |
|
||||
} |
|
||||
case *worker_pb.AdminMessage_TaskCancellation: |
|
||||
// Similar conversion for task cancellation
|
|
||||
_ = adminMsg // Use the variable to avoid unused warning
|
|
||||
workerMsg.Message = &worker_pb.WorkerMessage_Heartbeat{ |
|
||||
Heartbeat: &worker_pb.WorkerHeartbeat{ |
|
||||
WorkerId: wc.workerID, |
|
||||
Status: "task_cancelled", |
|
||||
}, |
|
||||
} |
|
||||
default: |
|
||||
// For other message types, send a generic heartbeat
|
|
||||
workerMsg.Message = &worker_pb.WorkerMessage_Heartbeat{ |
|
||||
Heartbeat: &worker_pb.WorkerHeartbeat{ |
|
||||
WorkerId: wc.workerID, |
|
||||
Status: "admin_message", |
|
||||
}, |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
return wc.stream.Send(workerMsg) |
|
||||
} |
|
||||
|
|
||||
// Helper functions
|
|
||||
|
|
||||
// convertCapabilities converts string capabilities to TaskType slice
|
|
||||
func convertCapabilities(capabilities []string) []TaskType { |
|
||||
var result []TaskType |
|
||||
for _, cap := range capabilities { |
|
||||
result = append(result, TaskType(cap)) |
|
||||
} |
|
||||
return result |
|
||||
} |
|
||||
|
|
||||
// WorkerStatus represents worker status information
|
|
||||
type WorkerStatus struct { |
|
||||
Status string |
|
||||
CurrentLoad int |
|
||||
MaxConcurrent int |
|
||||
CurrentTasks []string |
|
||||
TasksCompleted int |
|
||||
TasksFailed int |
|
||||
UptimeSeconds int64 |
|
||||
LastSeen time.Time |
|
||||
} |
|
||||
|
|
||||
// TaskProgress represents task progress information
|
|
||||
type TaskProgress struct { |
|
||||
Progress float64 |
|
||||
Message string |
|
||||
} |
|
||||
|
|
||||
// TaskResult represents task completion result
|
|
||||
type TaskResult struct { |
|
||||
Success bool |
|
||||
Error string |
|
||||
Message string |
|
||||
} |
|
||||
|
|
||||
// convertToWorkerMessage converts AdminMessage to WorkerMessage (stub implementation)
|
|
||||
func convertToWorkerMessage(msg *worker_pb.AdminMessage) *worker_pb.WorkerMessage { |
|
||||
// This is a stub - in real implementation would need proper conversion
|
|
||||
// For now, return nil to avoid processing
|
|
||||
return nil |
|
||||
} |
|
||||
@ -1,348 +0,0 @@ |
|||||
package task |
|
||||
|
|
||||
import ( |
|
||||
"fmt" |
|
||||
"sync" |
|
||||
"time" |
|
||||
|
|
||||
"github.com/seaweedfs/seaweedfs/weed/glog" |
|
||||
"github.com/seaweedfs/seaweedfs/weed/worker/types" |
|
||||
) |
|
||||
|
|
||||
// WorkerRegistry manages worker registration and tracking
|
|
||||
type WorkerRegistry struct { |
|
||||
workers map[string]*types.Worker |
|
||||
capabilities map[types.TaskType][]*types.Worker |
|
||||
metrics map[string]*WorkerMetrics |
|
||||
issues map[string][]WorkerIssue |
|
||||
mutex sync.RWMutex |
|
||||
} |
|
||||
|
|
||||
// WorkerIssue represents an issue with a worker
|
|
||||
type WorkerIssue struct { |
|
||||
Type string |
|
||||
Timestamp time.Time |
|
||||
Details string |
|
||||
} |
|
||||
|
|
||||
// NewWorkerRegistry creates a new worker registry
|
|
||||
func NewWorkerRegistry() *WorkerRegistry { |
|
||||
return &WorkerRegistry{ |
|
||||
workers: make(map[string]*types.Worker), |
|
||||
capabilities: make(map[types.TaskType][]*types.Worker), |
|
||||
metrics: make(map[string]*WorkerMetrics), |
|
||||
issues: make(map[string][]WorkerIssue), |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// RegisterWorker registers a new worker
|
|
||||
func (wr *WorkerRegistry) RegisterWorker(worker *types.Worker) error { |
|
||||
wr.mutex.Lock() |
|
||||
defer wr.mutex.Unlock() |
|
||||
|
|
||||
if _, exists := wr.workers[worker.ID]; exists { |
|
||||
return fmt.Errorf("worker %s already registered", worker.ID) |
|
||||
} |
|
||||
|
|
||||
// Register worker
|
|
||||
wr.workers[worker.ID] = worker |
|
||||
|
|
||||
// Initialize metrics
|
|
||||
wr.metrics[worker.ID] = &WorkerMetrics{ |
|
||||
TasksCompleted: 0, |
|
||||
TasksFailed: 0, |
|
||||
AverageTaskTime: 0, |
|
||||
LastTaskTime: time.Time{}, |
|
||||
SuccessRate: 1.0, |
|
||||
} |
|
||||
|
|
||||
// Update capabilities mapping
|
|
||||
wr.updateCapabilitiesMapping() |
|
||||
|
|
||||
glog.Infof("Registered worker %s with capabilities: %v", worker.ID, worker.Capabilities) |
|
||||
return nil |
|
||||
} |
|
||||
|
|
||||
// UnregisterWorker removes a worker
|
|
||||
func (wr *WorkerRegistry) UnregisterWorker(workerID string) error { |
|
||||
wr.mutex.Lock() |
|
||||
defer wr.mutex.Unlock() |
|
||||
|
|
||||
if _, exists := wr.workers[workerID]; !exists { |
|
||||
return fmt.Errorf("worker %s not found", workerID) |
|
||||
} |
|
||||
|
|
||||
delete(wr.workers, workerID) |
|
||||
delete(wr.metrics, workerID) |
|
||||
delete(wr.issues, workerID) |
|
||||
|
|
||||
// Update capabilities mapping
|
|
||||
wr.updateCapabilitiesMapping() |
|
||||
|
|
||||
glog.Infof("Unregistered worker %s", workerID) |
|
||||
return nil |
|
||||
} |
|
||||
|
|
||||
// GetWorker returns a worker by ID
|
|
||||
func (wr *WorkerRegistry) GetWorker(workerID string) (*types.Worker, bool) { |
|
||||
wr.mutex.RLock() |
|
||||
defer wr.mutex.RUnlock() |
|
||||
|
|
||||
worker, exists := wr.workers[workerID] |
|
||||
return worker, exists |
|
||||
} |
|
||||
|
|
||||
// GetAvailableWorkers returns workers that are available for new tasks
|
|
||||
func (wr *WorkerRegistry) GetAvailableWorkers() []*types.Worker { |
|
||||
wr.mutex.RLock() |
|
||||
defer wr.mutex.RUnlock() |
|
||||
|
|
||||
var available []*types.Worker |
|
||||
for _, worker := range wr.workers { |
|
||||
if worker.Status == "active" && worker.CurrentLoad < worker.MaxConcurrent { |
|
||||
available = append(available, worker) |
|
||||
} |
|
||||
} |
|
||||
return available |
|
||||
} |
|
||||
|
|
||||
// GetWorkersByCapability returns workers that support a specific capability
|
|
||||
func (wr *WorkerRegistry) GetWorkersByCapability(taskType types.TaskType) []*types.Worker { |
|
||||
wr.mutex.RLock() |
|
||||
defer wr.mutex.RUnlock() |
|
||||
|
|
||||
return wr.capabilities[taskType] |
|
||||
} |
|
||||
|
|
||||
// UpdateWorkerHeartbeat updates worker heartbeat and status
|
|
||||
func (wr *WorkerRegistry) UpdateWorkerHeartbeat(workerID string, status *types.WorkerStatus) error { |
|
||||
wr.mutex.Lock() |
|
||||
defer wr.mutex.Unlock() |
|
||||
|
|
||||
worker, exists := wr.workers[workerID] |
|
||||
if !exists { |
|
||||
return fmt.Errorf("worker %s not found", workerID) |
|
||||
} |
|
||||
|
|
||||
// Update worker status
|
|
||||
worker.LastHeartbeat = time.Now() |
|
||||
worker.Status = status.Status |
|
||||
worker.CurrentLoad = status.CurrentLoad |
|
||||
|
|
||||
glog.V(3).Infof("Updated heartbeat for worker %s, status: %s, load: %d/%d", |
|
||||
workerID, status.Status, status.CurrentLoad, worker.MaxConcurrent) |
|
||||
return nil |
|
||||
} |
|
||||
|
|
||||
// GetTimedOutWorkers returns workers that haven't sent heartbeat within timeout
|
|
||||
func (wr *WorkerRegistry) GetTimedOutWorkers(timeout time.Duration) []string { |
|
||||
wr.mutex.RLock() |
|
||||
defer wr.mutex.RUnlock() |
|
||||
|
|
||||
var timedOut []string |
|
||||
cutoff := time.Now().Add(-timeout) |
|
||||
|
|
||||
for workerID, worker := range wr.workers { |
|
||||
if worker.LastHeartbeat.Before(cutoff) { |
|
||||
timedOut = append(timedOut, workerID) |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
return timedOut |
|
||||
} |
|
||||
|
|
||||
// MarkWorkerInactive marks a worker as inactive
|
|
||||
func (wr *WorkerRegistry) MarkWorkerInactive(workerID string) { |
|
||||
wr.mutex.Lock() |
|
||||
defer wr.mutex.Unlock() |
|
||||
|
|
||||
if worker, exists := wr.workers[workerID]; exists { |
|
||||
worker.Status = "inactive" |
|
||||
worker.CurrentLoad = 0 |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// RecordWorkerIssue records an issue with a worker
|
|
||||
func (wr *WorkerRegistry) RecordWorkerIssue(workerID string, issueType string) { |
|
||||
wr.mutex.Lock() |
|
||||
defer wr.mutex.Unlock() |
|
||||
|
|
||||
issue := WorkerIssue{ |
|
||||
Type: issueType, |
|
||||
Timestamp: time.Now(), |
|
||||
Details: fmt.Sprintf("Worker issue: %s", issueType), |
|
||||
} |
|
||||
|
|
||||
wr.issues[workerID] = append(wr.issues[workerID], issue) |
|
||||
|
|
||||
// Limit issue history to last 10 issues
|
|
||||
if len(wr.issues[workerID]) > 10 { |
|
||||
wr.issues[workerID] = wr.issues[workerID][1:] |
|
||||
} |
|
||||
|
|
||||
glog.Warningf("Recorded issue for worker %s: %s", workerID, issueType) |
|
||||
} |
|
||||
|
|
||||
// GetWorkerMetrics returns metrics for a worker
|
|
||||
func (wr *WorkerRegistry) GetWorkerMetrics(workerID string) *WorkerMetrics { |
|
||||
wr.mutex.RLock() |
|
||||
defer wr.mutex.RUnlock() |
|
||||
|
|
||||
return wr.metrics[workerID] |
|
||||
} |
|
||||
|
|
||||
// UpdateWorkerMetrics updates performance metrics for a worker
|
|
||||
func (wr *WorkerRegistry) UpdateWorkerMetrics(workerID string, taskDuration time.Duration, success bool) { |
|
||||
wr.mutex.Lock() |
|
||||
defer wr.mutex.Unlock() |
|
||||
|
|
||||
metrics, exists := wr.metrics[workerID] |
|
||||
if !exists { |
|
||||
return |
|
||||
} |
|
||||
|
|
||||
if success { |
|
||||
metrics.TasksCompleted++ |
|
||||
} else { |
|
||||
metrics.TasksFailed++ |
|
||||
} |
|
||||
|
|
||||
metrics.LastTaskTime = time.Now() |
|
||||
|
|
||||
// Update average task time
|
|
||||
totalTasks := metrics.TasksCompleted + metrics.TasksFailed |
|
||||
if totalTasks > 0 { |
|
||||
oldAvg := metrics.AverageTaskTime |
|
||||
metrics.AverageTaskTime = time.Duration( |
|
||||
(float64(oldAvg)*float64(totalTasks-1) + float64(taskDuration)) / float64(totalTasks), |
|
||||
) |
|
||||
} |
|
||||
|
|
||||
// Update success rate
|
|
||||
if totalTasks > 0 { |
|
||||
metrics.SuccessRate = float64(metrics.TasksCompleted) / float64(totalTasks) |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// GetBestWorkerForTask returns the best worker for a specific task type
|
|
||||
func (wr *WorkerRegistry) GetBestWorkerForTask(taskType types.TaskType) *types.Worker { |
|
||||
wr.mutex.RLock() |
|
||||
defer wr.mutex.RUnlock() |
|
||||
|
|
||||
candidates := wr.capabilities[taskType] |
|
||||
if len(candidates) == 0 { |
|
||||
return nil |
|
||||
} |
|
||||
|
|
||||
var bestWorker *types.Worker |
|
||||
bestScore := -1.0 |
|
||||
|
|
||||
for _, worker := range candidates { |
|
||||
// Skip if not available
|
|
||||
if worker.Status != "active" || worker.CurrentLoad >= worker.MaxConcurrent { |
|
||||
continue |
|
||||
} |
|
||||
|
|
||||
// Calculate score based on multiple factors
|
|
||||
score := wr.calculateWorkerScore(worker) |
|
||||
if bestWorker == nil || score > bestScore { |
|
||||
bestWorker = worker |
|
||||
bestScore = score |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
return bestWorker |
|
||||
} |
|
||||
|
|
||||
// calculateWorkerScore calculates a score for worker selection
|
|
||||
func (wr *WorkerRegistry) calculateWorkerScore(worker *types.Worker) float64 { |
|
||||
metrics := wr.metrics[worker.ID] |
|
||||
if metrics == nil { |
|
||||
return 0.5 // Default score for new workers
|
|
||||
} |
|
||||
|
|
||||
// Factors for scoring:
|
|
||||
// 1. Available capacity (0.0 to 1.0)
|
|
||||
capacityScore := float64(worker.MaxConcurrent-worker.CurrentLoad) / float64(worker.MaxConcurrent) |
|
||||
|
|
||||
// 2. Success rate (0.0 to 1.0)
|
|
||||
successScore := metrics.SuccessRate |
|
||||
|
|
||||
// 3. Recent activity bonus (workers that completed tasks recently get slight bonus)
|
|
||||
activityScore := 0.0 |
|
||||
if !metrics.LastTaskTime.IsZero() && time.Since(metrics.LastTaskTime) < time.Hour { |
|
||||
activityScore = 0.1 |
|
||||
} |
|
||||
|
|
||||
// 4. Issue penalty (workers with recent issues get penalty)
|
|
||||
issuePenalty := 0.0 |
|
||||
if issues, exists := wr.issues[worker.ID]; exists { |
|
||||
recentIssues := 0 |
|
||||
cutoff := time.Now().Add(-time.Hour) |
|
||||
for _, issue := range issues { |
|
||||
if issue.Timestamp.After(cutoff) { |
|
||||
recentIssues++ |
|
||||
} |
|
||||
} |
|
||||
issuePenalty = float64(recentIssues) * 0.1 |
|
||||
} |
|
||||
|
|
||||
// Weighted average
|
|
||||
score := (capacityScore*0.4 + successScore*0.4 + activityScore) - issuePenalty |
|
||||
|
|
||||
if score < 0 { |
|
||||
score = 0 |
|
||||
} |
|
||||
if score > 1 { |
|
||||
score = 1 |
|
||||
} |
|
||||
|
|
||||
return score |
|
||||
} |
|
||||
|
|
||||
// updateCapabilitiesMapping rebuilds the capabilities mapping
|
|
||||
func (wr *WorkerRegistry) updateCapabilitiesMapping() { |
|
||||
// Clear existing mapping
|
|
||||
for taskType := range wr.capabilities { |
|
||||
wr.capabilities[taskType] = nil |
|
||||
} |
|
||||
|
|
||||
// Rebuild mapping
|
|
||||
for _, worker := range wr.workers { |
|
||||
for _, capability := range worker.Capabilities { |
|
||||
wr.capabilities[capability] = append(wr.capabilities[capability], worker) |
|
||||
} |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// GetRegistryStats returns statistics about the registry
|
|
||||
func (wr *WorkerRegistry) GetRegistryStats() map[string]interface{} { |
|
||||
wr.mutex.RLock() |
|
||||
defer wr.mutex.RUnlock() |
|
||||
|
|
||||
stats := make(map[string]interface{}) |
|
||||
stats["total_workers"] = len(wr.workers) |
|
||||
|
|
||||
statusCounts := make(map[string]int) |
|
||||
capabilityCounts := make(map[types.TaskType]int) |
|
||||
totalLoad := 0 |
|
||||
maxCapacity := 0 |
|
||||
|
|
||||
for _, worker := range wr.workers { |
|
||||
statusCounts[worker.Status]++ |
|
||||
totalLoad += worker.CurrentLoad |
|
||||
maxCapacity += worker.MaxConcurrent |
|
||||
|
|
||||
for _, capability := range worker.Capabilities { |
|
||||
capabilityCounts[capability]++ |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
stats["by_status"] = statusCounts |
|
||||
stats["by_capability"] = capabilityCounts |
|
||||
stats["total_load"] = totalLoad |
|
||||
stats["max_capacity"] = maxCapacity |
|
||||
stats["utilization"] = float64(totalLoad) / float64(maxCapacity) * 100.0 |
|
||||
|
|
||||
return stats |
|
||||
} |
|
||||
@ -1,324 +0,0 @@ |
|||||
package task |
|
||||
|
|
||||
import ( |
|
||||
"fmt" |
|
||||
"sync" |
|
||||
"time" |
|
||||
|
|
||||
"github.com/seaweedfs/seaweedfs/weed/wdclient" |
|
||||
"github.com/seaweedfs/seaweedfs/weed/worker/types" |
|
||||
) |
|
||||
|
|
||||
// AdminConfig contains configuration for the admin server
|
|
||||
type AdminConfig struct { |
|
||||
ScanInterval time.Duration |
|
||||
WorkerTimeout time.Duration |
|
||||
TaskTimeout time.Duration |
|
||||
MaxRetries int |
|
||||
ReconcileInterval time.Duration |
|
||||
EnableFailureRecovery bool |
|
||||
MaxConcurrentTasks int |
|
||||
} |
|
||||
|
|
||||
// AdminServer manages workers and tasks
|
|
||||
type AdminServer struct { |
|
||||
config *AdminConfig |
|
||||
masterClient *wdclient.MasterClient |
|
||||
running bool |
|
||||
mutex sync.RWMutex |
|
||||
|
|
||||
// Task management
|
|
||||
tasks map[string]*types.Task |
|
||||
taskQueue []*types.Task |
|
||||
activeTasks map[string]*types.Task |
|
||||
|
|
||||
// Worker management
|
|
||||
workers map[string]*types.Worker |
|
||||
workerStatus map[string]*types.WorkerStatus |
|
||||
|
|
||||
// Task history
|
|
||||
taskHistory []TaskHistoryEntry |
|
||||
} |
|
||||
|
|
||||
// TaskHistoryEntry represents a single task history entry
|
|
||||
type TaskHistoryEntry struct { |
|
||||
TaskID string |
|
||||
TaskType types.TaskType |
|
||||
VolumeID uint32 |
|
||||
WorkerID string |
|
||||
Status types.TaskStatus |
|
||||
StartedAt time.Time |
|
||||
CompletedAt time.Time |
|
||||
Duration time.Duration |
|
||||
ErrorMessage string |
|
||||
} |
|
||||
|
|
||||
// SystemStats represents system statistics
|
|
||||
type SystemStats struct { |
|
||||
ActiveTasks int |
|
||||
QueuedTasks int |
|
||||
ActiveWorkers int |
|
||||
TotalTasks int |
|
||||
} |
|
||||
|
|
||||
// NewAdminServer creates a new admin server
|
|
||||
func NewAdminServer(config *AdminConfig, masterClient *wdclient.MasterClient) *AdminServer { |
|
||||
return &AdminServer{ |
|
||||
config: config, |
|
||||
masterClient: masterClient, |
|
||||
tasks: make(map[string]*types.Task), |
|
||||
taskQueue: make([]*types.Task, 0), |
|
||||
activeTasks: make(map[string]*types.Task), |
|
||||
workers: make(map[string]*types.Worker), |
|
||||
workerStatus: make(map[string]*types.WorkerStatus), |
|
||||
taskHistory: make([]TaskHistoryEntry, 0), |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// Start starts the admin server
|
|
||||
func (as *AdminServer) Start() error { |
|
||||
as.mutex.Lock() |
|
||||
defer as.mutex.Unlock() |
|
||||
|
|
||||
if as.running { |
|
||||
return fmt.Errorf("admin server is already running") |
|
||||
} |
|
||||
|
|
||||
as.running = true |
|
||||
return nil |
|
||||
} |
|
||||
|
|
||||
// Stop stops the admin server
|
|
||||
func (as *AdminServer) Stop() error { |
|
||||
as.mutex.Lock() |
|
||||
defer as.mutex.Unlock() |
|
||||
|
|
||||
as.running = false |
|
||||
return nil |
|
||||
} |
|
||||
|
|
||||
// RegisterWorker registers a new worker
|
|
||||
func (as *AdminServer) RegisterWorker(worker *types.Worker) error { |
|
||||
as.mutex.Lock() |
|
||||
defer as.mutex.Unlock() |
|
||||
|
|
||||
if !as.running { |
|
||||
return fmt.Errorf("admin server is not running") |
|
||||
} |
|
||||
|
|
||||
as.workers[worker.ID] = worker |
|
||||
as.workerStatus[worker.ID] = &types.WorkerStatus{ |
|
||||
Status: "active", |
|
||||
CurrentLoad: 0, |
|
||||
} |
|
||||
|
|
||||
return nil |
|
||||
} |
|
||||
|
|
||||
// QueueTask adds a new task to the task queue
|
|
||||
func (as *AdminServer) QueueTask(task *types.Task) error { |
|
||||
as.mutex.Lock() |
|
||||
defer as.mutex.Unlock() |
|
||||
|
|
||||
if !as.running { |
|
||||
return fmt.Errorf("admin server is not running") |
|
||||
} |
|
||||
|
|
||||
if task.ID == "" { |
|
||||
task.ID = fmt.Sprintf("task-%d", time.Now().UnixNano()) |
|
||||
} |
|
||||
|
|
||||
task.Status = types.TaskStatusPending |
|
||||
task.CreatedAt = time.Now() |
|
||||
|
|
||||
as.tasks[task.ID] = task |
|
||||
as.taskQueue = append(as.taskQueue, task) |
|
||||
|
|
||||
return nil |
|
||||
} |
|
||||
|
|
||||
// RequestTask requests a task for a worker
|
|
||||
func (as *AdminServer) RequestTask(workerID string, capabilities []types.TaskType) (*types.Task, error) { |
|
||||
as.mutex.Lock() |
|
||||
defer as.mutex.Unlock() |
|
||||
|
|
||||
if !as.running { |
|
||||
return nil, fmt.Errorf("admin server is not running") |
|
||||
} |
|
||||
|
|
||||
// Check if worker exists
|
|
||||
worker, exists := as.workers[workerID] |
|
||||
if !exists { |
|
||||
return nil, fmt.Errorf("worker %s not found", workerID) |
|
||||
} |
|
||||
|
|
||||
// Check if worker has capacity
|
|
||||
status := as.workerStatus[workerID] |
|
||||
if status.CurrentLoad >= worker.MaxConcurrent { |
|
||||
return nil, nil // No capacity
|
|
||||
} |
|
||||
|
|
||||
// Find a suitable task
|
|
||||
for i, task := range as.taskQueue { |
|
||||
if task.Status != types.TaskStatusPending { |
|
||||
continue |
|
||||
} |
|
||||
|
|
||||
// Check if worker can handle this task type
|
|
||||
canHandle := false |
|
||||
for _, capability := range capabilities { |
|
||||
if task.Type == capability { |
|
||||
canHandle = true |
|
||||
break |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
if canHandle { |
|
||||
// Assign task to worker
|
|
||||
task.Status = types.TaskStatusInProgress |
|
||||
task.WorkerID = workerID |
|
||||
now := time.Now() |
|
||||
task.StartedAt = &now |
|
||||
|
|
||||
// Move task from queue to active tasks
|
|
||||
as.taskQueue = append(as.taskQueue[:i], as.taskQueue[i+1:]...) |
|
||||
as.activeTasks[task.ID] = task |
|
||||
|
|
||||
// Update worker load
|
|
||||
status.CurrentLoad++ |
|
||||
|
|
||||
return task, nil |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
return nil, nil // No suitable task found
|
|
||||
} |
|
||||
|
|
||||
// UpdateTaskProgress updates task progress
|
|
||||
func (as *AdminServer) UpdateTaskProgress(taskID string, progress float64) error { |
|
||||
as.mutex.Lock() |
|
||||
defer as.mutex.Unlock() |
|
||||
|
|
||||
task, exists := as.tasks[taskID] |
|
||||
if !exists { |
|
||||
return fmt.Errorf("task %s not found", taskID) |
|
||||
} |
|
||||
|
|
||||
task.Progress = progress |
|
||||
|
|
||||
return nil |
|
||||
} |
|
||||
|
|
||||
// CompleteTask marks a task as completed
|
|
||||
func (as *AdminServer) CompleteTask(taskID string, success bool, errorMessage string) error { |
|
||||
as.mutex.Lock() |
|
||||
defer as.mutex.Unlock() |
|
||||
|
|
||||
task, exists := as.tasks[taskID] |
|
||||
if !exists { |
|
||||
return fmt.Errorf("task %s not found", taskID) |
|
||||
} |
|
||||
|
|
||||
// Update task status
|
|
||||
if success { |
|
||||
task.Status = types.TaskStatusCompleted |
|
||||
} else { |
|
||||
task.Status = types.TaskStatusFailed |
|
||||
task.Error = errorMessage |
|
||||
} |
|
||||
|
|
||||
now := time.Now() |
|
||||
task.CompletedAt = &now |
|
||||
|
|
||||
// Remove from active tasks
|
|
||||
delete(as.activeTasks, taskID) |
|
||||
|
|
||||
// Update worker load
|
|
||||
if task.WorkerID != "" { |
|
||||
if status, exists := as.workerStatus[task.WorkerID]; exists { |
|
||||
status.CurrentLoad-- |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// Add to history
|
|
||||
var duration time.Duration |
|
||||
if task.StartedAt != nil { |
|
||||
duration = now.Sub(*task.StartedAt) |
|
||||
} |
|
||||
|
|
||||
entry := TaskHistoryEntry{ |
|
||||
TaskID: task.ID, |
|
||||
TaskType: task.Type, |
|
||||
VolumeID: task.VolumeID, |
|
||||
WorkerID: task.WorkerID, |
|
||||
Status: task.Status, |
|
||||
StartedAt: *task.StartedAt, |
|
||||
CompletedAt: now, |
|
||||
Duration: duration, |
|
||||
ErrorMessage: errorMessage, |
|
||||
} |
|
||||
as.taskHistory = append(as.taskHistory, entry) |
|
||||
|
|
||||
return nil |
|
||||
} |
|
||||
|
|
||||
// UpdateWorkerHeartbeat updates worker heartbeat
|
|
||||
func (as *AdminServer) UpdateWorkerHeartbeat(workerID string, status *types.WorkerStatus) error { |
|
||||
as.mutex.Lock() |
|
||||
defer as.mutex.Unlock() |
|
||||
|
|
||||
worker, exists := as.workers[workerID] |
|
||||
if !exists { |
|
||||
return fmt.Errorf("worker %s not found", workerID) |
|
||||
} |
|
||||
|
|
||||
worker.LastHeartbeat = time.Now() |
|
||||
as.workerStatus[workerID] = status |
|
||||
|
|
||||
return nil |
|
||||
} |
|
||||
|
|
||||
// GetSystemStats returns system statistics
|
|
||||
func (as *AdminServer) GetSystemStats() *SystemStats { |
|
||||
as.mutex.RLock() |
|
||||
defer as.mutex.RUnlock() |
|
||||
|
|
||||
activeWorkers := 0 |
|
||||
for _, status := range as.workerStatus { |
|
||||
if status.Status == "active" { |
|
||||
activeWorkers++ |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
return &SystemStats{ |
|
||||
ActiveTasks: len(as.activeTasks), |
|
||||
QueuedTasks: len(as.taskQueue), |
|
||||
ActiveWorkers: activeWorkers, |
|
||||
TotalTasks: len(as.tasks), |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// GetQueuedTaskCount returns the number of queued tasks
|
|
||||
func (as *AdminServer) GetQueuedTaskCount() int { |
|
||||
as.mutex.RLock() |
|
||||
defer as.mutex.RUnlock() |
|
||||
return len(as.taskQueue) |
|
||||
} |
|
||||
|
|
||||
// GetActiveTaskCount returns the number of active tasks
|
|
||||
func (as *AdminServer) GetActiveTaskCount() int { |
|
||||
as.mutex.RLock() |
|
||||
defer as.mutex.RUnlock() |
|
||||
return len(as.activeTasks) |
|
||||
} |
|
||||
|
|
||||
// GetTaskHistory returns task history
|
|
||||
func (as *AdminServer) GetTaskHistory() []TaskHistoryEntry { |
|
||||
as.mutex.RLock() |
|
||||
defer as.mutex.RUnlock() |
|
||||
|
|
||||
// Return a copy of the history
|
|
||||
history := make([]TaskHistoryEntry, len(as.taskHistory)) |
|
||||
copy(history, as.taskHistory) |
|
||||
return history |
|
||||
} |
|
||||
@ -1,3 +0,0 @@ |
|||||
module task_minimal |
|
||||
|
|
||||
go 1.24.1 |
|
||||
@ -1,233 +0,0 @@ |
|||||
package task |
|
||||
|
|
||||
import ( |
|
||||
"fmt" |
|
||||
"testing" |
|
||||
"time" |
|
||||
|
|
||||
"github.com/seaweedfs/seaweedfs/weed/worker/types" |
|
||||
) |
|
||||
|
|
||||
// TestSimpleIntegration tests basic admin-worker operational flow without complex dependencies
|
|
||||
func TestSimpleIntegration(t *testing.T) { |
|
||||
t.Logf("Starting simple integration test") |
|
||||
|
|
||||
// Step 1: Create a minimal admin server configuration
|
|
||||
config := &AdminConfig{ |
|
||||
ScanInterval: 10 * time.Second, |
|
||||
WorkerTimeout: 30 * time.Second, |
|
||||
TaskTimeout: 2 * time.Hour, |
|
||||
MaxRetries: 3, |
|
||||
ReconcileInterval: 5 * time.Minute, |
|
||||
EnableFailureRecovery: true, |
|
||||
MaxConcurrentTasks: 5, |
|
||||
} |
|
||||
|
|
||||
// Step 2: Create admin server with nil master client (for testing)
|
|
||||
adminServer := NewAdminServer(config, nil) |
|
||||
|
|
||||
// Step 3: Start admin server
|
|
||||
err := adminServer.Start() |
|
||||
if err != nil { |
|
||||
t.Fatalf("Failed to start admin server: %v", err) |
|
||||
} |
|
||||
defer adminServer.Stop() |
|
||||
|
|
||||
// Step 4: Test worker registration
|
|
||||
t.Logf("Testing worker registration") |
|
||||
|
|
||||
worker := &types.Worker{ |
|
||||
ID: "test-worker-1", |
|
||||
Address: "localhost:9001", |
|
||||
Capabilities: []types.TaskType{types.TaskTypeVacuum}, |
|
||||
MaxConcurrent: 2, |
|
||||
Status: "active", |
|
||||
CurrentLoad: 0, |
|
||||
LastHeartbeat: time.Now(), |
|
||||
} |
|
||||
|
|
||||
err = adminServer.RegisterWorker(worker) |
|
||||
if err != nil { |
|
||||
t.Fatalf("Failed to register worker: %v", err) |
|
||||
} |
|
||||
t.Logf("Successfully registered worker %s", worker.ID) |
|
||||
|
|
||||
// Step 5: Test task queueing
|
|
||||
t.Logf("Testing task queueing") |
|
||||
|
|
||||
task := &types.Task{ |
|
||||
ID: "test-task-1", |
|
||||
Type: types.TaskTypeVacuum, |
|
||||
VolumeID: 1001, |
|
||||
Server: "localhost:8080", |
|
||||
Status: types.TaskStatusPending, |
|
||||
Priority: types.TaskPriorityNormal, |
|
||||
Parameters: map[string]interface{}{ |
|
||||
"garbage_threshold": "0.3", |
|
||||
}, |
|
||||
CreatedAt: time.Now(), |
|
||||
} |
|
||||
|
|
||||
err = adminServer.QueueTask(task) |
|
||||
if err != nil { |
|
||||
t.Fatalf("Failed to queue task: %v", err) |
|
||||
} |
|
||||
t.Logf("Successfully queued task %s", task.ID) |
|
||||
|
|
||||
// Step 6: Test task request by worker
|
|
||||
t.Logf("Testing task request") |
|
||||
|
|
||||
assignedTask, err := adminServer.RequestTask("test-worker-1", []types.TaskType{types.TaskTypeVacuum}) |
|
||||
if err != nil { |
|
||||
t.Fatalf("Failed to request task: %v", err) |
|
||||
} |
|
||||
|
|
||||
if assignedTask != nil { |
|
||||
t.Logf("Successfully assigned task %s to worker", assignedTask.ID) |
|
||||
|
|
||||
// Step 7: Test task progress updates
|
|
||||
t.Logf("Testing task progress updates") |
|
||||
|
|
||||
err = adminServer.UpdateTaskProgress(assignedTask.ID, 50.0) |
|
||||
if err != nil { |
|
||||
t.Errorf("Failed to update task progress: %v", err) |
|
||||
} |
|
||||
|
|
||||
err = adminServer.UpdateTaskProgress(assignedTask.ID, 100.0) |
|
||||
if err != nil { |
|
||||
t.Errorf("Failed to update task progress: %v", err) |
|
||||
} |
|
||||
|
|
||||
// Step 8: Test task completion
|
|
||||
t.Logf("Testing task completion") |
|
||||
|
|
||||
err = adminServer.CompleteTask(assignedTask.ID, true, "") |
|
||||
if err != nil { |
|
||||
t.Errorf("Failed to complete task: %v", err) |
|
||||
} |
|
||||
t.Logf("Successfully completed task %s", assignedTask.ID) |
|
||||
} else { |
|
||||
t.Logf("No task was assigned (queue might be empty)") |
|
||||
} |
|
||||
|
|
||||
// Step 9: Test basic metrics
|
|
||||
t.Logf("Testing basic metrics") |
|
||||
|
|
||||
stats := adminServer.GetSystemStats() |
|
||||
if stats != nil { |
|
||||
t.Logf("System stats: Active tasks=%d, Queued tasks=%d, Active workers=%d", |
|
||||
stats.ActiveTasks, stats.QueuedTasks, stats.ActiveWorkers) |
|
||||
} |
|
||||
|
|
||||
queuedCount := adminServer.GetQueuedTaskCount() |
|
||||
activeCount := adminServer.GetActiveTaskCount() |
|
||||
t.Logf("Queue status: %d queued, %d active tasks", queuedCount, activeCount) |
|
||||
|
|
||||
// Step 10: Test task history
|
|
||||
history := adminServer.GetTaskHistory() |
|
||||
t.Logf("Task history contains %d entries", len(history)) |
|
||||
|
|
||||
t.Logf("Simple integration test completed successfully") |
|
||||
} |
|
||||
|
|
||||
// TestWorkerHeartbeat tests worker heartbeat functionality
|
|
||||
func TestWorkerHeartbeat(t *testing.T) { |
|
||||
t.Logf("Testing worker heartbeat") |
|
||||
|
|
||||
config := &AdminConfig{ |
|
||||
ScanInterval: 10 * time.Second, |
|
||||
WorkerTimeout: 30 * time.Second, |
|
||||
TaskTimeout: 2 * time.Hour, |
|
||||
MaxRetries: 3, |
|
||||
ReconcileInterval: 5 * time.Minute, |
|
||||
EnableFailureRecovery: true, |
|
||||
MaxConcurrentTasks: 5, |
|
||||
} |
|
||||
|
|
||||
adminServer := NewAdminServer(config, nil) |
|
||||
err := adminServer.Start() |
|
||||
if err != nil { |
|
||||
t.Fatalf("Failed to start admin server: %v", err) |
|
||||
} |
|
||||
defer adminServer.Stop() |
|
||||
|
|
||||
// Register a worker
|
|
||||
worker := &types.Worker{ |
|
||||
ID: "heartbeat-worker", |
|
||||
Address: "localhost:9002", |
|
||||
Capabilities: []types.TaskType{types.TaskTypeVacuum}, |
|
||||
MaxConcurrent: 1, |
|
||||
Status: "active", |
|
||||
CurrentLoad: 0, |
|
||||
LastHeartbeat: time.Now(), |
|
||||
} |
|
||||
|
|
||||
err = adminServer.RegisterWorker(worker) |
|
||||
if err != nil { |
|
||||
t.Fatalf("Failed to register worker: %v", err) |
|
||||
} |
|
||||
|
|
||||
// Test heartbeat update
|
|
||||
status := &types.WorkerStatus{ |
|
||||
Status: "active", |
|
||||
CurrentLoad: 0, |
|
||||
} |
|
||||
|
|
||||
err = adminServer.UpdateWorkerHeartbeat("heartbeat-worker", status) |
|
||||
if err != nil { |
|
||||
t.Errorf("Failed to update worker heartbeat: %v", err) |
|
||||
} |
|
||||
|
|
||||
t.Logf("Worker heartbeat test completed successfully") |
|
||||
} |
|
||||
|
|
||||
// TestTaskQueueOperations tests task queue operations
|
|
||||
func TestTaskQueueOperations(t *testing.T) { |
|
||||
t.Logf("Testing task queue operations") |
|
||||
|
|
||||
config := &AdminConfig{ |
|
||||
ScanInterval: 10 * time.Second, |
|
||||
WorkerTimeout: 30 * time.Second, |
|
||||
TaskTimeout: 2 * time.Hour, |
|
||||
MaxRetries: 3, |
|
||||
ReconcileInterval: 5 * time.Minute, |
|
||||
EnableFailureRecovery: true, |
|
||||
MaxConcurrentTasks: 5, |
|
||||
} |
|
||||
|
|
||||
adminServer := NewAdminServer(config, nil) |
|
||||
err := adminServer.Start() |
|
||||
if err != nil { |
|
||||
t.Fatalf("Failed to start admin server: %v", err) |
|
||||
} |
|
||||
defer adminServer.Stop() |
|
||||
|
|
||||
// Test queuing multiple tasks
|
|
||||
for i := 0; i < 3; i++ { |
|
||||
task := &types.Task{ |
|
||||
ID: fmt.Sprintf("queue-test-task-%d", i), |
|
||||
Type: types.TaskTypeVacuum, |
|
||||
VolumeID: uint32(2000 + i), |
|
||||
Server: "localhost:8080", |
|
||||
Status: types.TaskStatusPending, |
|
||||
Priority: types.TaskPriorityNormal, |
|
||||
Parameters: map[string]interface{}{ |
|
||||
"garbage_threshold": "0.3", |
|
||||
}, |
|
||||
CreatedAt: time.Now(), |
|
||||
} |
|
||||
|
|
||||
err = adminServer.QueueTask(task) |
|
||||
if err != nil { |
|
||||
t.Errorf("Failed to queue task %d: %v", i, err) |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// Check queue size
|
|
||||
queuedCount := adminServer.GetQueuedTaskCount() |
|
||||
if queuedCount != 3 { |
|
||||
t.Errorf("Expected 3 queued tasks, got %d", queuedCount) |
|
||||
} |
|
||||
|
|
||||
t.Logf("Task queue operations test completed successfully") |
|
||||
} |
|
||||
Write
Preview
Loading…
Cancel
Save
Reference in new issue