You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

604 lines
17 KiB

package task
import (
"context"
"fmt"
"math/rand"
"sync"
"time"
"github.com/seaweedfs/seaweedfs/weed/glog"
"github.com/seaweedfs/seaweedfs/weed/worker/types"
)
// TaskSimulator provides a comprehensive simulation framework for testing the task distribution system
type TaskSimulator struct {
adminServer *AdminServer
mockWorkers []*MockWorker
mockMaster *MockMasterClient
scenarios map[string]*SimulationScenario
results map[string]*SimulationResult
mutex sync.RWMutex
}
// SimulationScenario defines a test scenario
type SimulationScenario struct {
Name string
Description string
WorkerCount int
VolumeCount int
Duration time.Duration
FailurePatterns []*FailurePattern
TestCases []*TestCase
}
// FailurePattern defines how failures occur during simulation
type FailurePattern struct {
Type FailureType
Probability float64 // 0.0 to 1.0
Timing *TimingSpec // When during task execution
Duration time.Duration
Details string
}
// TestCase defines specific test scenarios
type TestCase struct {
Name string
VolumeID uint32
TaskType types.TaskType
ExpectedOutcome string
FailureToInject *FailurePattern
}
// FailureType represents different types of failures
type FailureType string
const (
FailureWorkerTimeout FailureType = "worker_timeout"
FailureTaskStuck FailureType = "task_stuck"
FailureTaskCrash FailureType = "task_crash"
FailureDuplicate FailureType = "duplicate_task"
FailureResourceExhaust FailureType = "resource_exhaustion"
FailureNetworkPartition FailureType = "network_partition"
)
// TimingSpec defines when a failure occurs
type TimingSpec struct {
MinProgress float64 // Minimum progress before failure can occur
MaxProgress float64 // Maximum progress before failure must occur
Delay time.Duration // Fixed delay before failure
}
// SimulationResult tracks the results of a simulation
type SimulationResult struct {
ScenarioName string
StartTime time.Time
EndTime time.Time
Duration time.Duration
TasksCreated int
TasksCompleted int
TasksFailed int
TasksStuck int
WorkerTimeouts int
DuplicatesFound int
StateInconsistencies int
Errors []string
Warnings []string
Success bool
}
// MockWorker simulates a worker with controllable behavior
type MockWorker struct {
ID string
Capabilities []types.TaskType
MaxConcurrent int
CurrentTasks map[string]*MockTask
Status string
FailureMode *FailurePattern
mutex sync.Mutex
}
// MockTask represents a simulated task execution
type MockTask struct {
Task *types.Task
StartTime time.Time
Progress float64
Stuck bool
Failed bool
Completed bool
}
// MockMasterClient simulates master server interactions
type MockMasterClient struct {
volumes map[uint32]*VolumeInfo
inconsistency bool
mutex sync.RWMutex
}
// NewTaskSimulator creates a new task simulator
func NewTaskSimulator() *TaskSimulator {
return &TaskSimulator{
scenarios: make(map[string]*SimulationScenario),
results: make(map[string]*SimulationResult),
}
}
// RegisterScenario registers a simulation scenario
func (ts *TaskSimulator) RegisterScenario(scenario *SimulationScenario) {
ts.mutex.Lock()
defer ts.mutex.Unlock()
ts.scenarios[scenario.Name] = scenario
glog.Infof("Registered simulation scenario: %s", scenario.Name)
}
// RunScenario executes a simulation scenario
func (ts *TaskSimulator) RunScenario(scenarioName string) (*SimulationResult, error) {
ts.mutex.RLock()
scenario, exists := ts.scenarios[scenarioName]
ts.mutex.RUnlock()
if !exists {
return nil, fmt.Errorf("scenario %s not found", scenarioName)
}
glog.Infof("Starting simulation scenario: %s", scenarioName)
result := &SimulationResult{
ScenarioName: scenarioName,
StartTime: time.Now(),
Errors: make([]string, 0),
Warnings: make([]string, 0),
}
// Setup simulation environment
if err := ts.setupEnvironment(scenario); err != nil {
return nil, fmt.Errorf("failed to setup environment: %v", err)
}
// Execute test cases
ctx, cancel := context.WithTimeout(context.Background(), scenario.Duration)
defer cancel()
ts.executeScenario(ctx, scenario, result)
// Cleanup
ts.cleanup()
result.EndTime = time.Now()
result.Duration = result.EndTime.Sub(result.StartTime)
result.Success = len(result.Errors) == 0
ts.mutex.Lock()
ts.results[scenarioName] = result
ts.mutex.Unlock()
glog.Infof("Completed simulation scenario: %s (success: %v)", scenarioName, result.Success)
return result, nil
}
// setupEnvironment prepares the simulation environment
func (ts *TaskSimulator) setupEnvironment(scenario *SimulationScenario) error {
// Create mock master client
ts.mockMaster = &MockMasterClient{
volumes: make(map[uint32]*VolumeInfo),
}
// Generate mock volumes
for i := uint32(1); i <= uint32(scenario.VolumeCount); i++ {
volume := &VolumeInfo{
ID: i,
Size: uint64(rand.Intn(30 * 1024 * 1024 * 1024)), // Random size up to 30GB
Collection: fmt.Sprintf("collection_%d", (i%3)+1),
DeletedByteCount: uint64(rand.Intn(1024 * 1024 * 1024)), // Random garbage
ReadOnly: false,
Server: fmt.Sprintf("server_%d", (i%6)+1),
ModifiedAtSecond: time.Now().Add(-time.Duration(rand.Intn(86400)) * time.Second).Unix(),
}
ts.mockMaster.volumes[i] = volume
}
// Create mock workers
ts.mockWorkers = make([]*MockWorker, scenario.WorkerCount)
for i := 0; i < scenario.WorkerCount; i++ {
worker := &MockWorker{
ID: fmt.Sprintf("worker_%d", i+1),
Capabilities: []types.TaskType{types.TaskTypeErasureCoding, types.TaskTypeVacuum},
MaxConcurrent: 2,
CurrentTasks: make(map[string]*MockTask),
Status: "active",
}
// Apply failure patterns
if i < len(scenario.FailurePatterns) {
worker.FailureMode = scenario.FailurePatterns[i]
}
ts.mockWorkers[i] = worker
}
// Initialize admin server (simplified for simulation)
config := DefaultAdminConfig()
config.ScanInterval = 10 * time.Second
config.TaskTimeout = 30 * time.Second
// Note: In a real implementation, this would use the actual master client
// For simulation, we'd need to inject our mock
return nil
}
// executeScenario runs the actual simulation scenario
func (ts *TaskSimulator) executeScenario(ctx context.Context, scenario *SimulationScenario, result *SimulationResult) {
// Execute each test case
for _, testCase := range scenario.TestCases {
ts.executeTestCase(ctx, testCase, result)
}
// Run continuous simulation for remaining duration
ts.runContinuousSimulation(ctx, scenario, result)
}
// executeTestCase runs a specific test case
func (ts *TaskSimulator) executeTestCase(ctx context.Context, testCase *TestCase, result *SimulationResult) {
glog.V(1).Infof("Executing test case: %s", testCase.Name)
// Create task for the test case
task := &types.Task{
ID: fmt.Sprintf("test_%s_%d", testCase.Name, time.Now().UnixNano()),
Type: testCase.TaskType,
VolumeID: testCase.VolumeID,
Priority: types.TaskPriorityNormal,
CreatedAt: time.Now(),
}
result.TasksCreated++
// Assign to worker
worker := ts.selectWorkerForTask(task)
if worker == nil {
result.Errors = append(result.Errors, fmt.Sprintf("No available worker for test case %s", testCase.Name))
return
}
// Execute task with potential failure injection
ts.executeTaskOnWorker(ctx, task, worker, testCase.FailureToInject, result)
}
// runContinuousSimulation runs ongoing simulation
func (ts *TaskSimulator) runContinuousSimulation(ctx context.Context, scenario *SimulationScenario, result *SimulationResult) {
ticker := time.NewTicker(5 * time.Second)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
return
case <-ticker.C:
ts.simulateOngoingTasks(result)
ts.checkForInconsistencies(result)
}
}
}
// executeTaskOnWorker simulates task execution on a worker
func (ts *TaskSimulator) executeTaskOnWorker(ctx context.Context, task *types.Task, worker *MockWorker, failurePattern *FailurePattern, result *SimulationResult) {
worker.mutex.Lock()
defer worker.mutex.Unlock()
mockTask := &MockTask{
Task: task,
StartTime: time.Now(),
Progress: 0.0,
}
worker.CurrentTasks[task.ID] = mockTask
// Simulate task execution
go ts.simulateTaskExecution(ctx, mockTask, worker, failurePattern, result)
}
// simulateTaskExecution simulates the execution of a single task
func (ts *TaskSimulator) simulateTaskExecution(ctx context.Context, mockTask *MockTask, worker *MockWorker, failurePattern *FailurePattern, result *SimulationResult) {
defer func() {
worker.mutex.Lock()
delete(worker.CurrentTasks, mockTask.Task.ID)
worker.mutex.Unlock()
}()
duration := 20 * time.Second // Base task duration
progressTicker := time.NewTicker(time.Second)
defer progressTicker.Stop()
startTime := time.Now()
for {
select {
case <-ctx.Done():
return
case <-progressTicker.C:
elapsed := time.Since(startTime)
progress := float64(elapsed) / float64(duration) * 100.0
if progress >= 100.0 {
mockTask.Completed = true
result.TasksCompleted++
glog.V(2).Infof("Task %s completed successfully", mockTask.Task.ID)
return
}
mockTask.Progress = progress
// Check for failure injection
if failurePattern != nil && ts.shouldInjectFailure(failurePattern, progress, elapsed) {
ts.injectFailure(mockTask, worker, failurePattern, result)
return
}
// Check for worker failure mode
if worker.FailureMode != nil && ts.shouldInjectFailure(worker.FailureMode, progress, elapsed) {
ts.injectFailure(mockTask, worker, worker.FailureMode, result)
return
}
}
}
}
// shouldInjectFailure determines if a failure should be injected
func (ts *TaskSimulator) shouldInjectFailure(pattern *FailurePattern, progress float64, elapsed time.Duration) bool {
if pattern.Timing != nil {
if progress < pattern.Timing.MinProgress || progress > pattern.Timing.MaxProgress {
return false
}
if elapsed < pattern.Timing.Delay {
return false
}
}
return rand.Float64() < pattern.Probability
}
// injectFailure simulates a failure
func (ts *TaskSimulator) injectFailure(mockTask *MockTask, worker *MockWorker, pattern *FailurePattern, result *SimulationResult) {
glog.Warningf("Injecting failure: %s for task %s", pattern.Type, mockTask.Task.ID)
switch pattern.Type {
case FailureWorkerTimeout:
worker.Status = "timeout"
result.WorkerTimeouts++
case FailureTaskStuck:
mockTask.Stuck = true
result.TasksStuck++
case FailureTaskCrash:
mockTask.Failed = true
result.TasksFailed++
case FailureDuplicate:
result.DuplicatesFound++
case FailureResourceExhaust:
worker.Status = "resource_exhausted"
result.Warnings = append(result.Warnings, fmt.Sprintf("Worker %s resource exhausted", worker.ID))
case FailureNetworkPartition:
worker.Status = "partitioned"
result.Warnings = append(result.Warnings, fmt.Sprintf("Worker %s network partitioned", worker.ID))
}
}
// selectWorkerForTask selects an available worker for a task
func (ts *TaskSimulator) selectWorkerForTask(task *types.Task) *MockWorker {
for _, worker := range ts.mockWorkers {
if worker.Status == "active" && len(worker.CurrentTasks) < worker.MaxConcurrent {
// Check capabilities
for _, capability := range worker.Capabilities {
if capability == task.Type {
return worker
}
}
}
}
return nil
}
// simulateOngoingTasks handles ongoing task simulation
func (ts *TaskSimulator) simulateOngoingTasks(result *SimulationResult) {
// Create random new tasks
if rand.Float64() < 0.3 { // 30% chance to create new task every tick
taskType := types.TaskTypeVacuum
if rand.Float64() < 0.5 {
taskType = types.TaskTypeErasureCoding
}
task := &types.Task{
ID: fmt.Sprintf("auto_%d", time.Now().UnixNano()),
Type: taskType,
VolumeID: uint32(rand.Intn(len(ts.mockMaster.volumes)) + 1),
Priority: types.TaskPriorityNormal,
CreatedAt: time.Now(),
}
result.TasksCreated++
worker := ts.selectWorkerForTask(task)
if worker != nil {
ts.executeTaskOnWorker(context.Background(), task, worker, nil, result)
}
}
}
// checkForInconsistencies checks for state inconsistencies
func (ts *TaskSimulator) checkForInconsistencies(result *SimulationResult) {
// Check for volume reservation inconsistencies
// Check for duplicate tasks
// Check for orphaned tasks
// This would be more comprehensive in a real implementation
for _, worker := range ts.mockWorkers {
worker.mutex.Lock()
for taskID, mockTask := range worker.CurrentTasks {
if mockTask.Stuck && time.Since(mockTask.StartTime) > 60*time.Second {
result.StateInconsistencies++
result.Warnings = append(result.Warnings, fmt.Sprintf("Long-running stuck task detected: %s", taskID))
}
}
worker.mutex.Unlock()
}
}
// cleanup cleans up simulation resources
func (ts *TaskSimulator) cleanup() {
ts.mockWorkers = nil
ts.mockMaster = nil
}
// GetSimulationResults returns all simulation results
func (ts *TaskSimulator) GetSimulationResults() map[string]*SimulationResult {
ts.mutex.RLock()
defer ts.mutex.RUnlock()
results := make(map[string]*SimulationResult)
for k, v := range ts.results {
results[k] = v
}
return results
}
// CreateStandardScenarios creates a set of standard test scenarios
func (ts *TaskSimulator) CreateStandardScenarios() {
// Scenario 1: Worker Timeout During EC
ts.RegisterScenario(&SimulationScenario{
Name: "worker_timeout_during_ec",
Description: "Test worker timeout during erasure coding operation",
WorkerCount: 3,
VolumeCount: 10,
Duration: 2 * time.Minute,
FailurePatterns: []*FailurePattern{
{
Type: FailureWorkerTimeout,
Probability: 1.0,
Timing: &TimingSpec{
MinProgress: 50.0,
MaxProgress: 60.0,
},
},
},
TestCases: []*TestCase{
{
Name: "ec_timeout_test",
VolumeID: 1,
TaskType: types.TaskTypeErasureCoding,
ExpectedOutcome: "task_reassigned",
},
},
})
// Scenario 2: Stuck Vacuum Task
ts.RegisterScenario(&SimulationScenario{
Name: "stuck_vacuum_task",
Description: "Test stuck vacuum task detection and cleanup",
WorkerCount: 2,
VolumeCount: 5,
Duration: 90 * time.Second,
TestCases: []*TestCase{
{
Name: "vacuum_stuck_test",
VolumeID: 2,
TaskType: types.TaskTypeVacuum,
FailureToInject: &FailurePattern{
Type: FailureTaskStuck,
Probability: 1.0,
Timing: &TimingSpec{
MinProgress: 75.0,
MaxProgress: 80.0,
},
},
ExpectedOutcome: "task_timeout_detected",
},
},
})
// Scenario 3: Duplicate Task Prevention
ts.RegisterScenario(&SimulationScenario{
Name: "duplicate_task_prevention",
Description: "Test duplicate task detection and prevention",
WorkerCount: 4,
VolumeCount: 8,
Duration: 60 * time.Second,
TestCases: []*TestCase{
{
Name: "duplicate_ec_test_1",
VolumeID: 3,
TaskType: types.TaskTypeErasureCoding,
},
{
Name: "duplicate_ec_test_2", // Same volume, should be detected as duplicate
VolumeID: 3,
TaskType: types.TaskTypeErasureCoding,
FailureToInject: &FailurePattern{
Type: FailureDuplicate,
Probability: 1.0,
},
ExpectedOutcome: "duplicate_detected",
},
},
})
// Scenario 4: Master-Admin State Divergence
ts.RegisterScenario(&SimulationScenario{
Name: "master_admin_divergence",
Description: "Test state reconciliation between master and admin server",
WorkerCount: 3,
VolumeCount: 15,
Duration: 2 * time.Minute,
TestCases: []*TestCase{
{
Name: "state_reconciliation_test",
VolumeID: 4,
TaskType: types.TaskTypeErasureCoding,
ExpectedOutcome: "state_reconciled",
},
},
})
}
// GenerateSimulationReport creates a comprehensive report of simulation results
func (ts *TaskSimulator) GenerateSimulationReport() string {
ts.mutex.RLock()
defer ts.mutex.RUnlock()
report := "# Task Distribution System Simulation Report\n\n"
for scenarioName, result := range ts.results {
report += fmt.Sprintf("## Scenario: %s\n", scenarioName)
report += fmt.Sprintf("- **Duration**: %v\n", result.Duration)
report += fmt.Sprintf("- **Success**: %v\n", result.Success)
report += fmt.Sprintf("- **Tasks Created**: %d\n", result.TasksCreated)
report += fmt.Sprintf("- **Tasks Completed**: %d\n", result.TasksCompleted)
report += fmt.Sprintf("- **Tasks Failed**: %d\n", result.TasksFailed)
report += fmt.Sprintf("- **Tasks Stuck**: %d\n", result.TasksStuck)
report += fmt.Sprintf("- **Worker Timeouts**: %d\n", result.WorkerTimeouts)
report += fmt.Sprintf("- **Duplicates Found**: %d\n", result.DuplicatesFound)
report += fmt.Sprintf("- **State Inconsistencies**: %d\n", result.StateInconsistencies)
if len(result.Errors) > 0 {
report += "- **Errors**:\n"
for _, err := range result.Errors {
report += fmt.Sprintf(" - %s\n", err)
}
}
if len(result.Warnings) > 0 {
report += "- **Warnings**:\n"
for _, warning := range result.Warnings {
report += fmt.Sprintf(" - %s\n", warning)
}
}
report += "\n"
}
return report
}