You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
604 lines
17 KiB
604 lines
17 KiB
package task
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"math/rand"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/seaweedfs/seaweedfs/weed/glog"
|
|
"github.com/seaweedfs/seaweedfs/weed/worker/types"
|
|
)
|
|
|
|
// TaskSimulator provides a comprehensive simulation framework for testing the task distribution system
|
|
type TaskSimulator struct {
|
|
adminServer *AdminServer
|
|
mockWorkers []*MockWorker
|
|
mockMaster *MockMasterClient
|
|
scenarios map[string]*SimulationScenario
|
|
results map[string]*SimulationResult
|
|
mutex sync.RWMutex
|
|
}
|
|
|
|
// SimulationScenario defines a test scenario
|
|
type SimulationScenario struct {
|
|
Name string
|
|
Description string
|
|
WorkerCount int
|
|
VolumeCount int
|
|
Duration time.Duration
|
|
FailurePatterns []*FailurePattern
|
|
TestCases []*TestCase
|
|
}
|
|
|
|
// FailurePattern defines how failures occur during simulation
|
|
type FailurePattern struct {
|
|
Type FailureType
|
|
Probability float64 // 0.0 to 1.0
|
|
Timing *TimingSpec // When during task execution
|
|
Duration time.Duration
|
|
Details string
|
|
}
|
|
|
|
// TestCase defines specific test scenarios
|
|
type TestCase struct {
|
|
Name string
|
|
VolumeID uint32
|
|
TaskType types.TaskType
|
|
ExpectedOutcome string
|
|
FailureToInject *FailurePattern
|
|
}
|
|
|
|
// FailureType represents different types of failures
|
|
type FailureType string
|
|
|
|
const (
|
|
FailureWorkerTimeout FailureType = "worker_timeout"
|
|
FailureTaskStuck FailureType = "task_stuck"
|
|
FailureTaskCrash FailureType = "task_crash"
|
|
FailureDuplicate FailureType = "duplicate_task"
|
|
FailureResourceExhaust FailureType = "resource_exhaustion"
|
|
FailureNetworkPartition FailureType = "network_partition"
|
|
)
|
|
|
|
// TimingSpec defines when a failure occurs
|
|
type TimingSpec struct {
|
|
MinProgress float64 // Minimum progress before failure can occur
|
|
MaxProgress float64 // Maximum progress before failure must occur
|
|
Delay time.Duration // Fixed delay before failure
|
|
}
|
|
|
|
// SimulationResult tracks the results of a simulation
|
|
type SimulationResult struct {
|
|
ScenarioName string
|
|
StartTime time.Time
|
|
EndTime time.Time
|
|
Duration time.Duration
|
|
TasksCreated int
|
|
TasksCompleted int
|
|
TasksFailed int
|
|
TasksStuck int
|
|
WorkerTimeouts int
|
|
DuplicatesFound int
|
|
StateInconsistencies int
|
|
Errors []string
|
|
Warnings []string
|
|
Success bool
|
|
}
|
|
|
|
// MockWorker simulates a worker with controllable behavior
|
|
type MockWorker struct {
|
|
ID string
|
|
Capabilities []types.TaskType
|
|
MaxConcurrent int
|
|
CurrentTasks map[string]*MockTask
|
|
Status string
|
|
FailureMode *FailurePattern
|
|
mutex sync.Mutex
|
|
}
|
|
|
|
// MockTask represents a simulated task execution
|
|
type MockTask struct {
|
|
Task *types.Task
|
|
StartTime time.Time
|
|
Progress float64
|
|
Stuck bool
|
|
Failed bool
|
|
Completed bool
|
|
}
|
|
|
|
// MockMasterClient simulates master server interactions
|
|
type MockMasterClient struct {
|
|
volumes map[uint32]*VolumeInfo
|
|
inconsistency bool
|
|
mutex sync.RWMutex
|
|
}
|
|
|
|
// NewTaskSimulator creates a new task simulator
|
|
func NewTaskSimulator() *TaskSimulator {
|
|
return &TaskSimulator{
|
|
scenarios: make(map[string]*SimulationScenario),
|
|
results: make(map[string]*SimulationResult),
|
|
}
|
|
}
|
|
|
|
// RegisterScenario registers a simulation scenario
|
|
func (ts *TaskSimulator) RegisterScenario(scenario *SimulationScenario) {
|
|
ts.mutex.Lock()
|
|
defer ts.mutex.Unlock()
|
|
|
|
ts.scenarios[scenario.Name] = scenario
|
|
glog.Infof("Registered simulation scenario: %s", scenario.Name)
|
|
}
|
|
|
|
// RunScenario executes a simulation scenario
|
|
func (ts *TaskSimulator) RunScenario(scenarioName string) (*SimulationResult, error) {
|
|
ts.mutex.RLock()
|
|
scenario, exists := ts.scenarios[scenarioName]
|
|
ts.mutex.RUnlock()
|
|
|
|
if !exists {
|
|
return nil, fmt.Errorf("scenario %s not found", scenarioName)
|
|
}
|
|
|
|
glog.Infof("Starting simulation scenario: %s", scenarioName)
|
|
|
|
result := &SimulationResult{
|
|
ScenarioName: scenarioName,
|
|
StartTime: time.Now(),
|
|
Errors: make([]string, 0),
|
|
Warnings: make([]string, 0),
|
|
}
|
|
|
|
// Setup simulation environment
|
|
if err := ts.setupEnvironment(scenario); err != nil {
|
|
return nil, fmt.Errorf("failed to setup environment: %v", err)
|
|
}
|
|
|
|
// Execute test cases
|
|
ctx, cancel := context.WithTimeout(context.Background(), scenario.Duration)
|
|
defer cancel()
|
|
|
|
ts.executeScenario(ctx, scenario, result)
|
|
|
|
// Cleanup
|
|
ts.cleanup()
|
|
|
|
result.EndTime = time.Now()
|
|
result.Duration = result.EndTime.Sub(result.StartTime)
|
|
result.Success = len(result.Errors) == 0
|
|
|
|
ts.mutex.Lock()
|
|
ts.results[scenarioName] = result
|
|
ts.mutex.Unlock()
|
|
|
|
glog.Infof("Completed simulation scenario: %s (success: %v)", scenarioName, result.Success)
|
|
return result, nil
|
|
}
|
|
|
|
// setupEnvironment prepares the simulation environment
|
|
func (ts *TaskSimulator) setupEnvironment(scenario *SimulationScenario) error {
|
|
// Create mock master client
|
|
ts.mockMaster = &MockMasterClient{
|
|
volumes: make(map[uint32]*VolumeInfo),
|
|
}
|
|
|
|
// Generate mock volumes
|
|
for i := uint32(1); i <= uint32(scenario.VolumeCount); i++ {
|
|
volume := &VolumeInfo{
|
|
ID: i,
|
|
Size: uint64(rand.Intn(30 * 1024 * 1024 * 1024)), // Random size up to 30GB
|
|
Collection: fmt.Sprintf("collection_%d", (i%3)+1),
|
|
DeletedByteCount: uint64(rand.Intn(1024 * 1024 * 1024)), // Random garbage
|
|
ReadOnly: false,
|
|
Server: fmt.Sprintf("server_%d", (i%6)+1),
|
|
ModifiedAtSecond: time.Now().Add(-time.Duration(rand.Intn(86400)) * time.Second).Unix(),
|
|
}
|
|
ts.mockMaster.volumes[i] = volume
|
|
}
|
|
|
|
// Create mock workers
|
|
ts.mockWorkers = make([]*MockWorker, scenario.WorkerCount)
|
|
for i := 0; i < scenario.WorkerCount; i++ {
|
|
worker := &MockWorker{
|
|
ID: fmt.Sprintf("worker_%d", i+1),
|
|
Capabilities: []types.TaskType{types.TaskTypeErasureCoding, types.TaskTypeVacuum},
|
|
MaxConcurrent: 2,
|
|
CurrentTasks: make(map[string]*MockTask),
|
|
Status: "active",
|
|
}
|
|
|
|
// Apply failure patterns
|
|
if i < len(scenario.FailurePatterns) {
|
|
worker.FailureMode = scenario.FailurePatterns[i]
|
|
}
|
|
|
|
ts.mockWorkers[i] = worker
|
|
}
|
|
|
|
// Initialize admin server (simplified for simulation)
|
|
config := DefaultAdminConfig()
|
|
config.ScanInterval = 10 * time.Second
|
|
config.TaskTimeout = 30 * time.Second
|
|
|
|
// Note: In a real implementation, this would use the actual master client
|
|
// For simulation, we'd need to inject our mock
|
|
|
|
return nil
|
|
}
|
|
|
|
// executeScenario runs the actual simulation scenario
|
|
func (ts *TaskSimulator) executeScenario(ctx context.Context, scenario *SimulationScenario, result *SimulationResult) {
|
|
// Execute each test case
|
|
for _, testCase := range scenario.TestCases {
|
|
ts.executeTestCase(ctx, testCase, result)
|
|
}
|
|
|
|
// Run continuous simulation for remaining duration
|
|
ts.runContinuousSimulation(ctx, scenario, result)
|
|
}
|
|
|
|
// executeTestCase runs a specific test case
|
|
func (ts *TaskSimulator) executeTestCase(ctx context.Context, testCase *TestCase, result *SimulationResult) {
|
|
glog.V(1).Infof("Executing test case: %s", testCase.Name)
|
|
|
|
// Create task for the test case
|
|
task := &types.Task{
|
|
ID: fmt.Sprintf("test_%s_%d", testCase.Name, time.Now().UnixNano()),
|
|
Type: testCase.TaskType,
|
|
VolumeID: testCase.VolumeID,
|
|
Priority: types.TaskPriorityNormal,
|
|
CreatedAt: time.Now(),
|
|
}
|
|
|
|
result.TasksCreated++
|
|
|
|
// Assign to worker
|
|
worker := ts.selectWorkerForTask(task)
|
|
if worker == nil {
|
|
result.Errors = append(result.Errors, fmt.Sprintf("No available worker for test case %s", testCase.Name))
|
|
return
|
|
}
|
|
|
|
// Execute task with potential failure injection
|
|
ts.executeTaskOnWorker(ctx, task, worker, testCase.FailureToInject, result)
|
|
}
|
|
|
|
// runContinuousSimulation runs ongoing simulation
|
|
func (ts *TaskSimulator) runContinuousSimulation(ctx context.Context, scenario *SimulationScenario, result *SimulationResult) {
|
|
ticker := time.NewTicker(5 * time.Second)
|
|
defer ticker.Stop()
|
|
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
return
|
|
case <-ticker.C:
|
|
ts.simulateOngoingTasks(result)
|
|
ts.checkForInconsistencies(result)
|
|
}
|
|
}
|
|
}
|
|
|
|
// executeTaskOnWorker simulates task execution on a worker
|
|
func (ts *TaskSimulator) executeTaskOnWorker(ctx context.Context, task *types.Task, worker *MockWorker, failurePattern *FailurePattern, result *SimulationResult) {
|
|
worker.mutex.Lock()
|
|
defer worker.mutex.Unlock()
|
|
|
|
mockTask := &MockTask{
|
|
Task: task,
|
|
StartTime: time.Now(),
|
|
Progress: 0.0,
|
|
}
|
|
|
|
worker.CurrentTasks[task.ID] = mockTask
|
|
|
|
// Simulate task execution
|
|
go ts.simulateTaskExecution(ctx, mockTask, worker, failurePattern, result)
|
|
}
|
|
|
|
// simulateTaskExecution simulates the execution of a single task
|
|
func (ts *TaskSimulator) simulateTaskExecution(ctx context.Context, mockTask *MockTask, worker *MockWorker, failurePattern *FailurePattern, result *SimulationResult) {
|
|
defer func() {
|
|
worker.mutex.Lock()
|
|
delete(worker.CurrentTasks, mockTask.Task.ID)
|
|
worker.mutex.Unlock()
|
|
}()
|
|
|
|
duration := 20 * time.Second // Base task duration
|
|
progressTicker := time.NewTicker(time.Second)
|
|
defer progressTicker.Stop()
|
|
|
|
startTime := time.Now()
|
|
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
return
|
|
case <-progressTicker.C:
|
|
elapsed := time.Since(startTime)
|
|
progress := float64(elapsed) / float64(duration) * 100.0
|
|
|
|
if progress >= 100.0 {
|
|
mockTask.Completed = true
|
|
result.TasksCompleted++
|
|
glog.V(2).Infof("Task %s completed successfully", mockTask.Task.ID)
|
|
return
|
|
}
|
|
|
|
mockTask.Progress = progress
|
|
|
|
// Check for failure injection
|
|
if failurePattern != nil && ts.shouldInjectFailure(failurePattern, progress, elapsed) {
|
|
ts.injectFailure(mockTask, worker, failurePattern, result)
|
|
return
|
|
}
|
|
|
|
// Check for worker failure mode
|
|
if worker.FailureMode != nil && ts.shouldInjectFailure(worker.FailureMode, progress, elapsed) {
|
|
ts.injectFailure(mockTask, worker, worker.FailureMode, result)
|
|
return
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// shouldInjectFailure determines if a failure should be injected
|
|
func (ts *TaskSimulator) shouldInjectFailure(pattern *FailurePattern, progress float64, elapsed time.Duration) bool {
|
|
if pattern.Timing != nil {
|
|
if progress < pattern.Timing.MinProgress || progress > pattern.Timing.MaxProgress {
|
|
return false
|
|
}
|
|
if elapsed < pattern.Timing.Delay {
|
|
return false
|
|
}
|
|
}
|
|
|
|
return rand.Float64() < pattern.Probability
|
|
}
|
|
|
|
// injectFailure simulates a failure
|
|
func (ts *TaskSimulator) injectFailure(mockTask *MockTask, worker *MockWorker, pattern *FailurePattern, result *SimulationResult) {
|
|
glog.Warningf("Injecting failure: %s for task %s", pattern.Type, mockTask.Task.ID)
|
|
|
|
switch pattern.Type {
|
|
case FailureWorkerTimeout:
|
|
worker.Status = "timeout"
|
|
result.WorkerTimeouts++
|
|
|
|
case FailureTaskStuck:
|
|
mockTask.Stuck = true
|
|
result.TasksStuck++
|
|
|
|
case FailureTaskCrash:
|
|
mockTask.Failed = true
|
|
result.TasksFailed++
|
|
|
|
case FailureDuplicate:
|
|
result.DuplicatesFound++
|
|
|
|
case FailureResourceExhaust:
|
|
worker.Status = "resource_exhausted"
|
|
result.Warnings = append(result.Warnings, fmt.Sprintf("Worker %s resource exhausted", worker.ID))
|
|
|
|
case FailureNetworkPartition:
|
|
worker.Status = "partitioned"
|
|
result.Warnings = append(result.Warnings, fmt.Sprintf("Worker %s network partitioned", worker.ID))
|
|
}
|
|
}
|
|
|
|
// selectWorkerForTask selects an available worker for a task
|
|
func (ts *TaskSimulator) selectWorkerForTask(task *types.Task) *MockWorker {
|
|
for _, worker := range ts.mockWorkers {
|
|
if worker.Status == "active" && len(worker.CurrentTasks) < worker.MaxConcurrent {
|
|
// Check capabilities
|
|
for _, capability := range worker.Capabilities {
|
|
if capability == task.Type {
|
|
return worker
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// simulateOngoingTasks handles ongoing task simulation
|
|
func (ts *TaskSimulator) simulateOngoingTasks(result *SimulationResult) {
|
|
// Create random new tasks
|
|
if rand.Float64() < 0.3 { // 30% chance to create new task every tick
|
|
taskType := types.TaskTypeVacuum
|
|
if rand.Float64() < 0.5 {
|
|
taskType = types.TaskTypeErasureCoding
|
|
}
|
|
|
|
task := &types.Task{
|
|
ID: fmt.Sprintf("auto_%d", time.Now().UnixNano()),
|
|
Type: taskType,
|
|
VolumeID: uint32(rand.Intn(len(ts.mockMaster.volumes)) + 1),
|
|
Priority: types.TaskPriorityNormal,
|
|
CreatedAt: time.Now(),
|
|
}
|
|
|
|
result.TasksCreated++
|
|
|
|
worker := ts.selectWorkerForTask(task)
|
|
if worker != nil {
|
|
ts.executeTaskOnWorker(context.Background(), task, worker, nil, result)
|
|
}
|
|
}
|
|
}
|
|
|
|
// checkForInconsistencies checks for state inconsistencies
|
|
func (ts *TaskSimulator) checkForInconsistencies(result *SimulationResult) {
|
|
// Check for volume reservation inconsistencies
|
|
// Check for duplicate tasks
|
|
// Check for orphaned tasks
|
|
// This would be more comprehensive in a real implementation
|
|
|
|
for _, worker := range ts.mockWorkers {
|
|
worker.mutex.Lock()
|
|
for taskID, mockTask := range worker.CurrentTasks {
|
|
if mockTask.Stuck && time.Since(mockTask.StartTime) > 60*time.Second {
|
|
result.StateInconsistencies++
|
|
result.Warnings = append(result.Warnings, fmt.Sprintf("Long-running stuck task detected: %s", taskID))
|
|
}
|
|
}
|
|
worker.mutex.Unlock()
|
|
}
|
|
}
|
|
|
|
// cleanup cleans up simulation resources
|
|
func (ts *TaskSimulator) cleanup() {
|
|
ts.mockWorkers = nil
|
|
ts.mockMaster = nil
|
|
}
|
|
|
|
// GetSimulationResults returns all simulation results
|
|
func (ts *TaskSimulator) GetSimulationResults() map[string]*SimulationResult {
|
|
ts.mutex.RLock()
|
|
defer ts.mutex.RUnlock()
|
|
|
|
results := make(map[string]*SimulationResult)
|
|
for k, v := range ts.results {
|
|
results[k] = v
|
|
}
|
|
return results
|
|
}
|
|
|
|
// CreateStandardScenarios creates a set of standard test scenarios
|
|
func (ts *TaskSimulator) CreateStandardScenarios() {
|
|
// Scenario 1: Worker Timeout During EC
|
|
ts.RegisterScenario(&SimulationScenario{
|
|
Name: "worker_timeout_during_ec",
|
|
Description: "Test worker timeout during erasure coding operation",
|
|
WorkerCount: 3,
|
|
VolumeCount: 10,
|
|
Duration: 2 * time.Minute,
|
|
FailurePatterns: []*FailurePattern{
|
|
{
|
|
Type: FailureWorkerTimeout,
|
|
Probability: 1.0,
|
|
Timing: &TimingSpec{
|
|
MinProgress: 50.0,
|
|
MaxProgress: 60.0,
|
|
},
|
|
},
|
|
},
|
|
TestCases: []*TestCase{
|
|
{
|
|
Name: "ec_timeout_test",
|
|
VolumeID: 1,
|
|
TaskType: types.TaskTypeErasureCoding,
|
|
ExpectedOutcome: "task_reassigned",
|
|
},
|
|
},
|
|
})
|
|
|
|
// Scenario 2: Stuck Vacuum Task
|
|
ts.RegisterScenario(&SimulationScenario{
|
|
Name: "stuck_vacuum_task",
|
|
Description: "Test stuck vacuum task detection and cleanup",
|
|
WorkerCount: 2,
|
|
VolumeCount: 5,
|
|
Duration: 90 * time.Second,
|
|
TestCases: []*TestCase{
|
|
{
|
|
Name: "vacuum_stuck_test",
|
|
VolumeID: 2,
|
|
TaskType: types.TaskTypeVacuum,
|
|
FailureToInject: &FailurePattern{
|
|
Type: FailureTaskStuck,
|
|
Probability: 1.0,
|
|
Timing: &TimingSpec{
|
|
MinProgress: 75.0,
|
|
MaxProgress: 80.0,
|
|
},
|
|
},
|
|
ExpectedOutcome: "task_timeout_detected",
|
|
},
|
|
},
|
|
})
|
|
|
|
// Scenario 3: Duplicate Task Prevention
|
|
ts.RegisterScenario(&SimulationScenario{
|
|
Name: "duplicate_task_prevention",
|
|
Description: "Test duplicate task detection and prevention",
|
|
WorkerCount: 4,
|
|
VolumeCount: 8,
|
|
Duration: 60 * time.Second,
|
|
TestCases: []*TestCase{
|
|
{
|
|
Name: "duplicate_ec_test_1",
|
|
VolumeID: 3,
|
|
TaskType: types.TaskTypeErasureCoding,
|
|
},
|
|
{
|
|
Name: "duplicate_ec_test_2", // Same volume, should be detected as duplicate
|
|
VolumeID: 3,
|
|
TaskType: types.TaskTypeErasureCoding,
|
|
FailureToInject: &FailurePattern{
|
|
Type: FailureDuplicate,
|
|
Probability: 1.0,
|
|
},
|
|
ExpectedOutcome: "duplicate_detected",
|
|
},
|
|
},
|
|
})
|
|
|
|
// Scenario 4: Master-Admin State Divergence
|
|
ts.RegisterScenario(&SimulationScenario{
|
|
Name: "master_admin_divergence",
|
|
Description: "Test state reconciliation between master and admin server",
|
|
WorkerCount: 3,
|
|
VolumeCount: 15,
|
|
Duration: 2 * time.Minute,
|
|
TestCases: []*TestCase{
|
|
{
|
|
Name: "state_reconciliation_test",
|
|
VolumeID: 4,
|
|
TaskType: types.TaskTypeErasureCoding,
|
|
ExpectedOutcome: "state_reconciled",
|
|
},
|
|
},
|
|
})
|
|
}
|
|
|
|
// GenerateSimulationReport creates a comprehensive report of simulation results
|
|
func (ts *TaskSimulator) GenerateSimulationReport() string {
|
|
ts.mutex.RLock()
|
|
defer ts.mutex.RUnlock()
|
|
|
|
report := "# Task Distribution System Simulation Report\n\n"
|
|
|
|
for scenarioName, result := range ts.results {
|
|
report += fmt.Sprintf("## Scenario: %s\n", scenarioName)
|
|
report += fmt.Sprintf("- **Duration**: %v\n", result.Duration)
|
|
report += fmt.Sprintf("- **Success**: %v\n", result.Success)
|
|
report += fmt.Sprintf("- **Tasks Created**: %d\n", result.TasksCreated)
|
|
report += fmt.Sprintf("- **Tasks Completed**: %d\n", result.TasksCompleted)
|
|
report += fmt.Sprintf("- **Tasks Failed**: %d\n", result.TasksFailed)
|
|
report += fmt.Sprintf("- **Tasks Stuck**: %d\n", result.TasksStuck)
|
|
report += fmt.Sprintf("- **Worker Timeouts**: %d\n", result.WorkerTimeouts)
|
|
report += fmt.Sprintf("- **Duplicates Found**: %d\n", result.DuplicatesFound)
|
|
report += fmt.Sprintf("- **State Inconsistencies**: %d\n", result.StateInconsistencies)
|
|
|
|
if len(result.Errors) > 0 {
|
|
report += "- **Errors**:\n"
|
|
for _, err := range result.Errors {
|
|
report += fmt.Sprintf(" - %s\n", err)
|
|
}
|
|
}
|
|
|
|
if len(result.Warnings) > 0 {
|
|
report += "- **Warnings**:\n"
|
|
for _, warning := range result.Warnings {
|
|
report += fmt.Sprintf(" - %s\n", warning)
|
|
}
|
|
}
|
|
|
|
report += "\n"
|
|
}
|
|
|
|
return report
|
|
}
|