You can not select more than 25 topics
			Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
		
		
		
		
		
			
		
			
				
					
					
						
							306 lines
						
					
					
						
							10 KiB
						
					
					
				
			
		
		
		
			
			
			
		
		
	
	
							306 lines
						
					
					
						
							10 KiB
						
					
					
				
								package topology
							 | 
						|
								
							 | 
						|
								import (
							 | 
						|
									"fmt"
							 | 
						|
									"sync"
							 | 
						|
									"sync/atomic"
							 | 
						|
									"testing"
							 | 
						|
									"time"
							 | 
						|
								
							 | 
						|
									"github.com/seaweedfs/seaweedfs/weed/sequence"
							 | 
						|
									"github.com/seaweedfs/seaweedfs/weed/storage/super_block"
							 | 
						|
									"github.com/seaweedfs/seaweedfs/weed/storage/types"
							 | 
						|
								)
							 | 
						|
								
							 | 
						|
								// TestRaceConditionStress simulates the original issue scenario:
							 | 
						|
								// High concurrent writes causing capacity misjudgment
							 | 
						|
								func TestRaceConditionStress(t *testing.T) {
							 | 
						|
									// Create a cluster similar to the issue description:
							 | 
						|
									// 3 volume servers, 200GB each, 5GB volume limit = 40 volumes max per server
							 | 
						|
									const (
							 | 
						|
										numServers          = 3
							 | 
						|
										volumeLimitMB       = 5000                                      // 5GB in MB
							 | 
						|
										storagePerServerGB  = 200                                       // 200GB per server
							 | 
						|
										maxVolumesPerServer = storagePerServerGB * 1024 / volumeLimitMB // 200*1024/5000 = 40
							 | 
						|
										concurrentRequests  = 50                                        // High concurrency like the issue
							 | 
						|
									)
							 | 
						|
								
							 | 
						|
									// Create test topology
							 | 
						|
									topo := NewTopology("weedfs", sequence.NewMemorySequencer(), uint64(volumeLimitMB)*1024*1024, 5, false)
							 | 
						|
								
							 | 
						|
									dc := NewDataCenter("dc1")
							 | 
						|
									topo.LinkChildNode(dc)
							 | 
						|
									rack := NewRack("rack1")
							 | 
						|
									dc.LinkChildNode(rack)
							 | 
						|
								
							 | 
						|
									// Create 3 volume servers with realistic capacity
							 | 
						|
									servers := make([]*DataNode, numServers)
							 | 
						|
									for i := 0; i < numServers; i++ {
							 | 
						|
										dn := NewDataNode(fmt.Sprintf("server%d", i+1))
							 | 
						|
										rack.LinkChildNode(dn)
							 | 
						|
								
							 | 
						|
										// Set up disk with capacity for 40 volumes
							 | 
						|
										disk := NewDisk(types.HardDriveType.String())
							 | 
						|
										disk.diskUsages.getOrCreateDisk(types.HardDriveType).maxVolumeCount = maxVolumesPerServer
							 | 
						|
										dn.LinkChildNode(disk)
							 | 
						|
								
							 | 
						|
										servers[i] = dn
							 | 
						|
									}
							 | 
						|
								
							 | 
						|
									vg := NewDefaultVolumeGrowth()
							 | 
						|
									rp, _ := super_block.NewReplicaPlacementFromString("000") // Single replica like the issue
							 | 
						|
								
							 | 
						|
									option := &VolumeGrowOption{
							 | 
						|
										Collection:       "test-bucket-large", // Same collection name as issue
							 | 
						|
										ReplicaPlacement: rp,
							 | 
						|
										DiskType:         types.HardDriveType,
							 | 
						|
									}
							 | 
						|
								
							 | 
						|
									// Track results
							 | 
						|
									var successfulAllocations int64
							 | 
						|
									var failedAllocations int64
							 | 
						|
									var totalVolumesCreated int64
							 | 
						|
								
							 | 
						|
									var wg sync.WaitGroup
							 | 
						|
								
							 | 
						|
									// Launch concurrent volume creation requests
							 | 
						|
									startTime := time.Now()
							 | 
						|
									for i := 0; i < concurrentRequests; i++ {
							 | 
						|
										wg.Add(1)
							 | 
						|
										go func(requestId int) {
							 | 
						|
											defer wg.Done()
							 | 
						|
								
							 | 
						|
											// This is the critical test: multiple threads trying to allocate simultaneously
							 | 
						|
											servers, reservation, err := vg.findEmptySlotsForOneVolume(topo, option, true)
							 | 
						|
								
							 | 
						|
											if err != nil {
							 | 
						|
												atomic.AddInt64(&failedAllocations, 1)
							 | 
						|
												t.Logf("Request %d failed: %v", requestId, err)
							 | 
						|
												return
							 | 
						|
											}
							 | 
						|
								
							 | 
						|
											// Simulate volume creation delay (like in real scenario)
							 | 
						|
											time.Sleep(time.Millisecond * 50)
							 | 
						|
								
							 | 
						|
											// Simulate successful volume creation
							 | 
						|
											for _, server := range servers {
							 | 
						|
												disk := server.children[NodeId(types.HardDriveType.String())].(*Disk)
							 | 
						|
												deltaDiskUsage := &DiskUsageCounts{
							 | 
						|
													volumeCount: 1,
							 | 
						|
												}
							 | 
						|
												disk.UpAdjustDiskUsageDelta(types.HardDriveType, deltaDiskUsage)
							 | 
						|
												atomic.AddInt64(&totalVolumesCreated, 1)
							 | 
						|
											}
							 | 
						|
								
							 | 
						|
											// Release reservations (simulates successful registration)
							 | 
						|
											reservation.releaseAllReservations()
							 | 
						|
											atomic.AddInt64(&successfulAllocations, 1)
							 | 
						|
								
							 | 
						|
										}(i)
							 | 
						|
									}
							 | 
						|
								
							 | 
						|
									wg.Wait()
							 | 
						|
									duration := time.Since(startTime)
							 | 
						|
								
							 | 
						|
									// Verify results
							 | 
						|
									t.Logf("Test completed in %v", duration)
							 | 
						|
									t.Logf("Successful allocations: %d", successfulAllocations)
							 | 
						|
									t.Logf("Failed allocations: %d", failedAllocations)
							 | 
						|
									t.Logf("Total volumes created: %d", totalVolumesCreated)
							 | 
						|
								
							 | 
						|
									// Check capacity limits are respected
							 | 
						|
									totalCapacityUsed := int64(0)
							 | 
						|
									for i, server := range servers {
							 | 
						|
										disk := server.children[NodeId(types.HardDriveType.String())].(*Disk)
							 | 
						|
										volumeCount := disk.diskUsages.getOrCreateDisk(types.HardDriveType).volumeCount
							 | 
						|
										totalCapacityUsed += volumeCount
							 | 
						|
								
							 | 
						|
										t.Logf("Server %d: %d volumes (max: %d)", i+1, volumeCount, maxVolumesPerServer)
							 | 
						|
								
							 | 
						|
										// Critical test: No server should exceed its capacity
							 | 
						|
										if volumeCount > maxVolumesPerServer {
							 | 
						|
											t.Errorf("RACE CONDITION DETECTED: Server %d exceeded capacity: %d > %d",
							 | 
						|
												i+1, volumeCount, maxVolumesPerServer)
							 | 
						|
										}
							 | 
						|
									}
							 | 
						|
								
							 | 
						|
									// Verify totals make sense
							 | 
						|
									if totalVolumesCreated != totalCapacityUsed {
							 | 
						|
										t.Errorf("Volume count mismatch: created=%d, actual=%d", totalVolumesCreated, totalCapacityUsed)
							 | 
						|
									}
							 | 
						|
								
							 | 
						|
									// The total should never exceed the cluster capacity (120 volumes for 3 servers × 40 each)
							 | 
						|
									maxClusterCapacity := int64(numServers * maxVolumesPerServer)
							 | 
						|
									if totalCapacityUsed > maxClusterCapacity {
							 | 
						|
										t.Errorf("RACE CONDITION DETECTED: Cluster capacity exceeded: %d > %d",
							 | 
						|
											totalCapacityUsed, maxClusterCapacity)
							 | 
						|
									}
							 | 
						|
								
							 | 
						|
									// With reservations, we should have controlled allocation
							 | 
						|
									// Total requests = successful + failed should equal concurrentRequests
							 | 
						|
									if successfulAllocations+failedAllocations != concurrentRequests {
							 | 
						|
										t.Errorf("Request count mismatch: success=%d + failed=%d != total=%d",
							 | 
						|
											successfulAllocations, failedAllocations, concurrentRequests)
							 | 
						|
									}
							 | 
						|
								
							 | 
						|
									t.Logf("Race condition test passed: Capacity limits respected with %d concurrent requests",
							 | 
						|
										concurrentRequests)
							 | 
						|
								}
							 | 
						|
								
							 | 
						|
								// TestCapacityJudgmentAccuracy verifies that the capacity calculation is accurate
							 | 
						|
								// under various load conditions
							 | 
						|
								func TestCapacityJudgmentAccuracy(t *testing.T) {
							 | 
						|
									// Create a single server with known capacity
							 | 
						|
									topo := NewTopology("weedfs", sequence.NewMemorySequencer(), 5*1024*1024*1024, 5, false)
							 | 
						|
								
							 | 
						|
									dc := NewDataCenter("dc1")
							 | 
						|
									topo.LinkChildNode(dc)
							 | 
						|
									rack := NewRack("rack1")
							 | 
						|
									dc.LinkChildNode(rack)
							 | 
						|
								
							 | 
						|
									dn := NewDataNode("server1")
							 | 
						|
									rack.LinkChildNode(dn)
							 | 
						|
								
							 | 
						|
									// Server with capacity for exactly 10 volumes
							 | 
						|
									disk := NewDisk(types.HardDriveType.String())
							 | 
						|
									diskUsage := disk.diskUsages.getOrCreateDisk(types.HardDriveType)
							 | 
						|
									diskUsage.maxVolumeCount = 10
							 | 
						|
									dn.LinkChildNode(disk)
							 | 
						|
								
							 | 
						|
									// Also set max volume count on the DataNode level (gets propagated up)
							 | 
						|
									dn.diskUsages.getOrCreateDisk(types.HardDriveType).maxVolumeCount = 10
							 | 
						|
								
							 | 
						|
									vg := NewDefaultVolumeGrowth()
							 | 
						|
									rp, _ := super_block.NewReplicaPlacementFromString("000")
							 | 
						|
								
							 | 
						|
									option := &VolumeGrowOption{
							 | 
						|
										Collection:       "test",
							 | 
						|
										ReplicaPlacement: rp,
							 | 
						|
										DiskType:         types.HardDriveType,
							 | 
						|
									}
							 | 
						|
								
							 | 
						|
									// Test accurate capacity reporting at each step
							 | 
						|
									for i := 0; i < 10; i++ {
							 | 
						|
										// Check available space before reservation
							 | 
						|
										availableBefore := dn.AvailableSpaceFor(option)
							 | 
						|
										availableForReservation := dn.AvailableSpaceForReservation(option)
							 | 
						|
								
							 | 
						|
										expectedAvailable := int64(10 - i)
							 | 
						|
										if availableBefore != expectedAvailable {
							 | 
						|
											t.Errorf("Step %d: Expected %d available, got %d", i, expectedAvailable, availableBefore)
							 | 
						|
										}
							 | 
						|
								
							 | 
						|
										if availableForReservation != expectedAvailable {
							 | 
						|
											t.Errorf("Step %d: Expected %d available for reservation, got %d", i, expectedAvailable, availableForReservation)
							 | 
						|
										}
							 | 
						|
								
							 | 
						|
										// Try to reserve and allocate
							 | 
						|
										_, reservation, err := vg.findEmptySlotsForOneVolume(topo, option, true)
							 | 
						|
										if err != nil {
							 | 
						|
											t.Fatalf("Step %d: Unexpected reservation failure: %v", i, err)
							 | 
						|
										}
							 | 
						|
								
							 | 
						|
										// Check that available space for reservation decreased
							 | 
						|
										availableAfterReservation := dn.AvailableSpaceForReservation(option)
							 | 
						|
										if availableAfterReservation != expectedAvailable-1 {
							 | 
						|
											t.Errorf("Step %d: Expected %d available after reservation, got %d",
							 | 
						|
												i, expectedAvailable-1, availableAfterReservation)
							 | 
						|
										}
							 | 
						|
								
							 | 
						|
										// Simulate successful volume creation by properly updating disk usage hierarchy
							 | 
						|
										disk := dn.children[NodeId(types.HardDriveType.String())].(*Disk)
							 | 
						|
								
							 | 
						|
										// Create a volume usage delta to simulate volume creation
							 | 
						|
										deltaDiskUsage := &DiskUsageCounts{
							 | 
						|
											volumeCount: 1,
							 | 
						|
										}
							 | 
						|
								
							 | 
						|
										// Properly propagate the usage up the hierarchy
							 | 
						|
										disk.UpAdjustDiskUsageDelta(types.HardDriveType, deltaDiskUsage)
							 | 
						|
								
							 | 
						|
										// Debug: Check the volume count after update
							 | 
						|
										diskUsageOnNode := dn.diskUsages.getOrCreateDisk(types.HardDriveType)
							 | 
						|
										currentVolumeCount := atomic.LoadInt64(&diskUsageOnNode.volumeCount)
							 | 
						|
										t.Logf("Step %d: Volume count after update: %d", i, currentVolumeCount)
							 | 
						|
								
							 | 
						|
										// Release reservation
							 | 
						|
										reservation.releaseAllReservations()
							 | 
						|
								
							 | 
						|
										// Verify final state
							 | 
						|
										availableAfter := dn.AvailableSpaceFor(option)
							 | 
						|
										expectedAfter := int64(10 - i - 1)
							 | 
						|
										if availableAfter != expectedAfter {
							 | 
						|
											t.Errorf("Step %d: Expected %d available after creation, got %d",
							 | 
						|
												i, expectedAfter, availableAfter)
							 | 
						|
											// More debugging
							 | 
						|
											diskUsageOnNode := dn.diskUsages.getOrCreateDisk(types.HardDriveType)
							 | 
						|
											maxVolumes := atomic.LoadInt64(&diskUsageOnNode.maxVolumeCount)
							 | 
						|
											remoteVolumes := atomic.LoadInt64(&diskUsageOnNode.remoteVolumeCount)
							 | 
						|
											actualVolumeCount := atomic.LoadInt64(&diskUsageOnNode.volumeCount)
							 | 
						|
											t.Logf("Debug Step %d: max=%d, volume=%d, remote=%d", i, maxVolumes, actualVolumeCount, remoteVolumes)
							 | 
						|
										}
							 | 
						|
									}
							 | 
						|
								
							 | 
						|
									// At this point, no more reservations should succeed
							 | 
						|
									_, _, err := vg.findEmptySlotsForOneVolume(topo, option, true)
							 | 
						|
									if err == nil {
							 | 
						|
										t.Error("Expected reservation to fail when at capacity")
							 | 
						|
									}
							 | 
						|
								
							 | 
						|
									t.Logf("Capacity judgment accuracy test passed")
							 | 
						|
								}
							 | 
						|
								
							 | 
						|
								// TestReservationSystemPerformance measures the performance impact of reservations
							 | 
						|
								func TestReservationSystemPerformance(t *testing.T) {
							 | 
						|
									// Create topology
							 | 
						|
									topo := NewTopology("weedfs", sequence.NewMemorySequencer(), 32*1024, 5, false)
							 | 
						|
								
							 | 
						|
									dc := NewDataCenter("dc1")
							 | 
						|
									topo.LinkChildNode(dc)
							 | 
						|
									rack := NewRack("rack1")
							 | 
						|
									dc.LinkChildNode(rack)
							 | 
						|
								
							 | 
						|
									dn := NewDataNode("server1")
							 | 
						|
									rack.LinkChildNode(dn)
							 | 
						|
								
							 | 
						|
									disk := NewDisk(types.HardDriveType.String())
							 | 
						|
									disk.diskUsages.getOrCreateDisk(types.HardDriveType).maxVolumeCount = 1000
							 | 
						|
									dn.LinkChildNode(disk)
							 | 
						|
								
							 | 
						|
									vg := NewDefaultVolumeGrowth()
							 | 
						|
									rp, _ := super_block.NewReplicaPlacementFromString("000")
							 | 
						|
								
							 | 
						|
									option := &VolumeGrowOption{
							 | 
						|
										Collection:       "test",
							 | 
						|
										ReplicaPlacement: rp,
							 | 
						|
										DiskType:         types.HardDriveType,
							 | 
						|
									}
							 | 
						|
								
							 | 
						|
									// Benchmark reservation operations
							 | 
						|
									const iterations = 1000
							 | 
						|
								
							 | 
						|
									startTime := time.Now()
							 | 
						|
									for i := 0; i < iterations; i++ {
							 | 
						|
										_, reservation, err := vg.findEmptySlotsForOneVolume(topo, option, true)
							 | 
						|
										if err != nil {
							 | 
						|
											t.Fatalf("Iteration %d failed: %v", i, err)
							 | 
						|
										}
							 | 
						|
										reservation.releaseAllReservations()
							 | 
						|
								
							 | 
						|
										// Simulate volume creation
							 | 
						|
										diskUsage := dn.diskUsages.getOrCreateDisk(types.HardDriveType)
							 | 
						|
										atomic.AddInt64(&diskUsage.volumeCount, 1)
							 | 
						|
									}
							 | 
						|
									duration := time.Since(startTime)
							 | 
						|
								
							 | 
						|
									avgDuration := duration / iterations
							 | 
						|
									t.Logf("Performance: %d reservations in %v (avg: %v per reservation)",
							 | 
						|
										iterations, duration, avgDuration)
							 | 
						|
								
							 | 
						|
									// Performance should be reasonable (less than 1ms per reservation on average)
							 | 
						|
									if avgDuration > time.Millisecond {
							 | 
						|
										t.Errorf("Reservation system performance concern: %v per reservation", avgDuration)
							 | 
						|
									} else {
							 | 
						|
										t.Logf("Performance test passed: %v per reservation", avgDuration)
							 | 
						|
									}
							 | 
						|
								}
							 |