From e60b2117c3bf36f9a5ff4a1cf1b9216124546f8b Mon Sep 17 00:00:00 2001 From: Chris Lu Date: Fri, 11 Sep 2020 00:29:25 -0700 Subject: [PATCH] shell: volume balance follows replica placement --- weed/shell/command_volume_balance.go | 63 +++++++-- weed/shell/command_volume_balance_test.go | 139 +++++++++++++++++++ weed/shell/command_volume_fix_replication.go | 29 ++-- 3 files changed, 207 insertions(+), 24 deletions(-) create mode 100644 weed/shell/command_volume_balance_test.go diff --git a/weed/shell/command_volume_balance.go b/weed/shell/command_volume_balance.go index 005236806..4f96c3e08 100644 --- a/weed/shell/command_volume_balance.go +++ b/weed/shell/command_volume_balance.go @@ -4,6 +4,7 @@ import ( "context" "flag" "fmt" + "github.com/chrislusf/seaweedfs/weed/storage/super_block" "io" "os" "sort" @@ -83,6 +84,7 @@ func (c *commandVolumeBalance) Do(args []string, commandEnv *CommandEnv, writer } typeToNodes := collectVolumeServersByType(resp.TopologyInfo, *dc) + volumeReplicas, _ := collectVolumeReplicaLocations(resp) for maxVolumeCount, volumeServers := range typeToNodes { if len(volumeServers) < 2 { @@ -95,16 +97,16 @@ func (c *commandVolumeBalance) Do(args []string, commandEnv *CommandEnv, writer return err } for _, c := range collections { - if err = balanceVolumeServers(commandEnv, volumeServers, resp.VolumeSizeLimitMb*1024*1024, c, *applyBalancing); err != nil { + if err = balanceVolumeServers(commandEnv, volumeReplicas, volumeServers, resp.VolumeSizeLimitMb*1024*1024, c, *applyBalancing); err != nil { return err } } } else if *collection == "ALL_COLLECTIONS" { - if err = balanceVolumeServers(commandEnv, volumeServers, resp.VolumeSizeLimitMb*1024*1024, "ALL_COLLECTIONS", *applyBalancing); err != nil { + if err = balanceVolumeServers(commandEnv, volumeReplicas, volumeServers, resp.VolumeSizeLimitMb*1024*1024, "ALL_COLLECTIONS", *applyBalancing); err != nil { return err } } else { - if err = balanceVolumeServers(commandEnv, volumeServers, resp.VolumeSizeLimitMb*1024*1024, *collection, *applyBalancing); err != nil { + if err = balanceVolumeServers(commandEnv, volumeReplicas, volumeServers, resp.VolumeSizeLimitMb*1024*1024, *collection, *applyBalancing); err != nil { return err } } @@ -113,7 +115,7 @@ func (c *commandVolumeBalance) Do(args []string, commandEnv *CommandEnv, writer return nil } -func balanceVolumeServers(commandEnv *CommandEnv, nodes []*Node, volumeSizeLimit uint64, collection string, applyBalancing bool) error { +func balanceVolumeServers(commandEnv *CommandEnv, volumeReplicas map[uint32][]*VolumeReplica, nodes []*Node, volumeSizeLimit uint64, collection string, applyBalancing bool) error { // balance writable volumes for _, n := range nodes { @@ -126,7 +128,7 @@ func balanceVolumeServers(commandEnv *CommandEnv, nodes []*Node, volumeSizeLimit return !v.ReadOnly && v.Size < volumeSizeLimit }) } - if err := balanceSelectedVolume(commandEnv, nodes, sortWritableVolumes, applyBalancing); err != nil { + if err := balanceSelectedVolume(commandEnv, volumeReplicas, nodes, sortWritableVolumes, applyBalancing); err != nil { return err } @@ -141,7 +143,7 @@ func balanceVolumeServers(commandEnv *CommandEnv, nodes []*Node, volumeSizeLimit return v.ReadOnly || v.Size >= volumeSizeLimit }) } - if err := balanceSelectedVolume(commandEnv, nodes, sortReadOnlyVolumes, applyBalancing); err != nil { + if err := balanceSelectedVolume(commandEnv, volumeReplicas, nodes, sortReadOnlyVolumes, applyBalancing); err != nil { return err } @@ -186,7 +188,7 @@ func sortReadOnlyVolumes(volumes []*master_pb.VolumeInformationMessage) { }) } -func balanceSelectedVolume(commandEnv *CommandEnv, nodes []*Node, sortCandidatesFn func(volumes []*master_pb.VolumeInformationMessage), applyBalancing bool) (err error) { +func balanceSelectedVolume(commandEnv *CommandEnv, volumeReplicas map[uint32][]*VolumeReplica, nodes []*Node, sortCandidatesFn func(volumes []*master_pb.VolumeInformationMessage), applyBalancing bool) (err error) { selectedVolumeCount := 0 for _, dn := range nodes { selectedVolumeCount += len(dn.selectedVolumes) @@ -216,7 +218,7 @@ func balanceSelectedVolume(commandEnv *CommandEnv, nodes []*Node, sortCandidates // no more volume servers with empty slots break } - hasMoved, err = attemptToMoveOneVolume(commandEnv, fullNode, candidateVolumes, emptyNode, applyBalancing) + hasMoved, err = attemptToMoveOneVolume(commandEnv, volumeReplicas, fullNode, candidateVolumes, emptyNode, applyBalancing) if err != nil { return } @@ -229,13 +231,12 @@ func balanceSelectedVolume(commandEnv *CommandEnv, nodes []*Node, sortCandidates return nil } -func attemptToMoveOneVolume(commandEnv *CommandEnv, fullNode *Node, candidateVolumes []*master_pb.VolumeInformationMessage, emptyNode *Node, applyBalancing bool) (hasMoved bool, err error) { +func attemptToMoveOneVolume(commandEnv *CommandEnv, volumeReplicas map[uint32][]*VolumeReplica, fullNode *Node, candidateVolumes []*master_pb.VolumeInformationMessage, emptyNode *Node, applyBalancing bool) (hasMoved bool, err error) { for _, v := range candidateVolumes { if v.ReplicaPlacement > 0 { - if fullNode.dc != emptyNode.dc && fullNode.rack != emptyNode.rack { - // TODO this logic is too simple, but should work most of the time - // Need a correct algorithm to handle all different cases + replicaPlacement, _ := super_block.NewReplicaPlacementFromByte(byte(v.ReplicaPlacement)) + if !isGoodMove(replicaPlacement, volumeReplicas[v.Id], fullNode, emptyNode) { continue } } @@ -273,3 +274,41 @@ func (node *Node) selectVolumes(fn func(v *master_pb.VolumeInformationMessage) b } } } + +func isGoodMove(placement *super_block.ReplicaPlacement, existingReplicas []*VolumeReplica, sourceNode, targetNode *Node) bool { + for _, replica := range existingReplicas { + if replica.location.dataNode.Id == targetNode.info.Id && + replica.location.rack == targetNode.rack && + replica.location.dc == targetNode.dc { + // never move to existing nodes + return false + } + } + dcs, racks := make(map[string]bool), make(map[string]int) + for _, replica := range existingReplicas { + if replica.location.dataNode.Id != sourceNode.info.Id { + dcs[replica.location.DataCenter()] = true + racks[replica.location.Rack()]++ + } + } + + dcs[targetNode.dc] = true + racks[fmt.Sprintf("%s %s", targetNode.dc, targetNode.rack)]++ + + if len(dcs) > placement.DiffDataCenterCount+1 { + return false + } + + if len(racks) > placement.DiffRackCount+1 { + return false + } + + for _, sameRackCount := range racks { + if sameRackCount > placement.SameRackCount+1 { + return false + } + } + + return true + +} diff --git a/weed/shell/command_volume_balance_test.go b/weed/shell/command_volume_balance_test.go new file mode 100644 index 000000000..7919d60c0 --- /dev/null +++ b/weed/shell/command_volume_balance_test.go @@ -0,0 +1,139 @@ +package shell + +import ( + "testing" + + "github.com/chrislusf/seaweedfs/weed/pb/master_pb" + "github.com/chrislusf/seaweedfs/weed/storage/super_block" +) + +type testMoveCase struct { + name string + replication string + replicas []*VolumeReplica + sourceLocation location + targetLocation location + expected bool +} + +func TestIsGoodMove(t *testing.T) { + + var tests = []testMoveCase{ + { + name: "test move to the same node", + replication: "001", + replicas: []*VolumeReplica{ + { + location: &location{"dc1", "r1", &master_pb.DataNodeInfo{Id: "dn1"}}, + }, + { + location: &location{"dc1", "r1", &master_pb.DataNodeInfo{Id: "dn2"}}, + }, + }, + sourceLocation: location{"dc1", "r1", &master_pb.DataNodeInfo{Id: "dn2"}}, + targetLocation: location{"dc1", "r1", &master_pb.DataNodeInfo{Id: "dn2"}}, + expected: false, + }, + + { + name: "test move to the same rack, but existing node", + replication: "001", + replicas: []*VolumeReplica{ + { + location: &location{"dc1", "r1", &master_pb.DataNodeInfo{Id: "dn1"}}, + }, + { + location: &location{"dc1", "r1", &master_pb.DataNodeInfo{Id: "dn2"}}, + }, + }, + sourceLocation: location{"dc1", "r1", &master_pb.DataNodeInfo{Id: "dn2"}}, + targetLocation: location{"dc1", "r1", &master_pb.DataNodeInfo{Id: "dn1"}}, + expected: false, + }, + + { + name: "test move to the same rack, a new node", + replication: "001", + replicas: []*VolumeReplica{ + { + location: &location{"dc1", "r1", &master_pb.DataNodeInfo{Id: "dn1"}}, + }, + { + location: &location{"dc1", "r1", &master_pb.DataNodeInfo{Id: "dn2"}}, + }, + }, + sourceLocation: location{"dc1", "r1", &master_pb.DataNodeInfo{Id: "dn2"}}, + targetLocation: location{"dc1", "r1", &master_pb.DataNodeInfo{Id: "dn3"}}, + expected: true, + }, + + { + name: "test 010 move all to the same rack", + replication: "010", + replicas: []*VolumeReplica{ + { + location: &location{"dc1", "r1", &master_pb.DataNodeInfo{Id: "dn1"}}, + }, + { + location: &location{"dc1", "r2", &master_pb.DataNodeInfo{Id: "dn2"}}, + }, + }, + sourceLocation: location{"dc1", "r2", &master_pb.DataNodeInfo{Id: "dn2"}}, + targetLocation: location{"dc1", "r1", &master_pb.DataNodeInfo{Id: "dn3"}}, + expected: false, + }, + + { + name: "test 010 move to spread racks", + replication: "010", + replicas: []*VolumeReplica{ + { + location: &location{"dc1", "r1", &master_pb.DataNodeInfo{Id: "dn1"}}, + }, + { + location: &location{"dc1", "r2", &master_pb.DataNodeInfo{Id: "dn2"}}, + }, + }, + sourceLocation: location{"dc1", "r2", &master_pb.DataNodeInfo{Id: "dn2"}}, + targetLocation: location{"dc1", "r3", &master_pb.DataNodeInfo{Id: "dn3"}}, + expected: true, + }, + + { + name: "test 010 move to spread racks", + replication: "010", + replicas: []*VolumeReplica{ + { + location: &location{"dc1", "r1", &master_pb.DataNodeInfo{Id: "dn1"}}, + }, + { + location: &location{"dc1", "r2", &master_pb.DataNodeInfo{Id: "dn2"}}, + }, + }, + sourceLocation: location{"dc1", "r2", &master_pb.DataNodeInfo{Id: "dn2"}}, + targetLocation: location{"dc1", "r2", &master_pb.DataNodeInfo{Id: "dn3"}}, + expected: true, + }, + + } + + for _, tt := range tests { + replicaPlacement, _ := super_block.NewReplicaPlacementFromString(tt.replication) + println("replication:", tt.replication, "expected", tt.expected, "name:", tt.name) + sourceNode := &Node{ + info: tt.sourceLocation.dataNode, + dc: tt.sourceLocation.dc, + rack: tt.sourceLocation.rack, + } + targetNode := &Node{ + info: tt.targetLocation.dataNode, + dc: tt.targetLocation.dc, + rack: tt.targetLocation.rack, + } + if isGoodMove(replicaPlacement, tt.replicas, sourceNode, targetNode) != tt.expected { + t.Errorf("%s: expect %v move from %v to %s, replication:%v", + tt.name, tt.expected, tt.sourceLocation, tt.targetLocation, tt.replication) + } + } + +} diff --git a/weed/shell/command_volume_fix_replication.go b/weed/shell/command_volume_fix_replication.go index 061a58891..b32ccaaab 100644 --- a/weed/shell/command_volume_fix_replication.go +++ b/weed/shell/command_volume_fix_replication.go @@ -66,18 +66,7 @@ func (c *commandVolumeFixReplication) Do(args []string, commandEnv *CommandEnv, // find all volumes that needs replication // collect all data nodes - volumeReplicas := make(map[uint32][]*VolumeReplica) - var allLocations []location - eachDataNode(resp.TopologyInfo, func(dc string, rack RackId, dn *master_pb.DataNodeInfo) { - loc := newLocation(dc, string(rack), dn) - for _, v := range dn.VolumeInfos { - volumeReplicas[v.Id] = append(volumeReplicas[v.Id], &VolumeReplica{ - location: &loc, - info: v, - }) - } - allLocations = append(allLocations, loc) - }) + volumeReplicas, allLocations := collectVolumeReplicaLocations(resp) if len(allLocations) == 0 { return fmt.Errorf("no data nodes at all") @@ -111,6 +100,22 @@ func (c *commandVolumeFixReplication) Do(args []string, commandEnv *CommandEnv, } +func collectVolumeReplicaLocations(resp *master_pb.VolumeListResponse) (map[uint32][]*VolumeReplica, []location) { + volumeReplicas := make(map[uint32][]*VolumeReplica) + var allLocations []location + eachDataNode(resp.TopologyInfo, func(dc string, rack RackId, dn *master_pb.DataNodeInfo) { + loc := newLocation(dc, string(rack), dn) + for _, v := range dn.VolumeInfos { + volumeReplicas[v.Id] = append(volumeReplicas[v.Id], &VolumeReplica{ + location: &loc, + info: v, + }) + } + allLocations = append(allLocations, loc) + }) + return volumeReplicas, allLocations +} + func (c *commandVolumeFixReplication) fixOverReplicatedVolumes(commandEnv *CommandEnv, writer io.Writer, takeAction bool, overReplicatedVolumeIds []uint32, volumeReplicas map[uint32][]*VolumeReplica, allLocations []location) error { for _, vid := range overReplicatedVolumeIds { replicas := volumeReplicas[vid]