diff --git a/weed/shell/command_ec_balance.go b/weed/shell/command_ec_balance.go index 1ddb6a490..bb280b7d9 100644 --- a/weed/shell/command_ec_balance.go +++ b/weed/shell/command_ec_balance.go @@ -28,7 +28,7 @@ func (c *commandEcBalance) Help() string { Algorithm: - For each type of volume server (different max volume count limit){ + func EcBalance() { for each collection: balanceEcVolumes(collectionName) for each rack: diff --git a/weed/shell/command_ec_common.go b/weed/shell/command_ec_common.go index c6c7a1260..a808335eb 100644 --- a/weed/shell/command_ec_common.go +++ b/weed/shell/command_ec_common.go @@ -173,6 +173,16 @@ type EcNode struct { freeEcSlot int } +func (ecNode *EcNode) localShardIdCount(vid uint32) int { + for _, ecShardInfo := range ecNode.info.EcShardInfos { + if vid == ecShardInfo.Id { + shardBits := erasure_coding.ShardBits(ecShardInfo.EcIndexBits) + return shardBits.ShardIdCount() + } + } + return 0 +} + type EcRack struct { ecNodes map[EcNodeId]*EcNode freeEcSlot int @@ -191,7 +201,15 @@ func collectEcNodes(commandEnv *CommandEnv, selectedDataCenter string) (ecNodes } // find out all volume servers with one slot left. - eachDataNode(resp.TopologyInfo, func(dc string, rack RackId, dn *master_pb.DataNodeInfo) { + ecNodes, totalFreeEcSlots = collectEcVolumeServersByDc(resp.TopologyInfo, selectedDataCenter) + + sortEcNodesByFreeslotsDecending(ecNodes) + + return +} + +func collectEcVolumeServersByDc(topo *master_pb.TopologyInfo, selectedDataCenter string) (ecNodes []*EcNode, totalFreeEcSlots int) { + eachDataNode(topo, func(dc string, rack RackId, dn *master_pb.DataNodeInfo) { if selectedDataCenter != "" && selectedDataCenter != dc { return } @@ -205,9 +223,6 @@ func collectEcNodes(commandEnv *CommandEnv, selectedDataCenter string) (ecNodes }) totalFreeEcSlots += freeEcSlots }) - - sortEcNodesByFreeslotsDecending(ecNodes) - return } diff --git a/weed/shell/command_volume_balance.go b/weed/shell/command_volume_balance.go index 624821431..53222ca29 100644 --- a/weed/shell/command_volume_balance.go +++ b/weed/shell/command_volume_balance.go @@ -244,32 +244,43 @@ func balanceSelectedVolume(commandEnv *CommandEnv, volumeReplicas map[uint32][]* func attemptToMoveOneVolume(commandEnv *CommandEnv, volumeReplicas map[uint32][]*VolumeReplica, fullNode *Node, candidateVolumes []*master_pb.VolumeInformationMessage, emptyNode *Node, applyBalancing bool) (hasMoved bool, err error) { for _, v := range candidateVolumes { - if v.ReplicaPlacement > 0 { - replicaPlacement, _ := super_block.NewReplicaPlacementFromByte(byte(v.ReplicaPlacement)) - if !isGoodMove(replicaPlacement, volumeReplicas[v.Id], fullNode, emptyNode) { - continue - } + hasMoved, err = maybeMoveOneVolume(commandEnv, volumeReplicas, fullNode, v, emptyNode, applyBalancing) + if err != nil { + return } - if _, found := emptyNode.selectedVolumes[v.Id]; !found { - if err = moveVolume(commandEnv, v, fullNode, emptyNode, applyBalancing); err == nil { - adjustAfterMove(v, volumeReplicas, fullNode, emptyNode) - hasMoved = true - break - } else { - return - } + if hasMoved { + break + } + } + return +} + +func maybeMoveOneVolume(commandEnv *CommandEnv, volumeReplicas map[uint32][]*VolumeReplica, fullNode *Node, candidateVolume *master_pb.VolumeInformationMessage, emptyNode *Node, applyChange bool) (hasMoved bool, err error) { + + if candidateVolume.ReplicaPlacement > 0 { + replicaPlacement, _ := super_block.NewReplicaPlacementFromByte(byte(candidateVolume.ReplicaPlacement)) + if !isGoodMove(replicaPlacement, volumeReplicas[candidateVolume.Id], fullNode, emptyNode) { + return false, nil + } + } + if _, found := emptyNode.selectedVolumes[candidateVolume.Id]; !found { + if err = moveVolume(commandEnv, candidateVolume, fullNode, emptyNode, applyChange); err == nil { + adjustAfterMove(candidateVolume, volumeReplicas, fullNode, emptyNode) + return true, nil + } else { + return } } return } -func moveVolume(commandEnv *CommandEnv, v *master_pb.VolumeInformationMessage, fullNode *Node, emptyNode *Node, applyBalancing bool) error { +func moveVolume(commandEnv *CommandEnv, v *master_pb.VolumeInformationMessage, fullNode *Node, emptyNode *Node, applyChange bool) error { collectionPrefix := v.Collection + "_" if v.Collection == "" { collectionPrefix = "" } fmt.Fprintf(os.Stdout, "moving volume %s%d %s => %s\n", collectionPrefix, v.Id, fullNode.info.Id, emptyNode.info.Id) - if applyBalancing { + if applyChange { return LiveMoveVolume(commandEnv.option.GrpcDialOption, needle.VolumeId(v.Id), fullNode.info.Id, emptyNode.info.Id, 5*time.Second) } return nil @@ -315,7 +326,9 @@ func isGoodMove(placement *super_block.ReplicaPlacement, existingReplicas []*Vol func adjustAfterMove(v *master_pb.VolumeInformationMessage, volumeReplicas map[uint32][]*VolumeReplica, fullNode *Node, emptyNode *Node) { delete(fullNode.selectedVolumes, v.Id) - emptyNode.selectedVolumes[v.Id] = v + if emptyNode.selectedVolumes != nil { + emptyNode.selectedVolumes[v.Id] = v + } existingReplicas := volumeReplicas[v.Id] for _, replica := range existingReplicas { if replica.location.dataNode.Id == fullNode.info.Id && diff --git a/weed/shell/command_volume_server_evacuate.go b/weed/shell/command_volume_server_evacuate.go new file mode 100644 index 000000000..d27c66fa2 --- /dev/null +++ b/weed/shell/command_volume_server_evacuate.go @@ -0,0 +1,192 @@ +package shell + +import ( + "context" + "flag" + "fmt" + "github.com/chrislusf/seaweedfs/weed/pb/master_pb" + "github.com/chrislusf/seaweedfs/weed/storage/erasure_coding" + "github.com/chrislusf/seaweedfs/weed/storage/needle" + "io" + "sort" +) + +func init() { + Commands = append(Commands, &commandVolumeServerEvacuate{}) +} + +type commandVolumeServerEvacuate struct { +} + +func (c *commandVolumeServerEvacuate) Name() string { + return "volumeServer.evacuate" +} + +func (c *commandVolumeServerEvacuate) Help() string { + return `move out all data on a volume server + + volumeServer.evacuate -node + + This command moves all data away from the volume server. + The volumes on the volume servers will be redistributed. + + Usually this is used to prepare to shutdown or upgrade the volume server. + +` +} + +func (c *commandVolumeServerEvacuate) Do(args []string, commandEnv *CommandEnv, writer io.Writer) (err error) { + + if err = commandEnv.confirmIsLocked(); err != nil { + return + } + + vsEvacuateCommand := flag.NewFlagSet(c.Name(), flag.ContinueOnError) + volumeServer := vsEvacuateCommand.String("node", "", ": of the volume server") + applyChange := vsEvacuateCommand.Bool("force", false, "actually apply the changes") + if err = vsEvacuateCommand.Parse(args); err != nil { + return nil + } + + if *volumeServer == "" { + return fmt.Errorf("need to specify volume server by -node=:") + } + + return volumeServerEvacuate(commandEnv, *volumeServer, *applyChange, writer) + +} + +func volumeServerEvacuate(commandEnv *CommandEnv, volumeServer string, applyChange bool, writer io.Writer) (err error) { + // 1. confirm the volume server is part of the cluster + // 2. collect all other volume servers, sort by empty slots + // 3. move to any other volume server as long as it satisfy the replication requirements + + // list all the volumes + var resp *master_pb.VolumeListResponse + err = commandEnv.MasterClient.WithClient(func(client master_pb.SeaweedClient) error { + resp, err = client.VolumeList(context.Background(), &master_pb.VolumeListRequest{}) + return err + }) + if err != nil { + return err + } + + if err := evacuateNormalVolumes(commandEnv, resp, volumeServer, applyChange); err != nil { + return err + } + + if err := evacuateEcVolumes(commandEnv, resp, volumeServer, applyChange); err != nil { + return err + } + + return nil +} + +func evacuateNormalVolumes(commandEnv *CommandEnv, resp *master_pb.VolumeListResponse, volumeServer string, applyChange bool) error { + // find this volume server + volumeServers := collectVolumeServersByDc(resp.TopologyInfo, "") + thisNode, otherNodes := nodesOtherThan(volumeServers, volumeServer) + if thisNode == nil { + return fmt.Errorf("%s is not found in this cluster", volumeServer) + } + + // move away normal volumes + volumeReplicas, _ := collectVolumeReplicaLocations(resp) + for _, vol := range thisNode.info.VolumeInfos { + hasMoved, err := moveAwayOneNormalVolume(commandEnv, volumeReplicas, vol, thisNode, otherNodes, applyChange) + if err != nil { + return fmt.Errorf("move away volume %d from %s: %v", vol.Id, volumeServer, err) + } + if !hasMoved { + return fmt.Errorf("failed to move volume %d from %s", vol.Id, volumeServer) + } + } + return nil +} + +func evacuateEcVolumes(commandEnv *CommandEnv, resp *master_pb.VolumeListResponse, volumeServer string, applyChange bool) error { + // find this ec volume server + ecNodes, _ := collectEcVolumeServersByDc(resp.TopologyInfo, "") + thisNode, otherNodes := ecNodesOtherThan(ecNodes, volumeServer) + if thisNode == nil { + return fmt.Errorf("%s is not found in this cluster", volumeServer) + } + + // move away ec volumes + for _, ecShardInfo := range thisNode.info.EcShardInfos { + hasMoved, err := moveAwayOneEcVolume(commandEnv, ecShardInfo, thisNode, otherNodes, applyChange) + if err != nil { + return fmt.Errorf("move away volume %d from %s: %v", ecShardInfo.Id, volumeServer, err) + } + if !hasMoved { + return fmt.Errorf("failed to move ec volume %d from %s", ecShardInfo.Id, volumeServer) + } + } + return nil +} + +func moveAwayOneEcVolume(commandEnv *CommandEnv, ecShardInfo *master_pb.VolumeEcShardInformationMessage, thisNode *EcNode, otherNodes []*EcNode, applyChange bool) (hasMoved bool, err error) { + + for _, shardId := range erasure_coding.ShardBits(ecShardInfo.EcIndexBits).ShardIds() { + + sort.Slice(otherNodes, func(i, j int) bool { + return otherNodes[i].localShardIdCount(ecShardInfo.Id) < otherNodes[j].localShardIdCount(ecShardInfo.Id) + }) + + for i := 0; i < len(otherNodes); i++ { + emptyNode := otherNodes[i] + err = moveMountedShardToEcNode(commandEnv, thisNode, ecShardInfo.Collection, needle.VolumeId(ecShardInfo.Id), shardId, emptyNode, applyChange) + if err != nil { + return + } else { + hasMoved = true + break + } + } + if !hasMoved { + return + } + } + + return +} + +func moveAwayOneNormalVolume(commandEnv *CommandEnv, volumeReplicas map[uint32][]*VolumeReplica, vol *master_pb.VolumeInformationMessage, thisNode *Node, otherNodes []*Node, applyChange bool) (hasMoved bool, err error) { + sort.Slice(otherNodes, func(i, j int) bool { + return otherNodes[i].localVolumeRatio() < otherNodes[j].localVolumeRatio() + }) + + for i := 0; i < len(otherNodes); i++ { + emptyNode := otherNodes[i] + hasMoved, err = maybeMoveOneVolume(commandEnv, volumeReplicas, thisNode, vol, emptyNode, applyChange) + if err != nil { + return + } + if hasMoved { + break + } + } + return +} + +func nodesOtherThan(volumeServers []*Node, thisServer string) (thisNode *Node, otherNodes []*Node) { + for _, node := range volumeServers { + if node.info.Id == thisServer { + thisNode = node + continue + } + otherNodes = append(otherNodes, node) + } + return +} + +func ecNodesOtherThan(volumeServers []*EcNode, thisServer string) (thisNode *EcNode, otherNodes []*EcNode) { + for _, node := range volumeServers { + if node.info.Id == thisServer { + thisNode = node + continue + } + otherNodes = append(otherNodes, node) + } + return +}