2 changed files with 499 additions and 0 deletions
@ -0,0 +1,412 @@ |
|||
package shell |
|||
|
|||
import ( |
|||
"container/heap" |
|||
"context" |
|||
"flag" |
|||
"fmt" |
|||
"io" |
|||
"os" |
|||
"sync" |
|||
"time" |
|||
|
|||
"github.com/seaweedfs/seaweedfs/weed/operation" |
|||
"github.com/seaweedfs/seaweedfs/weed/pb" |
|||
"github.com/seaweedfs/seaweedfs/weed/pb/master_pb" |
|||
"github.com/seaweedfs/seaweedfs/weed/pb/volume_server_pb" |
|||
"github.com/seaweedfs/seaweedfs/weed/storage/backend" |
|||
"github.com/seaweedfs/seaweedfs/weed/storage/needle" |
|||
"github.com/seaweedfs/seaweedfs/weed/storage/super_block" |
|||
"github.com/seaweedfs/seaweedfs/weed/storage/types" |
|||
"google.golang.org/grpc" |
|||
) |
|||
|
|||
const mergeIdleTimeoutSeconds = 1 |
|||
|
|||
func init() { |
|||
Commands = append(Commands, &commandVolumeMerge{}) |
|||
} |
|||
|
|||
type commandVolumeMerge struct{} |
|||
|
|||
func (c *commandVolumeMerge) Name() string { |
|||
return "volume.merge" |
|||
} |
|||
|
|||
func (c *commandVolumeMerge) Help() string { |
|||
return `merge replicas for a volume id in timestamp order into a fresh copy |
|||
|
|||
volume.merge -volumeId <volume id> |
|||
|
|||
This command: |
|||
1) marks the volume readonly on replicas (if not already) |
|||
2) allocates a temporary copy on a third location |
|||
3) merges replicas in append timestamp order, skipping duplicates |
|||
4) replaces the original replicas with the merged volume |
|||
5) restores writable state if it was writable before |
|||
` |
|||
} |
|||
|
|||
func (c *commandVolumeMerge) HasTag(CommandTag) bool { |
|||
return false |
|||
} |
|||
|
|||
func (c *commandVolumeMerge) Do(args []string, commandEnv *CommandEnv, writer io.Writer) (err error) { |
|||
mergeCommand := flag.NewFlagSet(c.Name(), flag.ContinueOnError) |
|||
volumeIdInt := mergeCommand.Int("volumeId", 0, "the volume id") |
|||
targetNodeStr := mergeCommand.String("target", "", "optional target volume server <host>:<port> for temporary merge output") |
|||
noLock := mergeCommand.Bool("noLock", false, "do not lock the admin shell at one's own risk") |
|||
if err = mergeCommand.Parse(args); err != nil { |
|||
return nil |
|||
} |
|||
|
|||
if *volumeIdInt == 0 { |
|||
return fmt.Errorf("volumeId is required") |
|||
} |
|||
|
|||
if *noLock { |
|||
commandEnv.noLock = true |
|||
} else if err = commandEnv.confirmIsLocked(args); err != nil { |
|||
return err |
|||
} |
|||
|
|||
volumeId := needle.VolumeId(*volumeIdInt) |
|||
|
|||
topologyInfo, _, err := collectTopologyInfo(commandEnv, 0) |
|||
if err != nil { |
|||
return err |
|||
} |
|||
volumeReplicas, allLocations := collectVolumeReplicaLocations(topologyInfo) |
|||
replicas := volumeReplicas[uint32(volumeId)] |
|||
if len(replicas) < 2 { |
|||
return fmt.Errorf("volume %d has %d replica(s); merge requires at least two", volumeId, len(replicas)) |
|||
} |
|||
|
|||
volumeInfo := replicas[0].info |
|||
replicaPlacement, err := super_block.NewReplicaPlacementFromByte(byte(volumeInfo.ReplicaPlacement)) |
|||
if err != nil { |
|||
return fmt.Errorf("parse replica placement for volume %d: %w", volumeId, err) |
|||
} |
|||
|
|||
var targetServer pb.ServerAddress |
|||
if *targetNodeStr != "" { |
|||
targetServer = pb.ServerAddress(*targetNodeStr) |
|||
if isReplicaServer(targetServer, replicas) { |
|||
return fmt.Errorf("target %s already hosts volume %d", *targetNodeStr, volumeId) |
|||
} |
|||
if err = allocateMergeVolume(commandEnv.option.GrpcDialOption, targetServer, volumeInfo, replicaPlacement); err != nil { |
|||
return err |
|||
} |
|||
} else { |
|||
targetServer, err = allocateMergeVolumeOnThirdLocation(commandEnv.option.GrpcDialOption, allLocations, replicas, volumeInfo, replicaPlacement) |
|||
if err != nil { |
|||
return err |
|||
} |
|||
} |
|||
|
|||
cleanupTarget := true |
|||
defer func() { |
|||
if !cleanupTarget { |
|||
return |
|||
} |
|||
_ = deleteVolume(commandEnv.option.GrpcDialOption, volumeId, targetServer, false) |
|||
}() |
|||
|
|||
shouldRestoreWritable, err := ensureVolumeReadonly(commandEnv, replicas) |
|||
if err != nil { |
|||
return err |
|||
} |
|||
if shouldRestoreWritable { |
|||
defer func() { |
|||
_ = markReplicasWritable(commandEnv.option.GrpcDialOption, replicas, true, false) |
|||
}() |
|||
} |
|||
|
|||
sources := make([]needleStream, 0, len(replicas)) |
|||
for _, replica := range replicas { |
|||
server := pb.NewServerAddressFromDataNode(replica.location.dataNode) |
|||
sources = append(sources, startTailNeedleStream(commandEnv.option.GrpcDialOption, volumeId, server)) |
|||
} |
|||
|
|||
mergeErr := operation.WithVolumeServerClient(false, targetServer, commandEnv.option.GrpcDialOption, func(client volume_server_pb.VolumeServerClient) error { |
|||
version := needle.Version(volumeInfo.Version) |
|||
if version == 0 { |
|||
version = needle.GetCurrentVersion() |
|||
} |
|||
seen := make(map[types.NeedleId]needleSeen) |
|||
return mergeNeedleStreams(sources, func(streamIndex int, n *needle.Needle) error { |
|||
ts := needleTimestamp(n) |
|||
if prev, ok := seen[n.Id]; ok && prev.timestamp == ts && prev.streamIndex != streamIndex { |
|||
return nil |
|||
} |
|||
seen[n.Id] = needleSeen{timestamp: ts, streamIndex: streamIndex} |
|||
blob, size, err := needleBlobFromNeedle(n, version) |
|||
if err != nil { |
|||
return err |
|||
} |
|||
_, err = client.WriteNeedleBlob(context.Background(), &volume_server_pb.WriteNeedleBlobRequest{ |
|||
VolumeId: uint32(volumeId), |
|||
NeedleId: uint64(n.Id), |
|||
Size: int32(size), |
|||
NeedleBlob: blob, |
|||
}) |
|||
return err |
|||
}) |
|||
}) |
|||
if mergeErr != nil { |
|||
return mergeErr |
|||
} |
|||
|
|||
for _, replica := range replicas { |
|||
sourceServer := pb.NewServerAddressFromDataNode(replica.location.dataNode) |
|||
if _, err = copyVolume(commandEnv.option.GrpcDialOption, writer, volumeId, targetServer, sourceServer, "", 0); err != nil { |
|||
return err |
|||
} |
|||
} |
|||
|
|||
if err = deleteVolume(commandEnv.option.GrpcDialOption, volumeId, targetServer, false); err != nil { |
|||
return err |
|||
} |
|||
cleanupTarget = false |
|||
|
|||
fmt.Fprintf(writer, "merged volume %d from %d replicas via %s\n", volumeId, len(replicas), targetServer) |
|||
return nil |
|||
} |
|||
|
|||
type needleStream interface { |
|||
Next() (*needle.Needle, bool) |
|||
Err() error |
|||
} |
|||
|
|||
type tailNeedleStream struct { |
|||
ch <-chan *needle.Needle |
|||
errMu sync.Mutex |
|||
err error |
|||
} |
|||
|
|||
func (s *tailNeedleStream) Next() (*needle.Needle, bool) { |
|||
n, ok := <-s.ch |
|||
return n, ok |
|||
} |
|||
|
|||
func (s *tailNeedleStream) Err() error { |
|||
s.errMu.Lock() |
|||
defer s.errMu.Unlock() |
|||
return s.err |
|||
} |
|||
|
|||
func (s *tailNeedleStream) setErr(err error) { |
|||
s.errMu.Lock() |
|||
s.err = err |
|||
s.errMu.Unlock() |
|||
} |
|||
|
|||
func startTailNeedleStream(grpcDialOption grpc.DialOption, volumeId needle.VolumeId, server pb.ServerAddress) *tailNeedleStream { |
|||
ch := make(chan *needle.Needle, 32) |
|||
stream := &tailNeedleStream{ch: ch} |
|||
go func() { |
|||
err := operation.TailVolumeFromSource(server, grpcDialOption, volumeId, 0, mergeIdleTimeoutSeconds, func(n *needle.Needle) error { |
|||
ch <- n |
|||
return nil |
|||
}) |
|||
close(ch) |
|||
stream.setErr(err) |
|||
}() |
|||
return stream |
|||
} |
|||
|
|||
type needleMergeItem struct { |
|||
streamIndex int |
|||
needle *needle.Needle |
|||
timestamp uint64 |
|||
} |
|||
|
|||
type needleMergeHeap []needleMergeItem |
|||
|
|||
func (h needleMergeHeap) Len() int { return len(h) } |
|||
func (h needleMergeHeap) Less(i, j int) bool { |
|||
if h[i].timestamp == h[j].timestamp { |
|||
return h[i].needle.Id < h[j].needle.Id |
|||
} |
|||
return h[i].timestamp < h[j].timestamp |
|||
} |
|||
func (h needleMergeHeap) Swap(i, j int) { h[i], h[j] = h[j], h[i] } |
|||
func (h *needleMergeHeap) Push(x any) { |
|||
*h = append(*h, x.(needleMergeItem)) |
|||
} |
|||
func (h *needleMergeHeap) Pop() any { |
|||
old := *h |
|||
n := len(old) |
|||
item := old[n-1] |
|||
*h = old[:n-1] |
|||
return item |
|||
} |
|||
|
|||
func mergeNeedleStreams(streams []needleStream, consume func(int, *needle.Needle) error) error { |
|||
h := &needleMergeHeap{} |
|||
heap.Init(h) |
|||
|
|||
for i, stream := range streams { |
|||
if n, ok := stream.Next(); ok { |
|||
heap.Push(h, needleMergeItem{streamIndex: i, needle: n, timestamp: needleTimestamp(n)}) |
|||
} |
|||
} |
|||
|
|||
for h.Len() > 0 { |
|||
item := heap.Pop(h).(needleMergeItem) |
|||
if err := consume(item.streamIndex, item.needle); err != nil { |
|||
return err |
|||
} |
|||
if n, ok := streams[item.streamIndex].Next(); ok { |
|||
heap.Push(h, needleMergeItem{streamIndex: item.streamIndex, needle: n, timestamp: needleTimestamp(n)}) |
|||
} |
|||
} |
|||
|
|||
for _, stream := range streams { |
|||
if err := stream.Err(); err != nil { |
|||
return err |
|||
} |
|||
} |
|||
return nil |
|||
} |
|||
|
|||
func needleTimestamp(n *needle.Needle) uint64 { |
|||
if n.AppendAtNs != 0 { |
|||
return n.AppendAtNs |
|||
} |
|||
if n.LastModified != 0 { |
|||
return uint64(time.Unix(int64(n.LastModified), 0).UnixNano()) |
|||
} |
|||
return 0 |
|||
} |
|||
|
|||
type needleSeen struct { |
|||
timestamp uint64 |
|||
streamIndex int |
|||
} |
|||
|
|||
func needleBlobFromNeedle(n *needle.Needle, version needle.Version) ([]byte, types.Size, error) { |
|||
file, err := os.CreateTemp("", "weed-needle-*.dat") |
|||
if err != nil { |
|||
return nil, 0, err |
|||
} |
|||
defer func() { |
|||
_ = file.Close() |
|||
_ = os.Remove(file.Name()) |
|||
}() |
|||
|
|||
diskFile := backend.NewDiskFile(file) |
|||
defer diskFile.Close() |
|||
|
|||
_, size, actualSize, err := n.Append(diskFile, version) |
|||
if err != nil { |
|||
return nil, 0, err |
|||
} |
|||
|
|||
buf := make([]byte, actualSize) |
|||
read, err := diskFile.ReadAt(buf, 0) |
|||
if err != nil && err != io.EOF { |
|||
return nil, 0, err |
|||
} |
|||
return buf[:read], size, nil |
|||
} |
|||
|
|||
func allocateMergeVolumeOnThirdLocation(grpcDialOption grpc.DialOption, allLocations []location, replicas []*VolumeReplica, info *master_pb.VolumeInformationMessage, replicaPlacement *super_block.ReplicaPlacement) (pb.ServerAddress, error) { |
|||
replicaNodes := map[string]struct{}{} |
|||
for _, replica := range replicas { |
|||
replicaNodes[replica.location.dataNode.Id] = struct{}{} |
|||
} |
|||
|
|||
for _, loc := range allLocations { |
|||
if _, exists := replicaNodes[loc.dataNode.Id]; exists { |
|||
continue |
|||
} |
|||
if !locationHasDiskType(loc, info.DiskType) { |
|||
continue |
|||
} |
|||
server := pb.NewServerAddressFromDataNode(loc.dataNode) |
|||
if err := allocateMergeVolume(grpcDialOption, server, info, replicaPlacement); err != nil { |
|||
continue |
|||
} |
|||
return server, nil |
|||
} |
|||
|
|||
return "", fmt.Errorf("no third location available to merge volume %d", info.Id) |
|||
} |
|||
|
|||
func allocateMergeVolume(grpcDialOption grpc.DialOption, server pb.ServerAddress, info *master_pb.VolumeInformationMessage, replicaPlacement *super_block.ReplicaPlacement) error { |
|||
return operation.WithVolumeServerClient(false, server, grpcDialOption, func(client volume_server_pb.VolumeServerClient) error { |
|||
_, err := client.AllocateVolume(context.Background(), &volume_server_pb.AllocateVolumeRequest{ |
|||
VolumeId: info.Id, |
|||
Collection: info.Collection, |
|||
Preallocate: 0, |
|||
Replication: replicaPlacement.String(), |
|||
Ttl: needle.LoadTTLFromUint32(info.Ttl).String(), |
|||
DiskType: info.DiskType, |
|||
Version: info.Version, |
|||
}) |
|||
return err |
|||
}) |
|||
} |
|||
|
|||
func ensureVolumeReadonly(commandEnv *CommandEnv, replicas []*VolumeReplica) (bool, error) { |
|||
shouldRestoreWritable := false |
|||
for _, replica := range replicas { |
|||
server := pb.NewServerAddressFromDataNode(replica.location.dataNode) |
|||
err := operation.WithVolumeServerClient(false, server, commandEnv.option.GrpcDialOption, func(client volume_server_pb.VolumeServerClient) error { |
|||
resp, err := client.VolumeStatus(context.Background(), &volume_server_pb.VolumeStatusRequest{VolumeId: replica.info.Id}) |
|||
if err != nil { |
|||
return err |
|||
} |
|||
if !resp.IsReadOnly { |
|||
shouldRestoreWritable = true |
|||
} |
|||
return nil |
|||
}) |
|||
if err != nil { |
|||
return false, err |
|||
} |
|||
} |
|||
if shouldRestoreWritable { |
|||
if err := markReplicasWritable(commandEnv.option.GrpcDialOption, replicas, false, false); err != nil { |
|||
return false, err |
|||
} |
|||
} |
|||
return shouldRestoreWritable, nil |
|||
} |
|||
|
|||
func isReplicaServer(target pb.ServerAddress, replicas []*VolumeReplica) bool { |
|||
for _, replica := range replicas { |
|||
if pb.NewServerAddressFromDataNode(replica.location.dataNode) == target { |
|||
return true |
|||
} |
|||
} |
|||
return false |
|||
} |
|||
|
|||
func locationHasDiskType(loc location, diskType string) bool { |
|||
for _, diskInfo := range loc.dataNode.DiskInfos { |
|||
if diskInfo.Type == diskType { |
|||
return true |
|||
} |
|||
} |
|||
return false |
|||
} |
|||
|
|||
func markReplicasWritable(grpcDialOption grpc.DialOption, replicas []*VolumeReplica, writable bool, persist bool) error { |
|||
for _, replica := range replicas { |
|||
server := pb.NewServerAddressFromDataNode(replica.location.dataNode) |
|||
err := operation.WithVolumeServerClient(false, server, grpcDialOption, func(client volume_server_pb.VolumeServerClient) error { |
|||
if writable { |
|||
_, err := client.VolumeMarkWritable(context.Background(), &volume_server_pb.VolumeMarkWritableRequest{VolumeId: replica.info.Id}) |
|||
return err |
|||
} |
|||
_, err := client.VolumeMarkReadonly(context.Background(), &volume_server_pb.VolumeMarkReadonlyRequest{VolumeId: replica.info.Id, Persist: persist}) |
|||
return err |
|||
}) |
|||
if err != nil { |
|||
return err |
|||
} |
|||
} |
|||
return nil |
|||
} |
|||
@ -0,0 +1,87 @@ |
|||
package shell |
|||
|
|||
import ( |
|||
"reflect" |
|||
"testing" |
|||
|
|||
"github.com/seaweedfs/seaweedfs/weed/storage/needle" |
|||
) |
|||
|
|||
type sliceNeedleStream struct { |
|||
needles []*needle.Needle |
|||
index int |
|||
} |
|||
|
|||
func (s *sliceNeedleStream) Next() (*needle.Needle, bool) { |
|||
if s.index >= len(s.needles) { |
|||
return nil, false |
|||
} |
|||
n := s.needles[s.index] |
|||
s.index++ |
|||
return n, true |
|||
} |
|||
|
|||
func (s *sliceNeedleStream) Err() error { |
|||
return nil |
|||
} |
|||
|
|||
func TestMergeNeedleStreamsOrdersByTimestamp(t *testing.T) { |
|||
streamA := &sliceNeedleStream{needles: []*needle.Needle{ |
|||
{Id: 1, AppendAtNs: 10_000_000_100}, |
|||
{Id: 2, AppendAtNs: 10_000_000_400}, |
|||
}} |
|||
streamB := &sliceNeedleStream{needles: []*needle.Needle{ |
|||
{Id: 3, AppendAtNs: 10_000_000_200}, |
|||
{Id: 4, AppendAtNs: 10_000_000_300}, |
|||
}} |
|||
streamC := &sliceNeedleStream{needles: []*needle.Needle{ |
|||
{Id: 5, LastModified: 1}, |
|||
}} |
|||
|
|||
var got []uint64 |
|||
err := mergeNeedleStreams([]needleStream{streamA, streamB, streamC}, func(_ int, n *needle.Needle) error { |
|||
got = append(got, uint64(n.Id)) |
|||
return nil |
|||
}) |
|||
if err != nil { |
|||
t.Fatalf("mergeNeedleStreams error: %v", err) |
|||
} |
|||
|
|||
want := []uint64{5, 1, 3, 4, 2} |
|||
if !reflect.DeepEqual(got, want) { |
|||
t.Fatalf("unexpected merge order: got %v want %v", got, want) |
|||
} |
|||
} |
|||
|
|||
func TestMergeNeedleStreamsSkipsCrossStreamDuplicates(t *testing.T) { |
|||
streamA := &sliceNeedleStream{needles: []*needle.Needle{ |
|||
{Id: 10, AppendAtNs: 10_000_000_100}, |
|||
{Id: 10, AppendAtNs: 10_000_000_300}, |
|||
}} |
|||
streamB := &sliceNeedleStream{needles: []*needle.Needle{ |
|||
{Id: 10, AppendAtNs: 10_000_000_100}, |
|||
{Id: 11, AppendAtNs: 10_000_000_200}, |
|||
}} |
|||
|
|||
type seenNeedle struct { |
|||
id uint64 |
|||
ts uint64 |
|||
} |
|||
var got []seenNeedle |
|||
err := mergeNeedleStreams([]needleStream{streamA, streamB}, func(_ int, n *needle.Needle) error { |
|||
got = append(got, seenNeedle{id: uint64(n.Id), ts: needleTimestamp(n)}) |
|||
return nil |
|||
}) |
|||
if err != nil { |
|||
t.Fatalf("mergeNeedleStreams error: %v", err) |
|||
} |
|||
|
|||
want := []seenNeedle{ |
|||
{id: 10, ts: 10_000_000_100}, |
|||
{id: 11, ts: 10_000_000_200}, |
|||
{id: 10, ts: 10_000_000_300}, |
|||
} |
|||
if !reflect.DeepEqual(got, want) { |
|||
t.Fatalf("unexpected merge output: got %v want %v", got, want) |
|||
} |
|||
} |
|||
Write
Preview
Loading…
Cancel
Save
Reference in new issue