You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

353 lines
11 KiB

package shell
import (
"bytes"
"context"
"fmt"
"io"
"math"
"sync"
"time"
"google.golang.org/grpc"
"github.com/seaweedfs/seaweedfs/weed/operation"
"github.com/seaweedfs/seaweedfs/weed/pb"
"github.com/seaweedfs/seaweedfs/weed/pb/volume_server_pb"
"github.com/seaweedfs/seaweedfs/weed/storage/needle"
"github.com/seaweedfs/seaweedfs/weed/storage/needle_map"
"github.com/seaweedfs/seaweedfs/weed/wdclient"
)
// VolumeReplicaStatus represents the status of a volume replica
type VolumeReplicaStatus struct {
Location wdclient.Location
FileCount uint64
FileDeletedCount uint64
VolumeSize uint64
IsReadOnly bool
Error error
}
// getVolumeReplicaStatus retrieves the current status of a volume replica
func getVolumeReplicaStatus(grpcDialOption grpc.DialOption, vid needle.VolumeId, location wdclient.Location) VolumeReplicaStatus {
status := VolumeReplicaStatus{
Location: location,
}
err := operation.WithVolumeServerClient(false, location.ServerAddress(), grpcDialOption, func(volumeServerClient volume_server_pb.VolumeServerClient) error {
resp, reqErr := volumeServerClient.VolumeStatus(context.Background(), &volume_server_pb.VolumeStatusRequest{
VolumeId: uint32(vid),
})
if reqErr != nil {
return reqErr
}
if resp != nil {
status.FileCount = resp.FileCount
status.FileDeletedCount = resp.FileDeletedCount
status.VolumeSize = resp.VolumeSize
status.IsReadOnly = resp.IsReadOnly
}
return nil
})
status.Error = err
return status
}
// getVolumeReplicaStatuses retrieves status for all replicas of a volume in parallel
func getVolumeReplicaStatuses(grpcDialOption grpc.DialOption, vid needle.VolumeId, locations []wdclient.Location) []VolumeReplicaStatus {
statuses := make([]VolumeReplicaStatus, len(locations))
var wg sync.WaitGroup
for i, location := range locations {
wg.Add(1)
go func(i int, location wdclient.Location) {
defer wg.Done()
statuses[i] = getVolumeReplicaStatus(grpcDialOption, vid, location)
}(i, location)
}
wg.Wait()
return statuses
}
// replicaUnionBuilder builds a union replica by copying missing entries from other replicas
type replicaUnionBuilder struct {
grpcDialOption grpc.DialOption
writer io.Writer
vid needle.VolumeId
collection string
}
// buildUnionReplica finds the largest replica and copies missing entries from other replicas into it.
// If excludeFromSelection is non-empty, that server won't be selected as the target but will still
// be used as a source for missing entries.
// Returns the location of the union replica (the one that now has all entries).
func (rub *replicaUnionBuilder) buildUnionReplica(locations []wdclient.Location, excludeFromSelection string) (wdclient.Location, int, error) {
if len(locations) == 0 {
return wdclient.Location{}, 0, fmt.Errorf("no replicas available")
}
if len(locations) == 1 {
if locations[0].Url == excludeFromSelection {
return wdclient.Location{}, 0, fmt.Errorf("only replica is excluded")
}
return locations[0], 0, nil
}
// Step 1: Find the largest replica (highest file count) that's not excluded
statuses := getVolumeReplicaStatuses(rub.grpcDialOption, rub.vid, locations)
bestIdx := -1
var bestFileCount uint64
for i, s := range statuses {
if s.Error == nil && locations[i].Url != excludeFromSelection {
if bestIdx == -1 || s.FileCount > bestFileCount {
bestIdx = i
bestFileCount = s.FileCount
}
}
}
if bestIdx == -1 {
return wdclient.Location{}, 0, fmt.Errorf("could not find valid replica (all excluded or errored)")
}
bestLocation := locations[bestIdx]
fmt.Fprintf(rub.writer, "volume %d: selected %s as best replica (file count: %d)\n",
rub.vid, bestLocation.Url, bestFileCount)
// Step 2: Read index database from the best replica
bestDB := needle_map.NewMemDb()
if bestDB == nil {
return wdclient.Location{}, 0, fmt.Errorf("failed to allocate in-memory needle DB")
}
defer bestDB.Close()
if err := rub.readIndexDatabase(bestDB, bestLocation.ServerAddress()); err != nil {
return wdclient.Location{}, 0, fmt.Errorf("read index from best replica %s: %w", bestLocation.Url, err)
}
// Step 3: For each other replica (including excluded), find entries missing from best and copy them
totalSynced := 0
cutoffFromAtNs := uint64(time.Now().UnixNano())
for i, loc := range locations {
if i == bestIdx {
continue
}
if statuses[i].Error != nil {
fmt.Fprintf(rub.writer, " skipping %s: %v\n", loc.Url, statuses[i].Error)
continue
}
// Read this replica's index
otherDB := needle_map.NewMemDb()
if otherDB == nil {
fmt.Fprintf(rub.writer, " skipping %s: failed to allocate DB\n", loc.Url)
continue
}
if err := rub.readIndexDatabase(otherDB, loc.ServerAddress()); err != nil {
otherDB.Close()
fmt.Fprintf(rub.writer, " skipping %s: %v\n", loc.Url, err)
continue
}
// Find entries in other that are missing from best
var missingNeedles []needle_map.NeedleValue
otherDB.AscendingVisit(func(nv needle_map.NeedleValue) error {
if nv.Size.IsDeleted() {
return nil
}
if _, found := bestDB.Get(nv.Key); !found {
// Check if this entry was written too recently (after sync started)
// Skip entries written after sync started to avoid copying in-flight writes
if needleMeta, err := readNeedleMeta(rub.grpcDialOption, loc.ServerAddress(), uint32(rub.vid), nv); err == nil {
if needleMeta.AppendAtNs > cutoffFromAtNs {
return nil // Skip entries written after sync started
}
}
missingNeedles = append(missingNeedles, nv)
}
return nil
})
otherDB.Close()
if len(missingNeedles) == 0 {
continue
}
// Copy missing entries from this replica to best replica
syncedFromThis := 0
for _, nv := range missingNeedles {
needleBlob, err := rub.readNeedleBlob(loc.ServerAddress(), nv)
if err != nil {
fmt.Fprintf(rub.writer, " warning: read needle %d from %s: %v\n", nv.Key, loc.Url, err)
continue
}
if err := rub.writeNeedleBlob(bestLocation.ServerAddress(), nv, needleBlob); err != nil {
fmt.Fprintf(rub.writer, " warning: write needle %d to %s: %v\n", nv.Key, bestLocation.Url, err)
continue
}
// Also add to bestDB so we don't copy duplicates from other replicas
bestDB.Set(nv.Key, nv.Offset, nv.Size)
syncedFromThis++
}
if syncedFromThis > 0 {
fmt.Fprintf(rub.writer, " copied %d entries from %s to %s\n",
syncedFromThis, loc.Url, bestLocation.Url)
totalSynced += syncedFromThis
}
}
return bestLocation, totalSynced, nil
}
func (rub *replicaUnionBuilder) readIndexDatabase(db *needle_map.MemDb, server pb.ServerAddress) error {
var buf bytes.Buffer
err := operation.WithVolumeServerClient(true, server, rub.grpcDialOption, func(volumeServerClient volume_server_pb.VolumeServerClient) error {
copyFileClient, err := volumeServerClient.CopyFile(context.Background(), &volume_server_pb.CopyFileRequest{
VolumeId: uint32(rub.vid),
Ext: ".idx",
CompactionRevision: math.MaxUint32,
StopOffset: math.MaxInt64,
Collection: rub.collection,
IsEcVolume: false,
IgnoreSourceFileNotFound: false,
})
if err != nil {
return fmt.Errorf("start copy: %w", err)
}
for {
resp, recvErr := copyFileClient.Recv()
if recvErr == io.EOF {
break
}
if recvErr != nil {
return fmt.Errorf("receive: %w", recvErr)
}
buf.Write(resp.FileContent)
}
return nil
})
if err != nil {
return err
}
return db.LoadFilterFromReaderAt(bytes.NewReader(buf.Bytes()), true, false)
}
func (rub *replicaUnionBuilder) readNeedleBlob(server pb.ServerAddress, nv needle_map.NeedleValue) ([]byte, error) {
var needleBlob []byte
err := operation.WithVolumeServerClient(false, server, rub.grpcDialOption, func(client volume_server_pb.VolumeServerClient) error {
resp, err := client.ReadNeedleBlob(context.Background(), &volume_server_pb.ReadNeedleBlobRequest{
VolumeId: uint32(rub.vid),
Offset: nv.Offset.ToActualOffset(),
Size: int32(nv.Size),
})
if err != nil {
return err
}
needleBlob = resp.NeedleBlob
return nil
})
return needleBlob, err
}
func (rub *replicaUnionBuilder) writeNeedleBlob(server pb.ServerAddress, nv needle_map.NeedleValue, needleBlob []byte) error {
return operation.WithVolumeServerClient(false, server, rub.grpcDialOption, func(client volume_server_pb.VolumeServerClient) error {
_, err := client.WriteNeedleBlob(context.Background(), &volume_server_pb.WriteNeedleBlobRequest{
VolumeId: uint32(rub.vid),
NeedleId: uint64(nv.Key),
Size: int32(nv.Size),
NeedleBlob: needleBlob,
})
return err
})
}
// syncAndSelectBestReplica finds the largest replica, copies missing entries from other replicas
// into it to create a union, then returns this union replica for the operation.
// If excludeFromSelection is non-empty, that server won't be selected but will still contribute entries.
//
// The process:
// 1. Find the replica with the highest file count (the "best" one), excluding excludeFromSelection
// 2. For each other replica, find entries missing from best and copy them to best
// 3. Return the best replica which now contains the union of all entries
func syncAndSelectBestReplica(grpcDialOption grpc.DialOption, vid needle.VolumeId, collection string, locations []wdclient.Location, excludeFromSelection string, writer io.Writer) (wdclient.Location, error) {
if len(locations) == 0 {
return wdclient.Location{}, fmt.Errorf("no replicas available for volume %d", vid)
}
// Filter for checking consistency (exclude the excluded server)
var checkLocations []wdclient.Location
for _, loc := range locations {
if loc.Url != excludeFromSelection {
checkLocations = append(checkLocations, loc)
}
}
if len(checkLocations) == 0 {
return wdclient.Location{}, fmt.Errorf("no replicas available for volume %d after exclusion", vid)
}
if len(checkLocations) == 1 && len(locations) == 1 {
return checkLocations[0], nil
}
// Check if replicas are already consistent (skip sync if so)
statuses := getVolumeReplicaStatuses(grpcDialOption, vid, locations)
var validStatuses []VolumeReplicaStatus
for i, s := range statuses {
if s.Error == nil {
// Include all for consistency check
validStatuses = append(validStatuses, s)
_ = i
}
}
if len(validStatuses) > 1 {
allSame := true
for _, s := range validStatuses[1:] {
if s.FileCount != validStatuses[0].FileCount {
allSame = false
break
}
}
if allSame {
// All replicas are consistent, return the best non-excluded one
for _, s := range validStatuses {
if s.Location.Url != excludeFromSelection {
fmt.Fprintf(writer, "volume %d: all %d replicas are consistent (file count: %d)\n",
vid, len(validStatuses), s.FileCount)
return s.Location, nil
}
}
}
}
// Replicas are inconsistent, build union on the best replica
fmt.Fprintf(writer, "volume %d: replicas are inconsistent, building union...\n", vid)
builder := &replicaUnionBuilder{
grpcDialOption: grpcDialOption,
writer: writer,
vid: vid,
collection: collection,
}
unionLocation, totalSynced, err := builder.buildUnionReplica(locations, excludeFromSelection)
if err != nil {
return wdclient.Location{}, fmt.Errorf("failed to build union replica: %w", err)
}
if totalSynced > 0 {
fmt.Fprintf(writer, "volume %d: added %d entries to union replica %s\n", vid, totalSynced, unionLocation.Url)
}
return unionLocation, nil
}