fix: enable RetryForever for active-active cluster sync to prevent out-of-sync (#7840)

Fixes #7230 When a cluster goes down during file replication, the chunk upload process would fail after a limited number of retries. Once the remote cluster came back online, those failed uploads were never retried, leaving the clusters out-of-sync. This change enables the RetryForever flag in the UploadOption when replicating chunks between filers. This ensures that upload operations will keep retrying indefinitely, and once the remote cluster comes back online, the pending uploads will automatically succeed. Users no longer need to manually run fs.meta.save and fs.meta.load as a workaround for out-of-sync clusters.
3 months ago · cc2edfaf68
1 changed files with 4 additions and 2 deletions
--- a/weed/replication/sink/filersink/fetch_write.go
+++ b/weed/replication/sink/filersink/fetch_write.go
@ -2,12 +2,13 @@ package filersink

 import (
 	"fmt"
-	"github.com/schollz/progressbar/v3"
-	"github.com/seaweedfs/seaweedfs/weed/util"
 	"os"
 	"path/filepath"
 	"sync"

+	"github.com/schollz/progressbar/v3"
+	"github.com/seaweedfs/seaweedfs/weed/util"
+
 	"google.golang.org/grpc"

 	"github.com/seaweedfs/seaweedfs/weed/glog"
@ -114,6 +115,7 @@ func (fs *FilerSink) fetchAndWrite(sourceChunk *filer_pb.FileChunk, path string)
 			IsInputCompressed: "gzip" == header.Get("Content-Encoding"),
 			MimeType:          header.Get("Content-Type"),
 			PairMap:           nil,
+			RetryForever:      true,
 		},
 		func(host, fileId string) string {
 			fileUrl := fmt.Sprintf("http://%s/%s", host, fileId)