Browse Source

s3: further improve filer consistency retry logic for CI environment

- Increase retry attempts from 5 to 8 for both updateLatestVersionInDirectory
  and getLatestObjectVersion functions
- Increase base delay from 50ms to 100ms with exponential backoff up to 6.4s
- Add specific retry logic for 'no Extended metadata' race condition where
  .versions directory exists but metadata is not yet written
- Add detailed timing logs to track retry delays and total wait times
- Addresses persistent CI failures where even 5 retries with 400ms max delay
  were insufficient for filer store consistency in GitHub Actions environment
pull/7231/head
chrislu 2 months ago
parent
commit
70d53ce91a
  1. 9
      weed/s3api/s3api_object_handlers_put.go
  2. 51
      weed/s3api/s3api_object_versioning.go

9
weed/s3api/s3api_object_handlers_put.go

@ -643,7 +643,7 @@ func (s3a *S3ApiServer) updateLatestVersionInDirectory(bucket, object, versionId
// Get the current .versions directory entry with retry logic for filer consistency
var versionsEntry *filer_pb.Entry
var err error
maxRetries := 5
maxRetries := 8
for attempt := 1; attempt <= maxRetries; attempt++ {
versionsEntry, err = s3a.getEntry(bucketDir, versionsObjectPath)
if err == nil {
@ -653,15 +653,16 @@ func (s3a *S3ApiServer) updateLatestVersionInDirectory(bucket, object, versionId
glog.V(0).Infof("CI-DEBUG: updateLatestVersionInDirectory: attempt %d/%d failed to get .versions entry for %s/%s: %v", attempt, maxRetries, bucket, object, err)
if attempt < maxRetries {
// Exponential backoff: 50ms, 100ms, 200ms, 400ms
delay := time.Millisecond * time.Duration(50 * (1 << (attempt - 1)))
// Exponential backoff with higher base: 100ms, 200ms, 400ms, 800ms, 1600ms, 3200ms, 6400ms
delay := time.Millisecond * time.Duration(100 * (1 << (attempt - 1)))
glog.V(0).Infof("CI-DEBUG: updateLatestVersionInDirectory: sleeping %v before retry %d", delay, attempt+1)
time.Sleep(delay)
}
}
if err != nil {
glog.Errorf("updateLatestVersionInDirectory: failed to get .versions directory for %s/%s after %d attempts: %v", bucket, object, maxRetries, err)
glog.V(0).Infof("CI-DEBUG: updateLatestVersionInDirectory: FAILED to get .versions entry for %s/%s after %d attempts: %v", bucket, object, maxRetries, err)
glog.V(0).Infof("CI-DEBUG: updateLatestVersionInDirectory: FAILED to get .versions entry for %s/%s after %d attempts (total delay ~%dms): %v", bucket, object, maxRetries, (100*(1<<maxRetries-1)-100), err)
return fmt.Errorf("failed to get .versions directory after %d attempts: %w", maxRetries, err)
}

51
weed/s3api/s3api_object_versioning.go

@ -785,7 +785,7 @@ func (s3a *S3ApiServer) getLatestObjectVersion(bucket, object string) (*filer_pb
// Get the .versions directory entry to read latest version metadata with retry logic for filer consistency
var versionsEntry *filer_pb.Entry
var err error
maxRetries := 5
maxRetries := 8
for attempt := 1; attempt <= maxRetries; attempt++ {
versionsEntry, err = s3a.getEntry(bucketDir, versionsObjectPath)
if err == nil {
@ -795,8 +795,9 @@ func (s3a *S3ApiServer) getLatestObjectVersion(bucket, object string) (*filer_pb
glog.V(0).Infof("CI-DEBUG: getLatestObjectVersion: attempt %d/%d failed to get .versions directory for %s/%s: %v", attempt, maxRetries, bucket, object, err)
if attempt < maxRetries {
// Exponential backoff: 50ms, 100ms, 200ms, 400ms
delay := time.Millisecond * time.Duration(50 * (1 << (attempt - 1)))
// Exponential backoff with higher base: 100ms, 200ms, 400ms, 800ms, 1600ms, 3200ms, 6400ms
delay := time.Millisecond * time.Duration(100 * (1 << (attempt - 1)))
glog.V(0).Infof("CI-DEBUG: getLatestObjectVersion: sleeping %v before retry %d", delay, attempt+1)
time.Sleep(delay)
}
}
@ -806,7 +807,7 @@ func (s3a *S3ApiServer) getLatestObjectVersion(bucket, object string) (*filer_pb
// before versioning was enabled on the bucket. Fall back to checking for a
// regular (non-versioned) object file.
glog.V(1).Infof("getLatestObjectVersion: no .versions directory for %s%s after %d attempts (error: %v), checking for pre-versioning object", bucket, object, maxRetries, err)
glog.V(0).Infof("CI-DEBUG: getLatestObjectVersion: no .versions directory for %s/%s after %d attempts (error: %v), falling back to pre-versioning", bucket, object, maxRetries, err)
glog.V(0).Infof("CI-DEBUG: getLatestObjectVersion: no .versions directory for %s/%s after %d attempts (total delay ~%dms, error: %v), falling back to pre-versioning", bucket, object, maxRetries, (100*(1<<maxRetries-1)-100), err)
regularEntry, regularErr := s3a.getEntry(bucketDir, object)
if regularErr != nil {
@ -818,20 +819,42 @@ func (s3a *S3ApiServer) getLatestObjectVersion(bucket, object string) (*filer_pb
return regularEntry, nil
}
// Check if directory has latest version metadata
// Check if directory has latest version metadata - retry if missing due to race condition
if versionsEntry.Extended == nil {
// No metadata means all versioned objects have been deleted.
// Fall back to checking for a pre-versioning object.
glog.V(2).Infof("getLatestObjectVersion: no Extended metadata in .versions directory for %s%s, checking for pre-versioning object", bucket, object)
glog.V(0).Infof("CI-DEBUG: getLatestObjectVersion: .versions directory exists but NO Extended metadata for %s/%s - possible race condition", bucket, object)
glog.V(0).Infof("CI-DEBUG: getLatestObjectVersion: .versions directory exists but NO Extended metadata for %s/%s - retrying to handle race condition", bucket, object)
// Retry a few times to handle the race condition where directory exists but metadata is not yet written
metadataRetries := 3
for metaAttempt := 1; metaAttempt <= metadataRetries; metaAttempt++ {
glog.V(0).Infof("CI-DEBUG: getLatestObjectVersion: metadata retry %d/%d for %s/%s", metaAttempt, metadataRetries, bucket, object)
// Small delay and re-read the directory
time.Sleep(time.Millisecond * 100)
versionsEntry, err = s3a.getEntry(bucketDir, versionsObjectPath)
if err != nil {
glog.V(0).Infof("CI-DEBUG: getLatestObjectVersion: metadata retry %d failed to re-read .versions for %s/%s: %v", metaAttempt, bucket, object, err)
break
}
regularEntry, regularErr := s3a.getEntry(bucketDir, object)
if regularErr != nil {
return nil, fmt.Errorf("no version metadata in .versions directory and no regular object found for %s%s", bucket, object)
if versionsEntry.Extended != nil {
glog.V(0).Infof("CI-DEBUG: getLatestObjectVersion: metadata retry %d SUCCESS - found Extended metadata for %s/%s", metaAttempt, bucket, object)
break
}
}
glog.V(2).Infof("getLatestObjectVersion: found pre-versioning object for %s%s (no Extended metadata case)", bucket, object)
return regularEntry, nil
// If still no metadata after retries, fall back to pre-versioning object
if versionsEntry.Extended == nil {
glog.V(2).Infof("getLatestObjectVersion: no Extended metadata in .versions directory for %s%s after retries, checking for pre-versioning object", bucket, object)
glog.V(0).Infof("CI-DEBUG: getLatestObjectVersion: NO Extended metadata for %s/%s after %d retries - falling back to pre-versioning", bucket, object, metadataRetries)
regularEntry, regularErr := s3a.getEntry(bucketDir, object)
if regularErr != nil {
return nil, fmt.Errorf("no version metadata in .versions directory and no regular object found for %s%s", bucket, object)
}
glog.V(2).Infof("getLatestObjectVersion: found pre-versioning object for %s%s (no Extended metadata case)", bucket, object)
return regularEntry, nil
}
}
latestVersionIdBytes, hasLatestVersionId := versionsEntry.Extended[s3_constants.ExtLatestVersionIdKey]

Loading…
Cancel
Save