Browse Source
Switch empty-folder cleanup to bucket policy (#8292)
Switch empty-folder cleanup to bucket policy (#8292)
* Fix Spark _temporary cleanup and add issue #8285 regression test * Generalize empty folder cleanup for Spark temp artifacts * Revert synchronous folder pruning and add cleanup diagnostics * Add actionable empty-folder cleanup diagnostics * Fix Spark temp marker cleanup in async folder cleaner * Fix Spark temp cleanup with implicit directory markers * Keep explicit directory markers non-implicit * logging * more logs * Switch empty-folder cleanup to bucket policy * Seaweed-X-Amz-Allow-Empty-Folders * less logs * go vet * less logs * refactoringmaster
committed by
GitHub
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
16 changed files with 798 additions and 157 deletions
-
2test/s3/spark/issue_8234_repro_test.go
-
180test/s3/spark/issue_8285_repro_test.go
-
32test/s3/spark/setup_test.go
-
7weed/command/filer_remote_gateway_buckets.go
-
68weed/command/mount_std.go
-
52weed/command/mount_std_test.go
-
30weed/filer/empty_folder_cleanup/cleanup_queue.go
-
120weed/filer/empty_folder_cleanup/cleanup_queue_test.go
-
169weed/filer/empty_folder_cleanup/empty_folder_cleaner.go
-
171weed/filer/empty_folder_cleanup/empty_folder_cleaner_test.go
-
7weed/filer/filer.go
-
2weed/s3api/s3_constants/extend_key.go
-
17weed/s3api/s3api_object_handlers_delete.go
-
2weed/s3api/s3api_object_handlers_put.go
-
33weed/util/buckets.go
-
63weed/util/buckets_test.go
@ -0,0 +1,180 @@ |
|||||
|
package spark |
||||
|
|
||||
|
import ( |
||||
|
"context" |
||||
|
"errors" |
||||
|
"fmt" |
||||
|
"strings" |
||||
|
"testing" |
||||
|
"time" |
||||
|
|
||||
|
"github.com/aws/aws-sdk-go-v2/aws" |
||||
|
"github.com/aws/aws-sdk-go-v2/credentials" |
||||
|
"github.com/aws/aws-sdk-go-v2/service/s3" |
||||
|
s3types "github.com/aws/aws-sdk-go-v2/service/s3/types" |
||||
|
) |
||||
|
|
||||
|
func TestSparkS3TemporaryDirectoryCleanupIssue8285Regression(t *testing.T) { |
||||
|
if testing.Short() { |
||||
|
t.Skip("Skipping Spark integration test in short mode") |
||||
|
} |
||||
|
|
||||
|
env := setupSparkIssue8234Env(t) |
||||
|
|
||||
|
script := ` |
||||
|
import pyspark.sql.functions as F |
||||
|
|
||||
|
target = "s3a://test/issue-8285/output" |
||||
|
|
||||
|
spark.conf.set("spark.hadoop.fs.s3a.committer.name", "directory") |
||||
|
spark.conf.set("spark.hadoop.fs.s3a.committer.magic.enabled", "false") |
||||
|
spark.conf.set("spark.hadoop.fs.s3a.committer.staging.abort.pending.uploads", "true") |
||||
|
spark.conf.set("spark.hadoop.fs.s3a.committer.staging.conflict-mode", "append") |
||||
|
spark.conf.set("spark.hadoop.fs.s3a.committer.staging.tmp.path", "/tmp") |
||||
|
spark.conf.set("spark.hadoop.fs.s3a.directory.marker.retention", "keep") |
||||
|
|
||||
|
df = spark.range(0, 200).repartition(12).withColumn("value", F.col("id") * 2) |
||||
|
df.write.format("parquet").mode("overwrite").save(target) |
||||
|
count = spark.read.parquet(target).count() |
||||
|
print("WRITE_COUNT=" + str(count)) |
||||
|
` |
||||
|
|
||||
|
code, output := runSparkPyScript(t, env.sparkContainer, script, env.s3Port) |
||||
|
if code != 0 { |
||||
|
t.Fatalf("Spark script exited with code %d; output:\n%s", code, output) |
||||
|
} |
||||
|
if !strings.Contains(output, "WRITE_COUNT=200") { |
||||
|
t.Fatalf("expected write/read success marker in output, got:\n%s", output) |
||||
|
} |
||||
|
|
||||
|
keys := listObjectKeysByPrefix(t, env, "test", "issue-8285/") |
||||
|
var temporaryKeys []string |
||||
|
for _, key := range keys { |
||||
|
if hasTemporaryPathSegment(key) { |
||||
|
temporaryKeys = append(temporaryKeys, key) |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
if len(temporaryKeys) > 0 { |
||||
|
t.Fatalf("issue #8285 regression detected: found lingering _temporary artifacts: %v\nall keys: %v", temporaryKeys, keys) |
||||
|
} |
||||
|
|
||||
|
temporaryCandidates := []string{ |
||||
|
"issue-8285/output/_temporary/", |
||||
|
"issue-8285/output/_temporary/0/", |
||||
|
"issue-8285/output/_temporary/0/_temporary/", |
||||
|
} |
||||
|
lingering := waitForObjectsToDisappear(t, env, "test", temporaryCandidates, 35*time.Second) |
||||
|
if len(lingering) > 0 { |
||||
|
t.Fatalf("issue #8285 regression detected: lingering temporary directories: %v", lingering) |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
func listObjectKeysByPrefix(t *testing.T, env *TestEnvironment, bucketName, prefix string) []string { |
||||
|
t.Helper() |
||||
|
client := newS3Client(env) |
||||
|
|
||||
|
pager := s3.NewListObjectsV2Paginator(client, &s3.ListObjectsV2Input{ |
||||
|
Bucket: aws.String(bucketName), |
||||
|
Prefix: aws.String(prefix), |
||||
|
}) |
||||
|
|
||||
|
var keys []string |
||||
|
for pager.HasMorePages() { |
||||
|
page, err := pager.NextPage(context.Background()) |
||||
|
if err != nil { |
||||
|
t.Fatalf("failed listing objects for prefix %q: %v", prefix, err) |
||||
|
} |
||||
|
for _, object := range page.Contents { |
||||
|
keys = append(keys, aws.ToString(object.Key)) |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
return keys |
||||
|
} |
||||
|
|
||||
|
func headObjectInfo(t *testing.T, env *TestEnvironment, bucketName, key string) (bool, string, error) { |
||||
|
t.Helper() |
||||
|
|
||||
|
client := newS3Client(env) |
||||
|
output, err := client.HeadObject(context.Background(), &s3.HeadObjectInput{ |
||||
|
Bucket: aws.String(bucketName), |
||||
|
Key: aws.String(key), |
||||
|
}) |
||||
|
if err == nil { |
||||
|
return true, aws.ToString(output.ContentType), nil |
||||
|
} |
||||
|
|
||||
|
var notFound *s3types.NotFound |
||||
|
if strings.Contains(err.Error(), "NotFound") || strings.Contains(err.Error(), "NoSuchKey") || errors.As(err, ¬Found) { |
||||
|
return false, "", nil |
||||
|
} |
||||
|
return false, "", err |
||||
|
} |
||||
|
|
||||
|
func waitForObjectsToDisappear(t *testing.T, env *TestEnvironment, bucketName string, keys []string, timeout time.Duration) []string { |
||||
|
t.Helper() |
||||
|
|
||||
|
deadline := time.Now().Add(timeout) |
||||
|
pending := make(map[string]struct{}, len(keys)) |
||||
|
details := make(map[string]string, len(keys)) |
||||
|
for _, key := range keys { |
||||
|
pending[key] = struct{}{} |
||||
|
} |
||||
|
|
||||
|
for len(pending) > 0 && time.Now().Before(deadline) { |
||||
|
for key := range pending { |
||||
|
exists, contentType, err := headObjectInfo(t, env, bucketName, key) |
||||
|
if err != nil { |
||||
|
details[key] = fmt.Sprintf("%s (head_error=%v)", key, err) |
||||
|
continue |
||||
|
} |
||||
|
if !exists { |
||||
|
delete(pending, key) |
||||
|
delete(details, key) |
||||
|
continue |
||||
|
} |
||||
|
details[key] = fmt.Sprintf("%s (exists=true, contentType=%q)", key, contentType) |
||||
|
} |
||||
|
if len(pending) > 0 { |
||||
|
time.Sleep(2 * time.Second) |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
if len(pending) == 0 { |
||||
|
return nil |
||||
|
} |
||||
|
|
||||
|
var lingering []string |
||||
|
for _, key := range keys { |
||||
|
if _, ok := pending[key]; !ok { |
||||
|
continue |
||||
|
} |
||||
|
if detail, hasDetail := details[key]; hasDetail { |
||||
|
lingering = append(lingering, detail) |
||||
|
} else { |
||||
|
lingering = append(lingering, key) |
||||
|
} |
||||
|
} |
||||
|
return lingering |
||||
|
} |
||||
|
|
||||
|
func newS3Client(env *TestEnvironment) *s3.Client { |
||||
|
cfg := aws.Config{ |
||||
|
Region: "us-east-1", |
||||
|
Credentials: aws.NewCredentialsCache(credentials.NewStaticCredentialsProvider(env.accessKey, env.secretKey, "")), |
||||
|
BaseEndpoint: aws.String(fmt.Sprintf("http://localhost:%d", env.s3Port)), |
||||
|
} |
||||
|
return s3.NewFromConfig(cfg, func(o *s3.Options) { |
||||
|
o.UsePathStyle = true |
||||
|
}) |
||||
|
} |
||||
|
|
||||
|
func hasTemporaryPathSegment(key string) bool { |
||||
|
for _, segment := range strings.Split(strings.TrimSuffix(key, "/"), "/") { |
||||
|
if segment == "_temporary" { |
||||
|
return true |
||||
|
} |
||||
|
} |
||||
|
return false |
||||
|
} |
||||
@ -0,0 +1,52 @@ |
|||||
|
//go:build linux || darwin || freebsd
|
||||
|
// +build linux darwin freebsd
|
||||
|
|
||||
|
package command |
||||
|
|
||||
|
import "testing" |
||||
|
|
||||
|
func Test_bucketPathForMountRoot(t *testing.T) { |
||||
|
tests := []struct { |
||||
|
name string |
||||
|
mountRoot string |
||||
|
expected string |
||||
|
ok bool |
||||
|
}{ |
||||
|
{ |
||||
|
name: "bucket root mount", |
||||
|
mountRoot: "/buckets/test", |
||||
|
expected: "/buckets/test", |
||||
|
ok: true, |
||||
|
}, |
||||
|
{ |
||||
|
name: "bucket root with trailing slash", |
||||
|
mountRoot: "/buckets/test/", |
||||
|
expected: "/buckets/test", |
||||
|
ok: true, |
||||
|
}, |
||||
|
{ |
||||
|
name: "subdirectory mount", |
||||
|
mountRoot: "/buckets/test/data", |
||||
|
expected: "", |
||||
|
ok: false, |
||||
|
}, |
||||
|
{ |
||||
|
name: "non-bucket mount", |
||||
|
mountRoot: "/data/test", |
||||
|
expected: "", |
||||
|
ok: false, |
||||
|
}, |
||||
|
} |
||||
|
|
||||
|
for _, tt := range tests { |
||||
|
t.Run(tt.name, func(t *testing.T) { |
||||
|
gotPath, gotOK := bucketPathForMountRoot(tt.mountRoot, "/buckets") |
||||
|
if gotOK != tt.ok { |
||||
|
t.Fatalf("expected ok=%v, got %v", tt.ok, gotOK) |
||||
|
} |
||||
|
if gotPath != tt.expected { |
||||
|
t.Fatalf("expected path %q, got %q", tt.expected, gotPath) |
||||
|
} |
||||
|
}) |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,33 @@ |
|||||
|
package util |
||||
|
|
||||
|
import "strings" |
||||
|
|
||||
|
// ExtractBucketPath returns the bucket path under basePath that contains target.
|
||||
|
// If requireChild is true, the target must include additional segments beyond the bucket itself.
|
||||
|
func ExtractBucketPath(basePath, target string, requireChild bool) (string, bool) { |
||||
|
cleanBase := strings.TrimSuffix(basePath, "/") |
||||
|
if cleanBase == "" { |
||||
|
return "", false |
||||
|
} |
||||
|
|
||||
|
prefix := cleanBase + "/" |
||||
|
if !strings.HasPrefix(target, prefix) { |
||||
|
return "", false |
||||
|
} |
||||
|
|
||||
|
rest := strings.TrimPrefix(target, prefix) |
||||
|
if rest == "" { |
||||
|
return "", false |
||||
|
} |
||||
|
|
||||
|
bucketName, _, found := strings.Cut(rest, "/") |
||||
|
if bucketName == "" { |
||||
|
return "", false |
||||
|
} |
||||
|
|
||||
|
if requireChild && !found { |
||||
|
return "", false |
||||
|
} |
||||
|
|
||||
|
return prefix + bucketName, true |
||||
|
} |
||||
@ -0,0 +1,63 @@ |
|||||
|
package util |
||||
|
|
||||
|
import "testing" |
||||
|
|
||||
|
func TestExtractBucketPath(t *testing.T) { |
||||
|
for _, tt := range []struct { |
||||
|
name string |
||||
|
base string |
||||
|
target string |
||||
|
requireChild bool |
||||
|
expected string |
||||
|
ok bool |
||||
|
}{ |
||||
|
{ |
||||
|
name: "child paths return bucket", |
||||
|
base: "/buckets", |
||||
|
target: "/buckets/test/folder/file", |
||||
|
requireChild: true, |
||||
|
expected: "/buckets/test", |
||||
|
ok: true, |
||||
|
}, |
||||
|
{ |
||||
|
name: "bucket root without child fails when required", |
||||
|
base: "/buckets", |
||||
|
target: "/buckets/test", |
||||
|
requireChild: true, |
||||
|
ok: false, |
||||
|
}, |
||||
|
{ |
||||
|
name: "bucket root allowed when not required", |
||||
|
base: "/buckets", |
||||
|
target: "/buckets/test", |
||||
|
requireChild: false, |
||||
|
expected: "/buckets/test", |
||||
|
ok: true, |
||||
|
}, |
||||
|
{ |
||||
|
name: "path outside buckets fails", |
||||
|
base: "/buckets", |
||||
|
target: "/data/test/folder", |
||||
|
requireChild: true, |
||||
|
ok: false, |
||||
|
}, |
||||
|
{ |
||||
|
name: "trailing slash on base is normalized", |
||||
|
base: "/buckets/", |
||||
|
target: "/buckets/test/sub", |
||||
|
requireChild: true, |
||||
|
expected: "/buckets/test", |
||||
|
ok: true, |
||||
|
}, |
||||
|
} { |
||||
|
t.Run(tt.name, func(t *testing.T) { |
||||
|
got, ok := ExtractBucketPath(tt.base, tt.target, tt.requireChild) |
||||
|
if ok != tt.ok { |
||||
|
t.Fatalf("expected ok=%v, got %v", tt.ok, ok) |
||||
|
} |
||||
|
if got != tt.expected { |
||||
|
t.Fatalf("expected path %q, got %q", tt.expected, got) |
||||
|
} |
||||
|
}) |
||||
|
} |
||||
|
} |
||||
Write
Preview
Loading…
Cancel
Save
Reference in new issue