5 changed files with 353 additions and 40 deletions
-
180test/s3/spark/issue_8285_repro_test.go
-
25weed/filer/empty_folder_cleanup/empty_folder_cleaner.go
-
23weed/filer/empty_folder_cleanup/empty_folder_cleaner_test.go
-
106weed/s3api/s3api_object_handlers_delete.go
-
59weed/s3api/s3api_object_handlers_put.go
@ -0,0 +1,180 @@ |
|||||
|
package spark |
||||
|
|
||||
|
import ( |
||||
|
"context" |
||||
|
"errors" |
||||
|
"fmt" |
||||
|
"strings" |
||||
|
"testing" |
||||
|
"time" |
||||
|
|
||||
|
"github.com/aws/aws-sdk-go-v2/aws" |
||||
|
"github.com/aws/aws-sdk-go-v2/credentials" |
||||
|
"github.com/aws/aws-sdk-go-v2/service/s3" |
||||
|
s3types "github.com/aws/aws-sdk-go-v2/service/s3/types" |
||||
|
) |
||||
|
|
||||
|
func TestSparkS3TemporaryDirectoryCleanupIssue8285Regression(t *testing.T) { |
||||
|
if testing.Short() { |
||||
|
t.Skip("Skipping Spark integration test in short mode") |
||||
|
} |
||||
|
|
||||
|
env := setupSparkIssue8234Env(t) |
||||
|
|
||||
|
script := ` |
||||
|
import pyspark.sql.functions as F |
||||
|
|
||||
|
target = "s3a://test/issue-8285/output" |
||||
|
|
||||
|
spark.conf.set("spark.hadoop.fs.s3a.committer.name", "directory") |
||||
|
spark.conf.set("spark.hadoop.fs.s3a.committer.magic.enabled", "false") |
||||
|
spark.conf.set("spark.hadoop.fs.s3a.committer.staging.abort.pending.uploads", "true") |
||||
|
spark.conf.set("spark.hadoop.fs.s3a.committer.staging.conflict-mode", "append") |
||||
|
spark.conf.set("spark.hadoop.fs.s3a.committer.staging.tmp.path", "/tmp") |
||||
|
spark.conf.set("spark.hadoop.fs.s3a.directory.marker.retention", "keep") |
||||
|
|
||||
|
df = spark.range(0, 200).repartition(12).withColumn("value", F.col("id") * 2) |
||||
|
df.write.format("parquet").mode("overwrite").save(target) |
||||
|
count = spark.read.parquet(target).count() |
||||
|
print("WRITE_COUNT=" + str(count)) |
||||
|
` |
||||
|
|
||||
|
code, output := runSparkPyScript(t, env.sparkContainer, script, env.s3Port) |
||||
|
if code != 0 { |
||||
|
t.Fatalf("Spark script exited with code %d; output:\n%s", code, output) |
||||
|
} |
||||
|
if !strings.Contains(output, "WRITE_COUNT=200") { |
||||
|
t.Fatalf("expected write/read success marker in output, got:\n%s", output) |
||||
|
} |
||||
|
|
||||
|
keys := listObjectKeysByPrefix(t, env, "test", "issue-8285/") |
||||
|
var temporaryKeys []string |
||||
|
for _, key := range keys { |
||||
|
if hasTemporaryPathSegment(key) { |
||||
|
temporaryKeys = append(temporaryKeys, key) |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
if len(temporaryKeys) > 0 { |
||||
|
t.Fatalf("issue #8285 regression detected: found lingering _temporary artifacts: %v\nall keys: %v", temporaryKeys, keys) |
||||
|
} |
||||
|
|
||||
|
temporaryCandidates := []string{ |
||||
|
"issue-8285/output/_temporary/", |
||||
|
"issue-8285/output/_temporary/0/", |
||||
|
"issue-8285/output/_temporary/0/_temporary/", |
||||
|
} |
||||
|
lingering := waitForObjectsToDisappear(t, env, "test", temporaryCandidates, 35*time.Second) |
||||
|
if len(lingering) > 0 { |
||||
|
t.Fatalf("issue #8285 regression detected: lingering temporary directories: %v", lingering) |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
func listObjectKeysByPrefix(t *testing.T, env *TestEnvironment, bucketName, prefix string) []string { |
||||
|
t.Helper() |
||||
|
client := newS3Client(env) |
||||
|
|
||||
|
pager := s3.NewListObjectsV2Paginator(client, &s3.ListObjectsV2Input{ |
||||
|
Bucket: aws.String(bucketName), |
||||
|
Prefix: aws.String(prefix), |
||||
|
}) |
||||
|
|
||||
|
var keys []string |
||||
|
for pager.HasMorePages() { |
||||
|
page, err := pager.NextPage(context.Background()) |
||||
|
if err != nil { |
||||
|
t.Fatalf("failed listing objects for prefix %q: %v", prefix, err) |
||||
|
} |
||||
|
for _, object := range page.Contents { |
||||
|
keys = append(keys, aws.ToString(object.Key)) |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
return keys |
||||
|
} |
||||
|
|
||||
|
func headObjectInfo(t *testing.T, env *TestEnvironment, bucketName, key string) (bool, string, error) { |
||||
|
t.Helper() |
||||
|
|
||||
|
client := newS3Client(env) |
||||
|
output, err := client.HeadObject(context.Background(), &s3.HeadObjectInput{ |
||||
|
Bucket: aws.String(bucketName), |
||||
|
Key: aws.String(key), |
||||
|
}) |
||||
|
if err == nil { |
||||
|
return true, aws.ToString(output.ContentType), nil |
||||
|
} |
||||
|
|
||||
|
var notFound *s3types.NotFound |
||||
|
if strings.Contains(err.Error(), "NotFound") || strings.Contains(err.Error(), "NoSuchKey") || errors.As(err, ¬Found) { |
||||
|
return false, "", nil |
||||
|
} |
||||
|
return false, "", err |
||||
|
} |
||||
|
|
||||
|
func waitForObjectsToDisappear(t *testing.T, env *TestEnvironment, bucketName string, keys []string, timeout time.Duration) []string { |
||||
|
t.Helper() |
||||
|
|
||||
|
deadline := time.Now().Add(timeout) |
||||
|
pending := make(map[string]struct{}, len(keys)) |
||||
|
details := make(map[string]string, len(keys)) |
||||
|
for _, key := range keys { |
||||
|
pending[key] = struct{}{} |
||||
|
} |
||||
|
|
||||
|
for len(pending) > 0 && time.Now().Before(deadline) { |
||||
|
for key := range pending { |
||||
|
exists, contentType, err := headObjectInfo(t, env, bucketName, key) |
||||
|
if err != nil { |
||||
|
details[key] = fmt.Sprintf("%s (head_error=%v)", key, err) |
||||
|
continue |
||||
|
} |
||||
|
if !exists { |
||||
|
delete(pending, key) |
||||
|
delete(details, key) |
||||
|
continue |
||||
|
} |
||||
|
details[key] = fmt.Sprintf("%s (exists=true, contentType=%q)", key, contentType) |
||||
|
} |
||||
|
if len(pending) > 0 { |
||||
|
time.Sleep(2 * time.Second) |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
if len(pending) == 0 { |
||||
|
return nil |
||||
|
} |
||||
|
|
||||
|
var lingering []string |
||||
|
for _, key := range keys { |
||||
|
if _, ok := pending[key]; !ok { |
||||
|
continue |
||||
|
} |
||||
|
if detail, hasDetail := details[key]; hasDetail { |
||||
|
lingering = append(lingering, detail) |
||||
|
} else { |
||||
|
lingering = append(lingering, key) |
||||
|
} |
||||
|
} |
||||
|
return lingering |
||||
|
} |
||||
|
|
||||
|
func newS3Client(env *TestEnvironment) *s3.Client { |
||||
|
cfg := aws.Config{ |
||||
|
Region: "us-east-1", |
||||
|
Credentials: aws.NewCredentialsCache(credentials.NewStaticCredentialsProvider(env.accessKey, env.secretKey, "")), |
||||
|
BaseEndpoint: aws.String(fmt.Sprintf("http://localhost:%d", env.s3Port)), |
||||
|
} |
||||
|
return s3.NewFromConfig(cfg, func(o *s3.Options) { |
||||
|
o.UsePathStyle = true |
||||
|
}) |
||||
|
} |
||||
|
|
||||
|
func hasTemporaryPathSegment(key string) bool { |
||||
|
for _, segment := range strings.Split(strings.TrimSuffix(key, "/"), "/") { |
||||
|
if segment == "_temporary" { |
||||
|
return true |
||||
|
} |
||||
|
} |
||||
|
return false |
||||
|
} |
||||
Write
Preview
Loading…
Cancel
Save
Reference in new issue