diff --git a/.github/workflows/java_integration_tests.yml b/.github/workflows/java_integration_tests.yml index c9efe580c..64c2cabbb 100644 --- a/.github/workflows/java_integration_tests.yml +++ b/.github/workflows/java_integration_tests.yml @@ -53,6 +53,7 @@ jobs: mkdir -p "$WEED_DATA_DIR" # Start SeaweedFS with optimized settings for CI + # Include S3 API for s3copier integration tests weed server -dir="$WEED_DATA_DIR" \ -master.raftHashicorp \ -master.electionTimeout=1s \ @@ -61,6 +62,9 @@ jobs: -volume.preStopSeconds=1 \ -master.peers=none \ -filer -filer.maxMB=64 \ + -s3 -s3.port=8333 \ + -s3.config="$GITHUB_WORKSPACE/docker/compose/s3.json" \ + -s3.allowDeleteBucketNotEmpty=true \ -master.port=9333 \ -volume.port=8080 \ -filer.port=8888 \ @@ -105,6 +109,16 @@ jobs: sleep 2 done + # Wait for S3 API + for i in {1..30}; do + if curl -s http://localhost:8333/ > /dev/null 2>&1; then + echo "✓ S3 API is ready" + break + fi + echo "Waiting for S3 API... ($i/30)" + sleep 2 + done + echo "✓ All SeaweedFS components are ready!" # Display cluster status @@ -133,6 +147,20 @@ jobs: run: | mvn test -Dtest=SeaweedFileSystemConfigTest -Dmaven.javadoc.skip=true -Dgpg.skip=true + - name: Run S3 ETag Validation Tests (Issue #7768) + working-directory: other/java/s3copier + env: + S3_ENDPOINT: http://127.0.0.1:8333 + S3_ACCESS_KEY: some_access_key1 + S3_SECRET_KEY: some_secret_key1 + run: | + echo "Running S3 ETag validation tests against $S3_ENDPOINT" + mvn test -Dtest=ETagValidationTest \ + -DS3_ENDPOINT=$S3_ENDPOINT \ + -DS3_ACCESS_KEY=$S3_ACCESS_KEY \ + -DS3_SECRET_KEY=$S3_SECRET_KEY \ + -Dmaven.javadoc.skip=true -Dgpg.skip=true + - name: Display logs on failure if: failure() run: | diff --git a/other/java/s3copier/README.md b/other/java/s3copier/README.md new file mode 100644 index 000000000..010ccd102 --- /dev/null +++ b/other/java/s3copier/README.md @@ -0,0 +1,110 @@ +# SeaweedFS S3 Java SDK Compatibility Tests + +This project contains Java-based integration tests for SeaweedFS S3 API compatibility. + +## Overview + +Tests are provided for both AWS SDK v1 and v2 to ensure compatibility with the various SDK versions commonly used in production. + +## SDK Versions + +| SDK | Version | Notes | +|-----|---------|-------| +| AWS SDK v1 for Java | 1.12.600 | Legacy SDK, less strict ETag validation | +| AWS SDK v2 for Java | 2.20.127 | Modern SDK with strict checksum validation | + +## Running Tests + +### Prerequisites + +1. SeaweedFS running with S3 API enabled: + ```bash + weed server -s3 + ``` + +2. Java 18+ and Maven + +### Run All Tests + +```bash +mvn test +``` + +### Run Specific Tests + +```bash +# Run only ETag validation tests (AWS SDK v2) +mvn test -Dtest=ETagValidationTest + +# Run with custom endpoint +mvn test -Dtest=ETagValidationTest -DS3_ENDPOINT=http://localhost:8333 +``` + +### Environment Variables + +| Variable | Default | Description | +|----------|---------|-------------| +| `S3_ENDPOINT` | `http://127.0.0.1:8333` | S3 API endpoint URL | +| `S3_ACCESS_KEY` | `some_access_key1` | Access key ID | +| `S3_SECRET_KEY` | `some_secret_key1` | Secret access key | +| `S3_REGION` | `us-east-1` | AWS region | + +## Test Coverage + +### ETagValidationTest (AWS SDK v2) + +Tests for [GitHub Issue #7768](https://github.com/seaweedfs/seaweedfs/issues/7768) - ETag format validation. + +| Test | Description | +|------|-------------| +| `testSmallFilePutObject` | Verify small files return pure MD5 ETag | +| `testLargeFilePutObject_Issue7768` | **Critical**: Verify large files (>8MB) return pure MD5 ETag | +| `testExtraLargeFilePutObject` | Verify very large files (>24MB) return pure MD5 ETag | +| `testMultipartUploadETag` | Verify multipart uploads return composite ETag | +| `testETagConsistency` | Verify ETag consistency across PUT/HEAD/GET | +| `testMultipleLargeFileUploads` | Stress test multiple large uploads | + +### Background: Issue #7768 + +AWS SDK v2 for Java includes checksum validation that decodes the ETag as hexadecimal. When SeaweedFS returned composite ETags (`-`) for regular `PutObject` with internally auto-chunked files, the SDK failed with: + +``` +java.lang.IllegalArgumentException: Invalid base 16 character: '-' +``` + +**Per AWS S3 specification:** +- `PutObject`: ETag is always a pure MD5 hex string (32 chars) +- `CompleteMultipartUpload`: ETag is composite format (`-`) + +The fix ensures SeaweedFS follows this specification. + +## Project Structure + +``` +src/ +├── main/java/com/seaweedfs/s3/ +│ ├── PutObject.java # Example PutObject with SDK v1 +│ └── HighLevelMultipartUpload.java +└── test/java/com/seaweedfs/s3/ + ├── PutObjectTest.java # Basic SDK v1 test + └── ETagValidationTest.java # Comprehensive SDK v2 ETag tests +``` + +## Validated SDK Versions + +This Java test project validates: + +- ✅ AWS SDK v2 for Java 2.20.127+ +- ✅ AWS SDK v1 for Java 1.12.600+ + +Go SDK validation is performed by separate test suites: +- See [Go ETag Tests](/test/s3/etag/) for AWS SDK v2 for Go tests +- See [test/s3/SDK_COMPATIBILITY.md](/test/s3/SDK_COMPATIBILITY.md) for full SDK compatibility matrix + +## Related + +- [GitHub Issue #7768](https://github.com/seaweedfs/seaweedfs/issues/7768) +- [AWS S3 ETag Documentation](https://docs.aws.amazon.com/AmazonS3/latest/API/API_Object.html) +- [Go ETag Tests](/test/s3/etag/) +- [SDK Compatibility Matrix](/test/s3/SDK_COMPATIBILITY.md) + diff --git a/other/java/s3copier/pom.xml b/other/java/s3copier/pom.xml index 0050c70da..734289ee9 100644 --- a/other/java/s3copier/pom.xml +++ b/other/java/s3copier/pom.xml @@ -6,18 +6,29 @@ jar 1.0-SNAPSHOT - 18 - 18 + 11 + 11 + 1.12.600 + 2.20.127 copier http://maven.apache.org + com.amazonaws aws-java-sdk-bom - 1.11.327 + ${aws.sdk.v1.version} + pom + import + + + + software.amazon.awssdk + bom + ${aws.sdk.v2.version} pom import @@ -25,15 +36,50 @@ + com.amazonaws aws-java-sdk-s3 + + + + software.amazon.awssdk + s3 + + + software.amazon.awssdk + s3-transfer-manager + + + junit junit - 4.13.1 + 4.13.2 + test + + + org.junit.jupiter + junit-jupiter + 5.10.0 + test + + + org.assertj + assertj-core + 3.24.2 test + + + + + org.apache.maven.plugins + maven-surefire-plugin + 3.2.2 + + + diff --git a/other/java/s3copier/src/test/java/com/seaweedfs/s3/ETagValidationTest.java b/other/java/s3copier/src/test/java/com/seaweedfs/s3/ETagValidationTest.java new file mode 100644 index 000000000..7e0009b19 --- /dev/null +++ b/other/java/s3copier/src/test/java/com/seaweedfs/s3/ETagValidationTest.java @@ -0,0 +1,439 @@ +package com.seaweedfs.s3; + +import org.junit.jupiter.api.*; +import org.junit.jupiter.api.condition.EnabledIfEnvironmentVariable; +import software.amazon.awssdk.auth.credentials.AwsBasicCredentials; +import software.amazon.awssdk.auth.credentials.StaticCredentialsProvider; +import software.amazon.awssdk.core.sync.RequestBody; +import software.amazon.awssdk.regions.Region; +import software.amazon.awssdk.services.s3.S3Client; +import software.amazon.awssdk.services.s3.model.*; + +import java.net.URI; +import java.security.MessageDigest; +import java.security.SecureRandom; +import java.util.ArrayList; +import java.util.List; +import java.util.UUID; +import java.util.regex.Pattern; + +import static org.assertj.core.api.Assertions.*; + +/** + * AWS SDK v2 Integration Tests for S3 ETag Format Validation. + * + * These tests verify that SeaweedFS returns correct ETag formats that are + * compatible with AWS SDK v2's checksum validation. + * + * Background (GitHub Issue #7768): + * AWS SDK v2 for Java validates ETags as hexadecimal MD5 hashes for PutObject + * responses. If the ETag contains non-hex characters (like '-' in composite + * format), the SDK fails with "Invalid base 16 character: '-'". + * + * Per AWS S3 specification: + * - Regular PutObject: ETag is always a pure MD5 hex string (32 chars) + * - CompleteMultipartUpload: ETag is composite format "-" + * + * To run these tests: + * mvn test -Dtest=ETagValidationTest -DS3_ENDPOINT=http://localhost:8333 + * + * Or set environment variable: + * export S3_ENDPOINT=http://localhost:8333 + * mvn test -Dtest=ETagValidationTest + */ +@TestInstance(TestInstance.Lifecycle.PER_CLASS) +@DisplayName("S3 ETag Format Validation Tests (AWS SDK v2)") +class ETagValidationTest { + + // Configuration - can be overridden via system properties or environment variables + private static final String DEFAULT_ENDPOINT = "http://127.0.0.1:8333"; + private static final String DEFAULT_ACCESS_KEY = "some_access_key1"; + private static final String DEFAULT_SECRET_KEY = "some_secret_key1"; + private static final String DEFAULT_REGION = "us-east-1"; + + // Auto-chunking threshold in SeaweedFS (must match s3api_object_handlers_put.go) + private static final int AUTO_CHUNK_SIZE = 8 * 1024 * 1024; // 8MB + + // Test sizes + private static final int SMALL_FILE_SIZE = 1024; // 1KB + private static final int LARGE_FILE_SIZE = 10 * 1024 * 1024; // 10MB (triggers auto-chunking) + private static final int XL_FILE_SIZE = 25 * 1024 * 1024; // 25MB (multiple chunks) + private static final int MULTIPART_PART_SIZE = 5 * 1024 * 1024; // 5MB per part + + // ETag format patterns + private static final Pattern PURE_MD5_PATTERN = Pattern.compile("^\"?[a-f0-9]{32}\"?$"); + private static final Pattern COMPOSITE_PATTERN = Pattern.compile("^\"?[a-f0-9]{32}-\\d+\"?$"); + + private S3Client s3Client; + private String testBucketName; + private final SecureRandom random = new SecureRandom(); + + @BeforeAll + void setUp() { + String endpoint = getConfig("S3_ENDPOINT", DEFAULT_ENDPOINT); + String accessKey = getConfig("S3_ACCESS_KEY", DEFAULT_ACCESS_KEY); + String secretKey = getConfig("S3_SECRET_KEY", DEFAULT_SECRET_KEY); + String region = getConfig("S3_REGION", DEFAULT_REGION); + + System.out.println("Connecting to S3 endpoint: " + endpoint); + + s3Client = S3Client.builder() + .endpointOverride(URI.create(endpoint)) + .credentialsProvider(StaticCredentialsProvider.create( + AwsBasicCredentials.create(accessKey, secretKey))) + .region(Region.of(region)) + .forcePathStyle(true) // Required for SeaweedFS + .build(); + + // Create test bucket + testBucketName = "test-etag-" + UUID.randomUUID().toString().substring(0, 8); + s3Client.createBucket(CreateBucketRequest.builder() + .bucket(testBucketName) + .build()); + + System.out.println("Created test bucket: " + testBucketName); + } + + @AfterAll + void tearDown() { + if (s3Client != null && testBucketName != null) { + try { + // Delete all objects with pagination + String continuationToken = null; + do { + ListObjectsV2Response listResp = s3Client.listObjectsV2( + ListObjectsV2Request.builder() + .bucket(testBucketName) + .continuationToken(continuationToken) + .build()); + for (S3Object obj : listResp.contents()) { + s3Client.deleteObject(DeleteObjectRequest.builder() + .bucket(testBucketName) + .key(obj.key()) + .build()); + } + continuationToken = listResp.nextContinuationToken(); + } while (continuationToken != null); + + // Abort any multipart uploads + ListMultipartUploadsResponse mpResp = s3Client.listMultipartUploads( + ListMultipartUploadsRequest.builder().bucket(testBucketName).build()); + for (MultipartUpload upload : mpResp.uploads()) { + s3Client.abortMultipartUpload(AbortMultipartUploadRequest.builder() + .bucket(testBucketName) + .key(upload.key()) + .uploadId(upload.uploadId()) + .build()); + } + + // Delete bucket + s3Client.deleteBucket(DeleteBucketRequest.builder() + .bucket(testBucketName) + .build()); + + System.out.println("Cleaned up test bucket: " + testBucketName); + } catch (Exception e) { + System.err.println("Warning: Failed to cleanup test bucket: " + e.getMessage()); + } + s3Client.close(); + } + } + + @Test + @DisplayName("Small file PutObject should return pure MD5 ETag") + void testSmallFilePutObject() throws Exception { + byte[] testData = generateRandomData(SMALL_FILE_SIZE); + String expectedMD5 = calculateMD5Hex(testData); + String objectKey = "small-file-" + UUID.randomUUID() + ".bin"; + + PutObjectResponse response = s3Client.putObject( + PutObjectRequest.builder() + .bucket(testBucketName) + .key(objectKey) + .build(), + RequestBody.fromBytes(testData)); + + String etag = response.eTag(); + System.out.println("Small file ETag: " + etag + " (expected MD5: " + expectedMD5 + ")"); + + assertThat(etag) + .describedAs("Small file ETag should be pure MD5") + .matches(PURE_MD5_PATTERN); + assertThat(cleanETag(etag)) + .describedAs("ETag should match calculated MD5") + .isEqualTo(expectedMD5); + assertThat(etag) + .describedAs("ETag should not contain hyphen") + .doesNotContain("-"); + } + + /** + * Critical test for GitHub Issue #7768. + * + * This test uploads a file larger than the auto-chunking threshold (8MB), + * which triggers SeaweedFS to split the file into multiple internal chunks. + * + * Previously, this caused SeaweedFS to return a composite ETag like + * "d41d8cd98f00b204e9800998ecf8427e-2", which AWS SDK v2 rejected because + * it validates the ETag as hexadecimal and '-' is not a valid hex character. + * + * The fix ensures that regular PutObject always returns a pure MD5 ETag, + * regardless of internal chunking. + */ + @Test + @DisplayName("Large file PutObject (>8MB) should return pure MD5 ETag - Issue #7768") + void testLargeFilePutObject_Issue7768() throws Exception { + byte[] testData = generateRandomData(LARGE_FILE_SIZE); + String expectedMD5 = calculateMD5Hex(testData); + String objectKey = "large-file-" + UUID.randomUUID() + ".bin"; + + System.out.println("Uploading large file (" + LARGE_FILE_SIZE + " bytes, " + + "> " + AUTO_CHUNK_SIZE + " byte auto-chunk threshold)..."); + + // This is where Issue #7768 would manifest - SDK v2 validates ETag + PutObjectResponse response = s3Client.putObject( + PutObjectRequest.builder() + .bucket(testBucketName) + .key(objectKey) + .build(), + RequestBody.fromBytes(testData)); + + String etag = response.eTag(); + int expectedChunks = (LARGE_FILE_SIZE / AUTO_CHUNK_SIZE) + 1; + System.out.println("Large file ETag: " + etag + + " (expected MD5: " + expectedMD5 + ", internal chunks: ~" + expectedChunks + ")"); + + // These assertions would fail before the fix + assertThat(etag) + .describedAs("Large file PutObject ETag MUST be pure MD5 (not composite)") + .matches(PURE_MD5_PATTERN); + assertThat(etag) + .describedAs("Large file ETag should NOT be composite format") + .doesNotMatch(COMPOSITE_PATTERN); + assertThat(etag) + .describedAs("ETag should not contain hyphen for regular PutObject") + .doesNotContain("-"); + assertThat(cleanETag(etag)) + .describedAs("ETag should match calculated MD5") + .isEqualTo(expectedMD5); + + // Verify hex decoding works (this is what fails in Issue #7768) + assertThatCode(() -> hexToBytes(cleanETag(etag))) + .describedAs("ETag should be valid hexadecimal (AWS SDK v2 validation)") + .doesNotThrowAnyException(); + } + + @Test + @DisplayName("Extra large file PutObject (>24MB) should return pure MD5 ETag") + void testExtraLargeFilePutObject() throws Exception { + byte[] testData = generateRandomData(XL_FILE_SIZE); + String expectedMD5 = calculateMD5Hex(testData); + String objectKey = "xl-file-" + UUID.randomUUID() + ".bin"; + + int expectedChunks = (XL_FILE_SIZE / AUTO_CHUNK_SIZE) + 1; + System.out.println("Uploading XL file (" + XL_FILE_SIZE + " bytes, ~" + + expectedChunks + " internal chunks)..."); + + PutObjectResponse response = s3Client.putObject( + PutObjectRequest.builder() + .bucket(testBucketName) + .key(objectKey) + .build(), + RequestBody.fromBytes(testData)); + + String etag = response.eTag(); + System.out.println("XL file ETag: " + etag); + + assertThat(etag) + .describedAs("XL file PutObject ETag MUST be pure MD5") + .matches(PURE_MD5_PATTERN); + assertThat(cleanETag(etag)) + .describedAs("ETag should match calculated MD5") + .isEqualTo(expectedMD5); + } + + @Test + @DisplayName("Multipart upload should return composite ETag") + void testMultipartUploadETag() throws Exception { + int totalSize = 15 * 1024 * 1024; // 15MB = 3 parts + byte[] testData = generateRandomData(totalSize); + String objectKey = "multipart-file-" + UUID.randomUUID() + ".bin"; + + System.out.println("Performing multipart upload (" + totalSize + " bytes)..."); + + // Initiate multipart upload + CreateMultipartUploadResponse createResp = s3Client.createMultipartUpload( + CreateMultipartUploadRequest.builder() + .bucket(testBucketName) + .key(objectKey) + .build()); + String uploadId = createResp.uploadId(); + + List completedParts = new ArrayList<>(); + int partNumber = 1; + + // Upload parts + for (int offset = 0; offset < totalSize; offset += MULTIPART_PART_SIZE) { + int end = Math.min(offset + MULTIPART_PART_SIZE, totalSize); + byte[] partData = new byte[end - offset]; + System.arraycopy(testData, offset, partData, 0, partData.length); + + UploadPartResponse uploadResp = s3Client.uploadPart( + UploadPartRequest.builder() + .bucket(testBucketName) + .key(objectKey) + .uploadId(uploadId) + .partNumber(partNumber) + .build(), + RequestBody.fromBytes(partData)); + + completedParts.add(CompletedPart.builder() + .partNumber(partNumber) + .eTag(uploadResp.eTag()) + .build()); + partNumber++; + } + + // Complete multipart upload + CompleteMultipartUploadResponse completeResp = s3Client.completeMultipartUpload( + CompleteMultipartUploadRequest.builder() + .bucket(testBucketName) + .key(objectKey) + .uploadId(uploadId) + .multipartUpload(CompletedMultipartUpload.builder() + .parts(completedParts) + .build()) + .build()); + + String etag = completeResp.eTag(); + System.out.println("Multipart upload ETag: " + etag + " (" + completedParts.size() + " parts)"); + + // Multipart uploads SHOULD have composite ETag + assertThat(etag) + .describedAs("Multipart upload ETag SHOULD be composite format") + .matches(COMPOSITE_PATTERN); + assertThat(etag) + .describedAs("Multipart ETag should contain hyphen") + .contains("-"); + + // Verify part count in ETag + String[] parts = cleanETag(etag).split("-"); + assertThat(parts).hasSize(2); + assertThat(parts[1]) + .describedAs("Part count in ETag should match uploaded parts") + .isEqualTo(String.valueOf(completedParts.size())); + } + + @Test + @DisplayName("ETag should be consistent across PUT, HEAD, and GET") + void testETagConsistency() throws Exception { + byte[] testData = generateRandomData(LARGE_FILE_SIZE); + String objectKey = "consistency-test-" + UUID.randomUUID() + ".bin"; + + // PUT + PutObjectResponse putResp = s3Client.putObject( + PutObjectRequest.builder() + .bucket(testBucketName) + .key(objectKey) + .build(), + RequestBody.fromBytes(testData)); + String putETag = putResp.eTag(); + + // HEAD + HeadObjectResponse headResp = s3Client.headObject( + HeadObjectRequest.builder() + .bucket(testBucketName) + .key(objectKey) + .build()); + String headETag = headResp.eTag(); + + // GET + GetObjectResponse getResp = s3Client.getObject( + GetObjectRequest.builder() + .bucket(testBucketName) + .key(objectKey) + .build()) + .response(); + String getETag = getResp.eTag(); + + System.out.println("PUT ETag: " + putETag + ", HEAD ETag: " + headETag + ", GET ETag: " + getETag); + + assertThat(putETag).isEqualTo(headETag); + assertThat(putETag).isEqualTo(getETag); + } + + @Test + @DisplayName("Multiple large file uploads should all return pure MD5 ETags") + void testMultipleLargeFileUploads() throws Exception { + int numFiles = 3; + + for (int i = 0; i < numFiles; i++) { + byte[] testData = generateRandomData(LARGE_FILE_SIZE); + String expectedMD5 = calculateMD5Hex(testData); + String objectKey = "multi-large-" + i + "-" + UUID.randomUUID() + ".bin"; + + PutObjectResponse response = s3Client.putObject( + PutObjectRequest.builder() + .bucket(testBucketName) + .key(objectKey) + .build(), + RequestBody.fromBytes(testData)); + + String etag = response.eTag(); + System.out.println("File " + i + " ETag: " + etag); + + assertThat(etag) + .describedAs("File " + i + " ETag should be pure MD5") + .matches(PURE_MD5_PATTERN); + assertThat(cleanETag(etag)) + .describedAs("File " + i + " ETag should match MD5") + .isEqualTo(expectedMD5); + + // Validate hex decoding + assertThatCode(() -> hexToBytes(cleanETag(etag))) + .doesNotThrowAnyException(); + } + } + + // Helper methods + + private String getConfig(String key, String defaultValue) { + String value = System.getProperty(key); + if (value == null) { + value = System.getenv(key); + } + return value != null ? value : defaultValue; + } + + private byte[] generateRandomData(int size) { + byte[] data = new byte[size]; + random.nextBytes(data); + return data; + } + + private String calculateMD5Hex(byte[] data) throws Exception { + MessageDigest md = MessageDigest.getInstance("MD5"); + byte[] digest = md.digest(data); + StringBuilder sb = new StringBuilder(); + for (byte b : digest) { + sb.append(String.format("%02x", b)); + } + return sb.toString(); + } + + private String cleanETag(String etag) { + if (etag == null) return null; + return etag.replace("\"", ""); + } + + private byte[] hexToBytes(String hex) { + int len = hex.length(); + byte[] data = new byte[len / 2]; + for (int i = 0; i < len; i += 2) { + data[i / 2] = (byte) ((Character.digit(hex.charAt(i), 16) << 4) + + Character.digit(hex.charAt(i + 1), 16)); + } + return data; + } +} + diff --git a/test/s3/SDK_COMPATIBILITY.md b/test/s3/SDK_COMPATIBILITY.md new file mode 100644 index 000000000..11cd3bff9 --- /dev/null +++ b/test/s3/SDK_COMPATIBILITY.md @@ -0,0 +1,126 @@ +# S3 SDK Compatibility Testing + +This document describes the SDK versions tested against SeaweedFS S3 API and known compatibility considerations. + +## Validated SDK Versions + +### Go SDKs + +| SDK | Version | Test Location | Status | +|-----|---------|---------------|--------| +| AWS SDK v2 for Go | 1.20+ | `test/s3/etag/`, `test/s3/copying/` | ✅ Tested | +| AWS SDK v1 for Go | 1.x | `test/s3/basic/` | ✅ Tested | + +### Java SDKs + +| SDK | Version | Test Location | Status | +|-----|---------|---------------|--------| +| AWS SDK v2 for Java | 2.20.127+ | `other/java/s3copier/` | ✅ Tested | +| AWS SDK v1 for Java | 1.12.600+ | `other/java/s3copier/` | ✅ Tested | + +### Python SDKs + +| SDK | Version | Test Location | Status | +|-----|---------|---------------|--------| +| boto3 | 1.x | `test/s3/parquet/` | ✅ Tested | +| PyArrow S3 | 14+ | `test/s3/parquet/` | ✅ Tested | + +## SDK-Specific Considerations + +### AWS SDK v2 for Java - ETag Validation + +**Issue**: [GitHub #7768](https://github.com/seaweedfs/seaweedfs/issues/7768) + +AWS SDK v2 for Java includes strict ETag validation in `ChecksumsEnabledValidator.validatePutObjectChecksum`. It decodes the ETag as a hexadecimal MD5 hash using `Base16Codec.decode()`. + +**Impact**: If the ETag contains non-hexadecimal characters (like `-` in composite format), the SDK fails with: +```text +java.lang.IllegalArgumentException: Invalid base 16 character: '-' +``` + +**Resolution**: SeaweedFS now correctly returns: +- **PutObject**: Pure MD5 hex ETag (32 characters) regardless of internal chunking +- **CompleteMultipartUpload**: Composite ETag (`-`) + +**Test Coverage**: `test/s3/etag/` and `other/java/s3copier/ETagValidationTest.java` + +### AWS SDK v1 vs v2 Differences + +| Feature | SDK v1 | SDK v2 | +|---------|--------|--------| +| ETag hex validation | No | Yes (strict) | +| Checksum validation | Basic | Enhanced | +| Async support | Limited | Full | +| Default retry behavior | Lenient | Stricter | + +### Large File Handling + +SeaweedFS auto-chunks files larger than **8MB** for efficient storage. This is transparent to clients, but previously affected ETag format. The current implementation ensures: + +1. Regular `PutObject` (any size): Returns pure MD5 ETag +2. Multipart upload: Returns composite ETag per AWS S3 specification + +## Test Categories by File Size + +| Category | Size | Chunks | ETag Format | +|----------|------|--------|-------------| +| Small | < 256KB | 1 (inline) | Pure MD5 | +| Medium | 256KB - 8MB | 1 | Pure MD5 | +| Large | 8MB - 24MB | 2-3 | Pure MD5 | +| Extra Large | > 24MB | 4+ | Pure MD5 | +| Multipart | N/A | Per part | Composite | + +## Running SDK Compatibility Tests + +### Go Tests + +```bash +# Run all ETag tests +cd test/s3/etag && make test + +# Run large file tests only +cd test/s3/etag && make test-large +``` + +### Java Tests + +```bash +# Run all Java SDK tests +cd other/java/s3copier && mvn test + +# Run only ETag validation tests +cd other/java/s3copier && mvn test -Dtest=ETagValidationTest +``` + +### Python Tests + +```bash +# Run PyArrow S3 tests +cd test/s3/parquet && make test +``` + +## Adding New SDK Tests + +When adding tests for new SDKs, ensure: + +1. **Large file tests (>8MB)**: Critical for verifying ETag format with auto-chunking +2. **Multipart upload tests**: Verify composite ETag format +3. **Checksum validation**: Test SDK-specific checksum validation if applicable +4. **Document SDK version**: Add to this compatibility matrix + +## Known Issues and Workarounds + +### Issue: Older SDK Versions + +Some very old SDK versions (e.g., AWS SDK v1 for Java < 1.11.x) may have different behavior. Testing with the versions listed above is recommended. + +### Issue: Custom Checksum Algorithms + +AWS SDK v2 supports SHA-256 and CRC32 checksums in addition to MD5. SeaweedFS currently returns MD5-based ETags. For checksums other than MD5, use the `x-amz-checksum-*` headers. + +## References + +- [AWS S3 ETag Documentation](https://docs.aws.amazon.com/AmazonS3/latest/API/API_Object.html) +- [AWS SDK v2 Migration Guide](https://docs.aws.amazon.com/sdk-for-java/latest/developer-guide/migration.html) +- [GitHub Issue #7768](https://github.com/seaweedfs/seaweedfs/issues/7768) + diff --git a/test/s3/etag/Makefile b/test/s3/etag/Makefile new file mode 100644 index 000000000..dc463e405 --- /dev/null +++ b/test/s3/etag/Makefile @@ -0,0 +1,50 @@ +# ETag Format Integration Tests +# +# These tests verify S3 ETag format compatibility, particularly for large files +# that trigger SeaweedFS auto-chunking. This addresses GitHub Issue #7768. +# +# Prerequisites: +# - SeaweedFS running with S3 API enabled on port 8333 +# - Go 1.21+ +# +# Usage: +# make test - Run all tests +# make test-large - Run only large file tests +# make test-verbose - Run with verbose output +# make clean - Clean test artifacts + +.PHONY: all test test-large test-verbose test-quick clean help + +# Default S3 endpoint +S3_ENDPOINT ?= http://127.0.0.1:8333 + +all: test + +test: + @echo "Running ETag format tests against $(S3_ENDPOINT)..." + S3_ENDPOINT=$(S3_ENDPOINT) go test -v -timeout 5m ./... + +test-large: + @echo "Running large file ETag tests..." + S3_ENDPOINT=$(S3_ENDPOINT) go test -v -timeout 5m -run "LargeFile|ExtraLarge" ./... + +test-verbose: + S3_ENDPOINT=$(S3_ENDPOINT) go test -v -timeout 5m -count=1 ./... + +test-quick: + @echo "Running quick ETag tests (small files only)..." + S3_ENDPOINT=$(S3_ENDPOINT) go test -v -timeout 1m -run "SmallFile|Consistency" ./... + +clean: + go clean -testcache + +help: + @echo "ETag Format Integration Tests" + @echo "Targets:" + @echo " test Run all ETag format tests" + @echo " test-large Run only large file tests (>8MB)" + @echo " test-quick Run quick tests (small files only)" + @echo " test-verbose Run with verbose output" + @echo " clean Clean test cache" + @echo "Environment Variables:" + @echo " S3_ENDPOINT S3 endpoint URL (default: http://127.0.0.1:8333)" diff --git a/test/s3/etag/README.md b/test/s3/etag/README.md new file mode 100644 index 000000000..215a43c0a --- /dev/null +++ b/test/s3/etag/README.md @@ -0,0 +1,92 @@ +# S3 ETag Format Integration Tests + +This test suite verifies that SeaweedFS returns correct ETag formats for S3 operations, ensuring compatibility with AWS S3 SDKs. + +## Background + +**GitHub Issue #7768**: AWS S3 SDK for Java v2 was failing with `Invalid base 16 character: '-'` when performing `PutObject` on large files. + +### Root Cause + +SeaweedFS internally auto-chunks large files (>8MB) for efficient storage. Previously, when a regular `PutObject` request resulted in multiple internal chunks, SeaweedFS returned a composite ETag format (`-`) instead of a pure MD5 hash. + +### AWS S3 Specification + +| Operation | ETag Format | Example | +|-----------|-------------|---------| +| PutObject (any size) | Pure MD5 hex (32 chars) | `d41d8cd98f00b204e9800998ecf8427e` | +| CompleteMultipartUpload | Composite (`-`) | `d41d8cd98f00b204e9800998ecf8427e-3` | + +AWS S3 SDK v2 for Java validates `PutObject` ETags as hexadecimal, which fails when the ETag contains a hyphen. + +## Test Coverage + +| Test | File Size | Purpose | +|------|-----------|---------| +| `TestPutObjectETagFormat_SmallFile` | 1KB | Verify single-chunk uploads return pure MD5 | +| `TestPutObjectETagFormat_LargeFile` | 10MB | **Critical**: Verify auto-chunked uploads return pure MD5 | +| `TestPutObjectETagFormat_ExtraLargeFile` | 25MB | Verify multi-chunk auto-chunked uploads return pure MD5 | +| `TestMultipartUploadETagFormat` | 15MB | Verify multipart uploads correctly return composite ETag | +| `TestPutObjectETagConsistency` | Various | Verify ETag consistency across PUT/HEAD/GET | +| `TestETagHexValidation` | 10MB | Simulate AWS SDK v2 hex validation | +| `TestMultipleLargeFileUploads` | 10MB x5 | Stress test multiple large uploads | + +## Prerequisites + +1. SeaweedFS running with S3 API enabled: + ```bash + weed server -s3 + ``` + +2. Go 1.21 or later + +3. AWS SDK v2 for Go (installed via go modules) + +## Running Tests + +```bash +# Run all tests +make test + +# Run only large file tests (the critical ones for issue #7768) +make test-large + +# Run quick tests (small files only) +make test-quick + +# Run with verbose output +make test-verbose +``` + +## Configuration + +By default, tests connect to `http://127.0.0.1:8333`. To use a different endpoint: + +```bash +S3_ENDPOINT=http://localhost:8333 make test +``` + +Or modify `defaultConfig` in `s3_etag_test.go`. + +## SDK Compatibility + +These tests use **AWS SDK v2 for Go**, which has the same ETag validation behavior as AWS SDK v2 for Java. The tests include: + +- ETag format validation (pure MD5 vs composite) +- Hex decoding validation (simulates `Base16Codec.decode`) +- Content integrity verification + +## Validated SDK Versions + +| SDK | Version | Status | +|-----|---------|--------| +| AWS SDK v2 for Go | 1.20+ | ✅ Tested | +| AWS SDK v2 for Java | 2.20+ | ✅ Compatible (issue #7768 fixed) | +| AWS SDK v1 for Go | 1.x | ✅ Compatible (less strict validation) | +| AWS SDK v1 for Java | 1.x | ✅ Compatible (less strict validation) | + +## Related + +- [GitHub Issue #7768](https://github.com/seaweedfs/seaweedfs/issues/7768) +- [AWS S3 ETag Documentation](https://docs.aws.amazon.com/AmazonS3/latest/API/API_Object.html) + diff --git a/test/s3/etag/s3_etag_test.go b/test/s3/etag/s3_etag_test.go new file mode 100644 index 000000000..1b87eaf12 --- /dev/null +++ b/test/s3/etag/s3_etag_test.go @@ -0,0 +1,543 @@ +// Package etag_test provides integration tests for S3 ETag format validation. +// +// These tests verify that SeaweedFS returns correct ETag formats for different +// upload scenarios, ensuring compatibility with AWS S3 SDKs that validate ETags. +// +// Background (GitHub Issue #7768): +// AWS S3 SDK for Java v2 validates ETags as hexadecimal MD5 hashes for PutObject +// responses. SeaweedFS was incorrectly returning composite ETags ("-") +// for regular PutObject when files were internally auto-chunked (>8MB), causing +// the SDK to fail with "Invalid base 16 character: '-'". +// +// Per AWS S3 specification: +// - Regular PutObject: ETag is always a pure MD5 hex string (32 chars) +// - Multipart Upload (CompleteMultipartUpload): ETag is "-" +// +// These tests ensure this behavior is maintained. +package etag_test + +import ( + "bytes" + "context" + "crypto/md5" + "crypto/rand" + "encoding/hex" + "fmt" + "io" + mathrand "math/rand" + "regexp" + "strings" + "testing" + "time" + + "github.com/aws/aws-sdk-go-v2/aws" + "github.com/aws/aws-sdk-go-v2/config" + "github.com/aws/aws-sdk-go-v2/credentials" + "github.com/aws/aws-sdk-go-v2/service/s3" + "github.com/aws/aws-sdk-go-v2/service/s3/types" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// S3TestConfig holds configuration for S3 tests +type S3TestConfig struct { + Endpoint string + AccessKey string + SecretKey string + Region string + BucketPrefix string +} + +// Default test configuration +var defaultConfig = &S3TestConfig{ + Endpoint: "http://127.0.0.1:8333", + AccessKey: "some_access_key1", + SecretKey: "some_secret_key1", + Region: "us-east-1", + BucketPrefix: "test-etag-", +} + +// Constants for auto-chunking thresholds (must match s3api_object_handlers_put.go) +const ( + // SeaweedFS auto-chunks files larger than 8MB + autoChunkSize = 8 * 1024 * 1024 + + // Test sizes + smallFileSize = 1 * 1024 // 1KB - single chunk + mediumFileSize = 256 * 1024 // 256KB - single chunk (at threshold) + largeFileSize = 10 * 1024 * 1024 // 10MB - triggers auto-chunking (2 chunks) + xlFileSize = 25 * 1024 * 1024 // 25MB - triggers auto-chunking (4 chunks) + multipartSize = 5 * 1024 * 1024 // 5MB per part for multipart uploads +) + +// ETag format patterns +var ( + // Pure MD5 ETag: 32 hex characters (with or without quotes) + pureMD5Pattern = regexp.MustCompile(`^"?[a-f0-9]{32}"?$`) + + // Composite ETag for multipart: 32 hex chars, hyphen, part count (with or without quotes) + compositePattern = regexp.MustCompile(`^"?[a-f0-9]{32}-\d+"?$`) +) + +func init() { + mathrand.Seed(time.Now().UnixNano()) +} + +// getS3Client creates an AWS S3 v2 client for testing +func getS3Client(t *testing.T) *s3.Client { + cfg, err := config.LoadDefaultConfig(context.TODO(), + config.WithRegion(defaultConfig.Region), + config.WithCredentialsProvider(credentials.NewStaticCredentialsProvider( + defaultConfig.AccessKey, + defaultConfig.SecretKey, + "", + )), + config.WithEndpointResolverWithOptions(aws.EndpointResolverWithOptionsFunc( + func(service, region string, options ...interface{}) (aws.Endpoint, error) { + return aws.Endpoint{ + URL: defaultConfig.Endpoint, + SigningRegion: defaultConfig.Region, + HostnameImmutable: true, + }, nil + })), + ) + require.NoError(t, err) + + return s3.NewFromConfig(cfg, func(o *s3.Options) { + o.UsePathStyle = true + }) +} + +// getNewBucketName generates a unique bucket name +func getNewBucketName() string { + timestamp := time.Now().UnixNano() + randomSuffix := mathrand.Intn(100000) + return fmt.Sprintf("%s%d-%d", defaultConfig.BucketPrefix, timestamp, randomSuffix) +} + +// generateRandomData generates random test data of specified size +func generateRandomData(size int) []byte { + data := make([]byte, size) + if _, err := rand.Read(data); err != nil { + panic(fmt.Sprintf("failed to generate random test data: %v", err)) + } + return data +} + +// calculateMD5 calculates the MD5 hash of data and returns hex string +func calculateMD5(data []byte) string { + hash := md5.Sum(data) + return hex.EncodeToString(hash[:]) +} + +// cleanETag removes quotes from ETag +func cleanETag(etag string) string { + return strings.Trim(etag, `"`) +} + +// isPureMD5ETag checks if ETag is a pure MD5 hex string (no composite format) +func isPureMD5ETag(etag string) bool { + return pureMD5Pattern.MatchString(etag) +} + +// isCompositeETag checks if ETag is in composite format (md5-count) +func isCompositeETag(etag string) bool { + return compositePattern.MatchString(etag) +} + +// createTestBucket creates a new bucket for testing +func createTestBucket(ctx context.Context, client *s3.Client, bucketName string) error { + _, err := client.CreateBucket(ctx, &s3.CreateBucketInput{ + Bucket: aws.String(bucketName), + }) + return err +} + +// cleanupTestBucket deletes all objects and the bucket +func cleanupTestBucket(ctx context.Context, client *s3.Client, bucketName string) { + // Delete all objects + paginator := s3.NewListObjectsV2Paginator(client, &s3.ListObjectsV2Input{ + Bucket: aws.String(bucketName), + }) + + for paginator.HasMorePages() { + page, err := paginator.NextPage(ctx) + if err != nil { + break + } + for _, obj := range page.Contents { + client.DeleteObject(ctx, &s3.DeleteObjectInput{ + Bucket: aws.String(bucketName), + Key: obj.Key, + }) + } + } + + // Abort any in-progress multipart uploads + mpPaginator := s3.NewListMultipartUploadsPaginator(client, &s3.ListMultipartUploadsInput{ + Bucket: aws.String(bucketName), + }) + for mpPaginator.HasMorePages() { + page, err := mpPaginator.NextPage(ctx) + if err != nil { + break + } + for _, upload := range page.Uploads { + client.AbortMultipartUpload(ctx, &s3.AbortMultipartUploadInput{ + Bucket: aws.String(bucketName), + Key: upload.Key, + UploadId: upload.UploadId, + }) + } + } + + // Delete bucket + client.DeleteBucket(ctx, &s3.DeleteBucketInput{ + Bucket: aws.String(bucketName), + }) +} + +// TestPutObjectETagFormat_SmallFile verifies ETag format for small files (single chunk) +func TestPutObjectETagFormat_SmallFile(t *testing.T) { + ctx := context.Background() + client := getS3Client(t) + + bucketName := getNewBucketName() + err := createTestBucket(ctx, client, bucketName) + require.NoError(t, err, "Failed to create test bucket") + defer cleanupTestBucket(ctx, client, bucketName) + + testData := generateRandomData(smallFileSize) + expectedMD5 := calculateMD5(testData) + objectKey := "small-file.bin" + + // Upload small file + putResp, err := client.PutObject(ctx, &s3.PutObjectInput{ + Bucket: aws.String(bucketName), + Key: aws.String(objectKey), + Body: bytes.NewReader(testData), + }) + require.NoError(t, err, "Failed to upload small file") + + // Verify ETag format + etag := aws.ToString(putResp.ETag) + t.Logf("Small file (%d bytes) ETag: %s", smallFileSize, etag) + + assert.True(t, isPureMD5ETag(etag), + "Small file ETag should be pure MD5, got: %s", etag) + assert.False(t, isCompositeETag(etag), + "Small file ETag should NOT be composite format, got: %s", etag) + assert.Equal(t, expectedMD5, cleanETag(etag), + "ETag should match calculated MD5") +} + +// TestPutObjectETagFormat_LargeFile verifies ETag format for large files that trigger auto-chunking +// This is the critical test for GitHub Issue #7768 +func TestPutObjectETagFormat_LargeFile(t *testing.T) { + ctx := context.Background() + client := getS3Client(t) + + bucketName := getNewBucketName() + err := createTestBucket(ctx, client, bucketName) + require.NoError(t, err, "Failed to create test bucket") + defer cleanupTestBucket(ctx, client, bucketName) + + testData := generateRandomData(largeFileSize) + expectedMD5 := calculateMD5(testData) + objectKey := "large-file.bin" + + t.Logf("Uploading large file (%d bytes, > %d byte auto-chunk threshold)...", + largeFileSize, autoChunkSize) + + // Upload large file (triggers auto-chunking internally) + putResp, err := client.PutObject(ctx, &s3.PutObjectInput{ + Bucket: aws.String(bucketName), + Key: aws.String(objectKey), + Body: bytes.NewReader(testData), + }) + require.NoError(t, err, "Failed to upload large file") + + // Verify ETag format - MUST be pure MD5, NOT composite + etag := aws.ToString(putResp.ETag) + t.Logf("Large file (%d bytes, ~%d internal chunks) ETag: %s", + largeFileSize, (largeFileSize/autoChunkSize)+1, etag) + + assert.True(t, isPureMD5ETag(etag), + "Large file PutObject ETag MUST be pure MD5 (not composite), got: %s", etag) + assert.False(t, isCompositeETag(etag), + "Large file PutObject ETag should NOT contain '-' (composite format), got: %s", etag) + assert.False(t, strings.Contains(cleanETag(etag), "-"), + "ETag should not contain hyphen for regular PutObject, got: %s", etag) + assert.Equal(t, expectedMD5, cleanETag(etag), + "ETag should match calculated MD5 of entire content") + + // Verify we can read back the object correctly + getResp, err := client.GetObject(ctx, &s3.GetObjectInput{ + Bucket: aws.String(bucketName), + Key: aws.String(objectKey), + }) + require.NoError(t, err, "Failed to get large file") + defer getResp.Body.Close() + + downloadedData, err := io.ReadAll(getResp.Body) + require.NoError(t, err, "Failed to read large file content") + assert.Equal(t, testData, downloadedData, "Downloaded content should match uploaded content") +} + +// TestPutObjectETagFormat_ExtraLargeFile tests even larger files with multiple internal chunks +func TestPutObjectETagFormat_ExtraLargeFile(t *testing.T) { + ctx := context.Background() + client := getS3Client(t) + + bucketName := getNewBucketName() + err := createTestBucket(ctx, client, bucketName) + require.NoError(t, err, "Failed to create test bucket") + defer cleanupTestBucket(ctx, client, bucketName) + + testData := generateRandomData(xlFileSize) + expectedMD5 := calculateMD5(testData) + objectKey := "xl-file.bin" + + expectedChunks := (xlFileSize / autoChunkSize) + 1 + t.Logf("Uploading XL file (%d bytes, expected ~%d internal chunks)...", + xlFileSize, expectedChunks) + + // Upload extra large file + putResp, err := client.PutObject(ctx, &s3.PutObjectInput{ + Bucket: aws.String(bucketName), + Key: aws.String(objectKey), + Body: bytes.NewReader(testData), + }) + require.NoError(t, err, "Failed to upload XL file") + + // Verify ETag format + etag := aws.ToString(putResp.ETag) + t.Logf("XL file (%d bytes) ETag: %s", xlFileSize, etag) + + assert.True(t, isPureMD5ETag(etag), + "XL file PutObject ETag MUST be pure MD5, got: %s", etag) + assert.False(t, isCompositeETag(etag), + "XL file PutObject ETag should NOT be composite, got: %s", etag) + assert.Equal(t, expectedMD5, cleanETag(etag), + "ETag should match calculated MD5") +} + +// TestMultipartUploadETagFormat verifies that ONLY multipart uploads get composite ETags +func TestMultipartUploadETagFormat(t *testing.T) { + ctx := context.Background() + client := getS3Client(t) + + bucketName := getNewBucketName() + err := createTestBucket(ctx, client, bucketName) + require.NoError(t, err, "Failed to create test bucket") + defer cleanupTestBucket(ctx, client, bucketName) + + // Create test data for multipart upload (15MB = 3 parts of 5MB each) + totalSize := 15 * 1024 * 1024 + testData := generateRandomData(totalSize) + objectKey := "multipart-file.bin" + + expectedPartCount := (totalSize + multipartSize - 1) / multipartSize // ceiling division + t.Logf("Performing multipart upload (%d bytes, %d parts)...", + totalSize, expectedPartCount) + + // Initiate multipart upload + createResp, err := client.CreateMultipartUpload(ctx, &s3.CreateMultipartUploadInput{ + Bucket: aws.String(bucketName), + Key: aws.String(objectKey), + }) + require.NoError(t, err, "Failed to create multipart upload") + + uploadId := createResp.UploadId + var completedParts []types.CompletedPart + partNumber := int32(1) + + // Upload parts + for offset := 0; offset < totalSize; offset += multipartSize { + end := offset + multipartSize + if end > totalSize { + end = totalSize + } + partData := testData[offset:end] + + uploadResp, err := client.UploadPart(ctx, &s3.UploadPartInput{ + Bucket: aws.String(bucketName), + Key: aws.String(objectKey), + UploadId: uploadId, + PartNumber: aws.Int32(partNumber), + Body: bytes.NewReader(partData), + }) + require.NoError(t, err, "Failed to upload part %d", partNumber) + + completedParts = append(completedParts, types.CompletedPart{ + ETag: uploadResp.ETag, + PartNumber: aws.Int32(partNumber), + }) + partNumber++ + } + + // Complete multipart upload + completeResp, err := client.CompleteMultipartUpload(ctx, &s3.CompleteMultipartUploadInput{ + Bucket: aws.String(bucketName), + Key: aws.String(objectKey), + UploadId: uploadId, + MultipartUpload: &types.CompletedMultipartUpload{ + Parts: completedParts, + }, + }) + require.NoError(t, err, "Failed to complete multipart upload") + + // Verify ETag format - SHOULD be composite for multipart + etag := aws.ToString(completeResp.ETag) + t.Logf("Multipart upload ETag: %s", etag) + + assert.True(t, isCompositeETag(etag), + "Multipart upload ETag SHOULD be composite format (md5-count), got: %s", etag) + assert.True(t, strings.Contains(cleanETag(etag), "-"), + "Multipart ETag should contain hyphen, got: %s", etag) + + // Verify the part count in the ETag matches + parts := strings.Split(cleanETag(etag), "-") + require.Len(t, parts, 2, "Composite ETag should have format 'hash-count'") + assert.Equal(t, fmt.Sprintf("%d", len(completedParts)), parts[1], + "Part count in ETag should match number of parts uploaded") +} + +// TestPutObjectETagConsistency verifies ETag consistency between PUT and GET +func TestPutObjectETagConsistency(t *testing.T) { + ctx := context.Background() + client := getS3Client(t) + + bucketName := getNewBucketName() + err := createTestBucket(ctx, client, bucketName) + require.NoError(t, err, "Failed to create test bucket") + defer cleanupTestBucket(ctx, client, bucketName) + + testCases := []struct { + name string + size int + }{ + {"tiny", 100}, + {"small", smallFileSize}, + {"medium", mediumFileSize}, + {"large", largeFileSize}, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + testData := generateRandomData(tc.size) + objectKey := fmt.Sprintf("consistency-test-%s.bin", tc.name) + + // PUT object + putResp, err := client.PutObject(ctx, &s3.PutObjectInput{ + Bucket: aws.String(bucketName), + Key: aws.String(objectKey), + Body: bytes.NewReader(testData), + }) + require.NoError(t, err) + putETag := aws.ToString(putResp.ETag) + + // HEAD object + headResp, err := client.HeadObject(ctx, &s3.HeadObjectInput{ + Bucket: aws.String(bucketName), + Key: aws.String(objectKey), + }) + require.NoError(t, err) + headETag := aws.ToString(headResp.ETag) + + // GET object + getResp, err := client.GetObject(ctx, &s3.GetObjectInput{ + Bucket: aws.String(bucketName), + Key: aws.String(objectKey), + }) + require.NoError(t, err) + getETag := aws.ToString(getResp.ETag) + getResp.Body.Close() + + // All ETags should match + t.Logf("%s (%d bytes): PUT=%s, HEAD=%s, GET=%s", + tc.name, tc.size, putETag, headETag, getETag) + + assert.Equal(t, putETag, headETag, + "PUT and HEAD ETags should match") + assert.Equal(t, putETag, getETag, + "PUT and GET ETags should match") + + // All should be pure MD5 (not composite) for regular PutObject + assert.True(t, isPureMD5ETag(putETag), + "PutObject ETag should be pure MD5, got: %s", putETag) + }) + } +} + +// TestETagHexValidation simulates the AWS SDK v2 validation that caused issue #7768 +func TestETagHexValidation(t *testing.T) { + ctx := context.Background() + client := getS3Client(t) + + bucketName := getNewBucketName() + err := createTestBucket(ctx, client, bucketName) + require.NoError(t, err, "Failed to create test bucket") + defer cleanupTestBucket(ctx, client, bucketName) + + // Test with a file large enough to trigger auto-chunking + testData := generateRandomData(largeFileSize) + objectKey := "hex-validation-test.bin" + + putResp, err := client.PutObject(ctx, &s3.PutObjectInput{ + Bucket: aws.String(bucketName), + Key: aws.String(objectKey), + Body: bytes.NewReader(testData), + }) + require.NoError(t, err) + + etag := cleanETag(aws.ToString(putResp.ETag)) + + // Simulate AWS SDK v2's hex validation (Base16Codec.decode) + // This is what fails in issue #7768 when ETag contains '-' + t.Logf("Validating ETag as hex: %s", etag) + + _, err = hex.DecodeString(etag) + assert.NoError(t, err, + "ETag should be valid hexadecimal (AWS SDK v2 validation). "+ + "Got ETag: %s. If this fails with 'invalid byte', the ETag contains non-hex chars like '-'", + etag) +} + +// TestMultipleLargeFileUploads verifies ETag format across multiple large uploads +func TestMultipleLargeFileUploads(t *testing.T) { + ctx := context.Background() + client := getS3Client(t) + + bucketName := getNewBucketName() + err := createTestBucket(ctx, client, bucketName) + require.NoError(t, err, "Failed to create test bucket") + defer cleanupTestBucket(ctx, client, bucketName) + + numFiles := 5 + for i := 0; i < numFiles; i++ { + testData := generateRandomData(largeFileSize) + expectedMD5 := calculateMD5(testData) + objectKey := fmt.Sprintf("large-file-%d.bin", i) + + putResp, err := client.PutObject(ctx, &s3.PutObjectInput{ + Bucket: aws.String(bucketName), + Key: aws.String(objectKey), + Body: bytes.NewReader(testData), + }) + require.NoError(t, err, "Failed to upload file %d", i) + + etag := aws.ToString(putResp.ETag) + t.Logf("File %d ETag: %s (expected MD5: %s)", i, etag, expectedMD5) + + assert.True(t, isPureMD5ETag(etag), + "File %d ETag should be pure MD5, got: %s", i, etag) + assert.Equal(t, expectedMD5, cleanETag(etag), + "File %d ETag should match MD5", i) + + // Validate as hex (AWS SDK v2 check) + _, err = hex.DecodeString(cleanETag(etag)) + assert.NoError(t, err, "File %d ETag should be valid hex", i) + } +} + diff --git a/test/s3/etag/test_config.json b/test/s3/etag/test_config.json new file mode 100644 index 000000000..268a54787 --- /dev/null +++ b/test/s3/etag/test_config.json @@ -0,0 +1,19 @@ +{ + "endpoint": "http://127.0.0.1:8333", + "access_key": "some_access_key1", + "secret_key": "some_secret_key1", + "region": "us-east-1", + "bucket_prefix": "test-etag-", + "notes": { + "description": "S3 ETag format integration tests", + "issue": "https://github.com/seaweedfs/seaweedfs/issues/7768", + "auto_chunk_size": "8MB - files larger than this trigger auto-chunking", + "test_sizes": { + "small": "1KB - single chunk", + "medium": "256KB - at inline threshold", + "large": "10MB - 2 internal chunks", + "xl": "25MB - 4 internal chunks" + } + } +} + diff --git a/weed/s3api/s3api_object_handlers_put.go b/weed/s3api/s3api_object_handlers_put.go index 7c73f4ce0..1554907ab 100644 --- a/weed/s3api/s3api_object_handlers_put.go +++ b/weed/s3api/s3api_object_handlers_put.go @@ -506,20 +506,14 @@ func (s3a *S3ApiServer) putToFiler(r *http.Request, filePath string, dataReader Extended: make(map[string][]byte), } - // Set Md5 attribute based on context: - // 1. For multipart upload PARTS (stored in .uploads/ directory): ALWAYS set Md5 - // - Parts must use simple MD5 ETags, never composite format - // - Even if a part has multiple chunks internally, its ETag is MD5 of entire part - // 2. For regular object uploads: only set Md5 for single-chunk uploads - // - Multi-chunk regular objects use composite "md5-count" format - isMultipartPart := strings.Contains(filePath, "/"+s3_constants.MultipartUploadsFolder+"/") - if isMultipartPart || len(chunkResult.FileChunks) == 1 { - entry.Attributes.Md5 = md5Sum - } - - // Calculate ETag using the same logic as GET to ensure consistency - // For single chunk: uses entry.Attributes.Md5 - // For multiple chunks: uses filer.ETagChunks() which returns "-" + // Always set Md5 attribute for regular object uploads (PutObject) + // This ensures the ETag is a pure MD5 hash, which AWS S3 SDKs expect + // for PutObject responses. The composite "md5-count" format is only + // used for multipart upload completion (CompleteMultipartUpload API), + // not for regular PutObject even if the file is internally auto-chunked. + entry.Attributes.Md5 = md5Sum + + // Calculate ETag - with Md5 set, this returns the pure MD5 hash etag = filer.ETag(entry) glog.V(4).Infof("putToFiler: Calculated ETag=%s for %d chunks", etag, len(chunkResult.FileChunks))