diff --git a/.github/workflows/container_release_unified.yml b/.github/workflows/container_release_unified.yml index eb8df9834..c7aa648fd 100644 --- a/.github/workflows/container_release_unified.yml +++ b/.github/workflows/container_release_unified.yml @@ -223,3 +223,4 @@ jobs: echo "✓ Successfully copied ${{ matrix.variant }} to Docker Hub" + diff --git a/.github/workflows/helm_ci.yml b/.github/workflows/helm_ci.yml index f936ff445..ea971aec1 100644 --- a/.github/workflows/helm_ci.yml +++ b/.github/workflows/helm_ci.yml @@ -44,6 +44,80 @@ jobs: - name: Run chart-testing (lint) run: ct lint --target-branch ${{ github.event.repository.default_branch }} --all --validate-maintainers=false --chart-dirs k8s/charts + - name: Verify template rendering + run: | + set -e + CHART_DIR="k8s/charts/seaweedfs" + + echo "=== Testing default configuration ===" + helm template test $CHART_DIR > /tmp/default.yaml + echo "✓ Default configuration renders successfully" + + echo "=== Testing with S3 enabled ===" + helm template test $CHART_DIR --set s3.enabled=true > /tmp/s3.yaml + grep -q "kind: Deployment" /tmp/s3.yaml && grep -q "seaweedfs-s3" /tmp/s3.yaml + echo "✓ S3 deployment renders correctly" + + echo "=== Testing with all-in-one mode ===" + helm template test $CHART_DIR --set allInOne.enabled=true > /tmp/allinone.yaml + grep -q "seaweedfs-all-in-one" /tmp/allinone.yaml + echo "✓ All-in-one deployment renders correctly" + + echo "=== Testing with security enabled ===" + helm template test $CHART_DIR --set global.enableSecurity=true > /tmp/security.yaml + grep -q "security-config" /tmp/security.yaml + echo "✓ Security configuration renders correctly" + + echo "=== Testing with monitoring enabled ===" + helm template test $CHART_DIR \ + --set global.monitoring.enabled=true \ + --set global.monitoring.gatewayHost=prometheus \ + --set global.monitoring.gatewayPort=9091 > /tmp/monitoring.yaml + echo "✓ Monitoring configuration renders correctly" + + echo "=== Testing with PVC storage ===" + helm template test $CHART_DIR \ + --set master.data.type=persistentVolumeClaim \ + --set master.data.size=10Gi \ + --set master.data.storageClass=standard > /tmp/pvc.yaml + grep -q "PersistentVolumeClaim" /tmp/pvc.yaml + echo "✓ PVC configuration renders correctly" + + echo "=== Testing with custom replicas ===" + helm template test $CHART_DIR \ + --set master.replicas=3 \ + --set filer.replicas=2 \ + --set volume.replicas=3 > /tmp/replicas.yaml + echo "✓ Custom replicas configuration renders correctly" + + echo "=== Testing filer with S3 gateway ===" + helm template test $CHART_DIR \ + --set filer.s3.enabled=true \ + --set filer.s3.enableAuth=true > /tmp/filer-s3.yaml + echo "✓ Filer S3 gateway renders correctly" + + echo "=== Testing SFTP enabled ===" + helm template test $CHART_DIR --set sftp.enabled=true > /tmp/sftp.yaml + grep -q "seaweedfs-sftp" /tmp/sftp.yaml + echo "✓ SFTP deployment renders correctly" + + echo "=== Testing ingress configurations ===" + helm template test $CHART_DIR \ + --set master.ingress.enabled=true \ + --set filer.ingress.enabled=true \ + --set s3.enabled=true \ + --set s3.ingress.enabled=true > /tmp/ingress.yaml + grep -q "kind: Ingress" /tmp/ingress.yaml + echo "✓ Ingress configurations render correctly" + + echo "=== Testing COSI driver ===" + helm template test $CHART_DIR --set cosi.enabled=true > /tmp/cosi.yaml + grep -q "seaweedfs-cosi" /tmp/cosi.yaml + echo "✓ COSI driver renders correctly" + + echo "" + echo "✅ All template rendering tests passed!" + - name: Create kind cluster uses: helm/kind-action@v1.13.0 diff --git a/.github/workflows/s3tests.yml b/.github/workflows/s3tests.yml index b4e4e5e70..deda4999b 100644 --- a/.github/workflows/s3tests.yml +++ b/.github/workflows/s3tests.yml @@ -64,7 +64,7 @@ jobs: -master.raftHashicorp -master.electionTimeout 1s -master.volumeSizeLimitMB=100 \ -volume.max=100 -volume.preStopSeconds=1 \ -master.port=9333 -volume.port=8080 -filer.port=8888 -s3.port=8000 -metricsPort=9324 \ - -s3.allowEmptyFolder=false -s3.allowDeleteBucketNotEmpty=true -s3.config="$GITHUB_WORKSPACE/docker/compose/s3.json" -master.peers=none & + -s3.allowDeleteBucketNotEmpty=true -s3.config="$GITHUB_WORKSPACE/docker/compose/s3.json" -master.peers=none & pid=$! # Wait for all SeaweedFS components to be ready @@ -368,7 +368,7 @@ jobs: -master.raftHashicorp -master.electionTimeout 1s -master.volumeSizeLimitMB=100 \ -volume.max=100 -volume.preStopSeconds=1 \ -master.port=9334 -volume.port=8081 -filer.port=8889 -s3.port=8001 -metricsPort=9325 \ - -s3.allowEmptyFolder=false -s3.allowDeleteBucketNotEmpty=true -s3.config="$GITHUB_WORKSPACE/docker/compose/s3.json" -master.peers=none & + -s3.allowDeleteBucketNotEmpty=true -s3.config="$GITHUB_WORKSPACE/docker/compose/s3.json" -master.peers=none & pid=$! # Wait for all SeaweedFS components to be ready @@ -526,7 +526,7 @@ jobs: -master.raftHashicorp -master.electionTimeout 1s -master.volumeSizeLimitMB=100 \ -volume.max=100 -volume.preStopSeconds=1 \ -master.port=9335 -volume.port=8082 -filer.port=8890 -s3.port=8002 -metricsPort=9326 \ - -s3.allowEmptyFolder=false -s3.allowDeleteBucketNotEmpty=true -s3.config="$GITHUB_WORKSPACE/docker/compose/s3.json" -master.peers=none & + -s3.allowDeleteBucketNotEmpty=true -s3.config="$GITHUB_WORKSPACE/docker/compose/s3.json" -master.peers=none & pid=$! # Wait for all SeaweedFS components to be ready @@ -636,7 +636,7 @@ jobs: -master.raftHashicorp -master.electionTimeout 1s -master.volumeSizeLimitMB=100 \ -volume.max=100 -volume.preStopSeconds=1 \ -master.port=9336 -volume.port=8083 -filer.port=8891 -s3.port=8003 -metricsPort=9327 \ - -s3.allowEmptyFolder=false -s3.allowDeleteBucketNotEmpty=true -s3.config="$GITHUB_WORKSPACE/docker/compose/s3.json" -master.peers=none & + -s3.allowDeleteBucketNotEmpty=true -s3.config="$GITHUB_WORKSPACE/docker/compose/s3.json" -master.peers=none & pid=$! # Wait for all SeaweedFS components to be ready @@ -817,7 +817,7 @@ jobs: -master.raftHashicorp -master.electionTimeout 1s -master.volumeSizeLimitMB=100 \ -volume.max=100 -volume.preStopSeconds=1 \ -master.port=9337 -volume.port=8085 -filer.port=8892 -s3.port=8004 -metricsPort=9328 \ - -s3.allowEmptyFolder=false -s3.allowDeleteBucketNotEmpty=true -s3.config="$GITHUB_WORKSPACE/docker/compose/s3.json" \ + -s3.allowDeleteBucketNotEmpty=true -s3.config="$GITHUB_WORKSPACE/docker/compose/s3.json" \ -master.peers=none \ > /tmp/seaweedfs-sql-server.log 2>&1 & pid=$! diff --git a/.github/workflows/sftp-tests.yml b/.github/workflows/sftp-tests.yml new file mode 100644 index 000000000..80a1b9929 --- /dev/null +++ b/.github/workflows/sftp-tests.yml @@ -0,0 +1,93 @@ +name: "SFTP Integration Tests" + +on: + push: + branches: [ master, main ] + paths: + - 'weed/sftpd/**' + - 'weed/command/sftp.go' + - 'test/sftp/**' + - '.github/workflows/sftp-tests.yml' + pull_request: + branches: [ master, main ] + paths: + - 'weed/sftpd/**' + - 'weed/command/sftp.go' + - 'test/sftp/**' + - '.github/workflows/sftp-tests.yml' + +concurrency: + group: ${{ github.head_ref }}/sftp-tests + cancel-in-progress: true + +permissions: + contents: read + +env: + GO_VERSION: '1.24' + TEST_TIMEOUT: '15m' + +jobs: + sftp-integration: + name: SFTP Integration Testing + runs-on: ubuntu-22.04 + timeout-minutes: 20 + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Go ${{ env.GO_VERSION }} + uses: actions/setup-go@v5 + with: + go-version: ${{ env.GO_VERSION }} + + - name: Install dependencies + run: | + sudo apt-get update + sudo apt-get install -y openssh-client + + - name: Build SeaweedFS + run: | + cd weed + go build -o weed . + chmod +x weed + ./weed version + + - name: Run SFTP Integration Tests + run: | + cd test/sftp + + echo "🧪 Running SFTP integration tests..." + echo "============================================" + + # Install test dependencies + go mod download + + # Run all SFTP tests + go test -v -timeout=${{ env.TEST_TIMEOUT }} ./... + + echo "============================================" + echo "✅ SFTP integration tests completed" + + - name: Test Summary + if: always() + run: | + echo "## 🔐 SFTP Integration Test Summary" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "### Test Coverage" >> $GITHUB_STEP_SUMMARY + echo "- ✅ **HomeDir Path Translation**: User home directory mapping (fixes #7470)" >> $GITHUB_STEP_SUMMARY + echo "- ✅ **File Operations**: Upload, download, delete" >> $GITHUB_STEP_SUMMARY + echo "- ✅ **Directory Operations**: Create, list, remove" >> $GITHUB_STEP_SUMMARY + echo "- ✅ **Large File Handling**: 1MB+ file support" >> $GITHUB_STEP_SUMMARY + echo "- ✅ **Path Edge Cases**: Unicode, trailing slashes, .. paths" >> $GITHUB_STEP_SUMMARY + echo "- ✅ **Admin Access**: Root user verification" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "### Test Configuration" >> $GITHUB_STEP_SUMMARY + echo "| User | HomeDir | Permissions |" >> $GITHUB_STEP_SUMMARY + echo "|------|---------|-------------|" >> $GITHUB_STEP_SUMMARY + echo "| admin | / | Full access |" >> $GITHUB_STEP_SUMMARY + echo "| testuser | /sftp/testuser | Home directory only |" >> $GITHUB_STEP_SUMMARY + echo "| readonly | /public | Read-only |" >> $GITHUB_STEP_SUMMARY + + diff --git a/Makefile b/Makefile index 6abe59423..a4a00a504 100644 --- a/Makefile +++ b/Makefile @@ -18,12 +18,12 @@ full_install: admin-generate cd weed; go install -tags "elastic gocdk sqlite ydb tarantool tikv rclone" server: install - weed -v 0 server -s3 -filer -filer.maxMB=64 -volume.max=0 -master.volumeSizeLimitMB=100 -volume.preStopSeconds=1 -s3.port=8000 -s3.allowEmptyFolder=false -s3.allowDeleteBucketNotEmpty=true -s3.config=./docker/compose/s3.json -metricsPort=9324 + weed -v 0 server -s3 -filer -filer.maxMB=64 -volume.max=0 -master.volumeSizeLimitMB=100 -volume.preStopSeconds=1 -s3.port=8000 -s3.allowDeleteBucketNotEmpty=true -s3.config=./docker/compose/s3.json -metricsPort=9324 benchmark: install warp_install pkill weed || true pkill warp || true - weed server -debug=$(debug) -s3 -filer -volume.max=0 -master.volumeSizeLimitMB=100 -volume.preStopSeconds=1 -s3.port=8000 -s3.allowEmptyFolder=false -s3.allowDeleteBucketNotEmpty=false -s3.config=./docker/compose/s3.json & + weed server -debug=$(debug) -s3 -filer -volume.max=0 -master.volumeSizeLimitMB=100 -volume.preStopSeconds=1 -s3.port=8000 -s3.allowDeleteBucketNotEmpty=false -s3.config=./docker/compose/s3.json & warp client & while ! nc -z localhost 8000 ; do sleep 1 ; done warp mixed --host=127.0.0.1:8000 --access-key=some_access_key1 --secret-key=some_secret_key1 --autoterm diff --git a/README.md b/README.md index 381abfff6..7d5ab91f9 100644 --- a/README.md +++ b/README.md @@ -592,65 +592,22 @@ Percentage of the requests served within a certain time (ms) ``` make benchmark -warp: Benchmark data written to "warp-mixed-2023-10-16[102354]-l70a.csv.zst" -Mixed operations. -Operation: DELETE, 10%, Concurrency: 20, Ran 4m59s. - * Throughput: 6.19 obj/s +warp: Benchmark data written to "warp-mixed-2025-12-05[194844]-kBpU.csv.zst" -Operation: GET, 45%, Concurrency: 20, Ran 5m0s. - * Throughput: 279.85 MiB/s, 27.99 obj/s +Mixed operations. +Operation: DELETE, 10%, Concurrency: 20, Ran 42s. + * Throughput: 55.13 obj/s -Operation: PUT, 15%, Concurrency: 20, Ran 5m0s. - * Throughput: 89.86 MiB/s, 8.99 obj/s +Operation: GET, 45%, Concurrency: 20, Ran 42s. + * Throughput: 2477.45 MiB/s, 247.75 obj/s -Operation: STAT, 30%, Concurrency: 20, Ran 5m0s. - * Throughput: 18.63 obj/s +Operation: PUT, 15%, Concurrency: 20, Ran 42s. + * Throughput: 825.85 MiB/s, 82.59 obj/s -Cluster Total: 369.74 MiB/s, 61.79 obj/s, 0 errors over 5m0s. -``` +Operation: STAT, 30%, Concurrency: 20, Ran 42s. + * Throughput: 165.27 obj/s -To see segmented request statistics, use the --analyze.v parameter. -``` -warp analyze --analyze.v warp-mixed-2023-10-16[102354]-l70a.csv.zst -18642 operations loaded... Done! -Mixed operations. ----------------------------------------- -Operation: DELETE - total: 1854, 10.0%, Concurrency: 20, Ran 5m0s, starting 2023-10-16 10:23:57.115 +0500 +05 - * Throughput: 6.19 obj/s - -Requests considered: 1855: - * Avg: 104ms, 50%: 30ms, 90%: 207ms, 99%: 1.355s, Fastest: 1ms, Slowest: 4.613s, StdDev: 320ms - ----------------------------------------- -Operation: GET - total: 8388, 45.3%, Size: 10485760 bytes. Concurrency: 20, Ran 5m0s, starting 2023-10-16 10:23:57.12 +0500 +05 - * Throughput: 279.77 MiB/s, 27.98 obj/s - -Requests considered: 8389: - * Avg: 221ms, 50%: 106ms, 90%: 492ms, 99%: 1.739s, Fastest: 8ms, Slowest: 8.633s, StdDev: 383ms - * TTFB: Avg: 81ms, Best: 2ms, 25th: 24ms, Median: 39ms, 75th: 65ms, 90th: 171ms, 99th: 669ms, Worst: 4.783s StdDev: 163ms - * First Access: Avg: 240ms, 50%: 105ms, 90%: 511ms, 99%: 2.08s, Fastest: 12ms, Slowest: 8.633s, StdDev: 480ms - * First Access TTFB: Avg: 88ms, Best: 2ms, 25th: 24ms, Median: 38ms, 75th: 64ms, 90th: 179ms, 99th: 919ms, Worst: 4.783s StdDev: 199ms - * Last Access: Avg: 219ms, 50%: 106ms, 90%: 463ms, 99%: 1.782s, Fastest: 9ms, Slowest: 8.633s, StdDev: 416ms - * Last Access TTFB: Avg: 81ms, Best: 2ms, 25th: 24ms, Median: 39ms, 75th: 65ms, 90th: 161ms, 99th: 657ms, Worst: 4.783s StdDev: 176ms - ----------------------------------------- -Operation: PUT - total: 2688, 14.5%, Size: 10485760 bytes. Concurrency: 20, Ran 5m0s, starting 2023-10-16 10:23:57.115 +0500 +05 - * Throughput: 89.83 MiB/s, 8.98 obj/s - -Requests considered: 2689: - * Avg: 1.165s, 50%: 878ms, 90%: 2.015s, 99%: 5.74s, Fastest: 99ms, Slowest: 8.264s, StdDev: 968ms - ----------------------------------------- -Operation: STAT - total: 5586, 30.2%, Concurrency: 20, Ran 5m0s, starting 2023-10-16 10:23:57.113 +0500 +05 - * Throughput: 18.63 obj/s - -Requests considered: 5587: - * Avg: 15ms, 50%: 11ms, 90%: 34ms, 99%: 80ms, Fastest: 0s, Slowest: 245ms, StdDev: 17ms - * First Access: Avg: 14ms, 50%: 10ms, 90%: 33ms, 99%: 69ms, Fastest: 0s, Slowest: 203ms, StdDev: 16ms - * Last Access: Avg: 15ms, 50%: 11ms, 90%: 34ms, 99%: 74ms, Fastest: 0s, Slowest: 203ms, StdDev: 17ms - -Cluster Total: 369.64 MiB/s, 61.77 obj/s, 0 errors over 5m0s. -Total Errors:0. +Cluster Total: 3302.88 MiB/s, 550.51 obj/s over 43s. ``` [Back to TOC](#table-of-contents) diff --git a/docker/compose/local-s3tests-compose.yml b/docker/compose/local-s3tests-compose.yml index f1961700c..f89261ec7 100644 --- a/docker/compose/local-s3tests-compose.yml +++ b/docker/compose/local-s3tests-compose.yml @@ -24,7 +24,7 @@ services: - 8888:8888 - 18888:18888 - 8000:8000 - command: 'filer -master="master:9333" -s3 -s3.config=/etc/seaweedfs/s3.json -s3.port=8000 -s3.allowEmptyFolder=false -s3.allowDeleteBucketNotEmpty=false' + command: 'filer -master="master:9333" -s3 -s3.config=/etc/seaweedfs/s3.json -s3.port=8000 -s3.allowDeleteBucketNotEmpty=false' volumes: - ./s3.json:/etc/seaweedfs/s3.json depends_on: diff --git a/docker/compose/test-tarantool-filer.yml b/docker/compose/test-tarantool-filer.yml index 8f31bf855..a0fa5436a 100644 --- a/docker/compose/test-tarantool-filer.yml +++ b/docker/compose/test-tarantool-filer.yml @@ -15,7 +15,7 @@ services: s3: image: chrislusf/seaweedfs:local - command: "server -ip=127.0.0.1 -filer -master.volumeSizeLimitMB=16 -volume.max=0 -volume -volume.preStopSeconds=1 -s3 -s3.config=/etc/seaweedfs/s3.json -s3.port=8000 -s3.allowEmptyFolder=false -s3.allowDeleteBucketNotEmpty=false" + command: "server -ip=127.0.0.1 -filer -master.volumeSizeLimitMB=16 -volume.max=0 -volume -volume.preStopSeconds=1 -s3 -s3.config=/etc/seaweedfs/s3.json -s3.port=8000 -s3.allowDeleteBucketNotEmpty=false" volumes: - ./s3.json:/etc/seaweedfs/s3.json environment: diff --git a/docker/compose/test-ydb-filer.yml b/docker/compose/test-ydb-filer.yml index ddbfe18d0..1e310dfb5 100644 --- a/docker/compose/test-ydb-filer.yml +++ b/docker/compose/test-ydb-filer.yml @@ -20,7 +20,7 @@ services: - 8888:8888 - 8000:8000 - 18888:18888 - command: "server -ip=s3 -filer -master.volumeSizeLimitMB=16 -volume.max=0 -volume -volume.preStopSeconds=1 -s3 -s3.config=/etc/seaweedfs/s3.json -s3.port=8000 -s3.allowEmptyFolder=false -s3.allowDeleteBucketNotEmpty=false" + command: "server -ip=s3 -filer -master.volumeSizeLimitMB=16 -volume.max=0 -volume -volume.preStopSeconds=1 -s3 -s3.config=/etc/seaweedfs/s3.json -s3.port=8000 -s3.allowDeleteBucketNotEmpty=false" volumes: - ./s3.json:/etc/seaweedfs/s3.json environment: diff --git a/k8s/charts/seaweedfs/Chart.yaml b/k8s/charts/seaweedfs/Chart.yaml index 379f67890..421b85175 100644 --- a/k8s/charts/seaweedfs/Chart.yaml +++ b/k8s/charts/seaweedfs/Chart.yaml @@ -3,4 +3,4 @@ description: SeaweedFS name: seaweedfs appVersion: "4.01" # Dev note: Trigger a helm chart release by `git tag -a helm-` -version: 4.0.401 \ No newline at end of file +version: 4.0.401 diff --git a/k8s/charts/seaweedfs/templates/all-in-one/all-in-one-deployment.yaml b/k8s/charts/seaweedfs/templates/all-in-one/all-in-one-deployment.yaml index 8700a8a69..f6237bb7e 100644 --- a/k8s/charts/seaweedfs/templates/all-in-one/all-in-one-deployment.yaml +++ b/k8s/charts/seaweedfs/templates/all-in-one/all-in-one-deployment.yaml @@ -15,9 +15,9 @@ metadata: {{- toYaml .Values.allInOne.annotations | nindent 4 }} {{- end }} spec: - replicas: 1 + replicas: {{ .Values.allInOne.replicas | default 1 }} strategy: - type: Recreate + type: {{ .Values.allInOne.updateStrategy.type | default "Recreate" }} selector: matchLabels: app.kubernetes.io/name: {{ template "seaweedfs.name" . }} @@ -130,12 +130,23 @@ spec: value: {{ include "seaweedfs.cluster.masterAddress" . | quote }} - name: {{ $clusterFilerKey }} value: {{ include "seaweedfs.cluster.filerAddress" . | quote }} + {{- if .Values.allInOne.secretExtraEnvironmentVars }} + {{- range $key, $value := .Values.allInOne.secretExtraEnvironmentVars }} + - name: {{ $key }} + valueFrom: + {{ toYaml $value | nindent 16 }} + {{- end }} + {{- end }} command: - "/bin/sh" - "-ec" - | /usr/bin/weed \ + {{- if .Values.allInOne.loggingOverrideLevel }} + -v={{ .Values.allInOne.loggingOverrideLevel }} \ + {{- else }} -v={{ .Values.global.loggingLevel }} \ + {{- end }} server \ -dir=/data \ -master \ @@ -191,6 +202,9 @@ spec: {{- else if .Values.master.metricsPort }} -metricsPort={{ .Values.master.metricsPort }} \ {{- end }} + {{- if .Values.allInOne.metricsIp }} + -metricsIp={{ .Values.allInOne.metricsIp }} \ + {{- end }} -filer \ -filer.port={{ .Values.filer.port }} \ {{- if .Values.filer.disableDirListing }} @@ -219,61 +233,75 @@ spec: {{- end }} {{- if .Values.allInOne.s3.enabled }} -s3 \ - -s3.port={{ .Values.s3.port }} \ - {{- if .Values.s3.domainName }} - -s3.domainName={{ .Values.s3.domainName }} \ + -s3.port={{ .Values.allInOne.s3.port | default .Values.s3.port }} \ + {{- $domainName := .Values.allInOne.s3.domainName | default .Values.s3.domainName }} + {{- if $domainName }} + -s3.domainName={{ $domainName }} \ {{- end }} {{- if .Values.global.enableSecurity }} - {{- if .Values.s3.httpsPort }} - -s3.port.https={{ .Values.s3.httpsPort }} \ + {{- $httpsPort := .Values.allInOne.s3.httpsPort | default .Values.s3.httpsPort }} + {{- if $httpsPort }} + -s3.port.https={{ $httpsPort }} \ {{- end }} -s3.cert.file=/usr/local/share/ca-certificates/client/tls.crt \ -s3.key.file=/usr/local/share/ca-certificates/client/tls.key \ {{- end }} - {{- if eq (typeOf .Values.s3.allowEmptyFolder) "bool" }} - -s3.allowEmptyFolder={{ .Values.s3.allowEmptyFolder }} \ - {{- end }} - {{- if .Values.s3.enableAuth }} + {{- if or .Values.allInOne.s3.enableAuth .Values.s3.enableAuth .Values.filer.s3.enableAuth }} -s3.config=/etc/sw/s3/seaweedfs_s3_config \ {{- end }} - {{- if .Values.s3.auditLogConfig }} + {{- $auditLogConfig := .Values.allInOne.s3.auditLogConfig | default .Values.s3.auditLogConfig }} + {{- if $auditLogConfig }} -s3.auditLogConfig=/etc/sw/s3/s3_auditLogConfig.json \ {{- end }} {{- end }} {{- if .Values.allInOne.sftp.enabled }} -sftp \ - -sftp.port={{ .Values.sftp.port }} \ - {{- if .Values.sftp.sshPrivateKey }} - -sftp.sshPrivateKey={{ .Values.sftp.sshPrivateKey }} \ + -sftp.port={{ .Values.allInOne.sftp.port | default .Values.sftp.port }} \ + {{- $sshPrivateKey := .Values.allInOne.sftp.sshPrivateKey | default .Values.sftp.sshPrivateKey }} + {{- if $sshPrivateKey }} + -sftp.sshPrivateKey={{ $sshPrivateKey }} \ {{- end }} - {{- if .Values.sftp.hostKeysFolder }} - -sftp.hostKeysFolder={{ .Values.sftp.hostKeysFolder }} \ + {{- $hostKeysFolder := .Values.allInOne.sftp.hostKeysFolder | default .Values.sftp.hostKeysFolder }} + {{- if $hostKeysFolder }} + -sftp.hostKeysFolder={{ $hostKeysFolder }} \ {{- end }} - {{- if .Values.sftp.authMethods }} - -sftp.authMethods={{ .Values.sftp.authMethods }} \ + {{- $authMethods := .Values.allInOne.sftp.authMethods | default .Values.sftp.authMethods }} + {{- if $authMethods }} + -sftp.authMethods={{ $authMethods }} \ {{- end }} - {{- if .Values.sftp.maxAuthTries }} - -sftp.maxAuthTries={{ .Values.sftp.maxAuthTries }} \ + {{- $maxAuthTries := .Values.allInOne.sftp.maxAuthTries | default .Values.sftp.maxAuthTries }} + {{- if $maxAuthTries }} + -sftp.maxAuthTries={{ $maxAuthTries }} \ {{- end }} - {{- if .Values.sftp.bannerMessage }} - -sftp.bannerMessage="{{ .Values.sftp.bannerMessage }}" \ + {{- $bannerMessage := .Values.allInOne.sftp.bannerMessage | default .Values.sftp.bannerMessage }} + {{- if $bannerMessage }} + -sftp.bannerMessage="{{ $bannerMessage }}" \ {{- end }} - {{- if .Values.sftp.loginGraceTime }} - -sftp.loginGraceTime={{ .Values.sftp.loginGraceTime }} \ + {{- $loginGraceTime := .Values.allInOne.sftp.loginGraceTime | default .Values.sftp.loginGraceTime }} + {{- if $loginGraceTime }} + -sftp.loginGraceTime={{ $loginGraceTime }} \ {{- end }} - {{- if .Values.sftp.clientAliveInterval }} - -sftp.clientAliveInterval={{ .Values.sftp.clientAliveInterval }} \ + {{- $clientAliveInterval := .Values.allInOne.sftp.clientAliveInterval | default .Values.sftp.clientAliveInterval }} + {{- if $clientAliveInterval }} + -sftp.clientAliveInterval={{ $clientAliveInterval }} \ {{- end }} - {{- if .Values.sftp.clientAliveCountMax }} - -sftp.clientAliveCountMax={{ .Values.sftp.clientAliveCountMax }} \ + {{- $clientAliveCountMax := .Values.allInOne.sftp.clientAliveCountMax | default .Values.sftp.clientAliveCountMax }} + {{- if $clientAliveCountMax }} + -sftp.clientAliveCountMax={{ $clientAliveCountMax }} \ {{- end }} + {{- if or .Values.allInOne.sftp.enableAuth .Values.sftp.enableAuth }} -sftp.userStoreFile=/etc/sw/sftp/seaweedfs_sftp_config \ {{- end }} + {{- end }} + {{- $extraArgsCount := len .Values.allInOne.extraArgs }} + {{- range $i, $arg := .Values.allInOne.extraArgs }} + {{ $arg | quote }}{{ if ne (add1 $i) $extraArgsCount }} \{{ end }} + {{- end }} volumeMounts: - name: data mountPath: /data - {{- if and .Values.allInOne.s3.enabled (or .Values.s3.enableAuth .Values.filer.s3.enableAuth) }} + {{- if and .Values.allInOne.s3.enabled (or .Values.allInOne.s3.enableAuth .Values.s3.enableAuth .Values.filer.s3.enableAuth) }} - name: config-s3-users mountPath: /etc/sw/s3 readOnly: true @@ -282,10 +310,12 @@ spec: - name: config-ssh mountPath: /etc/sw/ssh readOnly: true + {{- if or .Values.allInOne.sftp.enableAuth .Values.sftp.enableAuth }} - mountPath: /etc/sw/sftp name: config-users readOnly: true {{- end }} + {{- end }} {{- if .Values.filer.notificationConfig }} - name: notification-config mountPath: /etc/seaweedfs/notification.toml @@ -332,15 +362,16 @@ spec: - containerPort: {{ .Values.filer.grpcPort }} name: swfs-fil-grpc {{- if .Values.allInOne.s3.enabled }} - - containerPort: {{ .Values.s3.port }} + - containerPort: {{ .Values.allInOne.s3.port | default .Values.s3.port }} name: swfs-s3 - {{- if .Values.s3.httpsPort }} - - containerPort: {{ .Values.s3.httpsPort }} + {{- $httpsPort := .Values.allInOne.s3.httpsPort | default .Values.s3.httpsPort }} + {{- if $httpsPort }} + - containerPort: {{ $httpsPort }} name: swfs-s3-tls {{- end }} {{- end }} {{- if .Values.allInOne.sftp.enabled }} - - containerPort: {{ .Values.sftp.port }} + - containerPort: {{ .Values.allInOne.sftp.port | default .Values.sftp.port }} name: swfs-sftp {{- end }} {{- if .Values.allInOne.metricsPort }} @@ -352,7 +383,7 @@ spec: httpGet: path: {{ .Values.allInOne.readinessProbe.httpGet.path }} port: {{ .Values.master.port }} - scheme: {{ .Values.allInOne.readinessProbe.scheme }} + scheme: {{ .Values.allInOne.readinessProbe.httpGet.scheme }} initialDelaySeconds: {{ .Values.allInOne.readinessProbe.initialDelaySeconds }} periodSeconds: {{ .Values.allInOne.readinessProbe.periodSeconds }} successThreshold: {{ .Values.allInOne.readinessProbe.successThreshold }} @@ -364,7 +395,7 @@ spec: httpGet: path: {{ .Values.allInOne.livenessProbe.httpGet.path }} port: {{ .Values.master.port }} - scheme: {{ .Values.allInOne.livenessProbe.scheme }} + scheme: {{ .Values.allInOne.livenessProbe.httpGet.scheme }} initialDelaySeconds: {{ .Values.allInOne.livenessProbe.initialDelaySeconds }} periodSeconds: {{ .Values.allInOne.livenessProbe.periodSeconds }} successThreshold: {{ .Values.allInOne.livenessProbe.successThreshold }} @@ -389,26 +420,31 @@ spec: path: {{ .Values.allInOne.data.hostPathPrefix }}/seaweedfs-all-in-one-data/ type: DirectoryOrCreate {{- else if eq .Values.allInOne.data.type "persistentVolumeClaim" }} + persistentVolumeClaim: + claimName: {{ template "seaweedfs.name" . }}-all-in-one-data + {{- else if eq .Values.allInOne.data.type "existingClaim" }} persistentVolumeClaim: claimName: {{ .Values.allInOne.data.claimName }} {{- else if eq .Values.allInOne.data.type "emptyDir" }} emptyDir: {} {{- end }} - {{- if and .Values.allInOne.s3.enabled (or .Values.s3.enableAuth .Values.filer.s3.enableAuth) }} + {{- if and .Values.allInOne.s3.enabled (or .Values.allInOne.s3.enableAuth .Values.s3.enableAuth .Values.filer.s3.enableAuth) }} - name: config-s3-users secret: defaultMode: 420 - secretName: {{ default (printf "%s-s3-secret" (include "seaweedfs.name" .)) (or .Values.s3.existingConfigSecret .Values.filer.s3.existingConfigSecret) }} + secretName: {{ default (printf "%s-s3-secret" (include "seaweedfs.name" .)) (or .Values.allInOne.s3.existingConfigSecret .Values.s3.existingConfigSecret .Values.filer.s3.existingConfigSecret) }} {{- end }} {{- if .Values.allInOne.sftp.enabled }} - name: config-ssh secret: defaultMode: 420 - secretName: {{ default (printf "%s-sftp-ssh-secret" (include "seaweedfs.name" .)) .Values.sftp.existingSshConfigSecret }} + secretName: {{ default (printf "%s-sftp-ssh-secret" (include "seaweedfs.name" .)) (or .Values.allInOne.sftp.existingSshConfigSecret .Values.sftp.existingSshConfigSecret) }} + {{- if or .Values.allInOne.sftp.enableAuth .Values.sftp.enableAuth }} - name: config-users secret: defaultMode: 420 - secretName: {{ default (printf "%s-sftp-secret" (include "seaweedfs.name" .)) .Values.sftp.existingConfigSecret }} + secretName: {{ default (printf "%s-sftp-secret" (include "seaweedfs.name" .)) (or .Values.allInOne.sftp.existingConfigSecret .Values.sftp.existingConfigSecret) }} + {{- end }} {{- end }} {{- if .Values.filer.notificationConfig }} - name: notification-config diff --git a/k8s/charts/seaweedfs/templates/all-in-one/all-in-one-pvc.yaml b/k8s/charts/seaweedfs/templates/all-in-one/all-in-one-pvc.yaml index 49ac20148..a62450c3d 100644 --- a/k8s/charts/seaweedfs/templates/all-in-one/all-in-one-pvc.yaml +++ b/k8s/charts/seaweedfs/templates/all-in-one/all-in-one-pvc.yaml @@ -1,21 +1,28 @@ -{{- if and .Values.allInOne.enabled (eq .Values.allInOne.data.type "persistentVolumeClaim") }} +{{- if .Values.allInOne.enabled }} +{{- if eq .Values.allInOne.data.type "persistentVolumeClaim" }} apiVersion: v1 kind: PersistentVolumeClaim metadata: - name: {{ .Values.allInOne.data.claimName }} + name: {{ template "seaweedfs.name" . }}-all-in-one-data + namespace: {{ .Release.Namespace }} labels: + app.kubernetes.io/name: {{ template "seaweedfs.name" . }} + helm.sh/chart: {{ .Chart.Name }}-{{ .Chart.Version | replace "+" "_" }} + app.kubernetes.io/managed-by: {{ .Release.Service }} + app.kubernetes.io/instance: {{ .Release.Name }} app.kubernetes.io/component: seaweedfs-all-in-one - {{- if .Values.allInOne.annotations }} + {{- with .Values.allInOne.data.annotations }} annotations: - {{- toYaml .Values.allInOne.annotations | nindent 4 }} + {{- toYaml . | nindent 4 }} {{- end }} spec: accessModes: - - ReadWriteOnce - resources: - requests: - storage: {{ .Values.allInOne.data.size }} + {{- toYaml (.Values.allInOne.data.accessModes | default (list "ReadWriteOnce")) | nindent 4 }} {{- if .Values.allInOne.data.storageClass }} storageClassName: {{ .Values.allInOne.data.storageClass }} {{- end }} -{{- end }} \ No newline at end of file + resources: + requests: + storage: {{ .Values.allInOne.data.size | default "10Gi" }} +{{- end }} +{{- end }} diff --git a/k8s/charts/seaweedfs/templates/all-in-one/all-in-one-service.yml b/k8s/charts/seaweedfs/templates/all-in-one/all-in-one-service.yml index 14076a9c3..b13f57899 100644 --- a/k8s/charts/seaweedfs/templates/all-in-one/all-in-one-service.yml +++ b/k8s/charts/seaweedfs/templates/all-in-one/all-in-one-service.yml @@ -15,6 +15,7 @@ metadata: {{- toYaml .Values.allInOne.service.annotations | nindent 4 }} {{- end }} spec: + type: {{ .Values.allInOne.service.type | default "ClusterIP" }} internalTrafficPolicy: {{ .Values.allInOne.service.internalTrafficPolicy | default "Cluster" }} ports: # Master ports @@ -50,13 +51,14 @@ spec: # S3 ports (if enabled) {{- if .Values.allInOne.s3.enabled }} - name: "swfs-s3" - port: {{ if .Values.allInOne.s3.enabled }}{{ .Values.s3.port }}{{ else }}{{ .Values.filer.s3.port }}{{ end }} - targetPort: {{ if .Values.allInOne.s3.enabled }}{{ .Values.s3.port }}{{ else }}{{ .Values.filer.s3.port }}{{ end }} + port: {{ .Values.allInOne.s3.port | default .Values.s3.port }} + targetPort: {{ .Values.allInOne.s3.port | default .Values.s3.port }} protocol: TCP - {{- if and .Values.allInOne.s3.enabled .Values.s3.httpsPort }} + {{- $httpsPort := .Values.allInOne.s3.httpsPort | default .Values.s3.httpsPort }} + {{- if $httpsPort }} - name: "swfs-s3-tls" - port: {{ .Values.s3.httpsPort }} - targetPort: {{ .Values.s3.httpsPort }} + port: {{ $httpsPort }} + targetPort: {{ $httpsPort }} protocol: TCP {{- end }} {{- end }} @@ -64,8 +66,8 @@ spec: # SFTP ports (if enabled) {{- if .Values.allInOne.sftp.enabled }} - name: "swfs-sftp" - port: {{ .Values.sftp.port }} - targetPort: {{ .Values.sftp.port }} + port: {{ .Values.allInOne.sftp.port | default .Values.sftp.port }} + targetPort: {{ .Values.allInOne.sftp.port | default .Values.sftp.port }} protocol: TCP {{- end }} @@ -80,4 +82,4 @@ spec: selector: app.kubernetes.io/name: {{ template "seaweedfs.name" . }} app.kubernetes.io/component: seaweedfs-all-in-one -{{- end }} \ No newline at end of file +{{- end }} diff --git a/k8s/charts/seaweedfs/templates/filer/filer-ingress.yaml b/k8s/charts/seaweedfs/templates/filer/filer-ingress.yaml index 9ce15ae90..b185a58ba 100644 --- a/k8s/charts/seaweedfs/templates/filer/filer-ingress.yaml +++ b/k8s/charts/seaweedfs/templates/filer/filer-ingress.yaml @@ -1,5 +1,8 @@ -{{- if .Values.filer.enabled }} -{{- if .Values.filer.ingress.enabled }} +{{- /* Filer ingress works for both normal mode (filer.enabled) and all-in-one mode (allInOne.enabled) */}} +{{- $filerEnabled := or .Values.filer.enabled .Values.allInOne.enabled }} +{{- if and $filerEnabled .Values.filer.ingress.enabled }} +{{- /* Determine service name based on deployment mode */}} +{{- $serviceName := ternary (printf "%s-all-in-one" (include "seaweedfs.name" .)) (printf "%s-filer" (include "seaweedfs.name" .)) .Values.allInOne.enabled }} {{- if semverCompare ">=1.19-0" .Capabilities.KubeVersion.GitVersion }} apiVersion: networking.k8s.io/v1 {{- else if semverCompare ">=1.14-0" .Capabilities.KubeVersion.GitVersion }} @@ -33,16 +36,14 @@ spec: backend: {{- if semverCompare ">=1.19-0" .Capabilities.KubeVersion.GitVersion }} service: - name: {{ template "seaweedfs.name" . }}-filer + name: {{ $serviceName }} port: number: {{ .Values.filer.port }} - #name: {{- else }} - serviceName: {{ template "seaweedfs.name" . }}-filer + serviceName: {{ $serviceName }} servicePort: {{ .Values.filer.port }} {{- end }} {{- if .Values.filer.ingress.host }} host: {{ .Values.filer.ingress.host }} {{- end }} {{- end }} -{{- end }} diff --git a/k8s/charts/seaweedfs/templates/filer/filer-statefulset.yaml b/k8s/charts/seaweedfs/templates/filer/filer-statefulset.yaml index 5aeccfa02..2b8c27449 100644 --- a/k8s/charts/seaweedfs/templates/filer/filer-statefulset.yaml +++ b/k8s/charts/seaweedfs/templates/filer/filer-statefulset.yaml @@ -213,9 +213,6 @@ spec: -s3.cert.file=/usr/local/share/ca-certificates/client/tls.crt \ -s3.key.file=/usr/local/share/ca-certificates/client/tls.key \ {{- end }} - {{- if eq (typeOf .Values.filer.s3.allowEmptyFolder) "bool" }} - -s3.allowEmptyFolder={{ .Values.filer.s3.allowEmptyFolder }} \ - {{- end }} {{- if .Values.filer.s3.enableAuth }} -s3.config=/etc/sw/seaweedfs_s3_config \ {{- end }} @@ -289,7 +286,7 @@ spec: httpGet: path: {{ .Values.filer.readinessProbe.httpGet.path }} port: {{ .Values.filer.port }} - scheme: {{ .Values.filer.readinessProbe.scheme }} + scheme: {{ .Values.filer.readinessProbe.httpGet.scheme }} initialDelaySeconds: {{ .Values.filer.readinessProbe.initialDelaySeconds }} periodSeconds: {{ .Values.filer.readinessProbe.periodSeconds }} successThreshold: {{ .Values.filer.readinessProbe.successThreshold }} @@ -301,7 +298,7 @@ spec: httpGet: path: {{ .Values.filer.livenessProbe.httpGet.path }} port: {{ .Values.filer.port }} - scheme: {{ .Values.filer.livenessProbe.scheme }} + scheme: {{ .Values.filer.livenessProbe.httpGet.scheme }} initialDelaySeconds: {{ .Values.filer.livenessProbe.initialDelaySeconds }} periodSeconds: {{ .Values.filer.livenessProbe.periodSeconds }} successThreshold: {{ .Values.filer.livenessProbe.successThreshold }} diff --git a/k8s/charts/seaweedfs/templates/master/master-statefulset.yaml b/k8s/charts/seaweedfs/templates/master/master-statefulset.yaml index 704a33b80..a70673454 100644 --- a/k8s/charts/seaweedfs/templates/master/master-statefulset.yaml +++ b/k8s/charts/seaweedfs/templates/master/master-statefulset.yaml @@ -235,7 +235,7 @@ spec: httpGet: path: {{ .Values.master.readinessProbe.httpGet.path }} port: {{ .Values.master.port }} - scheme: {{ .Values.master.readinessProbe.scheme }} + scheme: {{ .Values.master.readinessProbe.httpGet.scheme }} initialDelaySeconds: {{ .Values.master.readinessProbe.initialDelaySeconds }} periodSeconds: {{ .Values.master.readinessProbe.periodSeconds }} successThreshold: {{ .Values.master.readinessProbe.successThreshold }} @@ -247,7 +247,7 @@ spec: httpGet: path: {{ .Values.master.livenessProbe.httpGet.path }} port: {{ .Values.master.port }} - scheme: {{ .Values.master.livenessProbe.scheme }} + scheme: {{ .Values.master.livenessProbe.httpGet.scheme }} initialDelaySeconds: {{ .Values.master.livenessProbe.initialDelaySeconds }} periodSeconds: {{ .Values.master.livenessProbe.periodSeconds }} successThreshold: {{ .Values.master.livenessProbe.successThreshold }} diff --git a/k8s/charts/seaweedfs/templates/s3/s3-deployment.yaml b/k8s/charts/seaweedfs/templates/s3/s3-deployment.yaml index 0c6d52c3e..29dd2d434 100644 --- a/k8s/charts/seaweedfs/templates/s3/s3-deployment.yaml +++ b/k8s/charts/seaweedfs/templates/s3/s3-deployment.yaml @@ -143,9 +143,6 @@ spec: {{- if .Values.s3.domainName }} -domainName={{ .Values.s3.domainName }} \ {{- end }} - {{- if eq (typeOf .Values.s3.allowEmptyFolder) "bool" }} - -allowEmptyFolder={{ .Values.s3.allowEmptyFolder }} \ - {{- end }} {{- if .Values.s3.enableAuth }} -config=/etc/sw/seaweedfs_s3_config \ {{- end }} @@ -204,7 +201,7 @@ spec: httpGet: path: {{ .Values.s3.readinessProbe.httpGet.path }} port: {{ .Values.s3.port }} - scheme: {{ .Values.s3.readinessProbe.scheme }} + scheme: {{ .Values.s3.readinessProbe.httpGet.scheme }} initialDelaySeconds: {{ .Values.s3.readinessProbe.initialDelaySeconds }} periodSeconds: {{ .Values.s3.readinessProbe.periodSeconds }} successThreshold: {{ .Values.s3.readinessProbe.successThreshold }} @@ -216,7 +213,7 @@ spec: httpGet: path: {{ .Values.s3.livenessProbe.httpGet.path }} port: {{ .Values.s3.port }} - scheme: {{ .Values.s3.livenessProbe.scheme }} + scheme: {{ .Values.s3.livenessProbe.httpGet.scheme }} initialDelaySeconds: {{ .Values.s3.livenessProbe.initialDelaySeconds }} periodSeconds: {{ .Values.s3.livenessProbe.periodSeconds }} successThreshold: {{ .Values.s3.livenessProbe.successThreshold }} diff --git a/k8s/charts/seaweedfs/templates/s3/s3-ingress.yaml b/k8s/charts/seaweedfs/templates/s3/s3-ingress.yaml index a856923e9..899773ae3 100644 --- a/k8s/charts/seaweedfs/templates/s3/s3-ingress.yaml +++ b/k8s/charts/seaweedfs/templates/s3/s3-ingress.yaml @@ -1,4 +1,9 @@ -{{- if .Values.s3.ingress.enabled }} +{{- /* S3 ingress works for standalone S3 gateway (s3.enabled), S3 on Filer (filer.s3.enabled), and all-in-one mode (allInOne.s3.enabled) */}} +{{- $s3Enabled := or .Values.s3.enabled (and .Values.filer.s3.enabled (not .Values.allInOne.enabled)) (and .Values.allInOne.enabled .Values.allInOne.s3.enabled) }} +{{- if and $s3Enabled .Values.s3.ingress.enabled }} +{{- /* Determine service name based on deployment mode */}} +{{- $serviceName := ternary (printf "%s-all-in-one" (include "seaweedfs.name" .)) (printf "%s-s3" (include "seaweedfs.name" .)) .Values.allInOne.enabled }} +{{- $s3Port := .Values.allInOne.s3.port | default .Values.s3.port }} {{- if semverCompare ">=1.19-0" .Capabilities.KubeVersion.GitVersion }} apiVersion: networking.k8s.io/v1 {{- else if semverCompare ">=1.14-0" .Capabilities.KubeVersion.GitVersion }} @@ -32,13 +37,12 @@ spec: backend: {{- if semverCompare ">=1.19-0" .Capabilities.KubeVersion.GitVersion }} service: - name: {{ template "seaweedfs.name" . }}-s3 + name: {{ $serviceName }} port: - number: {{ .Values.s3.port }} - #name: + number: {{ $s3Port }} {{- else }} - serviceName: {{ template "seaweedfs.name" . }}-s3 - servicePort: {{ .Values.s3.port }} + serviceName: {{ $serviceName }} + servicePort: {{ $s3Port }} {{- end }} {{- if .Values.s3.ingress.host }} host: {{ .Values.s3.ingress.host | quote }} diff --git a/k8s/charts/seaweedfs/templates/shared/post-install-bucket-hook.yaml b/k8s/charts/seaweedfs/templates/shared/post-install-bucket-hook.yaml index 44d650898..a0c56edc4 100644 --- a/k8s/charts/seaweedfs/templates/shared/post-install-bucket-hook.yaml +++ b/k8s/charts/seaweedfs/templates/shared/post-install-bucket-hook.yaml @@ -1,6 +1,32 @@ -{{- if .Values.master.enabled }} -{{- if .Values.filer.s3.enabled }} -{{- if .Values.filer.s3.createBuckets }} +{{- /* Support bucket creation for both standalone filer.s3 and allInOne modes */}} +{{- $createBuckets := list }} +{{- $s3Enabled := false }} +{{- $enableAuth := false }} +{{- $existingConfigSecret := "" }} + +{{- /* Check allInOne mode first */}} +{{- if .Values.allInOne.enabled }} + {{- if .Values.allInOne.s3.enabled }} + {{- $s3Enabled = true }} + {{- if .Values.allInOne.s3.createBuckets }} + {{- $createBuckets = .Values.allInOne.s3.createBuckets }} + {{- end }} + {{- $enableAuth = or .Values.allInOne.s3.enableAuth .Values.s3.enableAuth .Values.filer.s3.enableAuth }} + {{- $existingConfigSecret = or .Values.allInOne.s3.existingConfigSecret .Values.s3.existingConfigSecret .Values.filer.s3.existingConfigSecret }} + {{- end }} +{{- else if .Values.master.enabled }} + {{- /* Check standalone filer.s3 mode */}} + {{- if .Values.filer.s3.enabled }} + {{- $s3Enabled = true }} + {{- if .Values.filer.s3.createBuckets }} + {{- $createBuckets = .Values.filer.s3.createBuckets }} + {{- end }} + {{- $enableAuth = .Values.filer.s3.enableAuth }} + {{- $existingConfigSecret = .Values.filer.s3.existingConfigSecret }} + {{- end }} +{{- end }} + +{{- if and $s3Enabled $createBuckets }} --- apiVersion: batch/v1 kind: Job @@ -32,9 +58,9 @@ spec: - name: WEED_CLUSTER_DEFAULT value: "sw" - name: WEED_CLUSTER_SW_MASTER - value: "{{ template "seaweedfs.name" . }}-master.{{ .Release.Namespace }}:{{ .Values.master.port }}" + value: {{ include "seaweedfs.cluster.masterAddress" . | quote }} - name: WEED_CLUSTER_SW_FILER - value: "{{ template "seaweedfs.name" . }}-filer-client.{{ .Release.Namespace }}:{{ .Values.filer.port }}" + value: {{ include "seaweedfs.cluster.filerAddress" . | quote }} - name: POD_IP valueFrom: fieldRef: @@ -71,24 +97,29 @@ spec: echo "Service at $url failed to become ready within 5 minutes" exit 1 } + {{- if .Values.allInOne.enabled }} + wait_for_service "http://$WEED_CLUSTER_SW_MASTER{{ .Values.allInOne.readinessProbe.httpGet.path }}" + wait_for_service "http://$WEED_CLUSTER_SW_FILER{{ .Values.filer.readinessProbe.httpGet.path }}" + {{- else }} wait_for_service "http://$WEED_CLUSTER_SW_MASTER{{ .Values.master.readinessProbe.httpGet.path }}" wait_for_service "http://$WEED_CLUSTER_SW_FILER{{ .Values.filer.readinessProbe.httpGet.path }}" - {{- range $reg, $props := $.Values.filer.s3.createBuckets }} - exec /bin/echo \ - "s3.bucket.create --name {{ $props.name }}" |\ + {{- end }} + {{- range $createBuckets }} + /bin/echo \ + "s3.bucket.create --name {{ .name }}" |\ /usr/bin/weed shell {{- end }} - {{- range $reg, $props := $.Values.filer.s3.createBuckets }} - {{- if $props.anonymousRead }} - exec /bin/echo \ + {{- range $createBuckets }} + {{- if .anonymousRead }} + /bin/echo \ "s3.configure --user anonymous \ - --buckets {{ $props.name }} \ + --buckets {{ .name }} \ --actions Read \ --apply true" |\ /usr/bin/weed shell {{- end }} {{- end }} - {{- if .Values.filer.s3.enableAuth }} + {{- if $enableAuth }} volumeMounts: - name: config-users mountPath: /etc/sw @@ -106,17 +137,15 @@ spec: {{- if .Values.filer.containerSecurityContext.enabled }} securityContext: {{- omit .Values.filer.containerSecurityContext "enabled" | toYaml | nindent 12 }} {{- end }} - {{- if .Values.filer.s3.enableAuth }} + {{- if $enableAuth }} volumes: - name: config-users secret: defaultMode: 420 - {{- if not (empty .Values.filer.s3.existingConfigSecret) }} - secretName: {{ .Values.filer.s3.existingConfigSecret }} + {{- if $existingConfigSecret }} + secretName: {{ $existingConfigSecret }} {{- else }} - secretName: seaweedfs-s3-secret + secretName: {{ template "seaweedfs.name" . }}-s3-secret {{- end }} - {{- end }}{{/** if .Values.filer.s3.enableAuth **/}} -{{- end }}{{/** if .Values.master.enabled **/}} -{{- end }}{{/** if .Values.filer.s3.enabled **/}} -{{- end }}{{/** if .Values.filer.s3.createBuckets **/}} + {{- end }} +{{- end }} diff --git a/k8s/charts/seaweedfs/templates/volume/volume-statefulset.yaml b/k8s/charts/seaweedfs/templates/volume/volume-statefulset.yaml index 29a035a2b..1a8964a55 100644 --- a/k8s/charts/seaweedfs/templates/volume/volume-statefulset.yaml +++ b/k8s/charts/seaweedfs/templates/volume/volume-statefulset.yaml @@ -251,7 +251,7 @@ spec: httpGet: path: {{ $volume.readinessProbe.httpGet.path }} port: {{ $volume.port }} - scheme: {{ $volume.readinessProbe.scheme }} + scheme: {{ $volume.readinessProbe.httpGet.scheme }} initialDelaySeconds: {{ $volume.readinessProbe.initialDelaySeconds }} periodSeconds: {{ $volume.readinessProbe.periodSeconds }} successThreshold: {{ $volume.readinessProbe.successThreshold }} @@ -263,7 +263,7 @@ spec: httpGet: path: {{ $volume.livenessProbe.httpGet.path }} port: {{ $volume.port }} - scheme: {{ $volume.livenessProbe.scheme }} + scheme: {{ $volume.livenessProbe.httpGet.scheme }} initialDelaySeconds: {{ $volume.livenessProbe.initialDelaySeconds }} periodSeconds: {{ $volume.livenessProbe.periodSeconds }} successThreshold: {{ $volume.livenessProbe.successThreshold }} diff --git a/k8s/charts/seaweedfs/values.yaml b/k8s/charts/seaweedfs/values.yaml index 547b05479..2e8bb12e6 100644 --- a/k8s/charts/seaweedfs/values.yaml +++ b/k8s/charts/seaweedfs/values.yaml @@ -856,8 +856,6 @@ filer: port: 8333 # add additional https port httpsPort: 0 - # allow empty folders - allowEmptyFolder: false # Suffix of the host name, {bucket}.{domainName} domainName: "" # enable user & permission to s3 (need to inject to all services) @@ -885,8 +883,6 @@ s3: httpsPort: 0 metricsPort: 9327 loggingOverrideLevel: null - # allow empty folders - allowEmptyFolder: true # enable user & permission to s3 (need to inject to all services) enableAuth: false # set to the name of an existing kubernetes Secret with the s3 json config file @@ -979,9 +975,9 @@ s3: extraEnvironmentVars: # Custom command line arguments to add to the s3 command - # Example to fix connection idle seconds: - extraArgs: ["-idleTimeout=30"] - # extraArgs: [] + # Default idleTimeout is 120 seconds. Example to customize: + # extraArgs: ["-idleTimeout=300"] + extraArgs: [] # used to configure livenessProbe on s3 containers # @@ -1097,6 +1093,7 @@ allInOne: enabled: false imageOverride: null restartPolicy: Always + replicas: 1 # Number of replicas (note: multiple replicas may require shared storage) # Core configuration idleTimeout: 30 # Connection idle seconds @@ -1108,24 +1105,85 @@ allInOne: metricsIp: "" # Metrics listen IP. If empty, defaults to bindAddress loggingOverrideLevel: null # Override logging level - # Service configuration + # Custom command line arguments to add to the server command + # Example to fix IPv6 metrics connectivity issues: + # extraArgs: ["-metricsIp", "0.0.0.0"] + # Example with multiple args: + # extraArgs: ["-customFlag", "value", "-anotherFlag"] + extraArgs: [] + + # Update strategy configuration + # type: Recreate or RollingUpdate + # For single replica, Recreate is recommended to avoid data conflicts. + # For multiple replicas with RollingUpdate, you MUST use shared storage + # (e.g., data.type: persistentVolumeClaim with ReadWriteMany access mode) + # to avoid data loss or inconsistency between pods. + updateStrategy: + type: Recreate + + # S3 gateway configuration + # Note: Most parameters below default to null, which means they inherit from + # the global s3.* settings. Set explicit values here to override for allInOne only. s3: enabled: false # Whether to enable S3 gateway + port: null # S3 gateway port (null inherits from s3.port) + httpsPort: null # S3 gateway HTTPS port (null inherits from s3.httpsPort) + domainName: null # Suffix of the host name (null inherits from s3.domainName) + enableAuth: false # Enable user & permission to S3 + # Set to the name of an existing kubernetes Secret with the s3 json config file + # should have a secret key called seaweedfs_s3_config with an inline json config + existingConfigSecret: null + auditLogConfig: null # S3 audit log configuration (null inherits from s3.auditLogConfig) + # You may specify buckets to be created during the install process. + # Buckets may be exposed publicly by setting `anonymousRead` to `true` + # createBuckets: + # - name: bucket-a + # anonymousRead: true + # - name: bucket-b + # anonymousRead: false + + # SFTP server configuration + # Note: Most parameters below default to null, which means they inherit from + # the global sftp.* settings. Set explicit values here to override for allInOne only. sftp: enabled: false # Whether to enable SFTP server + port: null # SFTP port (null inherits from sftp.port) + sshPrivateKey: null # Path to SSH private key (null inherits from sftp.sshPrivateKey) + hostKeysFolder: null # Path to SSH host keys folder (null inherits from sftp.hostKeysFolder) + authMethods: null # Comma-separated auth methods (null inherits from sftp.authMethods) + maxAuthTries: null # Maximum authentication attempts (null inherits from sftp.maxAuthTries) + bannerMessage: null # Banner message (null inherits from sftp.bannerMessage) + loginGraceTime: null # Login grace time (null inherits from sftp.loginGraceTime) + clientAliveInterval: null # Client keep-alive interval (null inherits from sftp.clientAliveInterval) + clientAliveCountMax: null # Maximum missed keep-alive messages (null inherits from sftp.clientAliveCountMax) + enableAuth: false # Enable SFTP authentication + # Set to the name of an existing kubernetes Secret with the sftp json config file + existingConfigSecret: null + # Set to the name of an existing kubernetes Secret with the SSH keys + existingSshConfigSecret: null # Service settings service: annotations: {} # Annotations for the service type: ClusterIP # Service type (ClusterIP, NodePort, LoadBalancer) + internalTrafficPolicy: Cluster # Internal traffic policy + + # Note: For ingress in all-in-one mode, use the standard s3.ingress and + # filer.ingress settings. The templates automatically detect all-in-one mode + # and point to the correct service (seaweedfs-all-in-one instead of + # seaweedfs-s3 or seaweedfs-filer). # Storage configuration data: - type: "emptyDir" # Options: "hostPath", "persistentVolumeClaim", "emptyDir" + type: "emptyDir" # Options: "hostPath", "persistentVolumeClaim", "emptyDir", "existingClaim" hostPathPrefix: /mnt/data # Path prefix for hostPath volumes - claimName: seaweedfs-data-pvc # Name of the PVC to use - size: "" # Size of the PVC - storageClass: "" # Storage class for the PVC + claimName: seaweedfs-data-pvc # Name of the PVC to use (for existingClaim type) + size: null # Size of the PVC (null defaults to 10Gi for persistentVolumeClaim type) + storageClass: null # Storage class for the PVC (null uses cluster default) + # accessModes for the PVC. Default is ["ReadWriteOnce"]. + # For multi-replica deployments, use ["ReadWriteMany"] with a compatible storage class. + accessModes: [] + annotations: {} # Annotations for the PVC # Health checks readinessProbe: @@ -1133,7 +1191,7 @@ allInOne: httpGet: path: /cluster/status port: 9333 - scheme: HTTP + scheme: HTTP initialDelaySeconds: 10 periodSeconds: 15 successThreshold: 1 @@ -1145,7 +1203,7 @@ allInOne: httpGet: path: /cluster/status port: 9333 - scheme: HTTP + scheme: HTTP initialDelaySeconds: 20 periodSeconds: 30 successThreshold: 1 @@ -1154,6 +1212,18 @@ allInOne: # Additional resources extraEnvironmentVars: {} # Additional environment variables + # Secret environment variables (for database credentials, etc.) + # Example: + # secretExtraEnvironmentVars: + # WEED_POSTGRES_USERNAME: + # secretKeyRef: + # name: postgres-credentials + # key: username + # WEED_POSTGRES_PASSWORD: + # secretKeyRef: + # name: postgres-credentials + # key: password + secretExtraEnvironmentVars: {} extraVolumeMounts: "" # Additional volume mounts extraVolumes: "" # Additional volumes initContainers: "" # Init containers @@ -1173,7 +1243,7 @@ allInOne: matchLabels: app.kubernetes.io/name: {{ template "seaweedfs.name" . }} app.kubernetes.io/instance: {{ .Release.Name }} - app.kubernetes.io/component: master + app.kubernetes.io/component: seaweedfs-all-in-one topologyKey: kubernetes.io/hostname # Topology Spread Constraints Settings @@ -1181,16 +1251,16 @@ allInOne: # for a PodSpec. By Default no constraints are set. topologySpreadConstraints: "" - # Toleration Settings for master pods + # Toleration Settings for pods # This should be a multi-line string matching the Toleration array # in a PodSpec. tolerations: "" - # nodeSelector labels for master pod assignment, formatted as a muli-line string. + # nodeSelector labels for pod assignment, formatted as a muli-line string. # ref: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/#nodeselector nodeSelector: "" - # Used to assign priority to master pods + # Used to assign priority to pods # ref: https://kubernetes.io/docs/concepts/configuration/pod-priority-preemption/ priorityClassName: "" diff --git a/test/foundationdb/docker-compose.arm64.yml b/test/foundationdb/docker-compose.arm64.yml index 9c8f091e9..c2e7e8586 100644 --- a/test/foundationdb/docker-compose.arm64.yml +++ b/test/foundationdb/docker-compose.arm64.yml @@ -147,7 +147,7 @@ services: - "8888:8888" - "8333:8333" - "18888:18888" - command: "server -ip=seaweedfs -filer -master.volumeSizeLimitMB=16 -volume.max=0 -volume -volume.preStopSeconds=1 -s3 -s3.config=/etc/seaweedfs/s3.json -s3.port=8333 -s3.allowEmptyFolder=false -s3.allowDeleteBucketNotEmpty=false" + command: "server -ip=seaweedfs -filer -master.volumeSizeLimitMB=16 -volume.max=0 -volume -volume.preStopSeconds=1 -s3 -s3.config=/etc/seaweedfs/s3.json -s3.port=8333 -s3.allowDeleteBucketNotEmpty=false" volumes: - ./s3.json:/etc/seaweedfs/s3.json - ./filer.toml:/etc/seaweedfs/filer.toml diff --git a/test/foundationdb/docker-compose.yml b/test/foundationdb/docker-compose.yml index a1257d5c9..933cd41ec 100644 --- a/test/foundationdb/docker-compose.yml +++ b/test/foundationdb/docker-compose.yml @@ -116,7 +116,7 @@ services: - WEED_FOUNDATIONDB_MAX_RETRY_DELAY - WEED_MASTER_VOLUME_GROWTH_COPY_1=1 - WEED_MASTER_VOLUME_GROWTH_COPY_OTHER=1 - command: "weed server -ip=seaweedfs -filer -master.volumeSizeLimitMB=16 -volume.max=0 -volume -volume.preStopSeconds=1 -s3 -s3.config=/etc/seaweedfs/s3.json -s3.port=8333 -s3.allowEmptyFolder=false -s3.allowDeleteBucketNotEmpty=false" + command: "weed server -ip=seaweedfs -filer -master.volumeSizeLimitMB=16 -volume.max=0 -volume -volume.preStopSeconds=1 -s3 -s3.config=/etc/seaweedfs/s3.json -s3.port=8333 -s3.allowDeleteBucketNotEmpty=false" configs: fdb.cluster: diff --git a/test/postgres/docker-compose.yml b/test/postgres/docker-compose.yml index 6d222f83d..87c36d0e8 100644 --- a/test/postgres/docker-compose.yml +++ b/test/postgres/docker-compose.yml @@ -30,7 +30,6 @@ services: - -s3=true - -s3.port=8333 - -webdav=false - - -s3.allowEmptyFolder=false - -mq.broker=true - -mq.agent=true - -ip=seaweedfs diff --git a/test/s3/cors/Makefile b/test/s3/cors/Makefile index e59124a6a..3164d1341 100644 --- a/test/s3/cors/Makefile +++ b/test/s3/cors/Makefile @@ -79,12 +79,11 @@ start-server: check-deps @echo "🔍 DEBUG: Creating volume directory..." @mkdir -p ./test-volume-data @echo "🔍 DEBUG: Launching SeaweedFS server in background..." - @echo "🔍 DEBUG: Command: $(WEED_BINARY) server -debug -s3 -s3.port=$(S3_PORT) -s3.allowEmptyFolder=false -s3.allowDeleteBucketNotEmpty=true -s3.config=../../../docker/compose/s3.json -filer -filer.maxMB=64 -master.volumeSizeLimitMB=50 -volume.max=100 -dir=./test-volume-data -volume.preStopSeconds=1 -metricsPort=9324" + @echo "🔍 DEBUG: Command: $(WEED_BINARY) server -debug -s3 -s3.port=$(S3_PORT) -s3.allowDeleteBucketNotEmpty=true -s3.config=../../../docker/compose/s3.json -filer -filer.maxMB=64 -master.volumeSizeLimitMB=50 -volume.max=100 -dir=./test-volume-data -volume.preStopSeconds=1 -metricsPort=9324" @$(WEED_BINARY) server \ -debug \ -s3 \ -s3.port=$(S3_PORT) \ - -s3.allowEmptyFolder=false \ -s3.allowDeleteBucketNotEmpty=true \ -s3.config=../../../docker/compose/s3.json \ -filer \ diff --git a/test/s3/retention/Makefile b/test/s3/retention/Makefile index 092d2caac..3277e1db0 100644 --- a/test/s3/retention/Makefile +++ b/test/s3/retention/Makefile @@ -81,12 +81,11 @@ start-server: check-deps @echo "🔍 DEBUG: Creating volume directory..." @mkdir -p ./test-volume-data @echo "🔍 DEBUG: Launching SeaweedFS server in background..." - @echo "🔍 DEBUG: Command: $(WEED_BINARY) server -debug -s3 -s3.port=$(S3_PORT) -s3.allowEmptyFolder=false -s3.allowDeleteBucketNotEmpty=true -s3.config=../../../docker/compose/s3.json -filer -filer.maxMB=64 -master.volumeSizeLimitMB=50 -volume.max=100 -dir=./test-volume-data -volume.preStopSeconds=1 -metricsPort=9324" + @echo "🔍 DEBUG: Command: $(WEED_BINARY) server -debug -s3 -s3.port=$(S3_PORT) -s3.allowDeleteBucketNotEmpty=true -s3.config=../../../docker/compose/s3.json -filer -filer.maxMB=64 -master.volumeSizeLimitMB=50 -volume.max=100 -dir=./test-volume-data -volume.preStopSeconds=1 -metricsPort=9324" @$(WEED_BINARY) server \ -debug \ -s3 \ -s3.port=$(S3_PORT) \ - -s3.allowEmptyFolder=false \ -s3.allowDeleteBucketNotEmpty=true \ -s3.config=../../../docker/compose/s3.json \ -filer \ diff --git a/test/s3/retention/s3_object_lock_headers_test.go b/test/s3/retention/s3_object_lock_headers_test.go index bf7283617..fad9e6fbb 100644 --- a/test/s3/retention/s3_object_lock_headers_test.go +++ b/test/s3/retention/s3_object_lock_headers_test.go @@ -236,7 +236,7 @@ func TestObjectLockHeadersNonVersionedBucket(t *testing.T) { bucketName := getNewBucketName() // Create regular bucket without object lock/versioning - createBucket(t, client, bucketName) + createBucketWithoutObjectLock(t, client, bucketName) defer deleteBucket(t, client, bucketName) key := "test-non-versioned" diff --git a/test/s3/retention/s3_retention_test.go b/test/s3/retention/s3_retention_test.go index 8477a50bf..4abdf6d87 100644 --- a/test/s3/retention/s3_retention_test.go +++ b/test/s3/retention/s3_retention_test.go @@ -69,8 +69,19 @@ func getNewBucketName() string { return fmt.Sprintf("%s%d", defaultConfig.BucketPrefix, timestamp) } -// createBucket creates a new bucket for testing +// createBucket creates a new bucket for testing with Object Lock enabled +// Object Lock is required for retention and legal hold functionality per AWS S3 specification func createBucket(t *testing.T, client *s3.Client, bucketName string) { + _, err := client.CreateBucket(context.TODO(), &s3.CreateBucketInput{ + Bucket: aws.String(bucketName), + ObjectLockEnabledForBucket: aws.Bool(true), + }) + require.NoError(t, err) +} + +// createBucketWithoutObjectLock creates a new bucket without Object Lock enabled +// Use this only for tests that specifically need to verify non-Object-Lock bucket behavior +func createBucketWithoutObjectLock(t *testing.T, client *s3.Client, bucketName string) { _, err := client.CreateBucket(context.TODO(), &s3.CreateBucketInput{ Bucket: aws.String(bucketName), }) diff --git a/test/s3/tagging/Makefile b/test/s3/tagging/Makefile index aa2f18f7c..c495d1a40 100644 --- a/test/s3/tagging/Makefile +++ b/test/s3/tagging/Makefile @@ -77,7 +77,7 @@ start-server: check-deps @echo "🔍 DEBUG: Creating volume directory..." @mkdir -p ./test-volume-data @echo "🔍 DEBUG: Launching SeaweedFS server in background..." - @echo "🔍 DEBUG: Command: $(WEED_BINARY) server -filer -filer.maxMB=64 -s3 -ip.bind 0.0.0.0 -dir=./test-volume-data -master.raftHashicorp -master.electionTimeout 1s -master.volumeSizeLimitMB=100 -volume.max=100 -volume.preStopSeconds=1 -master.port=$(MASTER_PORT) -volume.port=$(VOLUME_PORT) -filer.port=$(FILER_PORT) -s3.port=$(S3_PORT) -metricsPort=9329 -s3.allowEmptyFolder=false -s3.allowDeleteBucketNotEmpty=true -s3.config=../../../docker/compose/s3.json -master.peers=none" + @echo "🔍 DEBUG: Command: $(WEED_BINARY) server -filer -filer.maxMB=64 -s3 -ip.bind 0.0.0.0 -dir=./test-volume-data -master.raftHashicorp -master.electionTimeout 1s -master.volumeSizeLimitMB=100 -volume.max=100 -volume.preStopSeconds=1 -master.port=$(MASTER_PORT) -volume.port=$(VOLUME_PORT) -filer.port=$(FILER_PORT) -s3.port=$(S3_PORT) -metricsPort=9329 -s3.allowDeleteBucketNotEmpty=true -s3.config=../../../docker/compose/s3.json -master.peers=none" @$(WEED_BINARY) server \ -filer \ -filer.maxMB=64 \ @@ -94,7 +94,6 @@ start-server: check-deps -filer.port=$(FILER_PORT) \ -s3.port=$(S3_PORT) \ -metricsPort=9329 \ - -s3.allowEmptyFolder=false \ -s3.allowDeleteBucketNotEmpty=true \ -s3.config=../../../docker/compose/s3.json \ -master.peers=none \ diff --git a/test/s3/versioning/Makefile b/test/s3/versioning/Makefile index ccf5e2092..91fd84fc1 100644 --- a/test/s3/versioning/Makefile +++ b/test/s3/versioning/Makefile @@ -81,12 +81,11 @@ start-server: check-deps @echo "🔍 DEBUG: Creating volume directory..." @mkdir -p ./test-volume-data @echo "🔍 DEBUG: Launching SeaweedFS server in background..." - @echo "🔍 DEBUG: Command: $(WEED_BINARY) server -debug -s3 -s3.port=$(S3_PORT) -s3.allowEmptyFolder=false -s3.allowDeleteBucketNotEmpty=true -s3.config=../../../docker/compose/s3.json -filer -filer.maxMB=64 -master.volumeSizeLimitMB=50 -volume.max=100 -dir=./test-volume-data -volume.preStopSeconds=1 -metricsPort=9324" + @echo "🔍 DEBUG: Command: $(WEED_BINARY) server -debug -s3 -s3.port=$(S3_PORT) -s3.allowDeleteBucketNotEmpty=true -s3.config=../../../docker/compose/s3.json -filer -filer.maxMB=64 -master.volumeSizeLimitMB=50 -volume.max=100 -dir=./test-volume-data -volume.preStopSeconds=1 -metricsPort=9324" @$(WEED_BINARY) server \ -debug \ -s3 \ -s3.port=$(S3_PORT) \ - -s3.allowEmptyFolder=false \ -s3.allowDeleteBucketNotEmpty=true \ -s3.config=../../../docker/compose/s3.json \ -filer \ @@ -222,7 +221,7 @@ test-with-server: start-server test-versioning-with-configs: check-deps @echo "Testing with different S3 configurations..." @echo "Testing with empty folder allowed..." - @$(WEED_BINARY) server -s3 -s3.port=$(S3_PORT) -s3.allowEmptyFolder=true -filer -master.volumeSizeLimitMB=100 -volume.max=100 > weed-test-config1.log 2>&1 & echo $$! > weed-config1.pid + @$(WEED_BINARY) server -s3 -s3.port=$(S3_PORT) -filer -master.volumeSizeLimitMB=100 -volume.max=100 > weed-test-config1.log 2>&1 & echo $$! > weed-config1.pid @sleep 5 @go test -v -timeout=5m -run "TestVersioningBasicWorkflow" . || true @if [ -f weed-config1.pid ]; then kill -TERM $$(cat weed-config1.pid) 2>/dev/null || true; rm -f weed-config1.pid; fi @@ -268,7 +267,6 @@ debug-server: -debug \ -s3 \ -s3.port=$(S3_PORT) \ - -s3.allowEmptyFolder=false \ -s3.allowDeleteBucketNotEmpty=true \ -s3.config=../../../docker/compose/s3.json \ -filer \ @@ -317,7 +315,6 @@ start-server-simple: check-deps -debug \ -s3 \ -s3.port=$(S3_PORT) \ - -s3.allowEmptyFolder=false \ -s3.allowDeleteBucketNotEmpty=true \ -s3.config=../../../docker/compose/s3.json \ -filer \ diff --git a/test/sftp/Makefile b/test/sftp/Makefile new file mode 100644 index 000000000..bc46dd3ce --- /dev/null +++ b/test/sftp/Makefile @@ -0,0 +1,41 @@ +.PHONY: all build test test-verbose test-short test-homedir test-debug clean deps tidy + +all: build test + +# Build the weed binary first +build: + cd ../../weed && go build -o weed . + +# Install test dependencies +deps: + go mod download + +# Run all tests +test: build deps + go test -timeout 5m ./... + +# Run tests with verbose output +test-verbose: build deps + go test -v -timeout 5m ./... + +# Run quick tests only (skip integration tests) +test-short: deps + go test -short -v ./... + +# Run specific test +test-homedir: build deps + go test -v -timeout 5m -run TestHomeDirPathTranslation ./... + +# Run tests with debug output from SeaweedFS +test-debug: build deps + go test -v -timeout 5m ./... 2>&1 | tee test.log + +# Clean up test artifacts +clean: + rm -f test.log + go clean -testcache + +# Update go.sum +tidy: + go mod tidy + diff --git a/test/sftp/README.md b/test/sftp/README.md new file mode 100644 index 000000000..17b5e67c7 --- /dev/null +++ b/test/sftp/README.md @@ -0,0 +1,92 @@ +# SeaweedFS SFTP Integration Tests + +This directory contains integration tests for the SeaweedFS SFTP server. + +## Prerequisites + +1. Build the SeaweedFS binary: + ```bash + cd ../../weed + go build -o weed . + ``` + +2. Ensure `ssh-keygen` is available (for generating test SSH host keys) + +## Running Tests + +### Run all tests +```bash +make test +``` + +### Run tests with verbose output +```bash +make test-verbose +``` + +### Run a specific test +```bash +go test -v -run TestHomeDirPathTranslation +``` + +### Skip long-running tests +```bash +go test -short ./... +``` + +## Test Structure + +- `framework.go` - Test framework that starts SeaweedFS cluster with SFTP +- `basic_test.go` - Basic SFTP operation tests including: + - HomeDir path translation (fixes issue #7470) + - File upload/download + - Directory operations + - Large file handling + - Edge cases + +## Test Configuration + +Tests use `testdata/userstore.json` which defines test users: + +| Username | Password | HomeDir | Permissions | +|----------|----------|---------|-------------| +| admin | adminpassword | / | Full access | +| testuser | testuserpassword | /sftp/testuser | Full access to home | +| readonly | readonlypassword | /public | Read-only | + +## Key Tests + +### TestHomeDirPathTranslation + +Tests the fix for [issue #7470](https://github.com/seaweedfs/seaweedfs/issues/7470) where +users with a non-root HomeDir (e.g., `/sftp/testuser`) could not upload files to `/` +because the path wasn't being translated to their home directory. + +The test verifies: +- Uploading to `/` correctly maps to the user's HomeDir +- Creating directories at `/` works +- Listing `/` shows the user's home directory contents +- All path operations respect the HomeDir translation + +## Debugging + +To debug test failures: + +1. Enable verbose output: + ```bash + go test -v -run TestName + ``` + +2. Keep test artifacts (don't cleanup): + ```go + config := DefaultTestConfig() + config.SkipCleanup = true + ``` + +3. Enable debug logging: + ```go + config := DefaultTestConfig() + config.EnableDebug = true + ``` + + diff --git a/test/sftp/basic_test.go b/test/sftp/basic_test.go new file mode 100644 index 000000000..e5ffe90d1 --- /dev/null +++ b/test/sftp/basic_test.go @@ -0,0 +1,652 @@ +package sftp + +import ( + "bytes" + "io" + "path" + "testing" + + "github.com/stretchr/testify/require" +) + +// TestHomeDirPathTranslation tests that SFTP operations correctly translate +// paths relative to the user's HomeDir. +// This is the fix for https://github.com/seaweedfs/seaweedfs/issues/7470 +func TestHomeDirPathTranslation(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + config := DefaultTestConfig() + config.EnableDebug = testing.Verbose() + + fw := NewSftpTestFramework(t, config) + err := fw.Setup(config) + require.NoError(t, err, "failed to setup test framework") + defer fw.Cleanup() + + // Test with user "testuser" who has HomeDir="/sftp/testuser" + // When they upload to "/", it should actually go to "/sftp/testuser" + sftpClient, sshConn, err := fw.ConnectSFTP("testuser", "testuserpassword") + require.NoError(t, err, "failed to connect as testuser") + defer sshConn.Close() + defer sftpClient.Close() + + // Test 1: Upload file to "/" (should map to /sftp/testuser/) + t.Run("UploadToRoot", func(t *testing.T) { + testContent := []byte("Hello from SFTP test!") + filename := "test_upload.txt" + + // Create file at "/" from user's perspective + file, err := sftpClient.Create("/" + filename) + require.NoError(t, err, "should be able to create file at /") + + _, err = file.Write(testContent) + require.NoError(t, err, "should be able to write to file") + err = file.Close() + require.NoError(t, err, "should be able to close file") + + // Verify file exists and has correct content + readFile, err := sftpClient.Open("/" + filename) + require.NoError(t, err, "should be able to open file") + defer readFile.Close() + + content, err := io.ReadAll(readFile) + require.NoError(t, err, "should be able to read file") + require.Equal(t, testContent, content, "file content should match") + + // Clean up + err = sftpClient.Remove("/" + filename) + require.NoError(t, err, "should be able to remove file") + }) + + // Test 2: Create directory at "/" (should map to /sftp/testuser/) + t.Run("CreateDirAtRoot", func(t *testing.T) { + dirname := "test_dir" + + err := sftpClient.Mkdir("/" + dirname) + require.NoError(t, err, "should be able to create directory at /") + + // Verify directory exists + info, err := sftpClient.Stat("/" + dirname) + require.NoError(t, err, "should be able to stat directory") + require.True(t, info.IsDir(), "should be a directory") + + // Clean up + err = sftpClient.RemoveDirectory("/" + dirname) + require.NoError(t, err, "should be able to remove directory") + }) + + // Test 3: List directory at "/" (should list /sftp/testuser/) + t.Run("ListRoot", func(t *testing.T) { + // Create a test file first + testContent := []byte("list test content") + filename := "list_test.txt" + + file, err := sftpClient.Create("/" + filename) + require.NoError(t, err) + _, err = file.Write(testContent) + require.NoError(t, err) + file.Close() + + // List root directory + files, err := sftpClient.ReadDir("/") + require.NoError(t, err, "should be able to list root directory") + + // Should find our test file + found := false + for _, f := range files { + if f.Name() == filename { + found = true + break + } + } + require.True(t, found, "should find test file in listing") + + // Clean up + err = sftpClient.Remove("/" + filename) + require.NoError(t, err) + }) + + // Test 4: Nested directory operations + t.Run("NestedOperations", func(t *testing.T) { + // Create nested directory structure + err := sftpClient.MkdirAll("/nested/dir/structure") + require.NoError(t, err, "should be able to create nested directories") + + // Create file in nested directory + testContent := []byte("nested file content") + file, err := sftpClient.Create("/nested/dir/structure/file.txt") + require.NoError(t, err) + _, err = file.Write(testContent) + require.NoError(t, err) + file.Close() + + // Verify file exists + readFile, err := sftpClient.Open("/nested/dir/structure/file.txt") + require.NoError(t, err) + content, err := io.ReadAll(readFile) + require.NoError(t, err) + readFile.Close() + require.Equal(t, testContent, content) + + // Clean up + err = sftpClient.Remove("/nested/dir/structure/file.txt") + require.NoError(t, err) + err = sftpClient.RemoveDirectory("/nested/dir/structure") + require.NoError(t, err) + err = sftpClient.RemoveDirectory("/nested/dir") + require.NoError(t, err) + err = sftpClient.RemoveDirectory("/nested") + require.NoError(t, err) + }) + + // Test 5: Rename operation + t.Run("RenameFile", func(t *testing.T) { + testContent := []byte("rename test content") + + file, err := sftpClient.Create("/original.txt") + require.NoError(t, err) + _, err = file.Write(testContent) + require.NoError(t, err) + file.Close() + + // Rename file + err = sftpClient.Rename("/original.txt", "/renamed.txt") + require.NoError(t, err, "should be able to rename file") + + // Verify old file doesn't exist + _, err = sftpClient.Stat("/original.txt") + require.Error(t, err, "original file should not exist") + + // Verify new file exists with correct content + readFile, err := sftpClient.Open("/renamed.txt") + require.NoError(t, err, "renamed file should exist") + content, err := io.ReadAll(readFile) + require.NoError(t, err) + readFile.Close() + require.Equal(t, testContent, content) + + // Clean up + err = sftpClient.Remove("/renamed.txt") + require.NoError(t, err) + }) +} + +// TestAdminRootAccess tests that admin user with HomeDir="/" can access everything +func TestAdminRootAccess(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + config := DefaultTestConfig() + config.EnableDebug = testing.Verbose() + + fw := NewSftpTestFramework(t, config) + err := fw.Setup(config) + require.NoError(t, err, "failed to setup test framework") + defer fw.Cleanup() + + // Connect as admin with HomeDir="/" + sftpClient, sshConn, err := fw.ConnectSFTP("admin", "adminpassword") + require.NoError(t, err, "failed to connect as admin") + defer sshConn.Close() + defer sftpClient.Close() + + // Admin should be able to create directories anywhere + t.Run("CreateAnyDirectory", func(t *testing.T) { + // Create the user's home directory structure + err := sftpClient.MkdirAll("/sftp/testuser") + require.NoError(t, err, "admin should be able to create any directory") + + // Create file in that directory + testContent := []byte("admin created this") + file, err := sftpClient.Create("/sftp/testuser/admin_file.txt") + require.NoError(t, err) + _, err = file.Write(testContent) + require.NoError(t, err) + file.Close() + + // Verify file exists + info, err := sftpClient.Stat("/sftp/testuser/admin_file.txt") + require.NoError(t, err) + require.False(t, info.IsDir()) + + // Clean up + err = sftpClient.Remove("/sftp/testuser/admin_file.txt") + require.NoError(t, err) + }) +} + +// TestLargeFileUpload tests uploading larger files through SFTP +func TestLargeFileUpload(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + config := DefaultTestConfig() + config.EnableDebug = testing.Verbose() + + fw := NewSftpTestFramework(t, config) + err := fw.Setup(config) + require.NoError(t, err, "failed to setup test framework") + defer fw.Cleanup() + + sftpClient, sshConn, err := fw.ConnectSFTP("testuser", "testuserpassword") + require.NoError(t, err, "failed to connect as testuser") + defer sshConn.Close() + defer sftpClient.Close() + + // Create a 1MB file + t.Run("Upload1MB", func(t *testing.T) { + size := 1024 * 1024 // 1MB + testData := bytes.Repeat([]byte("A"), size) + + file, err := sftpClient.Create("/large_file.bin") + require.NoError(t, err) + n, err := file.Write(testData) + require.NoError(t, err) + require.Equal(t, size, n) + file.Close() + + // Verify file size + info, err := sftpClient.Stat("/large_file.bin") + require.NoError(t, err) + require.Equal(t, int64(size), info.Size()) + + // Verify content + readFile, err := sftpClient.Open("/large_file.bin") + require.NoError(t, err) + content, err := io.ReadAll(readFile) + require.NoError(t, err) + readFile.Close() + require.Equal(t, testData, content) + + // Clean up + err = sftpClient.Remove("/large_file.bin") + require.NoError(t, err) + }) +} + +// TestStatOperations tests Stat and Lstat operations +func TestStatOperations(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + config := DefaultTestConfig() + config.EnableDebug = testing.Verbose() + + fw := NewSftpTestFramework(t, config) + err := fw.Setup(config) + require.NoError(t, err, "failed to setup test framework") + defer fw.Cleanup() + + sftpClient, sshConn, err := fw.ConnectSFTP("testuser", "testuserpassword") + require.NoError(t, err, "failed to connect as testuser") + defer sshConn.Close() + defer sftpClient.Close() + + // Create a test file + testContent := []byte("stat test content") + file, err := sftpClient.Create("/stat_test.txt") + require.NoError(t, err) + _, err = file.Write(testContent) + require.NoError(t, err) + file.Close() + + t.Run("StatFile", func(t *testing.T) { + info, err := sftpClient.Stat("/stat_test.txt") + require.NoError(t, err) + require.Equal(t, "stat_test.txt", info.Name()) + require.Equal(t, int64(len(testContent)), info.Size()) + require.False(t, info.IsDir()) + }) + + t.Run("StatDirectory", func(t *testing.T) { + err := sftpClient.Mkdir("/stat_dir") + require.NoError(t, err) + + info, err := sftpClient.Stat("/stat_dir") + require.NoError(t, err) + require.Equal(t, "stat_dir", info.Name()) + require.True(t, info.IsDir()) + + // Clean up + err = sftpClient.RemoveDirectory("/stat_dir") + require.NoError(t, err) + }) + + t.Run("StatRoot", func(t *testing.T) { + // Should be able to stat "/" which maps to user's home directory + info, err := sftpClient.Stat("/") + require.NoError(t, err, "should be able to stat root (home) directory") + require.True(t, info.IsDir(), "root should be a directory") + }) + + // Clean up + err = sftpClient.Remove("/stat_test.txt") + require.NoError(t, err) +} + +// TestWalk tests walking directory trees +func TestWalk(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + config := DefaultTestConfig() + config.EnableDebug = testing.Verbose() + + fw := NewSftpTestFramework(t, config) + err := fw.Setup(config) + require.NoError(t, err, "failed to setup test framework") + defer fw.Cleanup() + + sftpClient, sshConn, err := fw.ConnectSFTP("testuser", "testuserpassword") + require.NoError(t, err, "failed to connect as testuser") + defer sshConn.Close() + defer sftpClient.Close() + + // Create directory structure + err = sftpClient.MkdirAll("/walk/a/b") + require.NoError(t, err) + err = sftpClient.MkdirAll("/walk/c") + require.NoError(t, err) + + // Create files + for _, p := range []string{"/walk/file1.txt", "/walk/a/file2.txt", "/walk/a/b/file3.txt", "/walk/c/file4.txt"} { + file, err := sftpClient.Create(p) + require.NoError(t, err) + file.Write([]byte("test")) + file.Close() + } + + t.Run("WalkEntireTree", func(t *testing.T) { + var paths []string + walker := sftpClient.Walk("/walk") + for walker.Step() { + if walker.Err() != nil { + continue + } + paths = append(paths, walker.Path()) + } + + // Should find all directories and files + require.Contains(t, paths, "/walk") + require.Contains(t, paths, "/walk/a") + require.Contains(t, paths, "/walk/a/b") + require.Contains(t, paths, "/walk/c") + }) + + // Clean up + for _, p := range []string{"/walk/file1.txt", "/walk/a/file2.txt", "/walk/a/b/file3.txt", "/walk/c/file4.txt"} { + require.NoError(t, sftpClient.Remove(p)) + } + for _, p := range []string{"/walk/a/b", "/walk/a", "/walk/c", "/walk"} { + require.NoError(t, sftpClient.RemoveDirectory(p)) + } +} + +// TestCurrentWorkingDirectory tests that Getwd and Chdir work correctly +func TestCurrentWorkingDirectory(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + config := DefaultTestConfig() + config.EnableDebug = testing.Verbose() + + fw := NewSftpTestFramework(t, config) + err := fw.Setup(config) + require.NoError(t, err, "failed to setup test framework") + defer fw.Cleanup() + + sftpClient, sshConn, err := fw.ConnectSFTP("testuser", "testuserpassword") + require.NoError(t, err, "failed to connect as testuser") + defer sshConn.Close() + defer sftpClient.Close() + + // Create test directory + err = sftpClient.Mkdir("/cwd_test") + require.NoError(t, err) + + t.Run("GetCurrentDir", func(t *testing.T) { + cwd, err := sftpClient.Getwd() + require.NoError(t, err) + // The initial working directory should be the user's home directory + // which from the user's perspective is "/" + require.Equal(t, "/", cwd, "initial working directory should be the virtual root") + }) + + t.Run("ChangeAndCreate", func(t *testing.T) { + // Create file in subdirectory using relative path after chdir + // Note: pkg/sftp doesn't support Chdir, so we test using absolute paths + file, err := sftpClient.Create("/cwd_test/relative_file.txt") + require.NoError(t, err) + file.Write([]byte("test")) + file.Close() + + // Verify using absolute path + _, err = sftpClient.Stat("/cwd_test/relative_file.txt") + require.NoError(t, err) + + // Clean up + sftpClient.Remove("/cwd_test/relative_file.txt") + }) + + // Clean up + err = sftpClient.RemoveDirectory("/cwd_test") + require.NoError(t, err) +} + +// TestPathEdgeCases tests various edge cases in path handling +func TestPathEdgeCases(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + config := DefaultTestConfig() + config.EnableDebug = testing.Verbose() + + fw := NewSftpTestFramework(t, config) + err := fw.Setup(config) + require.NoError(t, err, "failed to setup test framework") + defer fw.Cleanup() + + sftpClient, sshConn, err := fw.ConnectSFTP("testuser", "testuserpassword") + require.NoError(t, err, "failed to connect as testuser") + defer sshConn.Close() + defer sftpClient.Close() + + t.Run("PathWithDotDot", func(t *testing.T) { + // Create directory structure + err := sftpClient.MkdirAll("/edge/subdir") + require.NoError(t, err) + + // Create file using path with .. + file, err := sftpClient.Create("/edge/subdir/../file.txt") + require.NoError(t, err) + file.Write([]byte("test")) + file.Close() + + // Verify file was created in /edge + _, err = sftpClient.Stat("/edge/file.txt") + require.NoError(t, err, "file should be created in parent directory") + + // Clean up + sftpClient.Remove("/edge/file.txt") + sftpClient.RemoveDirectory("/edge/subdir") + sftpClient.RemoveDirectory("/edge") + }) + + t.Run("PathWithTrailingSlash", func(t *testing.T) { + err := sftpClient.Mkdir("/trailing") + require.NoError(t, err) + + // Stat with trailing slash + info, err := sftpClient.Stat("/trailing/") + require.NoError(t, err) + require.True(t, info.IsDir()) + + // Clean up + sftpClient.RemoveDirectory("/trailing") + }) + + t.Run("CreateFileAtRootPath", func(t *testing.T) { + // This is the exact scenario from issue #7470 + // User with HomeDir="/sftp/testuser" uploads to "/" + file, err := sftpClient.Create("/issue7470.txt") + require.NoError(t, err, "should be able to create file at / (issue #7470)") + file.Write([]byte("This tests the fix for issue #7470")) + file.Close() + + // Verify + _, err = sftpClient.Stat("/issue7470.txt") + require.NoError(t, err) + + // Clean up + sftpClient.Remove("/issue7470.txt") + }) + + // Security test: path traversal attacks should be blocked + t.Run("PathTraversalPrevention", func(t *testing.T) { + // User's HomeDir is "/sftp/testuser" + // Attempting to escape via "../.." should NOT create files outside home directory + + // First, create a valid file to ensure we can write + validFile, err := sftpClient.Create("/valid.txt") + require.NoError(t, err) + validFile.Write([]byte("valid")) + validFile.Close() + + // Try various path traversal attempts + // These should either: + // 1. Be blocked (error returned), OR + // 2. Be safely resolved to stay within home directory + + traversalPaths := []string{ + "/../escape.txt", + "/../../escape.txt", + "/../../../escape.txt", + "/subdir/../../escape.txt", + "/./../../escape.txt", + } + + for _, traversalPath := range traversalPaths { + t.Run(traversalPath, func(t *testing.T) { + // Note: The pkg/sftp client sanitizes paths locally before sending them to the server. + // So "/../escape.txt" becomes "/escape.txt" on the wire. + // Therefore, we cannot trigger the server-side path traversal block with this client. + // Instead, we verify that the file is created successfully within the jail (contained). + // The server-side protection logic is verified in unit tests (sftpd/sftp_server_test.go). + + file, err := sftpClient.Create(traversalPath) + require.NoError(t, err, "creation should succeed because client sanitizes path") + file.Close() + + // Clean up + err = sftpClient.Remove(traversalPath) + require.NoError(t, err) + }) + } + + // Clean up + sftpClient.Remove("/valid.txt") + }) +} + +// TestFileContent tests reading and writing file content correctly +func TestFileContent(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + config := DefaultTestConfig() + config.EnableDebug = testing.Verbose() + + fw := NewSftpTestFramework(t, config) + err := fw.Setup(config) + require.NoError(t, err, "failed to setup test framework") + defer fw.Cleanup() + + sftpClient, sshConn, err := fw.ConnectSFTP("testuser", "testuserpassword") + require.NoError(t, err, "failed to connect as testuser") + defer sshConn.Close() + defer sftpClient.Close() + + t.Run("BinaryContent", func(t *testing.T) { + // Create binary data with all byte values + data := make([]byte, 256) + for i := 0; i < 256; i++ { + data[i] = byte(i) + } + + file, err := sftpClient.Create("/binary.bin") + require.NoError(t, err) + n, err := file.Write(data) + require.NoError(t, err) + require.Equal(t, 256, n) + file.Close() + + // Read back + readFile, err := sftpClient.Open("/binary.bin") + require.NoError(t, err) + content, err := io.ReadAll(readFile) + require.NoError(t, err) + readFile.Close() + + require.Equal(t, data, content, "binary content should match") + + // Clean up + sftpClient.Remove("/binary.bin") + }) + + t.Run("EmptyFile", func(t *testing.T) { + file, err := sftpClient.Create("/empty.txt") + require.NoError(t, err) + file.Close() + + info, err := sftpClient.Stat("/empty.txt") + require.NoError(t, err) + require.Equal(t, int64(0), info.Size()) + + // Clean up + sftpClient.Remove("/empty.txt") + }) + + t.Run("UnicodeFilename", func(t *testing.T) { + filename := "/文件名.txt" + content := []byte("Unicode content: 你好世界") + + file, err := sftpClient.Create(filename) + require.NoError(t, err) + file.Write(content) + file.Close() + + // Read back + readFile, err := sftpClient.Open(filename) + require.NoError(t, err) + readContent, err := io.ReadAll(readFile) + require.NoError(t, err) + readFile.Close() + + require.Equal(t, content, readContent) + + // Verify in listing + files, err := sftpClient.ReadDir("/") + require.NoError(t, err) + found := false + for _, f := range files { + if f.Name() == path.Base(filename) { + found = true + break + } + } + require.True(t, found, "should find unicode filename in listing") + + // Clean up + sftpClient.Remove(filename) + }) +} + diff --git a/test/sftp/framework.go b/test/sftp/framework.go new file mode 100644 index 000000000..5572eac28 --- /dev/null +++ b/test/sftp/framework.go @@ -0,0 +1,423 @@ +package sftp + +import ( + "fmt" + "net" + "os" + "os/exec" + "path/filepath" + "runtime" + "syscall" + "testing" + "time" + + "github.com/pkg/sftp" + "github.com/stretchr/testify/require" + "golang.org/x/crypto/ssh" +) + +// SftpTestFramework provides utilities for SFTP integration testing +type SftpTestFramework struct { + t *testing.T + tempDir string + dataDir string + masterProcess *os.Process + volumeProcess *os.Process + filerProcess *os.Process + sftpProcess *os.Process + masterAddr string + volumeAddr string + filerAddr string + sftpAddr string + weedBinary string + userStoreFile string + hostKeyFile string + isSetup bool + skipCleanup bool +} + +// TestConfig holds configuration for SFTP tests +type TestConfig struct { + NumVolumes int + EnableDebug bool + SkipCleanup bool // for debugging failed tests + UserStoreFile string +} + +// DefaultTestConfig returns a default configuration for SFTP tests +func DefaultTestConfig() *TestConfig { + return &TestConfig{ + NumVolumes: 3, + EnableDebug: false, + SkipCleanup: false, + UserStoreFile: "", + } +} + +// NewSftpTestFramework creates a new SFTP testing framework +func NewSftpTestFramework(t *testing.T, config *TestConfig) *SftpTestFramework { + if config == nil { + config = DefaultTestConfig() + } + + tempDir, err := os.MkdirTemp("", "seaweedfs_sftp_test_") + require.NoError(t, err) + + // Generate SSH host key for SFTP server + hostKeyFile := filepath.Join(tempDir, "ssh_host_key") + cmd := exec.Command("ssh-keygen", "-t", "ed25519", "-f", hostKeyFile, "-N", "") + err = cmd.Run() + require.NoError(t, err, "failed to generate SSH host key") + + // Use provided userstore or copy the test one + userStoreFile := config.UserStoreFile + if userStoreFile == "" { + // Copy test userstore to temp dir + userStoreFile = filepath.Join(tempDir, "userstore.json") + testDataPath := findTestDataPath() + input, err := os.ReadFile(filepath.Join(testDataPath, "userstore.json")) + require.NoError(t, err, "failed to read test userstore.json") + err = os.WriteFile(userStoreFile, input, 0644) + require.NoError(t, err, "failed to write userstore.json") + } + + return &SftpTestFramework{ + t: t, + tempDir: tempDir, + dataDir: filepath.Join(tempDir, "data"), + masterAddr: "127.0.0.1:19333", + volumeAddr: "127.0.0.1:18080", + filerAddr: "127.0.0.1:18888", + sftpAddr: "127.0.0.1:12022", + weedBinary: findWeedBinary(), + userStoreFile: userStoreFile, + hostKeyFile: hostKeyFile, + isSetup: false, + } +} + +// Setup starts SeaweedFS cluster with SFTP server +func (f *SftpTestFramework) Setup(config *TestConfig) error { + if f.isSetup { + return fmt.Errorf("framework already setup") + } + + // Create all data directories + dirs := []string{ + f.dataDir, + filepath.Join(f.dataDir, "master"), + filepath.Join(f.dataDir, "volume"), + } + for _, dir := range dirs { + if err := os.MkdirAll(dir, 0755); err != nil { + return fmt.Errorf("failed to create directory %s: %v", dir, err) + } + } + + // Start master + if err := f.startMaster(config); err != nil { + return fmt.Errorf("failed to start master: %v", err) + } + + // Wait for master to be ready + if err := f.waitForService(f.masterAddr, 30*time.Second); err != nil { + return fmt.Errorf("master not ready: %v", err) + } + + // Start volume server + if err := f.startVolumeServer(config); err != nil { + return fmt.Errorf("failed to start volume server: %v", err) + } + + // Wait for volume server to be ready + if err := f.waitForService(f.volumeAddr, 30*time.Second); err != nil { + return fmt.Errorf("volume server not ready: %v", err) + } + + // Start filer + if err := f.startFiler(config); err != nil { + return fmt.Errorf("failed to start filer: %v", err) + } + + // Wait for filer to be ready + if err := f.waitForService(f.filerAddr, 30*time.Second); err != nil { + return fmt.Errorf("filer not ready: %v", err) + } + + // Start SFTP server + if err := f.startSftpServer(config); err != nil { + return fmt.Errorf("failed to start SFTP server: %v", err) + } + + // Wait for SFTP server to be ready + if err := f.waitForService(f.sftpAddr, 30*time.Second); err != nil { + return fmt.Errorf("SFTP server not ready: %v", err) + } + + // Additional wait for all services to stabilize (gRPC endpoints) + time.Sleep(500 * time.Millisecond) + + f.skipCleanup = config.SkipCleanup + f.isSetup = true + return nil +} + +// Cleanup stops all processes and removes temporary files +func (f *SftpTestFramework) Cleanup() { + // Stop processes in reverse order + processes := []*os.Process{f.sftpProcess, f.filerProcess, f.volumeProcess, f.masterProcess} + for _, proc := range processes { + if proc != nil { + proc.Signal(syscall.SIGTERM) + proc.Wait() + } + } + + // Remove temp directory + if !f.skipCleanup { + os.RemoveAll(f.tempDir) + } +} + +// GetSftpAddr returns the SFTP server address +func (f *SftpTestFramework) GetSftpAddr() string { + return f.sftpAddr +} + +// GetFilerAddr returns the filer address +func (f *SftpTestFramework) GetFilerAddr() string { + return f.filerAddr +} + +// ConnectSFTP creates an SFTP client connection with the given credentials +func (f *SftpTestFramework) ConnectSFTP(username, password string) (*sftp.Client, *ssh.Client, error) { + // Load the known host public key for verification + hostKeyCallback, err := f.getHostKeyCallback() + if err != nil { + return nil, nil, fmt.Errorf("failed to get host key callback: %v", err) + } + + config := &ssh.ClientConfig{ + User: username, + Auth: []ssh.AuthMethod{ + ssh.Password(password), + }, + HostKeyCallback: hostKeyCallback, + Timeout: 5 * time.Second, + } + + sshConn, err := ssh.Dial("tcp", f.sftpAddr, config) + if err != nil { + return nil, nil, fmt.Errorf("failed to connect SSH: %v", err) + } + + sftpClient, err := sftp.NewClient(sshConn) + if err != nil { + sshConn.Close() + return nil, nil, fmt.Errorf("failed to create SFTP client: %v", err) + } + + return sftpClient, sshConn, nil +} + +// getHostKeyCallback returns a callback that verifies the server's host key +// matches the known test server key we generated +func (f *SftpTestFramework) getHostKeyCallback() (ssh.HostKeyCallback, error) { + // Read the public key file generated alongside the private key + pubKeyFile := f.hostKeyFile + ".pub" + pubKeyBytes, err := os.ReadFile(pubKeyFile) + if err != nil { + return nil, fmt.Errorf("failed to read host public key: %v", err) + } + + // Parse the public key + pubKey, _, _, _, err := ssh.ParseAuthorizedKey(pubKeyBytes) + if err != nil { + return nil, fmt.Errorf("failed to parse host public key: %v", err) + } + + // Return a callback that verifies the server key matches our known key + return ssh.FixedHostKey(pubKey), nil +} + +// startMaster starts the SeaweedFS master server +func (f *SftpTestFramework) startMaster(config *TestConfig) error { + args := []string{ + "master", + "-ip=127.0.0.1", + "-port=19333", + "-mdir=" + filepath.Join(f.dataDir, "master"), + "-raftBootstrap", + "-peers=none", + } + + cmd := exec.Command(f.weedBinary, args...) + cmd.Dir = f.tempDir + if config.EnableDebug { + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + } + if err := cmd.Start(); err != nil { + return err + } + f.masterProcess = cmd.Process + return nil +} + +// startVolumeServer starts SeaweedFS volume server +func (f *SftpTestFramework) startVolumeServer(config *TestConfig) error { + args := []string{ + "volume", + "-mserver=" + f.masterAddr, + "-ip=127.0.0.1", + "-port=18080", + "-dir=" + filepath.Join(f.dataDir, "volume"), + fmt.Sprintf("-max=%d", config.NumVolumes), + } + + cmd := exec.Command(f.weedBinary, args...) + cmd.Dir = f.tempDir + if config.EnableDebug { + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + } + if err := cmd.Start(); err != nil { + return err + } + f.volumeProcess = cmd.Process + return nil +} + +// startFiler starts the SeaweedFS filer server +func (f *SftpTestFramework) startFiler(config *TestConfig) error { + args := []string{ + "filer", + "-master=" + f.masterAddr, + "-ip=127.0.0.1", + "-port=18888", + } + + cmd := exec.Command(f.weedBinary, args...) + cmd.Dir = f.tempDir + if config.EnableDebug { + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + } + if err := cmd.Start(); err != nil { + return err + } + f.filerProcess = cmd.Process + return nil +} + +// startSftpServer starts the SeaweedFS SFTP server +func (f *SftpTestFramework) startSftpServer(config *TestConfig) error { + args := []string{ + "sftp", + "-filer=" + f.filerAddr, + "-ip.bind=127.0.0.1", + "-port=12022", + "-sshPrivateKey=" + f.hostKeyFile, + "-userStoreFile=" + f.userStoreFile, + } + + cmd := exec.Command(f.weedBinary, args...) + cmd.Dir = f.tempDir + if config.EnableDebug { + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + } + if err := cmd.Start(); err != nil { + return err + } + f.sftpProcess = cmd.Process + return nil +} + +// waitForService waits for a service to be available +func (f *SftpTestFramework) waitForService(addr string, timeout time.Duration) error { + deadline := time.Now().Add(timeout) + for time.Now().Before(deadline) { + conn, err := net.DialTimeout("tcp", addr, 1*time.Second) + if err == nil { + conn.Close() + return nil + } + time.Sleep(100 * time.Millisecond) + } + return fmt.Errorf("service at %s not ready within timeout", addr) +} + +// findWeedBinary locates the weed binary +// Prefers local build over system-installed weed to ensure we test the latest code +func findWeedBinary() string { + // Get the directory where this source file is located + // This ensures we find the locally built weed binary first + _, thisFile, _, ok := runtime.Caller(0) + if ok { + thisDir := filepath.Dir(thisFile) + // From test/sftp/, the weed binary should be at ../../weed/weed + candidates := []string{ + filepath.Join(thisDir, "../../weed/weed"), + filepath.Join(thisDir, "../weed/weed"), + } + for _, candidate := range candidates { + if _, err := os.Stat(candidate); err == nil { + abs, _ := filepath.Abs(candidate) + return abs + } + } + } + + // Try relative paths from current working directory + cwd, _ := os.Getwd() + candidates := []string{ + filepath.Join(cwd, "../../weed/weed"), + filepath.Join(cwd, "../weed/weed"), + filepath.Join(cwd, "./weed"), + } + for _, candidate := range candidates { + if _, err := os.Stat(candidate); err == nil { + abs, _ := filepath.Abs(candidate) + return abs + } + } + + // Fallback to PATH only if local build not found + if path, err := exec.LookPath("weed"); err == nil { + return path + } + + // Default fallback + return "weed" +} + +// findTestDataPath locates the testdata directory +func findTestDataPath() string { + // Get the directory where this source file is located + _, thisFile, _, ok := runtime.Caller(0) + if ok { + thisDir := filepath.Dir(thisFile) + testDataPath := filepath.Join(thisDir, "testdata") + if _, err := os.Stat(testDataPath); err == nil { + return testDataPath + } + } + + // Try relative paths from current working directory + cwd, _ := os.Getwd() + candidates := []string{ + filepath.Join(cwd, "testdata"), + filepath.Join(cwd, "../sftp/testdata"), + filepath.Join(cwd, "test/sftp/testdata"), + } + + for _, candidate := range candidates { + if _, err := os.Stat(candidate); err == nil { + return candidate + } + } + + return "./testdata" +} + diff --git a/test/sftp/go.mod b/test/sftp/go.mod new file mode 100644 index 000000000..34d9053a8 --- /dev/null +++ b/test/sftp/go.mod @@ -0,0 +1,17 @@ +module seaweedfs-sftp-tests + +go 1.24.0 + +require ( + github.com/pkg/sftp v1.13.7 + github.com/stretchr/testify v1.10.0 + golang.org/x/crypto v0.45.0 +) + +require ( + github.com/davecgh/go-spew v1.1.1 // indirect + github.com/kr/fs v0.1.0 // indirect + github.com/pmezard/go-difflib v1.0.0 // indirect + golang.org/x/sys v0.38.0 // indirect + gopkg.in/yaml.v3 v3.0.1 // indirect +) diff --git a/test/sftp/go.sum b/test/sftp/go.sum new file mode 100644 index 000000000..112e6f88a --- /dev/null +++ b/test/sftp/go.sum @@ -0,0 +1,64 @@ +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/kr/fs v0.1.0 h1:Jskdu9ieNAYnjxsi0LbQp1ulIKZV1LAFgK1tWhpZgl8= +github.com/kr/fs v0.1.0/go.mod h1:FFnZGqtBN9Gxj7eW1uZ42v5BccTP0vu6NEaFoC2HwRg= +github.com/pkg/sftp v1.13.7 h1:uv+I3nNJvlKZIQGSr8JVQLNHFU9YhhNpvC14Y6KgmSM= +github.com/pkg/sftp v1.13.7/go.mod h1:KMKI0t3T6hfA+lTR/ssZdunHo+uwq7ghoN09/FSu3DY= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= +github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= +github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA= +github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= +github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= +golang.org/x/crypto v0.17.0/go.mod h1:gCAAfMLgwOJRpTjQ2zCCt2OcSfYMTeZVSRtQlPC7Nq4= +golang.org/x/crypto v0.45.0 h1:jMBrvKuj23MTlT0bQEOBcAE0mjg8mK9RXFhRH6nyF3Q= +golang.org/x/crypto v0.45.0/go.mod h1:XTGrrkGJve7CYK7J8PEww4aY7gM3qMCElcJQ8n8JdX4= +golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= +golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= +golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= +golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= +golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.15.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.38.0 h1:3yZWxaJjBmCWXqhN1qh02AkOnCQ1poK6oF+a7xWL6Gc= +golang.org/x/sys v0.38.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= +golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= +golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= +golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= +golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo= +golang.org/x/term v0.15.0/go.mod h1:BDl952bC7+uMoWR75FIrCDx79TPU9oHkTZ9yRbYOrX0= +golang.org/x/term v0.37.0 h1:8EGAD0qCmHYZg6J17DvsMy9/wJ7/D/4pV/wfnld5lTU= +golang.org/x/term v0.37.0/go.mod h1:5pB4lxRNYYVZuTLmy8oR2BH8dflOR+IbTYFD8fi3254= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= +golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= +golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= +golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= +golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/test/sftp/testdata/userstore.json b/test/sftp/testdata/userstore.json new file mode 100644 index 000000000..66d78dd1d --- /dev/null +++ b/test/sftp/testdata/userstore.json @@ -0,0 +1,37 @@ +[ + { + "Username": "admin", + "Password": "adminpassword", + "PublicKeys": [], + "HomeDir": "/", + "Permissions": { + "/": ["*"] + }, + "Uid": 0, + "Gid": 0 + }, + { + "Username": "testuser", + "Password": "testuserpassword", + "PublicKeys": [], + "HomeDir": "/sftp/testuser", + "Permissions": { + "/sftp/testuser": ["*"] + }, + "Uid": 1001, + "Gid": 1001 + }, + { + "Username": "readonly", + "Password": "readonlypassword", + "PublicKeys": [], + "HomeDir": "/public", + "Permissions": { + "/public": ["read", "list"] + }, + "Uid": 1002, + "Gid": 1002 + } +] + + diff --git a/weed/admin/dash/admin_server.go b/weed/admin/dash/admin_server.go index 4ce357502..c499ca8fe 100644 --- a/weed/admin/dash/admin_server.go +++ b/weed/admin/dash/admin_server.go @@ -99,28 +99,22 @@ func NewAdminServer(masters string, templateFS http.FileSystem, dataDir string) // Continue without credential manager - will fall back to legacy approach } else { server.credentialManager = credentialManager + glog.V(0).Infof("Credential manager initialized with store type: %s", credentialManager.GetStore().GetName()) - // For stores that need filer address function, set them + // For stores that need filer address function, configure them if store := credentialManager.GetStore(); store != nil { if filerFuncSetter, ok := store.(interface { SetFilerAddressFunc(func() pb.ServerAddress, grpc.DialOption) }); ok { - // Set up a goroutine to configure filer address function once we discover filers - go func() { - for { - filerAddr := server.GetFilerAddress() - if filerAddr != "" { - // Configure the function to dynamically return the current active filer (HA-aware) - filerFuncSetter.SetFilerAddressFunc(func() pb.ServerAddress { - return pb.ServerAddress(server.GetFilerAddress()) - }, server.grpcDialOption) - glog.V(1).Infof("Set filer address function for credential manager: %s", filerAddr) - break - } - glog.V(1).Infof("Waiting for filer discovery for credential manager...") - time.Sleep(5 * time.Second) - } - }() + // Configure the filer address function to dynamically return the current active filer + // This function will be called each time credentials need to be loaded/saved, + // so it will automatically use whatever filer is currently available (HA-aware) + filerFuncSetter.SetFilerAddressFunc(func() pb.ServerAddress { + return pb.ServerAddress(server.GetFilerAddress()) + }, server.grpcDialOption) + glog.V(0).Infof("Credential store configured with dynamic filer address function") + } else { + glog.V(0).Infof("Credential store %s does not support filer address function", store.GetName()) } } } diff --git a/weed/admin/handlers/file_browser_handlers.go b/weed/admin/handlers/file_browser_handlers.go index a0427e39f..eeb8e2d85 100644 --- a/weed/admin/handlers/file_browser_handlers.go +++ b/weed/admin/handlers/file_browser_handlers.go @@ -5,10 +5,12 @@ import ( "context" "fmt" "io" + "mime" "mime/multipart" "net" "net/http" "os" + "path" "path/filepath" "strconv" "strings" @@ -20,15 +22,37 @@ import ( "github.com/seaweedfs/seaweedfs/weed/admin/view/layout" "github.com/seaweedfs/seaweedfs/weed/glog" "github.com/seaweedfs/seaweedfs/weed/pb/filer_pb" + "github.com/seaweedfs/seaweedfs/weed/util" + "github.com/seaweedfs/seaweedfs/weed/util/http/client" ) type FileBrowserHandlers struct { adminServer *dash.AdminServer + httpClient *client.HTTPClient } func NewFileBrowserHandlers(adminServer *dash.AdminServer) *FileBrowserHandlers { + // Create HTTP client with TLS support from https.client configuration + // The client is created without a timeout - each operation will set its own timeout + // If TLS is enabled but misconfigured, fail fast to alert the operator immediately + // rather than silently falling back to HTTP and causing confusing runtime errors + httpClient, err := client.NewHttpClient(client.Client) + if err != nil { + glog.Fatalf("Failed to create HTTPS client for file browser: %v", err) + } + return &FileBrowserHandlers{ adminServer: adminServer, + httpClient: httpClient, + } +} + +// newClientWithTimeout creates a temporary http.Client with the specified timeout, +// reusing the TLS transport from the shared httpClient. +func (h *FileBrowserHandlers) newClientWithTimeout(timeout time.Duration) http.Client { + return http.Client{ + Transport: h.httpClient.Client.Transport, + Timeout: timeout, } } @@ -245,8 +269,12 @@ func (h *FileBrowserHandlers) UploadFile(c *gin.Context) { continue } - // Create full path for the file - fullPath := filepath.Join(currentPath, fileName) + // Normalize Windows-style backslashes to forward slashes + fileName = util.CleanWindowsPath(fileName) + + // Create full path for the file using path.Join for URL path semantics + // path.Join handles double slashes and is not OS-specific like filepath.Join + fullPath := path.Join(currentPath, fileName) if !strings.HasPrefix(fullPath, "/") { fullPath = "/" + fullPath } @@ -327,8 +355,10 @@ func (h *FileBrowserHandlers) uploadFileToFiler(filePath string, fileHeader *mul var body bytes.Buffer writer := multipart.NewWriter(&body) - // Create form file field - part, err := writer.CreateFormFile("file", fileHeader.Filename) + // Create form file field with normalized base filename + // Use path.Base (not filepath.Base) since cleanFilePath uses URL path semantics + baseFileName := path.Base(cleanFilePath) + part, err := writer.CreateFormFile("file", baseFileName) if err != nil { return fmt.Errorf("failed to create form file: %w", err) } @@ -345,8 +375,15 @@ func (h *FileBrowserHandlers) uploadFileToFiler(filePath string, fileHeader *mul return fmt.Errorf("failed to close multipart writer: %w", err) } - // Create the upload URL with validated components - uploadURL := fmt.Sprintf("http://%s%s", filerAddress, cleanFilePath) + // Create the upload URL - the httpClient will normalize to the correct scheme (http/https) + // based on the https.client configuration in security.toml + uploadURL := fmt.Sprintf("%s%s", filerAddress, cleanFilePath) + + // Normalize the URL scheme based on TLS configuration + uploadURL, err = h.httpClient.NormalizeHttpScheme(uploadURL) + if err != nil { + return fmt.Errorf("failed to normalize URL scheme: %w", err) + } // Create HTTP request req, err := http.NewRequest("POST", uploadURL, &body) @@ -357,11 +394,11 @@ func (h *FileBrowserHandlers) uploadFileToFiler(filePath string, fileHeader *mul // Set content type with boundary req.Header.Set("Content-Type", writer.FormDataContentType()) - // Send request - client := &http.Client{Timeout: 60 * time.Second} // Increased timeout for larger files + // Send request using TLS-aware HTTP client with 60s timeout for large file uploads // lgtm[go/ssrf] // Safe: filerAddress validated by validateFilerAddress() to match configured filer // Safe: cleanFilePath validated and cleaned by validateAndCleanFilePath() to prevent path traversal + client := h.newClientWithTimeout(60 * time.Second) resp, err := client.Do(req) if err != nil { return fmt.Errorf("failed to upload file: %w", err) @@ -423,8 +460,12 @@ func (h *FileBrowserHandlers) validateAndCleanFilePath(filePath string) (string, return "", fmt.Errorf("file path cannot be empty") } + // Normalize Windows-style backslashes to forward slashes + filePath = util.CleanWindowsPath(filePath) + // Clean the path to remove any .. or . components - cleanPath := filepath.Clean(filePath) + // Use path.Clean (not filepath.Clean) since this is a URL path + cleanPath := path.Clean(filePath) // Ensure the path starts with / if !strings.HasPrefix(cleanPath, "/") { @@ -444,7 +485,57 @@ func (h *FileBrowserHandlers) validateAndCleanFilePath(filePath string) (string, return cleanPath, nil } -// DownloadFile handles file download requests +// fetchFileContent fetches file content from the filer and returns the content or an error. +func (h *FileBrowserHandlers) fetchFileContent(filePath string, timeout time.Duration) (string, error) { + filerAddress := h.adminServer.GetFilerAddress() + if filerAddress == "" { + return "", fmt.Errorf("filer address not configured") + } + + if err := h.validateFilerAddress(filerAddress); err != nil { + return "", fmt.Errorf("invalid filer address configuration: %w", err) + } + + cleanFilePath, err := h.validateAndCleanFilePath(filePath) + if err != nil { + return "", err + } + + // Create the file URL with proper scheme based on TLS configuration + fileURL := fmt.Sprintf("%s%s", filerAddress, cleanFilePath) + fileURL, err = h.httpClient.NormalizeHttpScheme(fileURL) + if err != nil { + return "", fmt.Errorf("failed to construct file URL: %w", err) + } + + // lgtm[go/ssrf] + // Safe: filerAddress validated by validateFilerAddress() to match configured filer + // Safe: cleanFilePath validated and cleaned by validateAndCleanFilePath() to prevent path traversal + client := h.newClientWithTimeout(timeout) + resp, err := client.Get(fileURL) + if err != nil { + return "", fmt.Errorf("failed to fetch file from filer: %w", err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + body, err := io.ReadAll(resp.Body) + if err != nil { + return "", fmt.Errorf("filer returned status %d but failed to read response body: %w", resp.StatusCode, err) + } + return "", fmt.Errorf("filer returned status %d: %s", resp.StatusCode, string(body)) + } + + contentBytes, err := io.ReadAll(resp.Body) + if err != nil { + return "", fmt.Errorf("failed to read file content: %w", err) + } + + return string(contentBytes), nil +} + +// DownloadFile handles file download requests by proxying through the Admin UI server +// This ensures mTLS works correctly since the Admin UI server has the client certificates func (h *FileBrowserHandlers) DownloadFile(c *gin.Context) { filePath := c.Query("path") if filePath == "" { @@ -459,6 +550,12 @@ func (h *FileBrowserHandlers) DownloadFile(c *gin.Context) { return } + // Validate filer address to prevent SSRF + if err := h.validateFilerAddress(filerAddress); err != nil { + c.JSON(http.StatusInternalServerError, gin.H{"error": "Invalid filer address configuration"}) + return + } + // Validate and sanitize the file path cleanFilePath, err := h.validateAndCleanFilePath(filePath) if err != nil { @@ -466,16 +563,66 @@ func (h *FileBrowserHandlers) DownloadFile(c *gin.Context) { return } - // Create the download URL - downloadURL := fmt.Sprintf("http://%s%s", filerAddress, cleanFilePath) + // Create the download URL with proper scheme based on TLS configuration + downloadURL := fmt.Sprintf("%s%s", filerAddress, cleanFilePath) + downloadURL, err = h.httpClient.NormalizeHttpScheme(downloadURL) + if err != nil { + c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to construct download URL: " + err.Error()}) + return + } + + // Proxy the download through the Admin UI server to support mTLS + // lgtm[go/ssrf] + // Safe: filerAddress validated by validateFilerAddress() to match configured filer + // Safe: cleanFilePath validated and cleaned by validateAndCleanFilePath() to prevent path traversal + // Use request context so download is cancelled when client disconnects + req, err := http.NewRequestWithContext(c.Request.Context(), "GET", downloadURL, nil) + if err != nil { + c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to create request: " + err.Error()}) + return + } + client := h.newClientWithTimeout(5 * time.Minute) // Longer timeout for large file downloads + resp, err := client.Do(req) + if err != nil { + c.JSON(http.StatusBadGateway, gin.H{"error": "Failed to fetch file from filer: " + err.Error()}) + return + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + body, err := io.ReadAll(resp.Body) + if err != nil { + c.JSON(resp.StatusCode, gin.H{"error": fmt.Sprintf("Filer returned status %d but failed to read response body: %v", resp.StatusCode, err)}) + return + } + c.JSON(resp.StatusCode, gin.H{"error": fmt.Sprintf("Filer returned status %d: %s", resp.StatusCode, string(body))}) + return + } // Set headers for file download fileName := filepath.Base(cleanFilePath) - c.Header("Content-Disposition", fmt.Sprintf("attachment; filename=\"%s\"", fileName)) - c.Header("Content-Type", "application/octet-stream") + // Use mime.FormatMediaType for RFC 6266 compliant Content-Disposition, + // properly handling non-ASCII characters and special characters + c.Header("Content-Disposition", mime.FormatMediaType("attachment", map[string]string{"filename": fileName})) + + // Use content type from filer response, or default to octet-stream + contentType := resp.Header.Get("Content-Type") + if contentType == "" { + contentType = "application/octet-stream" + } + c.Header("Content-Type", contentType) + + // Set content length if available + if resp.ContentLength > 0 { + c.Header("Content-Length", fmt.Sprintf("%d", resp.ContentLength)) + } - // Proxy the request to filer - c.Redirect(http.StatusFound, downloadURL) + // Stream the response body to the client + c.Status(http.StatusOK) + _, err = io.Copy(c.Writer, resp.Body) + if err != nil { + glog.Errorf("Error streaming file download: %v", err) + } } // ViewFile handles file viewing requests (for text files, images, etc.) @@ -559,46 +706,13 @@ func (h *FileBrowserHandlers) ViewFile(c *gin.Context) { viewable = false reason = "File too large for viewing (>1MB)" } else { - // Get file content from filer - filerAddress := h.adminServer.GetFilerAddress() - if filerAddress != "" { - // Validate filer address to prevent SSRF - if err := h.validateFilerAddress(filerAddress); err != nil { - viewable = false - reason = "Invalid filer address configuration" - } else { - cleanFilePath, err := h.validateAndCleanFilePath(filePath) - if err == nil { - fileURL := fmt.Sprintf("http://%s%s", filerAddress, cleanFilePath) - - client := &http.Client{Timeout: 30 * time.Second} - // lgtm[go/ssrf] - // Safe: filerAddress validated by validateFilerAddress() to match configured filer - // Safe: cleanFilePath validated and cleaned by validateAndCleanFilePath() to prevent path traversal - resp, err := client.Get(fileURL) - if err == nil && resp.StatusCode == http.StatusOK { - defer resp.Body.Close() - contentBytes, err := io.ReadAll(resp.Body) - if err == nil { - content = string(contentBytes) - viewable = true - } else { - viewable = false - reason = "Failed to read file content" - } - } else { - viewable = false - reason = "Failed to fetch file from filer" - } - } else { - viewable = false - reason = "Invalid file path" - } - } - } else { - viewable = false - reason = "Filer address not configured" + // Fetch file content from filer + var err error + content, err = h.fetchFileContent(filePath, 30*time.Second) + if err != nil { + reason = err.Error() } + viewable = (err == nil) } } else { // Not a text file, but might be viewable as image or PDF @@ -893,18 +1007,28 @@ func (h *FileBrowserHandlers) isLikelyTextFile(filePath string, maxCheckSize int return false } - fileURL := fmt.Sprintf("http://%s%s", filerAddress, cleanFilePath) + // Create the file URL with proper scheme based on TLS configuration + fileURL := fmt.Sprintf("%s%s", filerAddress, cleanFilePath) + fileURL, err = h.httpClient.NormalizeHttpScheme(fileURL) + if err != nil { + glog.Errorf("Failed to normalize URL scheme: %v", err) + return false + } - client := &http.Client{Timeout: 10 * time.Second} // lgtm[go/ssrf] // Safe: filerAddress validated by validateFilerAddress() to match configured filer // Safe: cleanFilePath validated and cleaned by validateAndCleanFilePath() to prevent path traversal + client := h.newClientWithTimeout(10 * time.Second) resp, err := client.Get(fileURL) - if err != nil || resp.StatusCode != http.StatusOK { + if err != nil { return false } defer resp.Body.Close() + if resp.StatusCode != http.StatusOK { + return false + } + // Read first few bytes to check if it's text buffer := make([]byte, min(maxCheckSize, 512)) n, err := resp.Body.Read(buffer) diff --git a/weed/command/filer.go b/weed/command/filer.go index 86991a181..0e3154819 100644 --- a/weed/command/filer.go +++ b/weed/command/filer.go @@ -122,13 +122,13 @@ func init() { filerS3Options.tlsCertificate = cmdFiler.Flag.String("s3.cert.file", "", "path to the TLS certificate file") filerS3Options.config = cmdFiler.Flag.String("s3.config", "", "path to the config file") filerS3Options.auditLogConfig = cmdFiler.Flag.String("s3.auditLogConfig", "", "path to the audit log config file") - filerS3Options.allowEmptyFolder = cmdFiler.Flag.Bool("s3.allowEmptyFolder", true, "allow empty folders") + cmdFiler.Flag.Bool("s3.allowEmptyFolder", true, "deprecated, ignored. Empty folder cleanup is now automatic.") filerS3Options.allowDeleteBucketNotEmpty = cmdFiler.Flag.Bool("s3.allowDeleteBucketNotEmpty", true, "allow recursive deleting all entries along with bucket") filerS3Options.localSocket = cmdFiler.Flag.String("s3.localSocket", "", "default to /tmp/seaweedfs-s3-.sock") filerS3Options.tlsCACertificate = cmdFiler.Flag.String("s3.cacert.file", "", "path to the TLS CA certificate file") filerS3Options.tlsVerifyClientCert = cmdFiler.Flag.Bool("s3.tlsVerifyClientCert", false, "whether to verify the client's certificate") filerS3Options.bindIp = cmdFiler.Flag.String("s3.ip.bind", "", "ip address to bind to. If empty, default to same as -ip.bind option.") - filerS3Options.idleTimeout = cmdFiler.Flag.Int("s3.idleTimeout", 10, "connection idle seconds") + filerS3Options.idleTimeout = cmdFiler.Flag.Int("s3.idleTimeout", 120, "connection idle seconds") filerS3Options.concurrentUploadLimitMB = cmdFiler.Flag.Int("s3.concurrentUploadLimitMB", 128, "limit total concurrent upload size for S3") filerS3Options.concurrentFileUploadLimit = cmdFiler.Flag.Int("s3.concurrentFileUploadLimit", 0, "limit number of concurrent file uploads for S3, 0 means unlimited") diff --git a/weed/command/s3.go b/weed/command/s3.go index 61222336b..5fb34155b 100644 --- a/weed/command/s3.go +++ b/weed/command/s3.go @@ -49,7 +49,6 @@ type S3Options struct { tlsVerifyClientCert *bool metricsHttpPort *int metricsHttpIp *string - allowEmptyFolder *bool allowDeleteBucketNotEmpty *bool auditLogConfig *string localFilerSocket *string @@ -80,11 +79,11 @@ func init() { s3StandaloneOptions.tlsVerifyClientCert = cmdS3.Flag.Bool("tlsVerifyClientCert", false, "whether to verify the client's certificate") s3StandaloneOptions.metricsHttpPort = cmdS3.Flag.Int("metricsPort", 0, "Prometheus metrics listen port") s3StandaloneOptions.metricsHttpIp = cmdS3.Flag.String("metricsIp", "", "metrics listen ip. If empty, default to same as -ip.bind option.") - s3StandaloneOptions.allowEmptyFolder = cmdS3.Flag.Bool("allowEmptyFolder", true, "allow empty folders") + cmdS3.Flag.Bool("allowEmptyFolder", true, "deprecated, ignored. Empty folder cleanup is now automatic.") s3StandaloneOptions.allowDeleteBucketNotEmpty = cmdS3.Flag.Bool("allowDeleteBucketNotEmpty", true, "allow recursive deleting all entries along with bucket") s3StandaloneOptions.localFilerSocket = cmdS3.Flag.String("localFilerSocket", "", "local filer socket path") s3StandaloneOptions.localSocket = cmdS3.Flag.String("localSocket", "", "default to /tmp/seaweedfs-s3-.sock") - s3StandaloneOptions.idleTimeout = cmdS3.Flag.Int("idleTimeout", 10, "connection idle seconds") + s3StandaloneOptions.idleTimeout = cmdS3.Flag.Int("idleTimeout", 120, "connection idle seconds") s3StandaloneOptions.concurrentUploadLimitMB = cmdS3.Flag.Int("concurrentUploadLimitMB", 128, "limit total concurrent upload size") s3StandaloneOptions.concurrentFileUploadLimit = cmdS3.Flag.Int("concurrentFileUploadLimit", 0, "limit number of concurrent file uploads, 0 means unlimited") } @@ -273,7 +272,6 @@ func (s3opt *S3Options) startS3Server() bool { AllowedOrigins: strings.Split(*s3opt.allowedOrigins, ","), BucketsPath: filerBucketsPath, GrpcDialOption: grpcDialOption, - AllowEmptyFolder: *s3opt.allowEmptyFolder, AllowDeleteBucketNotEmpty: *s3opt.allowDeleteBucketNotEmpty, LocalFilerSocket: localFilerSocket, DataCenter: *s3opt.dataCenter, diff --git a/weed/command/server.go b/weed/command/server.go index 47df30fc2..7d1606189 100644 --- a/weed/command/server.go +++ b/weed/command/server.go @@ -133,11 +133,13 @@ func init() { serverOptions.v.port = cmdServer.Flag.Int("volume.port", 8080, "volume server http listen port") serverOptions.v.portGrpc = cmdServer.Flag.Int("volume.port.grpc", 0, "volume server grpc listen port") serverOptions.v.publicPort = cmdServer.Flag.Int("volume.port.public", 0, "volume server public port") + serverOptions.v.id = cmdServer.Flag.String("volume.id", "", "volume server id. If empty, default to ip:port") serverOptions.v.indexType = cmdServer.Flag.String("volume.index", "memory", "Choose [memory|leveldb|leveldbMedium|leveldbLarge] mode for memory~performance balance.") serverOptions.v.diskType = cmdServer.Flag.String("volume.disk", "", "[hdd|ssd|] hard drive or solid state drive or any tag") serverOptions.v.fixJpgOrientation = cmdServer.Flag.Bool("volume.images.fix.orientation", false, "Adjust jpg orientation when uploading.") serverOptions.v.readMode = cmdServer.Flag.String("volume.readMode", "proxy", "[local|proxy|redirect] how to deal with non-local volume: 'not found|read in remote node|redirect volume location'.") serverOptions.v.compactionMBPerSecond = cmdServer.Flag.Int("volume.compactionMBps", 0, "limit compaction speed in mega bytes per second") + serverOptions.v.maintenanceMBPerSecond = cmdServer.Flag.Int("volume.maintenanceMBps", 0, "limit maintenance (replication / balance) IO rate in MB/s. Unset is 0, no limitation.") serverOptions.v.fileSizeLimitMB = cmdServer.Flag.Int("volume.fileSizeLimitMB", 256, "limit file size to avoid out of memory") serverOptions.v.ldbTimeout = cmdServer.Flag.Int64("volume.index.leveldbTimeout", 0, "alive time for leveldb (default to 0). If leveldb of volume is not accessed in ldbTimeout hours, it will be off loaded to reduce opened files and memory consumption.") serverOptions.v.concurrentUploadLimitMB = cmdServer.Flag.Int("volume.concurrentUploadLimitMB", 64, "limit total concurrent upload size") @@ -164,11 +166,11 @@ func init() { s3Options.config = cmdServer.Flag.String("s3.config", "", "path to the config file") s3Options.iamConfig = cmdServer.Flag.String("s3.iam.config", "", "path to the advanced IAM config file for S3. Overrides -iam.config if both are provided.") s3Options.auditLogConfig = cmdServer.Flag.String("s3.auditLogConfig", "", "path to the audit log config file") - s3Options.allowEmptyFolder = cmdServer.Flag.Bool("s3.allowEmptyFolder", true, "allow empty folders") + cmdServer.Flag.Bool("s3.allowEmptyFolder", true, "deprecated, ignored. Empty folder cleanup is now automatic.") s3Options.allowDeleteBucketNotEmpty = cmdServer.Flag.Bool("s3.allowDeleteBucketNotEmpty", true, "allow recursive deleting all entries along with bucket") s3Options.localSocket = cmdServer.Flag.String("s3.localSocket", "", "default to /tmp/seaweedfs-s3-.sock") s3Options.bindIp = cmdServer.Flag.String("s3.ip.bind", "", "ip address to bind to. If empty, default to same as -ip.bind option.") - s3Options.idleTimeout = cmdServer.Flag.Int("s3.idleTimeout", 10, "connection idle seconds") + s3Options.idleTimeout = cmdServer.Flag.Int("s3.idleTimeout", 120, "connection idle seconds") s3Options.concurrentUploadLimitMB = cmdServer.Flag.Int("s3.concurrentUploadLimitMB", 128, "limit total concurrent upload size for S3") s3Options.concurrentFileUploadLimit = cmdServer.Flag.Int("s3.concurrentFileUploadLimit", 0, "limit number of concurrent file uploads for S3, 0 means unlimited") diff --git a/weed/command/volume.go b/weed/command/volume.go index e21437e9a..ae9f5e7f4 100644 --- a/weed/command/volume.go +++ b/weed/command/volume.go @@ -41,6 +41,7 @@ type VolumeServerOptions struct { folderMaxLimits []int32 idxFolder *string ip *string + id *string publicUrl *string bindIp *string mastersString *string @@ -57,6 +58,7 @@ type VolumeServerOptions struct { cpuProfile *string memProfile *string compactionMBPerSecond *int + maintenanceMBPerSecond *int fileSizeLimitMB *int concurrentUploadLimitMB *int concurrentDownloadLimitMB *int @@ -78,6 +80,7 @@ func init() { v.portGrpc = cmdVolume.Flag.Int("port.grpc", 0, "grpc listen port") v.publicPort = cmdVolume.Flag.Int("port.public", 0, "port opened to public") v.ip = cmdVolume.Flag.String("ip", util.DetectedHostAddress(), "ip or server name, also used as identifier") + v.id = cmdVolume.Flag.String("id", "", "volume server id. If empty, default to ip:port") v.publicUrl = cmdVolume.Flag.String("publicUrl", "", "Publicly accessible address") v.bindIp = cmdVolume.Flag.String("ip.bind", "", "ip address to bind to. If empty, default to same as -ip option.") v.mastersString = cmdVolume.Flag.String("master", "localhost:9333", "comma-separated master servers") @@ -94,6 +97,7 @@ func init() { v.cpuProfile = cmdVolume.Flag.String("cpuprofile", "", "cpu profile output file") v.memProfile = cmdVolume.Flag.String("memprofile", "", "memory profile output file") v.compactionMBPerSecond = cmdVolume.Flag.Int("compactionMBps", 0, "limit background compaction or copying speed in mega bytes per second") + v.maintenanceMBPerSecond = cmdVolume.Flag.Int("maintenanceMBps", 0, "limit maintenance (replication / balance) IO rate in MB/s. Unset is 0, no limitation.") v.fileSizeLimitMB = cmdVolume.Flag.Int("fileSizeLimitMB", 256, "limit file size to avoid out of memory") v.ldbTimeout = cmdVolume.Flag.Int64("index.leveldbTimeout", 0, "alive time for leveldb (default to 0). If leveldb of volume is not accessed in ldbTimeout hours, it will be off loaded to reduce opened files and memory consumption.") v.concurrentUploadLimitMB = cmdVolume.Flag.Int("concurrentUploadLimitMB", 256, "limit total concurrent upload size") @@ -253,8 +257,11 @@ func (v VolumeServerOptions) startVolumeServer(volumeFolders, maxVolumeCounts, v volumeNeedleMapKind = storage.NeedleMapLevelDbLarge } + // Determine volume server ID: if not specified, use ip:port + volumeServerId := util.GetVolumeServerId(*v.id, *v.ip, *v.port) + volumeServer := weed_server.NewVolumeServer(volumeMux, publicVolumeMux, - *v.ip, *v.port, *v.portGrpc, *v.publicUrl, + *v.ip, *v.port, *v.portGrpc, *v.publicUrl, volumeServerId, v.folders, v.folderMaxLimits, minFreeSpaces, diskTypes, *v.idxFolder, volumeNeedleMapKind, @@ -262,6 +269,7 @@ func (v VolumeServerOptions) startVolumeServer(volumeFolders, maxVolumeCounts, v v.whiteList, *v.fixJpgOrientation, *v.readMode, *v.compactionMBPerSecond, + *v.maintenanceMBPerSecond, *v.fileSizeLimitMB, int64(*v.concurrentUploadLimitMB)*1024*1024, int64(*v.concurrentDownloadLimitMB)*1024*1024, diff --git a/weed/credential/filer_etc/filer_etc_store.go b/weed/credential/filer_etc/filer_etc_store.go index b181a55f0..e174b5ef4 100644 --- a/weed/credential/filer_etc/filer_etc_store.go +++ b/weed/credential/filer_etc/filer_etc_store.go @@ -58,7 +58,7 @@ func (store *FilerEtcStore) withFilerClient(fn func(client filer_pb.SeaweedFiler store.mu.RLock() if store.filerAddressFunc == nil { store.mu.RUnlock() - return fmt.Errorf("filer_etc: filer address function not configured") + return fmt.Errorf("filer_etc: filer not yet available - please wait for filer discovery to complete and try again") } filerAddress := store.filerAddressFunc() @@ -66,7 +66,7 @@ func (store *FilerEtcStore) withFilerClient(fn func(client filer_pb.SeaweedFiler store.mu.RUnlock() if filerAddress == "" { - return fmt.Errorf("filer_etc: filer address is empty") + return fmt.Errorf("filer_etc: no filer discovered yet - please ensure a filer is running and accessible") } // Use the pb.WithGrpcFilerClient helper similar to existing code diff --git a/weed/filer/empty_folder_cleanup/cleanup_queue.go b/weed/filer/empty_folder_cleanup/cleanup_queue.go new file mode 100644 index 000000000..f92af389d --- /dev/null +++ b/weed/filer/empty_folder_cleanup/cleanup_queue.go @@ -0,0 +1,207 @@ +package empty_folder_cleanup + +import ( + "container/list" + "sync" + "time" +) + +// CleanupQueue manages a deduplicated queue of folders pending cleanup. +// It uses a doubly-linked list ordered by event time (oldest at front) and a map for O(1) deduplication. +// Processing is triggered when: +// - Queue size reaches maxSize, OR +// - Oldest item exceeds maxAge +type CleanupQueue struct { + mu sync.Mutex + items *list.List // Linked list of *queueItem ordered by time (front = oldest) + itemsMap map[string]*list.Element // folder -> list element for O(1) lookup + maxSize int // Max queue size before triggering cleanup + maxAge time.Duration // Max age before triggering cleanup +} + +// queueItem represents an item in the cleanup queue +type queueItem struct { + folder string + queueTime time.Time +} + +// NewCleanupQueue creates a new CleanupQueue with the specified limits +func NewCleanupQueue(maxSize int, maxAge time.Duration) *CleanupQueue { + return &CleanupQueue{ + items: list.New(), + itemsMap: make(map[string]*list.Element), + maxSize: maxSize, + maxAge: maxAge, + } +} + +// Add adds a folder to the queue with the specified event time. +// The item is inserted in time-sorted order (oldest at front) to handle out-of-order events. +// If folder already exists with an older time, the time is updated and position adjusted. +// Returns true if the folder was newly added, false if it was updated. +func (q *CleanupQueue) Add(folder string, eventTime time.Time) bool { + q.mu.Lock() + defer q.mu.Unlock() + + // Check if folder already exists + if elem, exists := q.itemsMap[folder]; exists { + existingItem := elem.Value.(*queueItem) + // Only update if new event is later + if eventTime.After(existingItem.queueTime) { + // Remove from current position + q.items.Remove(elem) + // Re-insert with new time in sorted position + newElem := q.insertSorted(folder, eventTime) + q.itemsMap[folder] = newElem + } + return false + } + + // Insert new folder in sorted position + elem := q.insertSorted(folder, eventTime) + q.itemsMap[folder] = elem + return true +} + +// insertSorted inserts an item in the correct position to maintain time ordering (oldest at front) +func (q *CleanupQueue) insertSorted(folder string, eventTime time.Time) *list.Element { + item := &queueItem{ + folder: folder, + queueTime: eventTime, + } + + // Find the correct position (insert before the first item with a later time) + for elem := q.items.Back(); elem != nil; elem = elem.Prev() { + existingItem := elem.Value.(*queueItem) + if !eventTime.Before(existingItem.queueTime) { + // Insert after this element + return q.items.InsertAfter(item, elem) + } + } + + // This item is the oldest, insert at front + return q.items.PushFront(item) +} + +// Remove removes a specific folder from the queue (e.g., when a file is created). +// Returns true if the folder was found and removed. +func (q *CleanupQueue) Remove(folder string) bool { + q.mu.Lock() + defer q.mu.Unlock() + + elem, exists := q.itemsMap[folder] + if !exists { + return false + } + + q.items.Remove(elem) + delete(q.itemsMap, folder) + return true +} + +// ShouldProcess returns true if the queue should be processed. +// This is true when: +// - Queue size >= maxSize, OR +// - Oldest item age > maxAge +func (q *CleanupQueue) ShouldProcess() bool { + q.mu.Lock() + defer q.mu.Unlock() + + return q.shouldProcessLocked() +} + +// shouldProcessLocked checks if processing is needed (caller must hold lock) +func (q *CleanupQueue) shouldProcessLocked() bool { + if q.items.Len() == 0 { + return false + } + + // Check if queue is full + if q.items.Len() >= q.maxSize { + return true + } + + // Check if oldest item exceeds max age + front := q.items.Front() + if front != nil { + item := front.Value.(*queueItem) + if time.Since(item.queueTime) > q.maxAge { + return true + } + } + + return false +} + +// Pop removes and returns the oldest folder from the queue. +// Returns the folder and true if an item was available, or empty string and false if queue is empty. +func (q *CleanupQueue) Pop() (string, bool) { + q.mu.Lock() + defer q.mu.Unlock() + + front := q.items.Front() + if front == nil { + return "", false + } + + item := front.Value.(*queueItem) + q.items.Remove(front) + delete(q.itemsMap, item.folder) + + return item.folder, true +} + +// Peek returns the oldest folder without removing it. +// Returns the folder and queue time if available, or empty values if queue is empty. +func (q *CleanupQueue) Peek() (folder string, queueTime time.Time, ok bool) { + q.mu.Lock() + defer q.mu.Unlock() + + front := q.items.Front() + if front == nil { + return "", time.Time{}, false + } + + item := front.Value.(*queueItem) + return item.folder, item.queueTime, true +} + +// Len returns the current queue size. +func (q *CleanupQueue) Len() int { + q.mu.Lock() + defer q.mu.Unlock() + return q.items.Len() +} + +// Contains checks if a folder is in the queue. +func (q *CleanupQueue) Contains(folder string) bool { + q.mu.Lock() + defer q.mu.Unlock() + _, exists := q.itemsMap[folder] + return exists +} + +// Clear removes all items from the queue. +func (q *CleanupQueue) Clear() { + q.mu.Lock() + defer q.mu.Unlock() + + q.items.Init() + q.itemsMap = make(map[string]*list.Element) +} + +// OldestAge returns the age of the oldest item in the queue, or 0 if empty. +func (q *CleanupQueue) OldestAge() time.Duration { + q.mu.Lock() + defer q.mu.Unlock() + + front := q.items.Front() + if front == nil { + return 0 + } + + item := front.Value.(*queueItem) + return time.Since(item.queueTime) +} + + diff --git a/weed/filer/empty_folder_cleanup/cleanup_queue_test.go b/weed/filer/empty_folder_cleanup/cleanup_queue_test.go new file mode 100644 index 000000000..2effa3138 --- /dev/null +++ b/weed/filer/empty_folder_cleanup/cleanup_queue_test.go @@ -0,0 +1,371 @@ +package empty_folder_cleanup + +import ( + "testing" + "time" +) + +func TestCleanupQueue_Add(t *testing.T) { + q := NewCleanupQueue(100, 10*time.Minute) + now := time.Now() + + // Add first item + if !q.Add("/buckets/b1/folder1", now) { + t.Error("expected Add to return true for new item") + } + if q.Len() != 1 { + t.Errorf("expected len 1, got %d", q.Len()) + } + + // Add second item with later time + if !q.Add("/buckets/b1/folder2", now.Add(1*time.Second)) { + t.Error("expected Add to return true for new item") + } + if q.Len() != 2 { + t.Errorf("expected len 2, got %d", q.Len()) + } + + // Add duplicate with newer time - should update and reposition + if q.Add("/buckets/b1/folder1", now.Add(2*time.Second)) { + t.Error("expected Add to return false for existing item") + } + if q.Len() != 2 { + t.Errorf("expected len 2 after duplicate, got %d", q.Len()) + } + + // folder1 should now be at the back (newer time) - verify by popping + folder1, _ := q.Pop() + folder2, _ := q.Pop() + if folder1 != "/buckets/b1/folder2" || folder2 != "/buckets/b1/folder1" { + t.Errorf("expected folder1 to be moved to back, got %s, %s", folder1, folder2) + } +} + +func TestCleanupQueue_Add_OutOfOrder(t *testing.T) { + q := NewCleanupQueue(100, 10*time.Minute) + baseTime := time.Now() + + // Add items out of order + q.Add("/buckets/b1/folder3", baseTime.Add(3*time.Second)) + q.Add("/buckets/b1/folder1", baseTime.Add(1*time.Second)) + q.Add("/buckets/b1/folder2", baseTime.Add(2*time.Second)) + + // Items should be in time order (oldest first) - verify by popping + expected := []string{"/buckets/b1/folder1", "/buckets/b1/folder2", "/buckets/b1/folder3"} + for i, exp := range expected { + folder, ok := q.Pop() + if !ok || folder != exp { + t.Errorf("at index %d: expected %s, got %s", i, exp, folder) + } + } +} + +func TestCleanupQueue_Add_DuplicateWithOlderTime(t *testing.T) { + q := NewCleanupQueue(100, 10*time.Minute) + baseTime := time.Now() + + // Add folder at t=5 + q.Add("/buckets/b1/folder1", baseTime.Add(5*time.Second)) + + // Try to add same folder with older time - should NOT update + q.Add("/buckets/b1/folder1", baseTime.Add(2*time.Second)) + + // Time should remain at t=5 + _, queueTime, _ := q.Peek() + if queueTime != baseTime.Add(5*time.Second) { + t.Errorf("expected time to remain unchanged, got %v", queueTime) + } +} + +func TestCleanupQueue_Remove(t *testing.T) { + q := NewCleanupQueue(100, 10*time.Minute) + now := time.Now() + + q.Add("/buckets/b1/folder1", now) + q.Add("/buckets/b1/folder2", now.Add(1*time.Second)) + q.Add("/buckets/b1/folder3", now.Add(2*time.Second)) + + // Remove middle item + if !q.Remove("/buckets/b1/folder2") { + t.Error("expected Remove to return true for existing item") + } + if q.Len() != 2 { + t.Errorf("expected len 2, got %d", q.Len()) + } + if q.Contains("/buckets/b1/folder2") { + t.Error("removed item should not be in queue") + } + + // Remove non-existent item + if q.Remove("/buckets/b1/nonexistent") { + t.Error("expected Remove to return false for non-existent item") + } + + // Verify order is preserved by popping + folder1, _ := q.Pop() + folder3, _ := q.Pop() + if folder1 != "/buckets/b1/folder1" || folder3 != "/buckets/b1/folder3" { + t.Errorf("unexpected order: %s, %s", folder1, folder3) + } +} + +func TestCleanupQueue_Pop(t *testing.T) { + q := NewCleanupQueue(100, 10*time.Minute) + now := time.Now() + + // Pop from empty queue + folder, ok := q.Pop() + if ok { + t.Error("expected Pop to return false for empty queue") + } + if folder != "" { + t.Errorf("expected empty folder, got %s", folder) + } + + // Add items and pop in order + q.Add("/buckets/b1/folder1", now) + q.Add("/buckets/b1/folder2", now.Add(1*time.Second)) + q.Add("/buckets/b1/folder3", now.Add(2*time.Second)) + + folder, ok = q.Pop() + if !ok || folder != "/buckets/b1/folder1" { + t.Errorf("expected folder1, got %s (ok=%v)", folder, ok) + } + + folder, ok = q.Pop() + if !ok || folder != "/buckets/b1/folder2" { + t.Errorf("expected folder2, got %s (ok=%v)", folder, ok) + } + + folder, ok = q.Pop() + if !ok || folder != "/buckets/b1/folder3" { + t.Errorf("expected folder3, got %s (ok=%v)", folder, ok) + } + + // Queue should be empty now + if q.Len() != 0 { + t.Errorf("expected empty queue, got len %d", q.Len()) + } +} + +func TestCleanupQueue_Peek(t *testing.T) { + q := NewCleanupQueue(100, 10*time.Minute) + now := time.Now() + + // Peek empty queue + folder, _, ok := q.Peek() + if ok { + t.Error("expected Peek to return false for empty queue") + } + + // Add item and peek + q.Add("/buckets/b1/folder1", now) + folder, queueTime, ok := q.Peek() + if !ok || folder != "/buckets/b1/folder1" { + t.Errorf("expected folder1, got %s (ok=%v)", folder, ok) + } + if queueTime != now { + t.Errorf("expected queue time %v, got %v", now, queueTime) + } + + // Peek should not remove item + if q.Len() != 1 { + t.Errorf("Peek should not remove item, len=%d", q.Len()) + } +} + +func TestCleanupQueue_Contains(t *testing.T) { + q := NewCleanupQueue(100, 10*time.Minute) + now := time.Now() + + q.Add("/buckets/b1/folder1", now) + + if !q.Contains("/buckets/b1/folder1") { + t.Error("expected Contains to return true") + } + if q.Contains("/buckets/b1/folder2") { + t.Error("expected Contains to return false for non-existent") + } +} + +func TestCleanupQueue_ShouldProcess_MaxSize(t *testing.T) { + q := NewCleanupQueue(3, 10*time.Minute) + now := time.Now() + + // Empty queue + if q.ShouldProcess() { + t.Error("empty queue should not need processing") + } + + // Add items below max + q.Add("/buckets/b1/folder1", now) + q.Add("/buckets/b1/folder2", now.Add(1*time.Second)) + if q.ShouldProcess() { + t.Error("queue below max should not need processing") + } + + // Add item to reach max + q.Add("/buckets/b1/folder3", now.Add(2*time.Second)) + if !q.ShouldProcess() { + t.Error("queue at max should need processing") + } +} + +func TestCleanupQueue_ShouldProcess_MaxAge(t *testing.T) { + q := NewCleanupQueue(100, 100*time.Millisecond) // Short max age for testing + + // Add item with old event time + oldTime := time.Now().Add(-1 * time.Second) // 1 second ago + q.Add("/buckets/b1/folder1", oldTime) + + // Item is older than maxAge, should need processing + if !q.ShouldProcess() { + t.Error("old item should trigger processing") + } + + // Clear and add fresh item + q.Clear() + q.Add("/buckets/b1/folder2", time.Now()) + + // Fresh item should not trigger processing + if q.ShouldProcess() { + t.Error("fresh item should not trigger processing") + } +} + +func TestCleanupQueue_Clear(t *testing.T) { + q := NewCleanupQueue(100, 10*time.Minute) + now := time.Now() + + q.Add("/buckets/b1/folder1", now) + q.Add("/buckets/b1/folder2", now.Add(1*time.Second)) + q.Add("/buckets/b1/folder3", now.Add(2*time.Second)) + + q.Clear() + + if q.Len() != 0 { + t.Errorf("expected empty queue after Clear, got len %d", q.Len()) + } + if q.Contains("/buckets/b1/folder1") { + t.Error("queue should not contain items after Clear") + } +} + +func TestCleanupQueue_OldestAge(t *testing.T) { + q := NewCleanupQueue(100, 10*time.Minute) + + // Empty queue + if q.OldestAge() != 0 { + t.Error("empty queue should have zero oldest age") + } + + // Add item with time in the past + oldTime := time.Now().Add(-5 * time.Minute) + q.Add("/buckets/b1/folder1", oldTime) + + // Age should be approximately 5 minutes + age := q.OldestAge() + if age < 4*time.Minute || age > 6*time.Minute { + t.Errorf("expected ~5m age, got %v", age) + } +} + +func TestCleanupQueue_TimeOrder(t *testing.T) { + q := NewCleanupQueue(100, 10*time.Minute) + baseTime := time.Now() + + // Add items in order + items := []string{ + "/buckets/b1/a", + "/buckets/b1/b", + "/buckets/b1/c", + "/buckets/b1/d", + "/buckets/b1/e", + } + for i, item := range items { + q.Add(item, baseTime.Add(time.Duration(i)*time.Second)) + } + + // Pop should return in time order + for i, expected := range items { + got, ok := q.Pop() + if !ok { + t.Errorf("Pop %d: expected item, got empty", i) + } + if got != expected { + t.Errorf("Pop %d: expected %s, got %s", i, expected, got) + } + } +} + +func TestCleanupQueue_DuplicateWithNewerTime(t *testing.T) { + q := NewCleanupQueue(100, 10*time.Minute) + baseTime := time.Now() + + // Add items + q.Add("/buckets/b1/folder1", baseTime) + q.Add("/buckets/b1/folder2", baseTime.Add(1*time.Second)) + q.Add("/buckets/b1/folder3", baseTime.Add(2*time.Second)) + + // Add duplicate with newer time - should update and reposition + q.Add("/buckets/b1/folder1", baseTime.Add(3*time.Second)) + + // folder1 should now be at the back (newest time) - verify by popping + expected := []string{"/buckets/b1/folder2", "/buckets/b1/folder3", "/buckets/b1/folder1"} + for i, exp := range expected { + folder, ok := q.Pop() + if !ok || folder != exp { + t.Errorf("at index %d: expected %s, got %s", i, exp, folder) + } + } +} + +func TestCleanupQueue_Concurrent(t *testing.T) { + q := NewCleanupQueue(1000, 10*time.Minute) + done := make(chan bool) + now := time.Now() + + // Concurrent adds + go func() { + for i := 0; i < 100; i++ { + q.Add("/buckets/b1/folder"+string(rune('A'+i%26)), now.Add(time.Duration(i)*time.Millisecond)) + } + done <- true + }() + + // Concurrent removes + go func() { + for i := 0; i < 50; i++ { + q.Remove("/buckets/b1/folder" + string(rune('A'+i%26))) + } + done <- true + }() + + // Concurrent pops + go func() { + for i := 0; i < 30; i++ { + q.Pop() + } + done <- true + }() + + // Concurrent reads + go func() { + for i := 0; i < 100; i++ { + q.Len() + q.Contains("/buckets/b1/folderA") + q.ShouldProcess() + } + done <- true + }() + + // Wait for all goroutines + for i := 0; i < 4; i++ { + <-done + } + + // Just verify no panic occurred and queue is in consistent state + _ = q.Len() +} + + diff --git a/weed/filer/empty_folder_cleanup/empty_folder_cleaner.go b/weed/filer/empty_folder_cleanup/empty_folder_cleaner.go new file mode 100644 index 000000000..70856aaf1 --- /dev/null +++ b/weed/filer/empty_folder_cleanup/empty_folder_cleaner.go @@ -0,0 +1,436 @@ +package empty_folder_cleanup + +import ( + "context" + "strings" + "sync" + "time" + + "github.com/seaweedfs/seaweedfs/weed/cluster/lock_manager" + "github.com/seaweedfs/seaweedfs/weed/glog" + "github.com/seaweedfs/seaweedfs/weed/pb" + "github.com/seaweedfs/seaweedfs/weed/util" +) + +const ( + DefaultMaxCountCheck = 1000 + DefaultCacheExpiry = 5 * time.Minute + DefaultQueueMaxSize = 1000 + DefaultQueueMaxAge = 10 * time.Minute + DefaultProcessorSleep = 10 * time.Second // How often to check queue +) + +// FilerOperations defines the filer operations needed by EmptyFolderCleaner +type FilerOperations interface { + CountDirectoryEntries(ctx context.Context, dirPath util.FullPath, limit int) (count int, err error) + DeleteEntryMetaAndData(ctx context.Context, p util.FullPath, isRecursive, ignoreRecursiveError, shouldDeleteChunks, isFromOtherCluster bool, signatures []int32, ifNotModifiedAfter int64) error +} + +// folderState tracks the state of a folder for empty folder cleanup +type folderState struct { + roughCount int // Cached rough count (up to maxCountCheck) + lastAddTime time.Time // Last time an item was added + lastDelTime time.Time // Last time an item was deleted + lastCheck time.Time // Last time we checked the actual count +} + +// EmptyFolderCleaner handles asynchronous cleanup of empty folders +// Each filer owns specific folders via consistent hashing based on the peer filer list +type EmptyFolderCleaner struct { + filer FilerOperations + lockRing *lock_manager.LockRing + host pb.ServerAddress + + // Folder state tracking + mu sync.RWMutex + folderCounts map[string]*folderState // Rough count cache + + // Cleanup queue (thread-safe, has its own lock) + cleanupQueue *CleanupQueue + + // Configuration + maxCountCheck int // Max items to count (1000) + cacheExpiry time.Duration // How long to keep cache entries + processorSleep time.Duration // How often processor checks queue + bucketPath string // e.g., "/buckets" + + // Control + enabled bool + stopCh chan struct{} +} + +// NewEmptyFolderCleaner creates a new EmptyFolderCleaner +func NewEmptyFolderCleaner(filer FilerOperations, lockRing *lock_manager.LockRing, host pb.ServerAddress, bucketPath string) *EmptyFolderCleaner { + efc := &EmptyFolderCleaner{ + filer: filer, + lockRing: lockRing, + host: host, + folderCounts: make(map[string]*folderState), + cleanupQueue: NewCleanupQueue(DefaultQueueMaxSize, DefaultQueueMaxAge), + maxCountCheck: DefaultMaxCountCheck, + cacheExpiry: DefaultCacheExpiry, + processorSleep: DefaultProcessorSleep, + bucketPath: bucketPath, + enabled: true, + stopCh: make(chan struct{}), + } + go efc.cacheEvictionLoop() + go efc.cleanupProcessor() + return efc +} + +// SetEnabled enables or disables the cleaner +func (efc *EmptyFolderCleaner) SetEnabled(enabled bool) { + efc.mu.Lock() + defer efc.mu.Unlock() + efc.enabled = enabled +} + +// IsEnabled returns whether the cleaner is enabled +func (efc *EmptyFolderCleaner) IsEnabled() bool { + efc.mu.RLock() + defer efc.mu.RUnlock() + return efc.enabled +} + +// ownsFolder checks if this filer owns the folder via consistent hashing +func (efc *EmptyFolderCleaner) ownsFolder(folder string) bool { + servers := efc.lockRing.GetSnapshot() + if len(servers) <= 1 { + return true // Single filer case + } + return efc.hashKeyToServer(folder, servers) == efc.host +} + +// hashKeyToServer uses consistent hashing to map a folder to a server +func (efc *EmptyFolderCleaner) hashKeyToServer(key string, servers []pb.ServerAddress) pb.ServerAddress { + if len(servers) == 0 { + return "" + } + x := util.HashStringToLong(key) + if x < 0 { + x = -x + } + x = x % int64(len(servers)) + return servers[x] +} + +// OnDeleteEvent is called when a file or directory is deleted +// Both file and directory deletions count towards making the parent folder empty +// eventTime is the time when the delete event occurred (for proper ordering) +func (efc *EmptyFolderCleaner) OnDeleteEvent(directory string, entryName string, isDirectory bool, eventTime time.Time) { + // Skip if not under bucket path (must be at least /buckets//...) + if efc.bucketPath != "" && !isUnderBucketPath(directory, efc.bucketPath) { + return + } + + // Check if we own this folder + if !efc.ownsFolder(directory) { + glog.V(4).Infof("EmptyFolderCleaner: not owner of %s, skipping", directory) + return + } + + efc.mu.Lock() + defer efc.mu.Unlock() + + // Check enabled inside lock to avoid race with Stop() + if !efc.enabled { + return + } + + glog.V(3).Infof("EmptyFolderCleaner: delete event in %s/%s (isDir=%v)", directory, entryName, isDirectory) + + // Update cached count (create entry if needed) + state, exists := efc.folderCounts[directory] + if !exists { + state = &folderState{} + efc.folderCounts[directory] = state + } + if state.roughCount > 0 { + state.roughCount-- + } + state.lastDelTime = eventTime + + // Only add to cleanup queue if roughCount suggests folder might be empty + if state.roughCount > 0 { + glog.V(3).Infof("EmptyFolderCleaner: skipping queue for %s, roughCount=%d", directory, state.roughCount) + return + } + + // Add to cleanup queue with event time (handles out-of-order events) + if efc.cleanupQueue.Add(directory, eventTime) { + glog.V(3).Infof("EmptyFolderCleaner: queued %s for cleanup", directory) + } +} + +// OnCreateEvent is called when a file or directory is created +// Both file and directory creations cancel pending cleanup for the parent folder +func (efc *EmptyFolderCleaner) OnCreateEvent(directory string, entryName string, isDirectory bool) { + // Skip if not under bucket path (must be at least /buckets//...) + if efc.bucketPath != "" && !isUnderBucketPath(directory, efc.bucketPath) { + return + } + + efc.mu.Lock() + defer efc.mu.Unlock() + + // Check enabled inside lock to avoid race with Stop() + if !efc.enabled { + return + } + + // Update cached count only if already tracked (no need to track new folders) + if state, exists := efc.folderCounts[directory]; exists { + state.roughCount++ + state.lastAddTime = time.Now() + } + + // Remove from cleanup queue (cancel pending cleanup) + if efc.cleanupQueue.Remove(directory) { + glog.V(3).Infof("EmptyFolderCleaner: cancelled cleanup for %s due to new entry", directory) + } +} + +// cleanupProcessor runs in background and processes the cleanup queue +func (efc *EmptyFolderCleaner) cleanupProcessor() { + ticker := time.NewTicker(efc.processorSleep) + defer ticker.Stop() + + for { + select { + case <-efc.stopCh: + return + case <-ticker.C: + efc.processCleanupQueue() + } + } +} + +// processCleanupQueue processes items from the cleanup queue +func (efc *EmptyFolderCleaner) processCleanupQueue() { + // Check if we should process + if !efc.cleanupQueue.ShouldProcess() { + return + } + + glog.V(3).Infof("EmptyFolderCleaner: processing cleanup queue (len=%d, age=%v)", + efc.cleanupQueue.Len(), efc.cleanupQueue.OldestAge()) + + // Process all items that are ready + for efc.cleanupQueue.Len() > 0 { + // Check if still enabled + if !efc.IsEnabled() { + return + } + + // Pop the oldest item + folder, ok := efc.cleanupQueue.Pop() + if !ok { + break + } + + // Execute cleanup for this folder + efc.executeCleanup(folder) + + // If queue is no longer full and oldest item is not old enough, stop processing + if !efc.cleanupQueue.ShouldProcess() { + break + } + } +} + +// executeCleanup performs the actual cleanup of an empty folder +func (efc *EmptyFolderCleaner) executeCleanup(folder string) { + efc.mu.Lock() + + // Quick check: if we have cached count and it's > 0, skip + if state, exists := efc.folderCounts[folder]; exists { + if state.roughCount > 0 { + glog.V(3).Infof("EmptyFolderCleaner: skipping %s, cached count=%d", folder, state.roughCount) + efc.mu.Unlock() + return + } + // If there was an add after our delete, skip + if !state.lastAddTime.IsZero() && state.lastAddTime.After(state.lastDelTime) { + glog.V(3).Infof("EmptyFolderCleaner: skipping %s, add happened after delete", folder) + efc.mu.Unlock() + return + } + } + efc.mu.Unlock() + + // Re-check ownership (topology might have changed) + if !efc.ownsFolder(folder) { + glog.V(3).Infof("EmptyFolderCleaner: no longer owner of %s, skipping", folder) + return + } + + // Check if folder is actually empty (count up to maxCountCheck) + ctx := context.Background() + count, err := efc.countItems(ctx, folder) + if err != nil { + glog.V(2).Infof("EmptyFolderCleaner: error counting items in %s: %v", folder, err) + return + } + + efc.mu.Lock() + // Update cache + if _, exists := efc.folderCounts[folder]; !exists { + efc.folderCounts[folder] = &folderState{} + } + efc.folderCounts[folder].roughCount = count + efc.folderCounts[folder].lastCheck = time.Now() + efc.mu.Unlock() + + if count > 0 { + glog.V(3).Infof("EmptyFolderCleaner: folder %s has %d items, not empty", folder, count) + return + } + + // Delete the empty folder + glog.V(2).Infof("EmptyFolderCleaner: deleting empty folder %s", folder) + if err := efc.deleteFolder(ctx, folder); err != nil { + glog.V(2).Infof("EmptyFolderCleaner: failed to delete empty folder %s: %v", folder, err) + return + } + + // Clean up cache entry + efc.mu.Lock() + delete(efc.folderCounts, folder) + efc.mu.Unlock() + + // Note: No need to recursively check parent folder here. + // The deletion of this folder will generate a metadata event, + // which will trigger OnDeleteEvent for the parent folder. +} + +// countItems counts items in a folder (up to maxCountCheck) +func (efc *EmptyFolderCleaner) countItems(ctx context.Context, folder string) (int, error) { + return efc.filer.CountDirectoryEntries(ctx, util.FullPath(folder), efc.maxCountCheck) +} + +// deleteFolder deletes an empty folder +func (efc *EmptyFolderCleaner) deleteFolder(ctx context.Context, folder string) error { + return efc.filer.DeleteEntryMetaAndData(ctx, util.FullPath(folder), false, false, false, false, nil, 0) +} + +// isUnderPath checks if child is under parent path +func isUnderPath(child, parent string) bool { + if parent == "" || parent == "/" { + return true + } + // Ensure parent ends without slash for proper prefix matching + if len(parent) > 0 && parent[len(parent)-1] == '/' { + parent = parent[:len(parent)-1] + } + // Child must start with parent and then have a / or be exactly parent + if len(child) < len(parent) { + return false + } + if child[:len(parent)] != parent { + return false + } + if len(child) == len(parent) { + return true + } + return child[len(parent)] == '/' +} + +// isUnderBucketPath checks if directory is inside a bucket (under /buckets//...) +// This ensures we only clean up folders inside buckets, not the buckets themselves +func isUnderBucketPath(directory, bucketPath string) bool { + if bucketPath == "" { + return true + } + // Ensure bucketPath ends without slash + if len(bucketPath) > 0 && bucketPath[len(bucketPath)-1] == '/' { + bucketPath = bucketPath[:len(bucketPath)-1] + } + // Directory must be under bucketPath + if !isUnderPath(directory, bucketPath) { + return false + } + // Directory must be at least /buckets// + // i.e., depth must be at least bucketPath depth + 2 + // For /buckets (depth 1), we need at least /buckets/mybucket/folder (depth 3) + bucketPathDepth := strings.Count(bucketPath, "/") + directoryDepth := strings.Count(directory, "/") + return directoryDepth >= bucketPathDepth+2 +} + +// cacheEvictionLoop periodically removes stale entries from folderCounts +func (efc *EmptyFolderCleaner) cacheEvictionLoop() { + ticker := time.NewTicker(efc.cacheExpiry) + defer ticker.Stop() + + for { + select { + case <-efc.stopCh: + return + case <-ticker.C: + efc.evictStaleCacheEntries() + } + } +} + +// evictStaleCacheEntries removes cache entries that haven't been accessed recently +func (efc *EmptyFolderCleaner) evictStaleCacheEntries() { + efc.mu.Lock() + defer efc.mu.Unlock() + + now := time.Now() + expiredCount := 0 + for folder, state := range efc.folderCounts { + // Skip if folder is in cleanup queue + if efc.cleanupQueue.Contains(folder) { + continue + } + + // Find the most recent activity time for this folder + lastActivity := state.lastCheck + if state.lastAddTime.After(lastActivity) { + lastActivity = state.lastAddTime + } + if state.lastDelTime.After(lastActivity) { + lastActivity = state.lastDelTime + } + + // Evict if no activity within cache expiry period + if now.Sub(lastActivity) > efc.cacheExpiry { + delete(efc.folderCounts, folder) + expiredCount++ + } + } + + if expiredCount > 0 { + glog.V(3).Infof("EmptyFolderCleaner: evicted %d stale cache entries", expiredCount) + } +} + +// Stop stops the cleaner and cancels all pending tasks +func (efc *EmptyFolderCleaner) Stop() { + close(efc.stopCh) + + efc.mu.Lock() + defer efc.mu.Unlock() + + efc.enabled = false + efc.cleanupQueue.Clear() + efc.folderCounts = make(map[string]*folderState) // Clear cache on stop +} + +// GetPendingCleanupCount returns the number of pending cleanup tasks (for testing) +func (efc *EmptyFolderCleaner) GetPendingCleanupCount() int { + return efc.cleanupQueue.Len() +} + +// GetCachedFolderCount returns the cached count for a folder (for testing) +func (efc *EmptyFolderCleaner) GetCachedFolderCount(folder string) (int, bool) { + efc.mu.RLock() + defer efc.mu.RUnlock() + if state, exists := efc.folderCounts[folder]; exists { + return state.roughCount, true + } + return 0, false +} + diff --git a/weed/filer/empty_folder_cleanup/empty_folder_cleaner_test.go b/weed/filer/empty_folder_cleanup/empty_folder_cleaner_test.go new file mode 100644 index 000000000..fbc05ccf8 --- /dev/null +++ b/weed/filer/empty_folder_cleanup/empty_folder_cleaner_test.go @@ -0,0 +1,569 @@ +package empty_folder_cleanup + +import ( + "testing" + "time" + + "github.com/seaweedfs/seaweedfs/weed/cluster/lock_manager" + "github.com/seaweedfs/seaweedfs/weed/pb" +) + +func Test_isUnderPath(t *testing.T) { + tests := []struct { + name string + child string + parent string + expected bool + }{ + {"child under parent", "/buckets/mybucket/folder/file.txt", "/buckets", true}, + {"child is parent", "/buckets", "/buckets", true}, + {"child not under parent", "/other/path", "/buckets", false}, + {"empty parent", "/any/path", "", true}, + {"root parent", "/any/path", "/", true}, + {"parent with trailing slash", "/buckets/mybucket", "/buckets/", true}, + {"similar prefix but not under", "/buckets-other/file", "/buckets", false}, + {"deeply nested", "/buckets/a/b/c/d/e/f", "/buckets/a/b", true}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := isUnderPath(tt.child, tt.parent) + if result != tt.expected { + t.Errorf("isUnderPath(%q, %q) = %v, want %v", tt.child, tt.parent, result, tt.expected) + } + }) + } +} + +func Test_isUnderBucketPath(t *testing.T) { + tests := []struct { + name string + directory string + bucketPath string + expected bool + }{ + // Should NOT process - bucket path itself + {"bucket path itself", "/buckets", "/buckets", false}, + // Should NOT process - bucket directory (immediate child) + {"bucket directory", "/buckets/mybucket", "/buckets", false}, + // Should process - folder inside bucket + {"folder in bucket", "/buckets/mybucket/folder", "/buckets", true}, + // Should process - nested folder + {"nested folder", "/buckets/mybucket/a/b/c", "/buckets", true}, + // Should NOT process - outside buckets + {"outside buckets", "/other/path", "/buckets", false}, + // Empty bucket path allows all + {"empty bucket path", "/any/path", "", true}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := isUnderBucketPath(tt.directory, tt.bucketPath) + if result != tt.expected { + t.Errorf("isUnderBucketPath(%q, %q) = %v, want %v", tt.directory, tt.bucketPath, result, tt.expected) + } + }) + } +} + +func TestEmptyFolderCleaner_ownsFolder(t *testing.T) { + // Create a LockRing with multiple servers + lockRing := lock_manager.NewLockRing(5 * time.Second) + + servers := []pb.ServerAddress{ + "filer1:8888", + "filer2:8888", + "filer3:8888", + } + lockRing.SetSnapshot(servers) + + // Create cleaner for filer1 + cleaner1 := &EmptyFolderCleaner{ + lockRing: lockRing, + host: "filer1:8888", + } + + // Create cleaner for filer2 + cleaner2 := &EmptyFolderCleaner{ + lockRing: lockRing, + host: "filer2:8888", + } + + // Create cleaner for filer3 + cleaner3 := &EmptyFolderCleaner{ + lockRing: lockRing, + host: "filer3:8888", + } + + // Test that exactly one filer owns each folder + testFolders := []string{ + "/buckets/mybucket/folder1", + "/buckets/mybucket/folder2", + "/buckets/mybucket/folder3", + "/buckets/mybucket/a/b/c", + "/buckets/otherbucket/x", + } + + for _, folder := range testFolders { + ownCount := 0 + if cleaner1.ownsFolder(folder) { + ownCount++ + } + if cleaner2.ownsFolder(folder) { + ownCount++ + } + if cleaner3.ownsFolder(folder) { + ownCount++ + } + + if ownCount != 1 { + t.Errorf("folder %q owned by %d filers, expected exactly 1", folder, ownCount) + } + } +} + +func TestEmptyFolderCleaner_ownsFolder_singleServer(t *testing.T) { + // Create a LockRing with a single server + lockRing := lock_manager.NewLockRing(5 * time.Second) + lockRing.SetSnapshot([]pb.ServerAddress{"filer1:8888"}) + + cleaner := &EmptyFolderCleaner{ + lockRing: lockRing, + host: "filer1:8888", + } + + // Single filer should own all folders + testFolders := []string{ + "/buckets/mybucket/folder1", + "/buckets/mybucket/folder2", + "/buckets/otherbucket/x", + } + + for _, folder := range testFolders { + if !cleaner.ownsFolder(folder) { + t.Errorf("single filer should own folder %q", folder) + } + } +} + +func TestEmptyFolderCleaner_ownsFolder_emptyRing(t *testing.T) { + // Create an empty LockRing + lockRing := lock_manager.NewLockRing(5 * time.Second) + + cleaner := &EmptyFolderCleaner{ + lockRing: lockRing, + host: "filer1:8888", + } + + // With empty ring, should own all folders + if !cleaner.ownsFolder("/buckets/mybucket/folder") { + t.Error("should own folder with empty ring") + } +} + +func TestEmptyFolderCleaner_OnCreateEvent_cancelsCleanup(t *testing.T) { + lockRing := lock_manager.NewLockRing(5 * time.Second) + lockRing.SetSnapshot([]pb.ServerAddress{"filer1:8888"}) + + cleaner := &EmptyFolderCleaner{ + lockRing: lockRing, + host: "filer1:8888", + bucketPath: "/buckets", + enabled: true, + folderCounts: make(map[string]*folderState), + cleanupQueue: NewCleanupQueue(1000, 10*time.Minute), + stopCh: make(chan struct{}), + } + + folder := "/buckets/mybucket/testfolder" + now := time.Now() + + // Simulate delete event + cleaner.OnDeleteEvent(folder, "file.txt", false, now) + + // Check that cleanup is queued + if cleaner.GetPendingCleanupCount() != 1 { + t.Errorf("expected 1 pending cleanup, got %d", cleaner.GetPendingCleanupCount()) + } + + // Simulate create event + cleaner.OnCreateEvent(folder, "newfile.txt", false) + + // Check that cleanup is cancelled + if cleaner.GetPendingCleanupCount() != 0 { + t.Errorf("expected 0 pending cleanups after create, got %d", cleaner.GetPendingCleanupCount()) + } + + cleaner.Stop() +} + +func TestEmptyFolderCleaner_OnDeleteEvent_deduplication(t *testing.T) { + lockRing := lock_manager.NewLockRing(5 * time.Second) + lockRing.SetSnapshot([]pb.ServerAddress{"filer1:8888"}) + + cleaner := &EmptyFolderCleaner{ + lockRing: lockRing, + host: "filer1:8888", + bucketPath: "/buckets", + enabled: true, + folderCounts: make(map[string]*folderState), + cleanupQueue: NewCleanupQueue(1000, 10*time.Minute), + stopCh: make(chan struct{}), + } + + folder := "/buckets/mybucket/testfolder" + now := time.Now() + + // Simulate multiple delete events for same folder + for i := 0; i < 5; i++ { + cleaner.OnDeleteEvent(folder, "file"+string(rune('0'+i))+".txt", false, now.Add(time.Duration(i)*time.Second)) + } + + // Check that only 1 cleanup is queued (deduplicated) + if cleaner.GetPendingCleanupCount() != 1 { + t.Errorf("expected 1 pending cleanup after deduplication, got %d", cleaner.GetPendingCleanupCount()) + } + + cleaner.Stop() +} + +func TestEmptyFolderCleaner_OnDeleteEvent_multipleFolders(t *testing.T) { + lockRing := lock_manager.NewLockRing(5 * time.Second) + lockRing.SetSnapshot([]pb.ServerAddress{"filer1:8888"}) + + cleaner := &EmptyFolderCleaner{ + lockRing: lockRing, + host: "filer1:8888", + bucketPath: "/buckets", + enabled: true, + folderCounts: make(map[string]*folderState), + cleanupQueue: NewCleanupQueue(1000, 10*time.Minute), + stopCh: make(chan struct{}), + } + + now := time.Now() + + // Delete files in different folders + cleaner.OnDeleteEvent("/buckets/mybucket/folder1", "file.txt", false, now) + cleaner.OnDeleteEvent("/buckets/mybucket/folder2", "file.txt", false, now.Add(1*time.Second)) + cleaner.OnDeleteEvent("/buckets/mybucket/folder3", "file.txt", false, now.Add(2*time.Second)) + + // Each folder should be queued + if cleaner.GetPendingCleanupCount() != 3 { + t.Errorf("expected 3 pending cleanups, got %d", cleaner.GetPendingCleanupCount()) + } + + cleaner.Stop() +} + +func TestEmptyFolderCleaner_OnDeleteEvent_notOwner(t *testing.T) { + lockRing := lock_manager.NewLockRing(5 * time.Second) + lockRing.SetSnapshot([]pb.ServerAddress{"filer1:8888", "filer2:8888"}) + + // Create cleaner for filer that doesn't own the folder + cleaner := &EmptyFolderCleaner{ + lockRing: lockRing, + host: "filer1:8888", + bucketPath: "/buckets", + enabled: true, + folderCounts: make(map[string]*folderState), + cleanupQueue: NewCleanupQueue(1000, 10*time.Minute), + stopCh: make(chan struct{}), + } + + now := time.Now() + + // Try many folders, looking for one that filer1 doesn't own + foundNonOwned := false + for i := 0; i < 100; i++ { + folder := "/buckets/mybucket/folder" + string(rune('0'+i%10)) + string(rune('0'+i/10)) + if !cleaner.ownsFolder(folder) { + // This folder is not owned by filer1 + cleaner.OnDeleteEvent(folder, "file.txt", false, now) + if cleaner.GetPendingCleanupCount() != 0 { + t.Errorf("non-owner should not queue cleanup for folder %s", folder) + } + foundNonOwned = true + break + } + } + + if !foundNonOwned { + t.Skip("could not find a folder not owned by filer1") + } + + cleaner.Stop() +} + +func TestEmptyFolderCleaner_OnDeleteEvent_disabled(t *testing.T) { + lockRing := lock_manager.NewLockRing(5 * time.Second) + lockRing.SetSnapshot([]pb.ServerAddress{"filer1:8888"}) + + cleaner := &EmptyFolderCleaner{ + lockRing: lockRing, + host: "filer1:8888", + bucketPath: "/buckets", + enabled: false, // Disabled + folderCounts: make(map[string]*folderState), + cleanupQueue: NewCleanupQueue(1000, 10*time.Minute), + stopCh: make(chan struct{}), + } + + folder := "/buckets/mybucket/testfolder" + now := time.Now() + + // Simulate delete event + cleaner.OnDeleteEvent(folder, "file.txt", false, now) + + // Check that no cleanup is queued when disabled + if cleaner.GetPendingCleanupCount() != 0 { + t.Errorf("disabled cleaner should not queue cleanup, got %d", cleaner.GetPendingCleanupCount()) + } + + cleaner.Stop() +} + +func TestEmptyFolderCleaner_OnDeleteEvent_directoryDeletion(t *testing.T) { + lockRing := lock_manager.NewLockRing(5 * time.Second) + lockRing.SetSnapshot([]pb.ServerAddress{"filer1:8888"}) + + cleaner := &EmptyFolderCleaner{ + lockRing: lockRing, + host: "filer1:8888", + bucketPath: "/buckets", + enabled: true, + folderCounts: make(map[string]*folderState), + cleanupQueue: NewCleanupQueue(1000, 10*time.Minute), + stopCh: make(chan struct{}), + } + + folder := "/buckets/mybucket/testfolder" + now := time.Now() + + // Simulate directory delete event - should trigger cleanup + // because subdirectory deletion also makes parent potentially empty + cleaner.OnDeleteEvent(folder, "subdir", true, now) + + // Check that cleanup IS queued for directory deletion + if cleaner.GetPendingCleanupCount() != 1 { + t.Errorf("directory deletion should trigger cleanup, got %d", cleaner.GetPendingCleanupCount()) + } + + cleaner.Stop() +} + +func TestEmptyFolderCleaner_cachedCounts(t *testing.T) { + lockRing := lock_manager.NewLockRing(5 * time.Second) + lockRing.SetSnapshot([]pb.ServerAddress{"filer1:8888"}) + + cleaner := &EmptyFolderCleaner{ + lockRing: lockRing, + host: "filer1:8888", + bucketPath: "/buckets", + enabled: true, + folderCounts: make(map[string]*folderState), + cleanupQueue: NewCleanupQueue(1000, 10*time.Minute), + stopCh: make(chan struct{}), + } + + folder := "/buckets/mybucket/testfolder" + + // Initialize cached count + cleaner.folderCounts[folder] = &folderState{roughCount: 5} + + // Simulate create events + cleaner.OnCreateEvent(folder, "newfile1.txt", false) + cleaner.OnCreateEvent(folder, "newfile2.txt", false) + + // Check cached count increased + count, exists := cleaner.GetCachedFolderCount(folder) + if !exists { + t.Error("cached folder count should exist") + } + if count != 7 { + t.Errorf("expected cached count 7, got %d", count) + } + + // Simulate delete events + now := time.Now() + cleaner.OnDeleteEvent(folder, "file1.txt", false, now) + cleaner.OnDeleteEvent(folder, "file2.txt", false, now.Add(1*time.Second)) + + // Check cached count decreased + count, exists = cleaner.GetCachedFolderCount(folder) + if !exists { + t.Error("cached folder count should exist") + } + if count != 5 { + t.Errorf("expected cached count 5, got %d", count) + } + + cleaner.Stop() +} + +func TestEmptyFolderCleaner_Stop(t *testing.T) { + lockRing := lock_manager.NewLockRing(5 * time.Second) + lockRing.SetSnapshot([]pb.ServerAddress{"filer1:8888"}) + + cleaner := &EmptyFolderCleaner{ + lockRing: lockRing, + host: "filer1:8888", + bucketPath: "/buckets", + enabled: true, + folderCounts: make(map[string]*folderState), + cleanupQueue: NewCleanupQueue(1000, 10*time.Minute), + stopCh: make(chan struct{}), + } + + now := time.Now() + + // Queue some cleanups + cleaner.OnDeleteEvent("/buckets/mybucket/folder1", "file1.txt", false, now) + cleaner.OnDeleteEvent("/buckets/mybucket/folder2", "file2.txt", false, now.Add(1*time.Second)) + cleaner.OnDeleteEvent("/buckets/mybucket/folder3", "file3.txt", false, now.Add(2*time.Second)) + + // Verify cleanups are queued + if cleaner.GetPendingCleanupCount() < 1 { + t.Error("expected at least 1 pending cleanup before stop") + } + + // Stop the cleaner + cleaner.Stop() + + // Verify all cleanups are cancelled + if cleaner.GetPendingCleanupCount() != 0 { + t.Errorf("expected 0 pending cleanups after stop, got %d", cleaner.GetPendingCleanupCount()) + } +} + +func TestEmptyFolderCleaner_cacheEviction(t *testing.T) { + lockRing := lock_manager.NewLockRing(5 * time.Second) + lockRing.SetSnapshot([]pb.ServerAddress{"filer1:8888"}) + + cleaner := &EmptyFolderCleaner{ + lockRing: lockRing, + host: "filer1:8888", + bucketPath: "/buckets", + enabled: true, + folderCounts: make(map[string]*folderState), + cleanupQueue: NewCleanupQueue(1000, 10*time.Minute), + cacheExpiry: 100 * time.Millisecond, // Short expiry for testing + stopCh: make(chan struct{}), + } + + folder1 := "/buckets/mybucket/folder1" + folder2 := "/buckets/mybucket/folder2" + folder3 := "/buckets/mybucket/folder3" + + // Add some cache entries with old timestamps + oldTime := time.Now().Add(-1 * time.Hour) + cleaner.folderCounts[folder1] = &folderState{roughCount: 5, lastCheck: oldTime} + cleaner.folderCounts[folder2] = &folderState{roughCount: 3, lastCheck: oldTime} + // folder3 has recent activity + cleaner.folderCounts[folder3] = &folderState{roughCount: 2, lastCheck: time.Now()} + + // Verify all entries exist + if len(cleaner.folderCounts) != 3 { + t.Errorf("expected 3 cache entries, got %d", len(cleaner.folderCounts)) + } + + // Run eviction + cleaner.evictStaleCacheEntries() + + // Verify stale entries are evicted + if len(cleaner.folderCounts) != 1 { + t.Errorf("expected 1 cache entry after eviction, got %d", len(cleaner.folderCounts)) + } + + // Verify the recent entry still exists + if _, exists := cleaner.folderCounts[folder3]; !exists { + t.Error("expected folder3 to still exist in cache") + } + + // Verify stale entries are removed + if _, exists := cleaner.folderCounts[folder1]; exists { + t.Error("expected folder1 to be evicted") + } + if _, exists := cleaner.folderCounts[folder2]; exists { + t.Error("expected folder2 to be evicted") + } + + cleaner.Stop() +} + +func TestEmptyFolderCleaner_cacheEviction_skipsEntriesInQueue(t *testing.T) { + lockRing := lock_manager.NewLockRing(5 * time.Second) + lockRing.SetSnapshot([]pb.ServerAddress{"filer1:8888"}) + + cleaner := &EmptyFolderCleaner{ + lockRing: lockRing, + host: "filer1:8888", + bucketPath: "/buckets", + enabled: true, + folderCounts: make(map[string]*folderState), + cleanupQueue: NewCleanupQueue(1000, 10*time.Minute), + cacheExpiry: 100 * time.Millisecond, + stopCh: make(chan struct{}), + } + + folder := "/buckets/mybucket/folder" + oldTime := time.Now().Add(-1 * time.Hour) + + // Add a stale cache entry + cleaner.folderCounts[folder] = &folderState{roughCount: 0, lastCheck: oldTime} + // Also add to cleanup queue + cleaner.cleanupQueue.Add(folder, time.Now()) + + // Run eviction + cleaner.evictStaleCacheEntries() + + // Verify entry is NOT evicted because it's in cleanup queue + if _, exists := cleaner.folderCounts[folder]; !exists { + t.Error("expected folder to still exist in cache (is in cleanup queue)") + } + + cleaner.Stop() +} + +func TestEmptyFolderCleaner_queueFIFOOrder(t *testing.T) { + lockRing := lock_manager.NewLockRing(5 * time.Second) + lockRing.SetSnapshot([]pb.ServerAddress{"filer1:8888"}) + + cleaner := &EmptyFolderCleaner{ + lockRing: lockRing, + host: "filer1:8888", + bucketPath: "/buckets", + enabled: true, + folderCounts: make(map[string]*folderState), + cleanupQueue: NewCleanupQueue(1000, 10*time.Minute), + stopCh: make(chan struct{}), + } + + now := time.Now() + + // Add folders in order + folders := []string{ + "/buckets/mybucket/folder1", + "/buckets/mybucket/folder2", + "/buckets/mybucket/folder3", + } + for i, folder := range folders { + cleaner.OnDeleteEvent(folder, "file.txt", false, now.Add(time.Duration(i)*time.Second)) + } + + // Verify queue length + if cleaner.GetPendingCleanupCount() != 3 { + t.Errorf("expected 3 queued folders, got %d", cleaner.GetPendingCleanupCount()) + } + + // Verify time-sorted order by popping + for i, expected := range folders { + folder, ok := cleaner.cleanupQueue.Pop() + if !ok || folder != expected { + t.Errorf("expected folder %s at index %d, got %s", expected, i, folder) + } + } + + cleaner.Stop() +} + diff --git a/weed/filer/filer.go b/weed/filer/filer.go index f9f3d4fb2..382eb644f 100644 --- a/weed/filer/filer.go +++ b/weed/filer/filer.go @@ -11,6 +11,7 @@ import ( "github.com/seaweedfs/seaweedfs/weed/s3api/s3bucket" "github.com/seaweedfs/seaweedfs/weed/cluster/lock_manager" + "github.com/seaweedfs/seaweedfs/weed/filer/empty_folder_cleanup" "github.com/seaweedfs/seaweedfs/weed/cluster" "github.com/seaweedfs/seaweedfs/weed/pb" @@ -56,6 +57,7 @@ type Filer struct { MaxFilenameLength uint32 deletionQuit chan struct{} DeletionRetryQueue *DeletionRetryQueue + EmptyFolderCleaner *empty_folder_cleanup.EmptyFolderCleaner } func NewFiler(masters pb.ServerDiscovery, grpcDialOption grpc.DialOption, filerHost pb.ServerAddress, filerGroup string, collection string, replication string, dataCenter string, maxFilenameLength uint32, notifyFn func()) *Filer { @@ -116,6 +118,9 @@ func (f *Filer) AggregateFromPeers(self pb.ServerAddress, existingNodes []*maste f.Dlm.LockRing.SetSnapshot(snapshot) glog.V(0).Infof("%s aggregate from peers %+v", self, snapshot) + // Initialize the empty folder cleaner using the same LockRing as Dlm for consistent hashing + f.EmptyFolderCleaner = empty_folder_cleanup.NewEmptyFolderCleaner(f, f.Dlm.LockRing, self, f.DirBucketsPath) + f.MetaAggregator = NewMetaAggregator(f, self, f.GrpcDialOption) f.MasterClient.SetOnPeerUpdateFn(func(update *master_pb.ClusterNodeUpdate, startFrom time.Time) { if update.NodeType != cluster.FilerType { @@ -506,6 +511,9 @@ func (f *Filer) IsDirectoryEmpty(ctx context.Context, dirPath util.FullPath) (bo func (f *Filer) Shutdown() { close(f.deletionQuit) + if f.EmptyFolderCleaner != nil { + f.EmptyFolderCleaner.Stop() + } f.LocalMetaLogBuffer.ShutdownLogBuffer() f.Store.Shutdown() } diff --git a/weed/filer/filer_notify.go b/weed/filer/filer_notify.go index 845a0678e..45c9b070f 100644 --- a/weed/filer/filer_notify.go +++ b/weed/filer/filer_notify.go @@ -66,6 +66,10 @@ func (f *Filer) NotifyUpdateEvent(ctx context.Context, oldEntry, newEntry *Entry f.logMetaEvent(ctx, fullpath, eventNotification) + // Trigger empty folder cleanup for local events + // Remote events are handled via MetaAggregator.onMetadataChangeEvent + f.triggerLocalEmptyFolderCleanup(oldEntry, newEntry) + } func (f *Filer) logMetaEvent(ctx context.Context, fullpath string, eventNotification *filer_pb.EventNotification) { @@ -89,6 +93,41 @@ func (f *Filer) logMetaEvent(ctx context.Context, fullpath string, eventNotifica } +// triggerLocalEmptyFolderCleanup triggers empty folder cleanup for local events +// This is needed because onMetadataChangeEvent is only called for remote peer events +func (f *Filer) triggerLocalEmptyFolderCleanup(oldEntry, newEntry *Entry) { + if f.EmptyFolderCleaner == nil || !f.EmptyFolderCleaner.IsEnabled() { + return + } + + eventTime := time.Now() + + // Handle delete events (oldEntry exists, newEntry is nil) + if oldEntry != nil && newEntry == nil { + dir, name := oldEntry.FullPath.DirAndName() + f.EmptyFolderCleaner.OnDeleteEvent(dir, name, oldEntry.IsDirectory(), eventTime) + } + + // Handle create events (oldEntry is nil, newEntry exists) + if oldEntry == nil && newEntry != nil { + dir, name := newEntry.FullPath.DirAndName() + f.EmptyFolderCleaner.OnCreateEvent(dir, name, newEntry.IsDirectory()) + } + + // Handle rename/move events (both exist but paths differ) + if oldEntry != nil && newEntry != nil { + oldDir, oldName := oldEntry.FullPath.DirAndName() + newDir, newName := newEntry.FullPath.DirAndName() + + if oldDir != newDir || oldName != newName { + // Treat old location as delete + f.EmptyFolderCleaner.OnDeleteEvent(oldDir, oldName, oldEntry.IsDirectory(), eventTime) + // Treat new location as create + f.EmptyFolderCleaner.OnCreateEvent(newDir, newName, newEntry.IsDirectory()) + } + } +} + func (f *Filer) logFlushFunc(logBuffer *log_buffer.LogBuffer, startTime, stopTime time.Time, buf []byte, minOffset, maxOffset int64) { if len(buf) == 0 { diff --git a/weed/filer/filer_on_meta_event.go b/weed/filer/filer_on_meta_event.go index acbf4aa47..4ee80b3a6 100644 --- a/weed/filer/filer_on_meta_event.go +++ b/weed/filer/filer_on_meta_event.go @@ -2,6 +2,7 @@ package filer import ( "bytes" + "time" "github.com/seaweedfs/seaweedfs/weed/glog" "github.com/seaweedfs/seaweedfs/weed/pb/filer_pb" @@ -13,6 +14,7 @@ func (f *Filer) onMetadataChangeEvent(event *filer_pb.SubscribeMetadataResponse) f.maybeReloadFilerConfiguration(event) f.maybeReloadRemoteStorageConfigurationAndMapping(event) f.onBucketEvents(event) + f.onEmptyFolderCleanupEvents(event) } func (f *Filer) onBucketEvents(event *filer_pb.SubscribeMetadataResponse) { @@ -32,6 +34,43 @@ func (f *Filer) onBucketEvents(event *filer_pb.SubscribeMetadataResponse) { } } +// onEmptyFolderCleanupEvents handles create/delete events for empty folder cleanup +func (f *Filer) onEmptyFolderCleanupEvents(event *filer_pb.SubscribeMetadataResponse) { + if f.EmptyFolderCleaner == nil || !f.EmptyFolderCleaner.IsEnabled() { + return + } + + message := event.EventNotification + directory := event.Directory + eventTime := time.Unix(0, event.TsNs) + + // Handle delete events - trigger folder cleanup check + if filer_pb.IsDelete(event) && message.OldEntry != nil { + f.EmptyFolderCleaner.OnDeleteEvent(directory, message.OldEntry.Name, message.OldEntry.IsDirectory, eventTime) + } + + // Handle create events - cancel pending cleanup for the folder + if filer_pb.IsCreate(event) && message.NewEntry != nil { + f.EmptyFolderCleaner.OnCreateEvent(directory, message.NewEntry.Name, message.NewEntry.IsDirectory) + } + + // Handle rename/move events + if filer_pb.IsRename(event) { + // Treat the old location as a delete + if message.OldEntry != nil { + f.EmptyFolderCleaner.OnDeleteEvent(directory, message.OldEntry.Name, message.OldEntry.IsDirectory, eventTime) + } + // Treat the new location as a create + if message.NewEntry != nil { + newDir := message.NewParentPath + if newDir == "" { + newDir = directory + } + f.EmptyFolderCleaner.OnCreateEvent(newDir, message.NewEntry.Name, message.NewEntry.IsDirectory) + } + } +} + func (f *Filer) maybeReloadFilerConfiguration(event *filer_pb.SubscribeMetadataResponse) { if DirectoryEtcSeaweedFS != event.Directory { if DirectoryEtcSeaweedFS != event.EventNotification.NewParentPath { diff --git a/weed/filer/filer_search.go b/weed/filer/filer_search.go index 294fc0e7f..e6366e82f 100644 --- a/weed/filer/filer_search.go +++ b/weed/filer/filer_search.go @@ -41,6 +41,19 @@ func (f *Filer) ListDirectoryEntries(ctx context.Context, p util.FullPath, start return entries, hasMore, err } +// CountDirectoryEntries counts entries in a directory up to limit +func (f *Filer) CountDirectoryEntries(ctx context.Context, p util.FullPath, limit int) (count int, err error) { + entries, hasMore, err := f.ListDirectoryEntries(ctx, p, "", false, int64(limit), "", "", "") + if err != nil { + return 0, err + } + count = len(entries) + if hasMore { + count = limit // At least this many + } + return count, nil +} + // For now, prefix and namePattern are mutually exclusive func (f *Filer) StreamListDirectoryEntries(ctx context.Context, p util.FullPath, startFileName string, inclusive bool, limit int64, prefix string, namePattern string, namePatternExclude string, eachEntryFunc ListEachEntryFunc) (lastFileName string, err error) { if strings.HasSuffix(string(p), "/") && len(p) > 1 { diff --git a/weed/filer/reader_at.go b/weed/filer/reader_at.go index 93fa76a2e..5e8fd6154 100644 --- a/weed/filer/reader_at.go +++ b/weed/filer/reader_at.go @@ -7,6 +7,8 @@ import ( "math/rand" "sync" + "golang.org/x/sync/errgroup" + "github.com/seaweedfs/seaweedfs/weed/glog" "github.com/seaweedfs/seaweedfs/weed/pb/filer_pb" "github.com/seaweedfs/seaweedfs/weed/util" @@ -19,6 +21,11 @@ import ( // the prefetch count is derived from the -concurrentReaders option. const DefaultPrefetchCount = 4 +// minReadConcurrency is the minimum number of parallel chunk fetches. +// This ensures at least some parallelism even when prefetchCount is low, +// improving throughput for reads spanning multiple chunks. +const minReadConcurrency = 4 + type ChunkReadAt struct { masterClient *wdclient.MasterClient chunkViews *IntervalList[*ChunkView] @@ -175,67 +182,139 @@ func (c *ChunkReadAt) ReadAtWithTime(ctx context.Context, p []byte, offset int64 return c.doReadAt(ctx, p, offset) } +// chunkReadTask represents a single chunk read operation for parallel processing +type chunkReadTask struct { + chunk *ChunkView + bufferStart int64 // start position in the output buffer + bufferEnd int64 // end position in the output buffer + chunkOffset uint64 // offset within the chunk to read from + bytesRead int + modifiedTsNs int64 +} + func (c *ChunkReadAt) doReadAt(ctx context.Context, p []byte, offset int64) (n int, ts int64, err error) { + // Collect all chunk read tasks + var tasks []*chunkReadTask + var gaps []struct{ start, length int64 } // gaps that need zero-filling + startOffset, remaining := offset, int64(len(p)) - var nextChunks *Interval[*ChunkView] + var lastChunk *Interval[*ChunkView] + for x := c.chunkViews.Front(); x != nil; x = x.Next { chunk := x.Value if remaining <= 0 { break } - if x.Next != nil { - nextChunks = x.Next - } + lastChunk = x + + // Handle gap before this chunk if startOffset < chunk.ViewOffset { gap := chunk.ViewOffset - startOffset - glog.V(4).Infof("zero [%d,%d)", startOffset, chunk.ViewOffset) - n += zero(p, startOffset-offset, gap) + gaps = append(gaps, struct{ start, length int64 }{startOffset - offset, gap}) startOffset, remaining = chunk.ViewOffset, remaining-gap if remaining <= 0 { break } } - // fmt.Printf(">>> doReadAt [%d,%d), chunk[%d,%d)\n", offset, offset+int64(len(p)), chunk.ViewOffset, chunk.ViewOffset+int64(chunk.ViewSize)) + chunkStart, chunkStop := max(chunk.ViewOffset, startOffset), min(chunk.ViewOffset+int64(chunk.ViewSize), startOffset+remaining) if chunkStart >= chunkStop { continue } - // glog.V(4).Infof("read [%d,%d), %d/%d chunk %s [%d,%d)", chunkStart, chunkStop, i, len(c.chunkViews), chunk.FileId, chunk.ViewOffset-chunk.Offset, chunk.ViewOffset-chunk.Offset+int64(chunk.ViewSize)) + bufferOffset := chunkStart - chunk.ViewOffset + chunk.OffsetInChunk - ts = chunk.ModifiedTsNs - copied, err := c.readChunkSliceAt(ctx, p[startOffset-offset:chunkStop-chunkStart+startOffset-offset], chunk, nextChunks, uint64(bufferOffset)) - if err != nil { - glog.Errorf("fetching chunk %+v: %v\n", chunk, err) - return copied, ts, err + tasks = append(tasks, &chunkReadTask{ + chunk: chunk, + bufferStart: startOffset - offset, + bufferEnd: chunkStop - chunkStart + startOffset - offset, + chunkOffset: uint64(bufferOffset), + }) + + startOffset, remaining = chunkStop, remaining-(chunkStop-chunkStart) + } + + // Zero-fill gaps + for _, gap := range gaps { + glog.V(4).Infof("zero [%d,%d)", offset+gap.start, offset+gap.start+gap.length) + n += zero(p, gap.start, gap.length) + } + + // If only one chunk or random access mode, use sequential reading + if len(tasks) <= 1 || c.readerPattern.IsRandomMode() { + for _, task := range tasks { + copied, readErr := c.readChunkSliceAt(ctx, p[task.bufferStart:task.bufferEnd], task.chunk, nil, task.chunkOffset) + ts = max(ts, task.chunk.ModifiedTsNs) + if readErr != nil { + glog.Errorf("fetching chunk %+v: %v\n", task.chunk, readErr) + return n + copied, ts, readErr + } + n += copied + } + } else { + // Parallel chunk fetching for multiple chunks + // This significantly improves throughput when chunks are on different volume servers + g, gCtx := errgroup.WithContext(ctx) + + // Limit concurrency to avoid overwhelming the system + concurrency := c.prefetchCount + if concurrency < minReadConcurrency { + concurrency = minReadConcurrency + } + if concurrency > len(tasks) { + concurrency = len(tasks) + } + g.SetLimit(concurrency) + + for _, task := range tasks { + g.Go(func() error { + // Read directly into the correct position in the output buffer + copied, readErr := c.readChunkSliceAtForParallel(gCtx, p[task.bufferStart:task.bufferEnd], task.chunk, task.chunkOffset) + task.bytesRead = copied + task.modifiedTsNs = task.chunk.ModifiedTsNs + return readErr + }) } - n += copied - startOffset, remaining = startOffset+int64(copied), remaining-int64(copied) + // Wait for all chunk reads to complete + if waitErr := g.Wait(); waitErr != nil { + err = waitErr + } + + // Aggregate results (order is preserved since we read directly into buffer positions) + for _, task := range tasks { + n += task.bytesRead + ts = max(ts, task.modifiedTsNs) + } + + if err != nil { + return n, ts, err + } } - // glog.V(4).Infof("doReadAt [%d,%d), n:%v, err:%v", offset, offset+int64(len(p)), n, err) + // Trigger prefetch for sequential reads + if lastChunk != nil && lastChunk.Next != nil && c.prefetchCount > 0 && !c.readerPattern.IsRandomMode() { + c.readerCache.MaybeCache(lastChunk.Next, c.prefetchCount) + } - // zero the remaining bytes if a gap exists at the end of the last chunk (or a fully sparse file) - if err == nil && remaining > 0 { + // Zero the remaining bytes if a gap exists at the end + if remaining > 0 { var delta int64 if c.fileSize >= startOffset { delta = min(remaining, c.fileSize-startOffset) - startOffset -= offset - } - if delta > 0 { - glog.V(4).Infof("zero2 [%d,%d) of file size %d bytes", startOffset, startOffset+delta, c.fileSize) - n += zero(p, startOffset, delta) + bufStart := startOffset - offset + if delta > 0 { + glog.V(4).Infof("zero2 [%d,%d) of file size %d bytes", startOffset, startOffset+delta, c.fileSize) + n += zero(p, bufStart, delta) + } } } if err == nil && offset+int64(len(p)) >= c.fileSize { err = io.EOF } - // fmt.Printf("~~~ filled %d, err: %v\n\n", n, err) return - } func (c *ChunkReadAt) readChunkSliceAt(ctx context.Context, buffer []byte, chunkView *ChunkView, nextChunkViews *Interval[*ChunkView], offset uint64) (n int, err error) { @@ -249,7 +328,7 @@ func (c *ChunkReadAt) readChunkSliceAt(ctx context.Context, buffer []byte, chunk } shouldCache := (uint64(chunkView.ViewOffset) + chunkView.ChunkSize) <= c.readerCache.chunkCache.GetMaxFilePartSizeInCache() - n, err = c.readerCache.ReadChunkAt(buffer, chunkView.FileId, chunkView.CipherKey, chunkView.IsGzipped, int64(offset), int(chunkView.ChunkSize), shouldCache) + n, err = c.readerCache.ReadChunkAt(ctx, buffer, chunkView.FileId, chunkView.CipherKey, chunkView.IsGzipped, int64(offset), int(chunkView.ChunkSize), shouldCache) if c.lastChunkFid != chunkView.FileId { if chunkView.OffsetInChunk == 0 { // start of a new chunk if c.lastChunkFid != "" { @@ -266,6 +345,13 @@ func (c *ChunkReadAt) readChunkSliceAt(ctx context.Context, buffer []byte, chunk return } +// readChunkSliceAtForParallel is a simplified version for parallel chunk fetching +// It doesn't update lastChunkFid or trigger prefetch (handled by the caller) +func (c *ChunkReadAt) readChunkSliceAtForParallel(ctx context.Context, buffer []byte, chunkView *ChunkView, offset uint64) (n int, err error) { + shouldCache := (uint64(chunkView.ViewOffset) + chunkView.ChunkSize) <= c.readerCache.chunkCache.GetMaxFilePartSizeInCache() + return c.readerCache.ReadChunkAt(ctx, buffer, chunkView.FileId, chunkView.CipherKey, chunkView.IsGzipped, int64(offset), int(chunkView.ChunkSize), shouldCache) +} + func zero(buffer []byte, start, length int64) int { if length <= 0 { return 0 diff --git a/weed/filer/reader_cache.go b/weed/filer/reader_cache.go index 605be5e73..66cbac1e3 100644 --- a/weed/filer/reader_cache.go +++ b/weed/filer/reader_cache.go @@ -35,6 +35,7 @@ type SingleChunkCacher struct { shouldCache bool wg sync.WaitGroup cacheStartedCh chan struct{} + done chan struct{} // signals when download is complete } func NewReaderCache(limit int, chunkCache chunk_cache.ChunkCache, lookupFileIdFn wdclient.LookupFileIdFunctionType) *ReaderCache { @@ -93,14 +94,18 @@ func (rc *ReaderCache) MaybeCache(chunkViews *Interval[*ChunkView], count int) { return } -func (rc *ReaderCache) ReadChunkAt(buffer []byte, fileId string, cipherKey []byte, isGzipped bool, offset int64, chunkSize int, shouldCache bool) (int, error) { +func (rc *ReaderCache) ReadChunkAt(ctx context.Context, buffer []byte, fileId string, cipherKey []byte, isGzipped bool, offset int64, chunkSize int, shouldCache bool) (int, error) { rc.Lock() if cacher, found := rc.downloaders[fileId]; found { - if n, err := cacher.readChunkAt(buffer, offset); n != 0 && err == nil { - rc.Unlock() + rc.Unlock() + n, err := cacher.readChunkAt(ctx, buffer, offset) + if n > 0 || err != nil { return n, err } + // If n=0 and err=nil, the cacher couldn't provide data for this offset. + // Fall through to try chunkCache. + rc.Lock() } if shouldCache || rc.lookupFileIdFn == nil { n, err := rc.chunkCache.ReadChunkAt(buffer, fileId, uint64(offset)) @@ -134,7 +139,7 @@ func (rc *ReaderCache) ReadChunkAt(buffer []byte, fileId string, cipherKey []byt rc.downloaders[fileId] = cacher rc.Unlock() - return cacher.readChunkAt(buffer, offset) + return cacher.readChunkAt(ctx, buffer, offset) } func (rc *ReaderCache) UnCache(fileId string) { @@ -166,38 +171,53 @@ func newSingleChunkCacher(parent *ReaderCache, fileId string, cipherKey []byte, chunkSize: chunkSize, shouldCache: shouldCache, cacheStartedCh: make(chan struct{}), + done: make(chan struct{}), } } +// startCaching downloads the chunk data in the background. +// It does NOT hold the lock during the HTTP download to allow concurrent readers +// to wait efficiently using the done channel. func (s *SingleChunkCacher) startCaching() { s.wg.Add(1) defer s.wg.Done() - s.Lock() - defer s.Unlock() + defer close(s.done) // guarantee completion signal even on panic - s.cacheStartedCh <- struct{}{} // means this has been started + s.cacheStartedCh <- struct{}{} // signal that we've started + // Note: We intentionally use context.Background() here, NOT a request-specific context. + // The downloaded chunk is a shared resource - multiple concurrent readers may be waiting + // for this same download to complete. If we used a request context and that request was + // cancelled, it would abort the download and cause errors for all other waiting readers. + // The download should always complete once started to serve all potential consumers. + + // Lookup file ID without holding the lock urlStrings, err := s.parent.lookupFileIdFn(context.Background(), s.chunkFileId) if err != nil { + s.Lock() s.err = fmt.Errorf("operation LookupFileId %s failed, err: %v", s.chunkFileId, err) + s.Unlock() return } - s.data = mem.Allocate(s.chunkSize) - - _, s.err = util_http.RetriedFetchChunkData(context.Background(), s.data, urlStrings, s.cipherKey, s.isGzipped, true, 0, s.chunkFileId) - if s.err != nil { - mem.Free(s.data) - s.data = nil - return - } + // Allocate buffer and download without holding the lock + // This allows multiple downloads to proceed in parallel + data := mem.Allocate(s.chunkSize) + _, fetchErr := util_http.RetriedFetchChunkData(context.Background(), data, urlStrings, s.cipherKey, s.isGzipped, true, 0, s.chunkFileId) - if s.shouldCache { - s.parent.chunkCache.SetChunk(s.chunkFileId, s.data) + // Now acquire lock to update state + s.Lock() + if fetchErr != nil { + mem.Free(data) + s.err = fetchErr + } else { + s.data = data + if s.shouldCache { + s.parent.chunkCache.SetChunk(s.chunkFileId, s.data) + } + atomic.StoreInt64(&s.completedTimeNew, time.Now().UnixNano()) } - atomic.StoreInt64(&s.completedTimeNew, time.Now().UnixNano()) - - return + s.Unlock() } func (s *SingleChunkCacher) destroy() { @@ -209,13 +229,34 @@ func (s *SingleChunkCacher) destroy() { if s.data != nil { mem.Free(s.data) s.data = nil - close(s.cacheStartedCh) } } -func (s *SingleChunkCacher) readChunkAt(buf []byte, offset int64) (int, error) { +// readChunkAt reads data from the cached chunk. +// It waits for the download to complete if it's still in progress. +// The ctx parameter allows the reader to cancel its wait (but the download continues +// for other readers - see comment in startCaching about shared resource semantics). +func (s *SingleChunkCacher) readChunkAt(ctx context.Context, buf []byte, offset int64) (int, error) { s.wg.Add(1) defer s.wg.Done() + + // Wait for download to complete, but allow reader cancellation. + // Prioritize checking done first - if data is already available, + // return it even if context is also cancelled. + select { + case <-s.done: + // Download already completed, proceed immediately + default: + // Download not complete, wait for it or context cancellation + select { + case <-s.done: + // Download completed + case <-ctx.Done(): + // Reader cancelled while waiting - download continues for other readers + return 0, ctx.Err() + } + } + s.Lock() defer s.Unlock() @@ -228,5 +269,4 @@ func (s *SingleChunkCacher) readChunkAt(buf []byte, offset int64) (int, error) { } return copy(buf, s.data[offset:]), nil - } diff --git a/weed/filer/reader_cache_test.go b/weed/filer/reader_cache_test.go new file mode 100644 index 000000000..0480de8a7 --- /dev/null +++ b/weed/filer/reader_cache_test.go @@ -0,0 +1,505 @@ +package filer + +import ( + "context" + "fmt" + "sync" + "sync/atomic" + "testing" + "time" +) + +// mockChunkCacheForReaderCache implements chunk cache for testing +type mockChunkCacheForReaderCache struct { + data map[string][]byte + hitCount int32 + mu sync.Mutex +} + +func newMockChunkCacheForReaderCache() *mockChunkCacheForReaderCache { + return &mockChunkCacheForReaderCache{ + data: make(map[string][]byte), + } +} + +func (m *mockChunkCacheForReaderCache) GetChunk(fileId string, minSize uint64) []byte { + m.mu.Lock() + defer m.mu.Unlock() + if d, ok := m.data[fileId]; ok { + atomic.AddInt32(&m.hitCount, 1) + return d + } + return nil +} + +func (m *mockChunkCacheForReaderCache) ReadChunkAt(data []byte, fileId string, offset uint64) (int, error) { + m.mu.Lock() + defer m.mu.Unlock() + if d, ok := m.data[fileId]; ok && int(offset) < len(d) { + atomic.AddInt32(&m.hitCount, 1) + n := copy(data, d[offset:]) + return n, nil + } + return 0, nil +} + +func (m *mockChunkCacheForReaderCache) SetChunk(fileId string, data []byte) { + m.mu.Lock() + defer m.mu.Unlock() + m.data[fileId] = data +} + +func (m *mockChunkCacheForReaderCache) GetMaxFilePartSizeInCache() uint64 { + return 1024 * 1024 // 1MB +} + +func (m *mockChunkCacheForReaderCache) IsInCache(fileId string, lockNeeded bool) bool { + m.mu.Lock() + defer m.mu.Unlock() + _, ok := m.data[fileId] + return ok +} + +// TestReaderCacheContextCancellation tests that a reader can cancel its wait +// while the download continues for other readers +func TestReaderCacheContextCancellation(t *testing.T) { + cache := newMockChunkCacheForReaderCache() + + // Create a ReaderCache - we can't easily test the full flow without mocking HTTP, + // but we can test the context cancellation in readChunkAt + rc := NewReaderCache(10, cache, nil) + defer rc.destroy() + + // Pre-populate cache to avoid HTTP calls + testData := []byte("test data for context cancellation") + cache.SetChunk("test-file-1", testData) + + // Test that context cancellation works + ctx, cancel := context.WithCancel(context.Background()) + + buffer := make([]byte, len(testData)) + n, err := rc.ReadChunkAt(ctx, buffer, "test-file-1", nil, false, 0, len(testData), true) + if err != nil { + t.Errorf("Expected no error, got: %v", err) + } + if n != len(testData) { + t.Errorf("Expected %d bytes, got %d", len(testData), n) + } + + // Cancel context and verify it doesn't affect already completed reads + cancel() + + // Subsequent read with cancelled context should still work from cache + buffer2 := make([]byte, len(testData)) + n2, err2 := rc.ReadChunkAt(ctx, buffer2, "test-file-1", nil, false, 0, len(testData), true) + // Note: This may or may not error depending on whether it hits cache + _ = n2 + _ = err2 +} + +// TestReaderCacheFallbackToChunkCache tests that when a cacher returns n=0, err=nil, +// we fall back to the chunkCache +func TestReaderCacheFallbackToChunkCache(t *testing.T) { + cache := newMockChunkCacheForReaderCache() + + // Pre-populate the chunk cache with data + testData := []byte("fallback test data that should be found in chunk cache") + cache.SetChunk("fallback-file", testData) + + rc := NewReaderCache(10, cache, nil) + defer rc.destroy() + + // Read should hit the chunk cache + buffer := make([]byte, len(testData)) + n, err := rc.ReadChunkAt(context.Background(), buffer, "fallback-file", nil, false, 0, len(testData), true) + + if err != nil { + t.Errorf("Expected no error, got: %v", err) + } + if n != len(testData) { + t.Errorf("Expected %d bytes, got %d", len(testData), n) + } + + // Verify cache was hit + if cache.hitCount == 0 { + t.Error("Expected chunk cache to be hit") + } +} + +// TestReaderCacheMultipleReadersWaitForSameChunk tests that multiple readers +// can wait for the same chunk download to complete +func TestReaderCacheMultipleReadersWaitForSameChunk(t *testing.T) { + cache := newMockChunkCacheForReaderCache() + + // Pre-populate cache so we don't need HTTP + testData := make([]byte, 1024) + for i := range testData { + testData[i] = byte(i % 256) + } + cache.SetChunk("shared-chunk", testData) + + rc := NewReaderCache(10, cache, nil) + defer rc.destroy() + + // Launch multiple concurrent readers for the same chunk + numReaders := 10 + var wg sync.WaitGroup + errors := make(chan error, numReaders) + bytesRead := make(chan int, numReaders) + + for i := 0; i < numReaders; i++ { + wg.Add(1) + go func() { + defer wg.Done() + buffer := make([]byte, len(testData)) + n, err := rc.ReadChunkAt(context.Background(), buffer, "shared-chunk", nil, false, 0, len(testData), true) + if err != nil { + errors <- err + } + bytesRead <- n + }() + } + + wg.Wait() + close(errors) + close(bytesRead) + + // Check for errors + for err := range errors { + t.Errorf("Reader got error: %v", err) + } + + // Verify all readers got the expected data + for n := range bytesRead { + if n != len(testData) { + t.Errorf("Expected %d bytes, got %d", len(testData), n) + } + } +} + +// TestReaderCachePartialRead tests reading at different offsets +func TestReaderCachePartialRead(t *testing.T) { + cache := newMockChunkCacheForReaderCache() + + testData := []byte("0123456789ABCDEFGHIJ") + cache.SetChunk("partial-read-file", testData) + + rc := NewReaderCache(10, cache, nil) + defer rc.destroy() + + tests := []struct { + name string + offset int64 + size int + expected []byte + }{ + {"read from start", 0, 5, []byte("01234")}, + {"read from middle", 5, 5, []byte("56789")}, + {"read to end", 15, 5, []byte("FGHIJ")}, + {"read single byte", 10, 1, []byte("A")}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + buffer := make([]byte, tt.size) + n, err := rc.ReadChunkAt(context.Background(), buffer, "partial-read-file", nil, false, tt.offset, len(testData), true) + + if err != nil { + t.Errorf("Expected no error, got: %v", err) + } + if n != tt.size { + t.Errorf("Expected %d bytes, got %d", tt.size, n) + } + if string(buffer[:n]) != string(tt.expected) { + t.Errorf("Expected %q, got %q", tt.expected, buffer[:n]) + } + }) + } +} + +// TestReaderCacheCleanup tests that old downloaders are cleaned up +func TestReaderCacheCleanup(t *testing.T) { + cache := newMockChunkCacheForReaderCache() + + // Create cache with limit of 3 + rc := NewReaderCache(3, cache, nil) + defer rc.destroy() + + // Add data for multiple files + for i := 0; i < 5; i++ { + fileId := string(rune('A' + i)) + data := []byte("data for file " + fileId) + cache.SetChunk(fileId, data) + } + + // Read from multiple files - should trigger cleanup when exceeding limit + for i := 0; i < 5; i++ { + fileId := string(rune('A' + i)) + buffer := make([]byte, 20) + _, err := rc.ReadChunkAt(context.Background(), buffer, fileId, nil, false, 0, 20, true) + if err != nil { + t.Errorf("Read error for file %s: %v", fileId, err) + } + } + + // Cache should still work - reads should succeed + for i := 0; i < 5; i++ { + fileId := string(rune('A' + i)) + buffer := make([]byte, 20) + n, err := rc.ReadChunkAt(context.Background(), buffer, fileId, nil, false, 0, 20, true) + if err != nil { + t.Errorf("Second read error for file %s: %v", fileId, err) + } + if n == 0 { + t.Errorf("Expected data for file %s, got 0 bytes", fileId) + } + } +} + +// TestSingleChunkCacherDoneSignal tests that done channel is always closed +func TestSingleChunkCacherDoneSignal(t *testing.T) { + cache := newMockChunkCacheForReaderCache() + rc := NewReaderCache(10, cache, nil) + defer rc.destroy() + + // Test that we can read even when data is in cache (done channel should work) + testData := []byte("done signal test") + cache.SetChunk("done-signal-test", testData) + + // Multiple goroutines reading same chunk + var wg sync.WaitGroup + for i := 0; i < 5; i++ { + wg.Add(1) + go func() { + defer wg.Done() + buffer := make([]byte, len(testData)) + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + + n, err := rc.ReadChunkAt(ctx, buffer, "done-signal-test", nil, false, 0, len(testData), true) + if err != nil && err != context.DeadlineExceeded { + t.Errorf("Unexpected error: %v", err) + } + if n == 0 && err == nil { + t.Error("Got 0 bytes with no error") + } + }() + } + + // Should complete without hanging + done := make(chan struct{}) + go func() { + wg.Wait() + close(done) + }() + + select { + case <-done: + // Success + case <-time.After(10 * time.Second): + t.Fatal("Test timed out - done channel may not be signaled correctly") + } +} + +// ============================================================================ +// Tests that exercise SingleChunkCacher concurrency logic +// ============================================================================ +// +// These tests use blocking lookupFileIdFn to exercise the wait/cancellation +// logic in SingleChunkCacher without requiring HTTP calls. + +// TestSingleChunkCacherLookupError tests handling of lookup errors +func TestSingleChunkCacherLookupError(t *testing.T) { + cache := newMockChunkCacheForReaderCache() + + // Lookup function that returns an error + lookupFn := func(ctx context.Context, fileId string) ([]string, error) { + return nil, fmt.Errorf("lookup failed for %s", fileId) + } + + rc := NewReaderCache(10, cache, lookupFn) + defer rc.destroy() + + buffer := make([]byte, 100) + _, err := rc.ReadChunkAt(context.Background(), buffer, "error-test", nil, false, 0, 100, true) + + if err == nil { + t.Error("Expected an error, got nil") + } +} + +// TestSingleChunkCacherContextCancellationDuringLookup tests that a reader can +// cancel its wait while the lookup is in progress. This exercises the actual +// SingleChunkCacher wait/cancel logic. +func TestSingleChunkCacherContextCancellationDuringLookup(t *testing.T) { + cache := newMockChunkCacheForReaderCache() + lookupStarted := make(chan struct{}) + lookupCanFinish := make(chan struct{}) + + // Lookup function that blocks to simulate slow operation + lookupFn := func(ctx context.Context, fileId string) ([]string, error) { + close(lookupStarted) + <-lookupCanFinish // Block until test allows completion + return nil, fmt.Errorf("lookup completed but reader should have cancelled") + } + + rc := NewReaderCache(10, cache, lookupFn) + defer rc.destroy() + defer close(lookupCanFinish) // Ensure cleanup + + ctx, cancel := context.WithCancel(context.Background()) + readResult := make(chan error, 1) + + go func() { + buffer := make([]byte, 100) + _, err := rc.ReadChunkAt(ctx, buffer, "cancel-during-lookup", nil, false, 0, 100, true) + readResult <- err + }() + + // Wait for lookup to start, then cancel the reader's context + select { + case <-lookupStarted: + cancel() // Cancel the reader while lookup is blocked + case <-time.After(5 * time.Second): + t.Fatal("Lookup never started") + } + + // Read should return with context.Canceled + select { + case err := <-readResult: + if err != context.Canceled { + t.Errorf("Expected context.Canceled, got: %v", err) + } + case <-time.After(5 * time.Second): + t.Fatal("Read did not complete after context cancellation") + } +} + +// TestSingleChunkCacherMultipleReadersWaitForDownload tests that multiple readers +// can wait for the same SingleChunkCacher download to complete. When lookup fails, +// all readers should receive the same error. +func TestSingleChunkCacherMultipleReadersWaitForDownload(t *testing.T) { + cache := newMockChunkCacheForReaderCache() + lookupStarted := make(chan struct{}) + lookupCanFinish := make(chan struct{}) + var lookupStartedOnce sync.Once + + // Lookup function that blocks to simulate slow operation + lookupFn := func(ctx context.Context, fileId string) ([]string, error) { + lookupStartedOnce.Do(func() { close(lookupStarted) }) + <-lookupCanFinish + return nil, fmt.Errorf("simulated lookup error") + } + + rc := NewReaderCache(10, cache, lookupFn) + defer rc.destroy() + + numReaders := 5 + var wg sync.WaitGroup + errors := make(chan error, numReaders) + + // Start multiple readers for the same chunk + for i := 0; i < numReaders; i++ { + wg.Add(1) + go func() { + defer wg.Done() + buffer := make([]byte, 100) + _, err := rc.ReadChunkAt(context.Background(), buffer, "shared-chunk", nil, false, 0, 100, true) + errors <- err + }() + } + + // Wait for lookup to start, then allow completion + select { + case <-lookupStarted: + close(lookupCanFinish) + case <-time.After(5 * time.Second): + close(lookupCanFinish) + t.Fatal("Lookup never started") + } + + wg.Wait() + close(errors) + + // All readers should receive an error + errorCount := 0 + for err := range errors { + if err != nil { + errorCount++ + } + } + if errorCount != numReaders { + t.Errorf("Expected %d errors, got %d", numReaders, errorCount) + } +} + +// TestSingleChunkCacherOneReaderCancelsOthersContinue tests that when one reader +// cancels, other readers waiting on the same chunk continue to wait. +func TestSingleChunkCacherOneReaderCancelsOthersContinue(t *testing.T) { + cache := newMockChunkCacheForReaderCache() + lookupStarted := make(chan struct{}) + lookupCanFinish := make(chan struct{}) + var lookupStartedOnce sync.Once + + lookupFn := func(ctx context.Context, fileId string) ([]string, error) { + lookupStartedOnce.Do(func() { close(lookupStarted) }) + <-lookupCanFinish + return nil, fmt.Errorf("simulated error after delay") + } + + rc := NewReaderCache(10, cache, lookupFn) + defer rc.destroy() + + cancelledReaderDone := make(chan error, 1) + otherReaderDone := make(chan error, 1) + + ctx, cancel := context.WithCancel(context.Background()) + + // Start reader that will be cancelled + go func() { + buffer := make([]byte, 100) + _, err := rc.ReadChunkAt(ctx, buffer, "shared-chunk-2", nil, false, 0, 100, true) + cancelledReaderDone <- err + }() + + // Start reader that will NOT be cancelled + go func() { + buffer := make([]byte, 100) + _, err := rc.ReadChunkAt(context.Background(), buffer, "shared-chunk-2", nil, false, 0, 100, true) + otherReaderDone <- err + }() + + // Wait for lookup to start + select { + case <-lookupStarted: + case <-time.After(5 * time.Second): + t.Fatal("Lookup never started") + } + + // Cancel the first reader + cancel() + + // First reader should complete with context.Canceled quickly + select { + case err := <-cancelledReaderDone: + if err != context.Canceled { + t.Errorf("Cancelled reader: expected context.Canceled, got: %v", err) + } + case <-time.After(2 * time.Second): + t.Error("Cancelled reader did not complete quickly") + } + + // Allow the download to complete + close(lookupCanFinish) + + // Other reader should eventually complete (with error since lookup returns error) + select { + case err := <-otherReaderDone: + if err == nil || err == context.Canceled { + t.Errorf("Other reader: expected non-nil non-cancelled error, got: %v", err) + } + // Expected: "simulated error after delay" + case <-time.After(5 * time.Second): + t.Error("Other reader did not complete") + } +} diff --git a/weed/operation/upload_content.go b/weed/operation/upload_content.go index 90f90c87d..a2fff4792 100644 --- a/weed/operation/upload_content.go +++ b/weed/operation/upload_content.go @@ -90,10 +90,9 @@ func (uploadResult *UploadResult) ToPbFileChunkWithSSE(fileId string, offset int } var ( - fileNameEscaper = strings.NewReplacer(`\`, `\\`, `"`, `\"`, "\n", "") - uploader *Uploader - uploaderErr error - once sync.Once + uploader *Uploader + uploaderErr error + once sync.Once ) // HTTPClient interface for testing @@ -336,8 +335,9 @@ func (uploader *Uploader) upload_content(ctx context.Context, fillBufferFunction body_writer = multipart.NewWriter(option.BytesBuffer) } h := make(textproto.MIMEHeader) - filename := fileNameEscaper.Replace(option.Filename) - h.Set("Content-Disposition", fmt.Sprintf(`form-data; name="file"; filename="%s"`, filename)) + // Use mime.FormatMediaType for RFC 6266 compliant Content-Disposition, + // properly handling non-ASCII characters and special characters + h.Set("Content-Disposition", mime.FormatMediaType("form-data", map[string]string{"name": "file", "filename": option.Filename})) h.Set("Idempotency-Key", option.UploadUrl) if option.MimeType == "" { option.MimeType = mime.TypeByExtension(strings.ToLower(filepath.Ext(option.Filename))) diff --git a/weed/pb/master.proto b/weed/pb/master.proto index f8049c466..afbf31de9 100644 --- a/weed/pb/master.proto +++ b/weed/pb/master.proto @@ -81,6 +81,7 @@ message Heartbeat { map max_volume_counts = 4; uint32 grpc_port = 20; repeated string location_uuids = 21; + string id = 22; // volume server id, independent of ip:port for stable identification } message HeartbeatResponse { @@ -289,6 +290,7 @@ message DataNodeInfo { string id = 1; map diskInfos = 2; uint32 grpc_port = 3; + string address = 4; // ip:port for connecting to the volume server } message RackInfo { string id = 1; diff --git a/weed/pb/master_pb/master.pb.go b/weed/pb/master_pb/master.pb.go index 19df43d71..41d46fad1 100644 --- a/weed/pb/master_pb/master.pb.go +++ b/weed/pb/master_pb/master.pb.go @@ -44,6 +44,7 @@ type Heartbeat struct { MaxVolumeCounts map[string]uint32 `protobuf:"bytes,4,rep,name=max_volume_counts,json=maxVolumeCounts,proto3" json:"max_volume_counts,omitempty" protobuf_key:"bytes,1,opt,name=key" protobuf_val:"varint,2,opt,name=value"` GrpcPort uint32 `protobuf:"varint,20,opt,name=grpc_port,json=grpcPort,proto3" json:"grpc_port,omitempty"` LocationUuids []string `protobuf:"bytes,21,rep,name=location_uuids,json=locationUuids,proto3" json:"location_uuids,omitempty"` + Id string `protobuf:"bytes,22,opt,name=id,proto3" json:"id,omitempty"` // volume server id, independent of ip:port for stable identification unknownFields protoimpl.UnknownFields sizeCache protoimpl.SizeCache } @@ -204,6 +205,13 @@ func (x *Heartbeat) GetLocationUuids() []string { return nil } +func (x *Heartbeat) GetId() string { + if x != nil { + return x.Id + } + return "" +} + type HeartbeatResponse struct { state protoimpl.MessageState `protogen:"open.v1"` VolumeSizeLimit uint64 `protobuf:"varint,1,opt,name=volume_size_limit,json=volumeSizeLimit,proto3" json:"volume_size_limit,omitempty"` @@ -2039,6 +2047,7 @@ type DataNodeInfo struct { Id string `protobuf:"bytes,1,opt,name=id,proto3" json:"id,omitempty"` DiskInfos map[string]*DiskInfo `protobuf:"bytes,2,rep,name=diskInfos,proto3" json:"diskInfos,omitempty" protobuf_key:"bytes,1,opt,name=key" protobuf_val:"bytes,2,opt,name=value"` GrpcPort uint32 `protobuf:"varint,3,opt,name=grpc_port,json=grpcPort,proto3" json:"grpc_port,omitempty"` + Address string `protobuf:"bytes,4,opt,name=address,proto3" json:"address,omitempty"` // ip:port for connecting to the volume server unknownFields protoimpl.UnknownFields sizeCache protoimpl.SizeCache } @@ -2094,6 +2103,13 @@ func (x *DataNodeInfo) GetGrpcPort() uint32 { return 0 } +func (x *DataNodeInfo) GetAddress() string { + if x != nil { + return x.Address + } + return "" +} + type RackInfo struct { state protoimpl.MessageState `protogen:"open.v1"` Id string `protobuf:"bytes,1,opt,name=id,proto3" json:"id,omitempty"` @@ -4038,7 +4054,7 @@ var File_master_proto protoreflect.FileDescriptor const file_master_proto_rawDesc = "" + "\n" + - "\fmaster.proto\x12\tmaster_pb\"\xc0\a\n" + + "\fmaster.proto\x12\tmaster_pb\"\xd0\a\n" + "\tHeartbeat\x12\x0e\n" + "\x02ip\x18\x01 \x01(\tR\x02ip\x12\x12\n" + "\x04port\x18\x02 \x01(\rR\x04port\x12\x1d\n" + @@ -4063,7 +4079,8 @@ const file_master_proto_rawDesc = "" + "\x10has_no_ec_shards\x18\x13 \x01(\bR\rhasNoEcShards\x12U\n" + "\x11max_volume_counts\x18\x04 \x03(\v2).master_pb.Heartbeat.MaxVolumeCountsEntryR\x0fmaxVolumeCounts\x12\x1b\n" + "\tgrpc_port\x18\x14 \x01(\rR\bgrpcPort\x12%\n" + - "\x0elocation_uuids\x18\x15 \x03(\tR\rlocationUuids\x1aB\n" + + "\x0elocation_uuids\x18\x15 \x03(\tR\rlocationUuids\x12\x0e\n" + + "\x02id\x18\x16 \x01(\tR\x02id\x1aB\n" + "\x14MaxVolumeCountsEntry\x12\x10\n" + "\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n" + "\x05value\x18\x02 \x01(\rR\x05value:\x028\x01\"\xcd\x02\n" + @@ -4254,11 +4271,12 @@ const file_master_proto_rawDesc = "" + "\fvolume_infos\x18\x06 \x03(\v2#.master_pb.VolumeInformationMessageR\vvolumeInfos\x12P\n" + "\x0eec_shard_infos\x18\a \x03(\v2*.master_pb.VolumeEcShardInformationMessageR\fecShardInfos\x12.\n" + "\x13remote_volume_count\x18\b \x01(\x03R\x11remoteVolumeCount\x12\x17\n" + - "\adisk_id\x18\t \x01(\rR\x06diskId\"\xd4\x01\n" + + "\adisk_id\x18\t \x01(\rR\x06diskId\"\xee\x01\n" + "\fDataNodeInfo\x12\x0e\n" + "\x02id\x18\x01 \x01(\tR\x02id\x12D\n" + "\tdiskInfos\x18\x02 \x03(\v2&.master_pb.DataNodeInfo.DiskInfosEntryR\tdiskInfos\x12\x1b\n" + - "\tgrpc_port\x18\x03 \x01(\rR\bgrpcPort\x1aQ\n" + + "\tgrpc_port\x18\x03 \x01(\rR\bgrpcPort\x12\x18\n" + + "\aaddress\x18\x04 \x01(\tR\aaddress\x1aQ\n" + "\x0eDiskInfosEntry\x12\x10\n" + "\x03key\x18\x01 \x01(\tR\x03key\x12)\n" + "\x05value\x18\x02 \x01(\v2\x13.master_pb.DiskInfoR\x05value:\x028\x01\"\xf0\x01\n" + diff --git a/weed/pb/server_address.go b/weed/pb/server_address.go index a0aa79ae4..943b85519 100644 --- a/weed/pb/server_address.go +++ b/weed/pb/server_address.go @@ -2,11 +2,12 @@ package pb import ( "fmt" - "github.com/seaweedfs/seaweedfs/weed/pb/master_pb" - "github.com/seaweedfs/seaweedfs/weed/util" "net" "strconv" "strings" + + "github.com/seaweedfs/seaweedfs/weed/pb/master_pb" + "github.com/seaweedfs/seaweedfs/weed/util" ) type ServerAddress string @@ -32,7 +33,12 @@ func NewServerAddressWithGrpcPort(address string, grpcPort int) ServerAddress { } func NewServerAddressFromDataNode(dn *master_pb.DataNodeInfo) ServerAddress { - return NewServerAddressWithGrpcPort(dn.Id, int(dn.GrpcPort)) + // Use Address field if available (new behavior), fall back to Id for backward compatibility + addr := dn.Address + if addr == "" { + addr = dn.Id // backward compatibility: old nodes use ip:port as id + } + return NewServerAddressWithGrpcPort(addr, int(dn.GrpcPort)) } func NewServerAddressFromLocation(dn *master_pb.Location) ServerAddress { diff --git a/weed/s3api/auth_signature_v4.go b/weed/s3api/auth_signature_v4.go index d897894bc..4e22530d1 100644 --- a/weed/s3api/auth_signature_v4.go +++ b/weed/s3api/auth_signature_v4.go @@ -53,10 +53,11 @@ func (iam *IdentityAccessManagement) reqSignatureV4Verify(r *http.Request) (*Ide // Constants specific to this file const ( - emptySHA256 = "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855" - streamingContentSHA256 = "STREAMING-AWS4-HMAC-SHA256-PAYLOAD" - streamingUnsignedPayload = "STREAMING-UNSIGNED-PAYLOAD-TRAILER" - unsignedPayload = "UNSIGNED-PAYLOAD" + emptySHA256 = "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855" + streamingContentSHA256 = "STREAMING-AWS4-HMAC-SHA256-PAYLOAD" + streamingContentSHA256Trailer = "STREAMING-AWS4-HMAC-SHA256-PAYLOAD-TRAILER" + streamingUnsignedPayload = "STREAMING-UNSIGNED-PAYLOAD-TRAILER" + unsignedPayload = "UNSIGNED-PAYLOAD" // Limit for IAM/STS request body size to prevent DoS attacks iamRequestBodyLimit = 10 * (1 << 20) // 10 MiB ) @@ -214,14 +215,14 @@ func (iam *IdentityAccessManagement) verifyV4Signature(r *http.Request, shouldCh availableKeys = append(availableKeys, key) } iam.m.RUnlock() - + glog.Warningf("InvalidAccessKeyId: attempted key '%s' not found. Available keys: %d, Auth enabled: %v", authInfo.AccessKey, len(availableKeys), iam.isAuthEnabled) - + if glog.V(2) && len(availableKeys) > 0 { glog.V(2).Infof("Available access keys: %v", availableKeys) } - + return nil, nil, "", nil, s3err.ErrInvalidAccessKeyID } @@ -562,10 +563,10 @@ func (iam *IdentityAccessManagement) doesPolicySignatureV4Match(formValues http. iam.m.RLock() availableKeyCount := len(iam.accessKeyIdent) iam.m.RUnlock() - + glog.Warningf("InvalidAccessKeyId (POST policy): attempted key '%s' not found. Available keys: %d, Auth enabled: %v", credHeader.accessKey, availableKeyCount, iam.isAuthEnabled) - + return s3err.ErrInvalidAccessKeyID } diff --git a/weed/s3api/chunked_reader_v4.go b/weed/s3api/chunked_reader_v4.go index f841c3e1e..ca58ecec0 100644 --- a/weed/s3api/chunked_reader_v4.go +++ b/weed/s3api/chunked_reader_v4.go @@ -53,8 +53,8 @@ func (iam *IdentityAccessManagement) calculateSeedSignature(r *http.Request) (cr // This check ensures we only proceed for streaming uploads. switch authInfo.HashedPayload { - case streamingContentSHA256: - glog.V(3).Infof("streaming content sha256") + case streamingContentSHA256, streamingContentSHA256Trailer: + glog.V(3).Infof("streaming content sha256 (with trailer: %v)", authInfo.HashedPayload == streamingContentSHA256Trailer) case streamingUnsignedPayload: glog.V(3).Infof("streaming unsigned payload") default: @@ -87,9 +87,9 @@ func (iam *IdentityAccessManagement) newChunkedReader(req *http.Request) (io.Rea var errCode s3err.ErrorCode switch contentSha256Header { - // Payload for STREAMING signature should be 'STREAMING-AWS4-HMAC-SHA256-PAYLOAD' - case streamingContentSHA256: - glog.V(3).Infof("streaming content sha256") + // Payload for STREAMING signature should be 'STREAMING-AWS4-HMAC-SHA256-PAYLOAD' or 'STREAMING-AWS4-HMAC-SHA256-PAYLOAD-TRAILER' + case streamingContentSHA256, streamingContentSHA256Trailer: + glog.V(3).Infof("streaming content sha256 (with trailer: %v)", contentSha256Header == streamingContentSHA256Trailer) credential, seedSignature, region, service, seedDate, errCode = iam.calculateSeedSignature(req) if errCode != s3err.ErrNone { return nil, errCode diff --git a/weed/s3api/chunked_reader_v4_test.go b/weed/s3api/chunked_reader_v4_test.go index b797bf340..98654ce8b 100644 --- a/weed/s3api/chunked_reader_v4_test.go +++ b/weed/s3api/chunked_reader_v4_test.go @@ -234,6 +234,150 @@ func TestSignedStreamingUpload(t *testing.T) { assert.Equal(t, chunk1Data+chunk2Data, string(data)) } +// createTrailerStreamingRequest creates a streaming upload request with trailer for testing. +// If useValidTrailerSignature is true, uses a correctly calculated trailer signature; +// otherwise uses an intentionally wrong signature for negative testing. +func createTrailerStreamingRequest(t *testing.T, useValidTrailerSignature bool) (*http.Request, string) { + chunk1Data := "hello world\n" + chunk1DataLen := len(chunk1Data) + chunk1DataLenHex := fmt.Sprintf("%x", chunk1DataLen) + + // Use current time for signatures + now := time.Now().UTC() + amzDate := now.Format(iso8601Format) + dateStamp := now.Format(yyyymmdd) + + // Calculate seed signature + scope := dateStamp + "/" + defaultRegion + "/s3/aws4_request" + + // Build canonical request for seed signature + hashedPayload := "STREAMING-AWS4-HMAC-SHA256-PAYLOAD-TRAILER" + canonicalHeaders := "content-encoding:aws-chunked\n" + + "host:s3.amazonaws.com\n" + + "x-amz-content-sha256:" + hashedPayload + "\n" + + "x-amz-date:" + amzDate + "\n" + + fmt.Sprintf("x-amz-decoded-content-length:%d\n", chunk1DataLen) + + "x-amz-trailer:x-amz-checksum-crc32\n" + signedHeaders := "content-encoding;host;x-amz-content-sha256;x-amz-date;x-amz-decoded-content-length;x-amz-trailer" + + canonicalRequest := "PUT\n" + + "/test-bucket/test-object\n" + + "\n" + + canonicalHeaders + "\n" + + signedHeaders + "\n" + + hashedPayload + + canonicalRequestHash := getSHA256Hash([]byte(canonicalRequest)) + stringToSign := "AWS4-HMAC-SHA256\n" + amzDate + "\n" + scope + "\n" + canonicalRequestHash + + signingKey := getSigningKey(defaultSecretAccessKey, dateStamp, defaultRegion, "s3") + seedSignature := getSignature(signingKey, stringToSign) + + // Calculate chunk signatures + chunk1Hash := getSHA256Hash([]byte(chunk1Data)) + chunk1StringToSign := "AWS4-HMAC-SHA256-PAYLOAD\n" + amzDate + "\n" + scope + "\n" + + seedSignature + "\n" + emptySHA256 + "\n" + chunk1Hash + chunk1Signature := getSignature(signingKey, chunk1StringToSign) + + // Final chunk (0 bytes) + finalStringToSign := "AWS4-HMAC-SHA256-PAYLOAD\n" + amzDate + "\n" + scope + "\n" + + chunk1Signature + "\n" + emptySHA256 + "\n" + emptySHA256 + finalSignature := getSignature(signingKey, finalStringToSign) + + // Calculate CRC32 checksum for trailer + crcWriter := crc32.NewIEEE() + _, crcErr := crcWriter.Write([]byte(chunk1Data)) + assert.NoError(t, crcErr) + checksum := crcWriter.Sum(nil) + base64EncodedChecksum := base64.StdEncoding.EncodeToString(checksum) + + // The on-wire trailer format uses \r\n (HTTP/aws-chunked convention) + trailerOnWire := "x-amz-checksum-crc32:" + base64EncodedChecksum + "\r\n" + + // Calculate or use wrong trailer signature + var trailerSignature string + if useValidTrailerSignature { + // The canonical trailer content uses \n for signing (per AWS SigV4 spec) + trailerCanonical := "x-amz-checksum-crc32:" + base64EncodedChecksum + "\n" + trailerHash := getSHA256Hash([]byte(trailerCanonical)) + trailerStringToSign := "AWS4-HMAC-SHA256-TRAILER\n" + amzDate + "\n" + scope + "\n" + + finalSignature + "\n" + trailerHash + trailerSignature = getSignature(signingKey, trailerStringToSign) + } else { + // Intentionally wrong signature for negative testing + trailerSignature = "0000000000000000000000000000000000000000000000000000000000000000" + } + + // Build the chunked payload with trailer and trailer signature + payload := fmt.Sprintf("%s;chunk-signature=%s\r\n%s\r\n", chunk1DataLenHex, chunk1Signature, chunk1Data) + + fmt.Sprintf("0;chunk-signature=%s\r\n", finalSignature) + + trailerOnWire + + "x-amz-trailer-signature:" + trailerSignature + "\r\n" + + "\r\n" + + // Create the request + req, err := http.NewRequest("PUT", "http://s3.amazonaws.com/test-bucket/test-object", + bytes.NewReader([]byte(payload))) + assert.NoError(t, err) + + req.Header.Set("Host", "s3.amazonaws.com") + req.Header.Set("x-amz-date", amzDate) + req.Header.Set("x-amz-content-sha256", hashedPayload) + req.Header.Set("Content-Encoding", "aws-chunked") + req.Header.Set("x-amz-decoded-content-length", fmt.Sprintf("%d", chunk1DataLen)) + req.Header.Set("x-amz-trailer", "x-amz-checksum-crc32") + + authHeader := fmt.Sprintf("AWS4-HMAC-SHA256 Credential=%s/%s, SignedHeaders=%s, Signature=%s", + defaultAccessKeyId, scope, signedHeaders, seedSignature) + req.Header.Set("Authorization", authHeader) + + return req, chunk1Data +} + +// TestSignedStreamingUploadWithTrailer tests streaming uploads with signed chunks and trailers +// This tests the STREAMING-AWS4-HMAC-SHA256-PAYLOAD-TRAILER content-sha256 header value +// which is used by AWS SDK v2 when checksum validation is enabled +func TestSignedStreamingUploadWithTrailer(t *testing.T) { + iam := setupIam() + req, expectedData := createTrailerStreamingRequest(t, true) + + // Test the chunked reader + reader, errCode := iam.newChunkedReader(req) + assert.Equal(t, s3err.ErrNone, errCode) + assert.NotNil(t, reader) + + // Read and verify the payload + data, err := io.ReadAll(reader) + assert.NoError(t, err) + assert.Equal(t, expectedData, string(data)) +} + +// TestSignedStreamingUploadWithTrailerInvalidSignature tests behavior with invalid trailer signatures. +// This is a negative test case for trailer signature validation. It currently verifies that an invalid +// signature doesn't break content reading, and is prepared for when validation is implemented. +func TestSignedStreamingUploadWithTrailerInvalidSignature(t *testing.T) { + iam := setupIam() + req, expectedData := createTrailerStreamingRequest(t, false) + + // Test the chunked reader - it should be created successfully + reader, errCode := iam.newChunkedReader(req) + assert.Equal(t, s3err.ErrNone, errCode) + assert.NotNil(t, reader) + + // Read the payload - currently trailer signature validation may not be implemented, + // but this test documents the expected behavior and will catch regressions + // if trailer signature validation is added in the future + data, err := io.ReadAll(reader) + // Note: If trailer signature validation is implemented, this should fail with an error + // For now, we just verify the content is correctly extracted + if err != nil { + assert.Contains(t, err.Error(), "signature", "Error should indicate signature mismatch") + } else { + // If no error, content should still be correct (trailer sig validation not yet implemented) + assert.Equal(t, expectedData, string(data)) + } +} + // TestSignedStreamingUploadInvalidSignature tests that invalid chunk signatures are rejected // This is a negative test case to ensure signature validation is actually working func TestSignedStreamingUploadInvalidSignature(t *testing.T) { diff --git a/weed/s3api/filer_multipart.go b/weed/s3api/filer_multipart.go index 1e4635ead..8dca4cedc 100644 --- a/weed/s3api/filer_multipart.go +++ b/weed/s3api/filer_multipart.go @@ -187,7 +187,10 @@ func (s3a *S3ApiServer) completeMultipartUpload(r *http.Request, input *s3.Compl sort.Ints(completedPartNumbers) uploadDirectory := s3a.genUploadsFolder(*input.Bucket) + "/" + *input.UploadId - entries, _, err := s3a.list(uploadDirectory, "", "", false, 0) + // Use explicit limit to ensure all parts are listed (up to S3's max of 10,000 parts) + // Previously limit=0 relied on server's DirListingLimit default (1000 in weed server mode), + // which caused CompleteMultipartUpload to fail for uploads with more than 1000 parts. + entries, _, err := s3a.list(uploadDirectory, "", "", false, s3_constants.MaxS3MultipartParts+1) if err != nil { glog.Errorf("completeMultipartUpload %s %s error: %v, entries:%d", *input.Bucket, *input.UploadId, err, len(entries)) stats.S3HandlerCounter.WithLabelValues(stats.ErrorCompletedNoSuchUpload).Inc() diff --git a/weed/s3api/s3api_auth.go b/weed/s3api/s3api_auth.go index e946b1284..5592fe939 100644 --- a/weed/s3api/s3api_auth.go +++ b/weed/s3api/s3api_auth.go @@ -48,14 +48,22 @@ func isRequestPostPolicySignatureV4(r *http.Request) bool { } // Verify if the request has AWS Streaming Signature Version '4'. This is only valid for 'PUT' operation. +// Supports both with and without trailer variants: +// - STREAMING-AWS4-HMAC-SHA256-PAYLOAD (original) +// - STREAMING-AWS4-HMAC-SHA256-PAYLOAD-TRAILER (with trailing checksums) func isRequestSignStreamingV4(r *http.Request) bool { - return r.Header.Get("x-amz-content-sha256") == streamingContentSHA256 && - r.Method == http.MethodPut + if r.Method != http.MethodPut { + return false + } + contentSha256 := r.Header.Get("x-amz-content-sha256") + return contentSha256 == streamingContentSHA256 || contentSha256 == streamingContentSHA256Trailer } func isRequestUnsignedStreaming(r *http.Request) bool { - return r.Header.Get("x-amz-content-sha256") == streamingUnsignedPayload && - r.Method == http.MethodPut + if r.Method != http.MethodPut { + return false + } + return r.Header.Get("x-amz-content-sha256") == streamingUnsignedPayload } // Authorization type. diff --git a/weed/s3api/s3api_bucket_config.go b/weed/s3api/s3api_bucket_config.go index 00449d80a..6076f0108 100644 --- a/weed/s3api/s3api_bucket_config.go +++ b/weed/s3api/s3api_bucket_config.go @@ -514,12 +514,27 @@ func (s3a *S3ApiServer) isVersioningConfigured(bucket string) (bool, error) { return config.Versioning != "" || config.ObjectLockConfig != nil, nil } +// isObjectLockEnabled checks if Object Lock is enabled for a bucket (with caching) +func (s3a *S3ApiServer) isObjectLockEnabled(bucket string) (bool, error) { + config, errCode := s3a.getBucketConfig(bucket) + if errCode != s3err.ErrNone { + if errCode == s3err.ErrNoSuchBucket { + return false, filer_pb.ErrNotFound + } + return false, fmt.Errorf("failed to get bucket config: %v", errCode) + } + + return config.ObjectLockConfig != nil, nil +} + // getVersioningState returns the detailed versioning state for a bucket func (s3a *S3ApiServer) getVersioningState(bucket string) (string, error) { config, errCode := s3a.getBucketConfig(bucket) if errCode != s3err.ErrNone { if errCode == s3err.ErrNoSuchBucket { - return "", nil + // Signal to callers that the bucket does not exist so they can + // decide whether to auto-create it (e.g., in PUT handlers). + return "", filer_pb.ErrNotFound } glog.Errorf("getVersioningState: failed to get bucket config for %s: %v", bucket, errCode) return "", fmt.Errorf("failed to get bucket config: %v", errCode) diff --git a/weed/s3api/s3api_bucket_handlers.go b/weed/s3api/s3api_bucket_handlers.go index f0704fe23..a810dfd37 100644 --- a/weed/s3api/s3api_bucket_handlers.go +++ b/weed/s3api/s3api_bucket_handlers.go @@ -244,46 +244,64 @@ func (s3a *S3ApiServer) PutBucketHandler(w http.ResponseWriter, r *http.Request) return } - // create the folder for bucket, but lazily create actual collection - if err := s3a.mkdir(s3a.option.BucketsPath, bucket, setBucketOwner(r)); err != nil { - glog.Errorf("PutBucketHandler mkdir: %v", err) - s3err.WriteErrorResponse(w, r, s3err.ErrInternalError) - return - } + // Check for x-amz-bucket-object-lock-enabled header BEFORE creating bucket + // This allows us to create the bucket with Object Lock configuration atomically + objectLockEnabled := strings.EqualFold(r.Header.Get(s3_constants.AmzBucketObjectLockEnabled), "true") - // Remove bucket from negative cache after successful creation - if s3a.bucketConfigCache != nil { - s3a.bucketConfigCache.RemoveNegativeCache(bucket) - } + // Capture any Object Lock configuration error from within the callback + // The mkdir callback doesn't support returning errors, so we capture it here + var objectLockSetupError error - // Check for x-amz-bucket-object-lock-enabled header (S3 standard compliance) - if objectLockHeaderValue := r.Header.Get(s3_constants.AmzBucketObjectLockEnabled); strings.EqualFold(objectLockHeaderValue, "true") { - glog.V(3).Infof("PutBucketHandler: enabling Object Lock and Versioning for bucket %s due to x-amz-bucket-object-lock-enabled header", bucket) + // Create the folder for bucket with all settings atomically + // This ensures Object Lock configuration is set in the same CreateEntry call, + // preventing race conditions where the bucket exists without Object Lock enabled + if err := s3a.mkdir(s3a.option.BucketsPath, bucket, func(entry *filer_pb.Entry) { + // Set bucket owner + setBucketOwner(r)(entry) + + // Set Object Lock configuration atomically during bucket creation + if objectLockEnabled { + glog.V(3).Infof("PutBucketHandler: enabling Object Lock and Versioning for bucket %s atomically", bucket) + + if entry.Extended == nil { + entry.Extended = make(map[string][]byte) + } - // Atomically update the configuration of the specified bucket. See the updateBucketConfig - // function definition for detailed documentation on parameters and behavior. - errCode := s3a.updateBucketConfig(bucket, func(bucketConfig *BucketConfig) error { // Enable versioning (required for Object Lock) - bucketConfig.Versioning = s3_constants.VersioningEnabled + entry.Extended[s3_constants.ExtVersioningKey] = []byte(s3_constants.VersioningEnabled) - // Create basic Object Lock configuration (enabled without default retention) + // Create and store Object Lock configuration objectLockConfig := &ObjectLockConfiguration{ ObjectLockEnabled: s3_constants.ObjectLockEnabled, } + if err := StoreObjectLockConfigurationInExtended(entry, objectLockConfig); err != nil { + glog.Errorf("PutBucketHandler: failed to store Object Lock config for bucket %s: %v", bucket, err) + objectLockSetupError = err + // Note: The entry will still be created, but we'll roll it back below + } else { + glog.V(3).Infof("PutBucketHandler: set ObjectLockConfig for bucket %s: %+v", bucket, objectLockConfig) + } + } + }); err != nil { + glog.Errorf("PutBucketHandler mkdir: %v", err) + s3err.WriteErrorResponse(w, r, s3err.ErrInternalError) + return + } - // Set the cached Object Lock configuration - bucketConfig.ObjectLockConfig = objectLockConfig - glog.V(3).Infof("PutBucketHandler: set ObjectLockConfig for bucket %s: %+v", bucket, objectLockConfig) - - return nil - }) - - if errCode != s3err.ErrNone { - glog.Errorf("PutBucketHandler: failed to enable Object Lock for bucket %s: %v", bucket, errCode) - s3err.WriteErrorResponse(w, r, errCode) - return + // If Object Lock setup failed, roll back the bucket creation + // This ensures we don't leave a bucket without the requested Object Lock configuration + if objectLockSetupError != nil { + glog.Errorf("PutBucketHandler: rolling back bucket %s creation due to Object Lock setup failure: %v", bucket, objectLockSetupError) + if deleteErr := s3a.rm(s3a.option.BucketsPath, bucket, true, true); deleteErr != nil { + glog.Errorf("PutBucketHandler: failed to rollback bucket %s after Object Lock setup failure: %v", bucket, deleteErr) } - glog.V(3).Infof("PutBucketHandler: enabled Object Lock and Versioning for bucket %s", bucket) + s3err.WriteErrorResponse(w, r, s3err.ErrInternalError) + return + } + + // Remove bucket from negative cache after successful creation + if s3a.bucketConfigCache != nil { + s3a.bucketConfigCache.RemoveNegativeCache(bucket) } w.Header().Set("Location", "/"+bucket) diff --git a/weed/s3api/s3api_object_handlers.go b/weed/s3api/s3api_object_handlers.go index 1406bbf42..43cc4e5fc 100644 --- a/weed/s3api/s3api_object_handlers.go +++ b/weed/s3api/s3api_object_handlers.go @@ -659,16 +659,14 @@ func (s3a *S3ApiServer) GetObjectHandler(w http.ResponseWriter, r *http.Request) glog.V(3).Infof("GetObject: Set PartsCount=%d for multipart GET with PartNumber=%d", partsCount, partNumber) // Calculate the byte range for this part + // Note: ETag is NOT overridden - AWS S3 returns the complete object's ETag + // even when requesting a specific part via PartNumber var startOffset, endOffset int64 if partInfo != nil { // Use part boundaries from metadata (accurate for multi-chunk parts) startOffset = objectEntryForSSE.Chunks[partInfo.StartChunk].Offset lastChunk := objectEntryForSSE.Chunks[partInfo.EndChunk-1] endOffset = lastChunk.Offset + int64(lastChunk.Size) - 1 - - // Override ETag with the part's ETag from metadata - w.Header().Set("ETag", "\""+partInfo.ETag+"\"") - glog.V(3).Infof("GetObject: Override ETag with part %d ETag: %s (from metadata)", partNumber, partInfo.ETag) } else { // Fallback: assume 1:1 part-to-chunk mapping (backward compatibility) chunkIndex := partNumber - 1 @@ -680,15 +678,6 @@ func (s3a *S3ApiServer) GetObjectHandler(w http.ResponseWriter, r *http.Request) partChunk := objectEntryForSSE.Chunks[chunkIndex] startOffset = partChunk.Offset endOffset = partChunk.Offset + int64(partChunk.Size) - 1 - - // Override ETag with chunk's ETag (fallback) - if partChunk.ETag != "" { - if md5Bytes, decodeErr := base64.StdEncoding.DecodeString(partChunk.ETag); decodeErr == nil { - partETag := fmt.Sprintf("%x", md5Bytes) - w.Header().Set("ETag", "\""+partETag+"\"") - glog.V(3).Infof("GetObject: Override ETag with part %d ETag: %s (fallback from chunk)", partNumber, partETag) - } - } } // Check if client supplied a Range header - if so, apply it within the part's boundaries @@ -2266,7 +2255,7 @@ func (s3a *S3ApiServer) HeadObjectHandler(w http.ResponseWriter, r *http.Request if partNumberStr != "" { if partNumber, parseErr := strconv.Atoi(partNumberStr); parseErr == nil && partNumber > 0 { // Get actual parts count from metadata (not chunk count) - partsCount, partInfo := s3a.getMultipartInfo(objectEntryForSSE, partNumber) + partsCount, _ := s3a.getMultipartInfo(objectEntryForSSE, partNumber) // Validate part number if partNumber > partsCount { @@ -2276,31 +2265,10 @@ func (s3a *S3ApiServer) HeadObjectHandler(w http.ResponseWriter, r *http.Request } // Set parts count header + // Note: ETag is NOT overridden - AWS S3 returns the complete object's ETag + // even when requesting a specific part via PartNumber w.Header().Set(s3_constants.AmzMpPartsCount, strconv.Itoa(partsCount)) glog.V(3).Infof("HeadObject: Set PartsCount=%d for part %d", partsCount, partNumber) - - // Override ETag with the part's ETag - if partInfo != nil { - // Use part ETag from metadata (accurate for multi-chunk parts) - w.Header().Set("ETag", "\""+partInfo.ETag+"\"") - glog.V(3).Infof("HeadObject: Override ETag with part %d ETag: %s (from metadata)", partNumber, partInfo.ETag) - } else { - // Fallback: use chunk's ETag (backward compatibility) - chunkIndex := partNumber - 1 - if chunkIndex >= len(objectEntryForSSE.Chunks) { - glog.Warningf("HeadObject: Part %d chunk index %d out of range (chunks: %d)", partNumber, chunkIndex, len(objectEntryForSSE.Chunks)) - s3err.WriteErrorResponse(w, r, s3err.ErrInvalidPart) - return - } - partChunk := objectEntryForSSE.Chunks[chunkIndex] - if partChunk.ETag != "" { - if md5Bytes, decodeErr := base64.StdEncoding.DecodeString(partChunk.ETag); decodeErr == nil { - partETag := fmt.Sprintf("%x", md5Bytes) - w.Header().Set("ETag", "\""+partETag+"\"") - glog.V(3).Infof("HeadObject: Override ETag with part %d ETag: %s (fallback from chunk)", partNumber, partETag) - } - } - } } } diff --git a/weed/s3api/s3api_object_handlers_delete.go b/weed/s3api/s3api_object_handlers_delete.go index f779a6edc..da0b78654 100644 --- a/weed/s3api/s3api_object_handlers_delete.go +++ b/weed/s3api/s3api_object_handlers_delete.go @@ -1,12 +1,10 @@ package s3api import ( - "context" "encoding/xml" "fmt" "io" "net/http" - "slices" "strings" "github.com/seaweedfs/seaweedfs/weed/filer" @@ -127,23 +125,11 @@ func (s3a *S3ApiServer) DeleteObjectHandler(w http.ResponseWriter, r *http.Reque dir, name := target.DirAndName() err := s3a.WithFilerClient(false, func(client filer_pb.SeaweedFilerClient) error { - // Use operation context that won't be cancelled if request terminates - // This ensures deletion completes atomically to avoid inconsistent state - opCtx := context.WithoutCancel(r.Context()) - - if err := doDeleteEntry(client, dir, name, true, false); err != nil { - return err - } - - // Cleanup empty directories - if !s3a.option.AllowEmptyFolder && strings.LastIndex(object, "/") > 0 { - bucketPath := fmt.Sprintf("%s/%s", s3a.option.BucketsPath, bucket) - // Recursively delete empty parent directories, stop at bucket path - filer_pb.DoDeleteEmptyParentDirectories(opCtx, client, util.FullPath(dir), util.FullPath(bucketPath), nil) - } - - return nil + return doDeleteEntry(client, dir, name, true, false) + // Note: Empty folder cleanup is now handled asynchronously by EmptyFolderCleaner + // which listens to metadata events and uses consistent hashing for coordination }) + if err != nil { s3err.WriteErrorResponse(w, r, s3err.ErrInternalError) return @@ -222,8 +208,6 @@ func (s3a *S3ApiServer) DeleteMultipleObjectsHandler(w http.ResponseWriter, r *h var deleteErrors []DeleteError var auditLog *s3err.AccessLog - directoriesWithDeletion := make(map[string]bool) - if s3err.Logger != nil { auditLog = s3err.GetAccessLog(r, http.StatusNoContent, s3err.ErrNone) } @@ -245,10 +229,6 @@ func (s3a *S3ApiServer) DeleteMultipleObjectsHandler(w http.ResponseWriter, r *h versioningConfigured := (versioningState != "") s3a.WithFilerClient(false, func(client filer_pb.SeaweedFilerClient) error { - // Use operation context that won't be cancelled if request terminates - // This ensures batch deletion completes atomically to avoid inconsistent state - opCtx := context.WithoutCancel(r.Context()) - // delete file entries for _, object := range deleteObjects.Objects { if object.Key == "" { @@ -357,10 +337,6 @@ func (s3a *S3ApiServer) DeleteMultipleObjectsHandler(w http.ResponseWriter, r *h err := doDeleteEntry(client, parentDirectoryPath, entryName, isDeleteData, isRecursive) if err == nil { - // Track directory for empty directory cleanup - if !s3a.option.AllowEmptyFolder { - directoriesWithDeletion[parentDirectoryPath] = true - } deletedObjects = append(deletedObjects, object) } else if strings.Contains(err.Error(), filer.MsgFailDelNonEmptyFolder) { deletedObjects = append(deletedObjects, object) @@ -380,30 +356,8 @@ func (s3a *S3ApiServer) DeleteMultipleObjectsHandler(w http.ResponseWriter, r *h } } - // Cleanup empty directories - optimize by processing deepest first - if !s3a.option.AllowEmptyFolder && len(directoriesWithDeletion) > 0 { - bucketPath := fmt.Sprintf("%s/%s", s3a.option.BucketsPath, bucket) - - // Collect and sort directories by depth (deepest first) to avoid redundant checks - var allDirs []string - for dirPath := range directoriesWithDeletion { - allDirs = append(allDirs, dirPath) - } - // Sort by depth (deeper directories first) - slices.SortFunc(allDirs, func(a, b string) int { - return strings.Count(b, "/") - strings.Count(a, "/") - }) - - // Track already-checked directories to avoid redundant work - checked := make(map[string]bool) - for _, dirPath := range allDirs { - if !checked[dirPath] { - // Recursively delete empty parent directories, stop at bucket path - // Mark this directory and all its parents as checked during recursion - filer_pb.DoDeleteEmptyParentDirectories(opCtx, client, util.FullPath(dirPath), util.FullPath(bucketPath), checked) - } - } - } + // Note: Empty folder cleanup is now handled asynchronously by EmptyFolderCleaner + // which listens to metadata events and uses consistent hashing for coordination return nil }) diff --git a/weed/s3api/s3api_object_handlers_list.go b/weed/s3api/s3api_object_handlers_list.go index 3edbc9522..ad65bd4fe 100644 --- a/weed/s3api/s3api_object_handlers_list.go +++ b/weed/s3api/s3api_object_handlers_list.go @@ -554,15 +554,7 @@ func (s3a *S3ApiServer) doListFilerEntries(client filer_pb.SeaweedFilerClient, d } // println("doListFilerEntries2 nextMarker", nextMarker) } else { - var isEmpty bool - if !s3a.option.AllowEmptyFolder && entry.IsOlderDir() { - //if isEmpty, err = s3a.ensureDirectoryAllEmpty(client, dir, entry.Name); err != nil { - // glog.Errorf("check empty folder %s: %v", dir, err) - //} - } - if !isEmpty { - eachEntryFn(dir, entry) - } + eachEntryFn(dir, entry) } } else { eachEntryFn(dir, entry) diff --git a/weed/s3api/s3api_object_handlers_put.go b/weed/s3api/s3api_object_handlers_put.go index f848790de..3da9047ac 100644 --- a/weed/s3api/s3api_object_handlers_put.go +++ b/weed/s3api/s3api_object_handlers_put.go @@ -30,14 +30,14 @@ import ( // Object lock validation errors var ( - ErrObjectLockVersioningRequired = errors.New("object lock headers can only be used on versioned buckets") + ErrObjectLockVersioningRequired = errors.New("object lock headers can only be used on buckets with Object Lock enabled") ErrInvalidObjectLockMode = errors.New("invalid object lock mode") ErrInvalidLegalHoldStatus = errors.New("invalid legal hold status") ErrInvalidRetentionDateFormat = errors.New("invalid retention until date format") ErrRetentionDateMustBeFuture = errors.New("retain until date must be in the future") ErrObjectLockModeRequiresDate = errors.New("object lock mode requires retention until date") ErrRetentionDateRequiresMode = errors.New("retention until date requires object lock mode") - ErrGovernanceBypassVersioningRequired = errors.New("governance bypass header can only be used on versioned buckets") + ErrGovernanceBypassVersioningRequired = errors.New("governance bypass header can only be used on buckets with Object Lock enabled") ErrInvalidObjectLockDuration = errors.New("object lock duration must be greater than 0 days") ErrObjectLockDurationExceeded = errors.New("object lock duration exceeds maximum allowed days") ErrObjectLockConfigurationMissingEnabled = errors.New("object lock configuration must specify ObjectLockEnabled") @@ -159,8 +159,16 @@ func (s3a *S3ApiServer) PutObjectHandler(w http.ResponseWriter, r *http.Request) glog.V(3).Infof("PutObjectHandler: bucket=%s, object=%s, versioningState='%s', versioningEnabled=%v, versioningConfigured=%v", bucket, object, versioningState, versioningEnabled, versioningConfigured) + // Check if Object Lock is enabled for this bucket + objectLockEnabled, err := s3a.isObjectLockEnabled(bucket) + if err != nil && !errors.Is(err, filer_pb.ErrNotFound) { + glog.Errorf("Error checking Object Lock status for bucket %s: %v", bucket, err) + s3err.WriteErrorResponse(w, r, s3err.ErrInternalError) + return + } + // Validate object lock headers before processing - if err := s3a.validateObjectLockHeaders(r, versioningEnabled); err != nil { + if err := s3a.validateObjectLockHeaders(r, objectLockEnabled); err != nil { glog.V(2).Infof("PutObjectHandler: object lock header validation failed for bucket %s, object %s: %v", bucket, object, err) s3err.WriteErrorResponse(w, r, mapValidationErrorToS3Error(err)) return @@ -1311,7 +1319,8 @@ func (s3a *S3ApiServer) applyBucketDefaultRetention(bucket string, entry *filer_ } // validateObjectLockHeaders validates object lock headers in PUT requests -func (s3a *S3ApiServer) validateObjectLockHeaders(r *http.Request, versioningEnabled bool) error { +// objectLockEnabled should be true only if the bucket has Object Lock configured +func (s3a *S3ApiServer) validateObjectLockHeaders(r *http.Request, objectLockEnabled bool) error { // Extract object lock headers from request mode := r.Header.Get(s3_constants.AmzObjectLockMode) retainUntilDateStr := r.Header.Get(s3_constants.AmzObjectLockRetainUntilDate) @@ -1320,8 +1329,11 @@ func (s3a *S3ApiServer) validateObjectLockHeaders(r *http.Request, versioningEna // Check if any object lock headers are present hasObjectLockHeaders := mode != "" || retainUntilDateStr != "" || legalHold != "" - // Object lock headers can only be used on versioned buckets - if hasObjectLockHeaders && !versioningEnabled { + // Object lock headers can only be used on buckets with Object Lock enabled + // Per AWS S3: Object Lock can only be enabled at bucket creation, and once enabled, + // objects can have retention/legal-hold metadata. Without Object Lock enabled, + // these headers must be rejected. + if hasObjectLockHeaders && !objectLockEnabled { return ErrObjectLockVersioningRequired } @@ -1362,11 +1374,11 @@ func (s3a *S3ApiServer) validateObjectLockHeaders(r *http.Request, versioningEna } } - // Check for governance bypass header - only valid for versioned buckets + // Check for governance bypass header - only valid for buckets with Object Lock enabled bypassGovernance := r.Header.Get("x-amz-bypass-governance-retention") == "true" - // Governance bypass headers are only valid for versioned buckets (like object lock headers) - if bypassGovernance && !versioningEnabled { + // Governance bypass headers are only valid for buckets with Object Lock enabled + if bypassGovernance && !objectLockEnabled { return ErrGovernanceBypassVersioningRequired } diff --git a/weed/s3api/s3api_object_retention.go b/weed/s3api/s3api_object_retention.go index ef298eb43..328e938c5 100644 --- a/weed/s3api/s3api_object_retention.go +++ b/weed/s3api/s3api_object_retention.go @@ -586,10 +586,26 @@ func (s3a *S3ApiServer) evaluateGovernanceBypassRequest(r *http.Request, bucket, // enforceObjectLockProtections enforces object lock protections for operations func (s3a *S3ApiServer) enforceObjectLockProtections(request *http.Request, bucket, object, versionId string, governanceBypassAllowed bool) error { + // Quick check: if bucket doesn't have Object Lock enabled, skip the expensive entry lookup + // This optimization avoids a filer gRPC call for every DELETE operation on buckets without Object Lock + objectLockEnabled, err := s3a.isObjectLockEnabled(bucket) + if err != nil { + if errors.Is(err, filer_pb.ErrNotFound) { + // Bucket does not exist, so no protections to enforce + return nil + } + // For other errors, we can't determine lock status, so we should fail. + glog.Errorf("enforceObjectLockProtections: failed to check object lock for bucket %s: %v", bucket, err) + return err + } + if !objectLockEnabled { + // Object Lock is not enabled on this bucket, no protections to enforce + return nil + } + // Get the object entry to check both retention and legal hold // For delete operations without versionId, we need to check the latest version var entry *filer_pb.Entry - var err error if versionId != "" { // Check specific version diff --git a/weed/s3api/s3api_server.go b/weed/s3api/s3api_server.go index a1a3f100b..d75f53dd4 100644 --- a/weed/s3api/s3api_server.go +++ b/weed/s3api/s3api_server.go @@ -43,7 +43,6 @@ type S3ApiServerOption struct { AllowedOrigins []string BucketsPath string GrpcDialOption grpc.DialOption - AllowEmptyFolder bool AllowDeleteBucketNotEmpty bool LocalFilerSocket string DataCenter string diff --git a/weed/server/common.go b/weed/server/common.go index 930695f4b..dfed891b4 100644 --- a/weed/server/common.go +++ b/weed/server/common.go @@ -9,9 +9,9 @@ import ( "fmt" "io" "io/fs" + "mime" "mime/multipart" "net/http" - "net/url" "path/filepath" "strconv" "strings" @@ -286,14 +286,15 @@ func adjustHeaderContentDisposition(w http.ResponseWriter, r *http.Request, file return } if filename != "" { - filename = url.QueryEscape(filename) - contentDisposition := "inline" + dispositionType := "inline" if r.FormValue("dl") != "" { if dl, _ := strconv.ParseBool(r.FormValue("dl")); dl { - contentDisposition = "attachment" + dispositionType = "attachment" } } - w.Header().Set("Content-Disposition", contentDisposition+`; filename="`+fileNameEscaper.Replace(filename)+`"`) + // Use mime.FormatMediaType for RFC 6266 compliant Content-Disposition, + // properly handling non-ASCII characters and special characters + w.Header().Set("Content-Disposition", mime.FormatMediaType(dispositionType, map[string]string{"filename": filename})) } } diff --git a/weed/server/master_grpc_server.go b/weed/server/master_grpc_server.go index dcf279e1d..e053d9ea7 100644 --- a/weed/server/master_grpc_server.go +++ b/weed/server/master_grpc_server.go @@ -137,8 +137,8 @@ func (ms *MasterServer) SendHeartbeat(stream master_pb.Seaweed_SendHeartbeatServ dcName, rackName := ms.Topo.Configuration.Locate(heartbeat.Ip, heartbeat.DataCenter, heartbeat.Rack) dc := ms.Topo.GetOrCreateDataCenter(dcName) rack := dc.GetOrCreateRack(rackName) - dn = rack.GetOrCreateDataNode(heartbeat.Ip, int(heartbeat.Port), int(heartbeat.GrpcPort), heartbeat.PublicUrl, heartbeat.MaxVolumeCounts) - glog.V(0).Infof("added volume server %d: %v:%d %v", dn.Counter, heartbeat.GetIp(), heartbeat.GetPort(), heartbeat.LocationUuids) + dn = rack.GetOrCreateDataNode(heartbeat.Ip, int(heartbeat.Port), int(heartbeat.GrpcPort), heartbeat.PublicUrl, heartbeat.Id, heartbeat.MaxVolumeCounts) + glog.V(0).Infof("added volume server %d: %v (id=%s, ip=%v:%d) %v", dn.Counter, dn.Id(), heartbeat.Id, heartbeat.GetIp(), heartbeat.GetPort(), heartbeat.LocationUuids) uuidlist, err := ms.RegisterUuids(heartbeat) if err != nil { if stream_err := stream.Send(&master_pb.HeartbeatResponse{ diff --git a/weed/server/master_grpc_server_volume.go b/weed/server/master_grpc_server_volume.go index a7ef8e7e9..d00cb5df4 100644 --- a/weed/server/master_grpc_server_volume.go +++ b/weed/server/master_grpc_server_volume.go @@ -253,7 +253,7 @@ func (ms *MasterServer) LookupEcVolume(ctx context.Context, req *master_pb.Looku var locations []*master_pb.Location for _, dn := range shardLocations { locations = append(locations, &master_pb.Location{ - Url: string(dn.Id()), + Url: dn.Url(), PublicUrl: dn.PublicUrl, DataCenter: dn.GetDataCenterId(), }) diff --git a/weed/server/volume_grpc_copy.go b/weed/server/volume_grpc_copy.go index 84a9035ca..410c6b05d 100644 --- a/weed/server/volume_grpc_copy.go +++ b/weed/server/volume_grpc_copy.go @@ -115,7 +115,7 @@ func (vs *VolumeServer) VolumeCopy(req *volume_server_pb.VolumeCopyRequest, stre var sendErr error var ioBytePerSecond int64 if req.IoBytePerSecond <= 0 { - ioBytePerSecond = vs.compactionBytePerSecond + ioBytePerSecond = vs.maintenanceBytePerSecond } else { ioBytePerSecond = req.IoBytePerSecond } @@ -199,7 +199,7 @@ func (vs *VolumeServer) VolumeCopy(req *volume_server_pb.VolumeCopyRequest, stre } func (vs *VolumeServer) doCopyFile(client volume_server_pb.VolumeServerClient, isEcVolume bool, collection string, vid, compactRevision uint32, stopOffset uint64, baseFileName, ext string, isAppend, ignoreSourceFileNotFound bool, progressFn storage.ProgressFunc) (modifiedTsNs int64, err error) { - return vs.doCopyFileWithThrottler(client, isEcVolume, collection, vid, compactRevision, stopOffset, baseFileName, ext, isAppend, ignoreSourceFileNotFound, progressFn, util.NewWriteThrottler(vs.compactionBytePerSecond)) + return vs.doCopyFileWithThrottler(client, isEcVolume, collection, vid, compactRevision, stopOffset, baseFileName, ext, isAppend, ignoreSourceFileNotFound, progressFn, util.NewWriteThrottler(vs.maintenanceBytePerSecond)) } func (vs *VolumeServer) doCopyFileWithThrottler(client volume_server_pb.VolumeServerClient, isEcVolume bool, collection string, vid, compactRevision uint32, stopOffset uint64, baseFileName, ext string, isAppend, ignoreSourceFileNotFound bool, progressFn storage.ProgressFunc, throttler *util.WriteThrottler) (modifiedTsNs int64, err error) { @@ -264,7 +264,7 @@ func writeToFile(client volume_server_pb.VolumeServer_CopyFileClient, fileName s } dst, err := os.OpenFile(fileName, flags, 0644) if err != nil { - return modifiedTsNs, nil + return modifiedTsNs, fmt.Errorf("open file %s: %w", fileName, err) } defer dst.Close() @@ -278,9 +278,11 @@ func writeToFile(client volume_server_pb.VolumeServer_CopyFileClient, fileName s modifiedTsNs = resp.ModifiedTsNs } if receiveErr != nil { - return modifiedTsNs, fmt.Errorf("receiving %s: %v", fileName, receiveErr) + return modifiedTsNs, fmt.Errorf("receiving %s: %w", fileName, receiveErr) + } + if _, writeErr := dst.Write(resp.FileContent); writeErr != nil { + return modifiedTsNs, fmt.Errorf("write file %s: %w", fileName, writeErr) } - dst.Write(resp.FileContent) progressedBytes += int64(len(resp.FileContent)) if progressFn != nil { if !progressFn(progressedBytes) { diff --git a/weed/server/volume_server.go b/weed/server/volume_server.go index 4f8a7fb0d..0647c4196 100644 --- a/weed/server/volume_server.go +++ b/weed/server/volume_server.go @@ -42,20 +42,21 @@ type VolumeServer struct { guard *security.Guard grpcDialOption grpc.DialOption - needleMapKind storage.NeedleMapKind - ldbTimout int64 - FixJpgOrientation bool - ReadMode string - compactionBytePerSecond int64 - metricsAddress string - metricsIntervalSec int - fileSizeLimitBytes int64 - isHeartbeating bool - stopChan chan bool + needleMapKind storage.NeedleMapKind + ldbTimout int64 + FixJpgOrientation bool + ReadMode string + compactionBytePerSecond int64 + maintenanceBytePerSecond int64 + metricsAddress string + metricsIntervalSec int + fileSizeLimitBytes int64 + isHeartbeating bool + stopChan chan bool } func NewVolumeServer(adminMux, publicMux *http.ServeMux, ip string, - port int, grpcPort int, publicUrl string, + port int, grpcPort int, publicUrl string, id string, folders []string, maxCounts []int32, minFreeSpaces []util.MinFreeSpace, diskTypes []types.DiskType, idxFolder string, needleMapKind storage.NeedleMapKind, @@ -65,6 +66,7 @@ func NewVolumeServer(adminMux, publicMux *http.ServeMux, ip string, fixJpgOrientation bool, readMode string, compactionMBPerSecond int, + maintenanceMBPerSecond int, fileSizeLimitMB int, concurrentUploadLimit int64, concurrentDownloadLimit int64, @@ -94,6 +96,7 @@ func NewVolumeServer(adminMux, publicMux *http.ServeMux, ip string, ReadMode: readMode, grpcDialOption: security.LoadClientTLS(util.GetViper(), "grpc.volume"), compactionBytePerSecond: int64(compactionMBPerSecond) * 1024 * 1024, + maintenanceBytePerSecond: int64(maintenanceMBPerSecond) * 1024 * 1024, fileSizeLimitBytes: int64(fileSizeLimitMB) * 1024 * 1024, isHeartbeating: true, stopChan: make(chan bool), @@ -114,7 +117,7 @@ func NewVolumeServer(adminMux, publicMux *http.ServeMux, ip string, vs.checkWithMaster() - vs.store = storage.NewStore(vs.grpcDialOption, ip, port, grpcPort, publicUrl, folders, maxCounts, minFreeSpaces, idxFolder, vs.needleMapKind, diskTypes, ldbTimeout) + vs.store = storage.NewStore(vs.grpcDialOption, ip, port, grpcPort, publicUrl, id, folders, maxCounts, minFreeSpaces, idxFolder, vs.needleMapKind, diskTypes, ldbTimeout) vs.guard = security.NewGuard(whiteList, signingKey, expiresAfterSec, readSigningKey, readExpiresAfterSec) handleStaticResources(adminMux) diff --git a/weed/server/volume_server_handlers_admin.go b/weed/server/volume_server_handlers_admin.go index ec6490662..a54369277 100644 --- a/weed/server/volume_server_handlers_admin.go +++ b/weed/server/volume_server_handlers_admin.go @@ -4,28 +4,33 @@ import ( "net/http" "path/filepath" - "github.com/seaweedfs/seaweedfs/weed/topology" "github.com/seaweedfs/seaweedfs/weed/util/version" "github.com/seaweedfs/seaweedfs/weed/pb/volume_server_pb" "github.com/seaweedfs/seaweedfs/weed/stats" ) +// healthzHandler checks the local health of the volume server. +// It only checks local conditions to avoid cascading failures when remote +// volume servers go down. Previously, this handler checked if all replicated +// volumes could reach their remote replicas, which caused healthy volume +// servers to fail health checks when a peer went down. +// See https://github.com/seaweedfs/seaweedfs/issues/6823 func (vs *VolumeServer) healthzHandler(w http.ResponseWriter, r *http.Request) { w.Header().Set("Server", "SeaweedFS Volume "+version.VERSION) - volumeInfos := vs.store.VolumeInfos() - for _, vinfo := range volumeInfos { - if len(vinfo.Collection) == 0 { - continue - } - if vinfo.ReplicaPlacement.GetCopyCount() > 1 { - _, err := topology.GetWritableRemoteReplications(vs.store, vs.grpcDialOption, vinfo.Id, vs.GetMaster) - if err != nil { - w.WriteHeader(http.StatusServiceUnavailable) - return - } - } + + // Check if the server is shutting down + if vs.store.IsStopping() { + w.WriteHeader(http.StatusServiceUnavailable) + return } + + // Check if we can communicate with master + if !vs.isHeartbeating { + w.WriteHeader(http.StatusServiceUnavailable) + return + } + w.WriteHeader(http.StatusOK) } diff --git a/weed/server/volume_server_handlers_read.go b/weed/server/volume_server_handlers_read.go index a29ebd183..1fad742db 100644 --- a/weed/server/volume_server_handlers_read.go +++ b/weed/server/volume_server_handlers_read.go @@ -34,8 +34,6 @@ import ( const reqIsProxied = "proxied" -var fileNameEscaper = strings.NewReplacer(`\`, `\\`, `"`, `\"`) - func NotFound(w http.ResponseWriter) { stats.VolumeServerHandlerCounter.WithLabelValues(stats.ErrorGetNotFound).Inc() w.WriteHeader(http.StatusNotFound) diff --git a/weed/sftpd/sftp_file_writer.go b/weed/sftpd/sftp_file_writer.go index 0a662d021..fed60eec0 100644 --- a/weed/sftpd/sftp_file_writer.go +++ b/weed/sftpd/sftp_file_writer.go @@ -72,6 +72,7 @@ func (l listerat) ListAt(ls []os.FileInfo, offset int64) (int, error) { type SeaweedSftpFileWriter struct { fs SftpServer req *sftp.Request + absPath string // Absolute path after HomeDir translation mu sync.Mutex tmpFile *os.File permissions os.FileMode @@ -105,6 +106,6 @@ func (w *SeaweedSftpFileWriter) Close() error { return err } - // Stream the file instead of loading it - return w.fs.putFile(w.req.Filepath, w.tmpFile, w.fs.user) + // Stream the file to the absolute path (after HomeDir translation) + return w.fs.putFile(w.absPath, w.tmpFile, w.fs.user) } diff --git a/weed/sftpd/sftp_filer.go b/weed/sftpd/sftp_filer.go index 9baaf41d7..eb196cc28 100644 --- a/weed/sftpd/sftp_filer.go +++ b/weed/sftpd/sftp_filer.go @@ -100,18 +100,26 @@ func (fs *SftpServer) withTimeoutContext(fn func(ctx context.Context) error) err // ==================== Command Dispatcher ==================== func (fs *SftpServer) dispatchCmd(r *sftp.Request) error { - glog.V(0).Infof("Dispatch: %s %s", r.Method, r.Filepath) + absPath, err := fs.toAbsolutePath(r.Filepath) + if err != nil { + return err + } + glog.V(1).Infof("Dispatch: %s %s (absolute: %s)", r.Method, r.Filepath, absPath) switch r.Method { case "Remove": - return fs.removeEntry(r) + return fs.removeEntry(absPath) case "Rename": - return fs.renameEntry(r) + absTarget, err := fs.toAbsolutePath(r.Target) + if err != nil { + return err + } + return fs.renameEntry(absPath, absTarget) case "Mkdir": - return fs.makeDir(r) + return fs.makeDir(absPath) case "Rmdir": - return fs.removeDir(r) + return fs.removeDir(absPath) case "Setstat": - return fs.setFileStat(r) + return fs.setFileStatWithRequest(absPath, r) default: return fmt.Errorf("unsupported: %s", r.Method) } @@ -120,10 +128,14 @@ func (fs *SftpServer) dispatchCmd(r *sftp.Request) error { // ==================== File Operations ==================== func (fs *SftpServer) readFile(r *sftp.Request) (io.ReaderAt, error) { - if err := fs.checkFilePermission(r.Filepath, "read"); err != nil { + absPath, err := fs.toAbsolutePath(r.Filepath) + if err != nil { + return nil, err + } + if err := fs.checkFilePermission(absPath, "read"); err != nil { return nil, err } - entry, err := fs.getEntry(r.Filepath) + entry, err := fs.getEntry(absPath) if err != nil { return nil, err } @@ -131,7 +143,11 @@ func (fs *SftpServer) readFile(r *sftp.Request) (io.ReaderAt, error) { } func (fs *SftpServer) newFileWriter(r *sftp.Request) (io.WriterAt, error) { - dir, _ := util.FullPath(r.Filepath).DirAndName() + absPath, err := fs.toAbsolutePath(r.Filepath) + if err != nil { + return nil, err + } + dir, _ := util.FullPath(absPath).DirAndName() if err := fs.checkFilePermission(dir, "write"); err != nil { glog.Errorf("Permission denied for %s", dir) return nil, err @@ -145,6 +161,7 @@ func (fs *SftpServer) newFileWriter(r *sftp.Request) (io.WriterAt, error) { return &SeaweedSftpFileWriter{ fs: *fs, req: r, + absPath: absPath, tmpFile: tmpFile, permissions: 0644, uid: fs.user.Uid, @@ -153,16 +170,20 @@ func (fs *SftpServer) newFileWriter(r *sftp.Request) (io.WriterAt, error) { }, nil } -func (fs *SftpServer) removeEntry(r *sftp.Request) error { - return fs.deleteEntry(r.Filepath, false) +func (fs *SftpServer) removeEntry(absPath string) error { + return fs.deleteEntry(absPath, false) } -func (fs *SftpServer) renameEntry(r *sftp.Request) error { - if err := fs.checkFilePermission(r.Filepath, "rename"); err != nil { +func (fs *SftpServer) renameEntry(absPath, absTarget string) error { + if err := fs.checkFilePermission(absPath, "rename"); err != nil { + return err + } + targetDir, _ := util.FullPath(absTarget).DirAndName() + if err := fs.checkFilePermission(targetDir, "write"); err != nil { return err } - oldDir, oldName := util.FullPath(r.Filepath).DirAndName() - newDir, newName := util.FullPath(r.Target).DirAndName() + oldDir, oldName := util.FullPath(absPath).DirAndName() + newDir, newName := util.FullPath(absTarget).DirAndName() return fs.callWithClient(false, func(ctx context.Context, client filer_pb.SeaweedFilerClient) error { _, err := client.AtomicRenameEntry(ctx, &filer_pb.AtomicRenameEntryRequest{ OldDirectory: oldDir, OldName: oldName, @@ -172,15 +193,15 @@ func (fs *SftpServer) renameEntry(r *sftp.Request) error { }) } -func (fs *SftpServer) setFileStat(r *sftp.Request) error { - if err := fs.checkFilePermission(r.Filepath, "write"); err != nil { +func (fs *SftpServer) setFileStatWithRequest(absPath string, r *sftp.Request) error { + if err := fs.checkFilePermission(absPath, "write"); err != nil { return err } - entry, err := fs.getEntry(r.Filepath) + entry, err := fs.getEntry(absPath) if err != nil { return err } - dir, _ := util.FullPath(r.Filepath).DirAndName() + dir, _ := util.FullPath(absPath).DirAndName() // apply attrs if r.AttrFlags().Permissions { entry.Attributes.FileMode = uint32(r.Attributes().FileMode()) @@ -201,18 +222,22 @@ func (fs *SftpServer) setFileStat(r *sftp.Request) error { // ==================== Directory Operations ==================== func (fs *SftpServer) listDir(r *sftp.Request) (sftp.ListerAt, error) { - if err := fs.checkFilePermission(r.Filepath, "list"); err != nil { + absPath, err := fs.toAbsolutePath(r.Filepath) + if err != nil { + return nil, err + } + if err := fs.checkFilePermission(absPath, "list"); err != nil { return nil, err } if r.Method == "Stat" || r.Method == "Lstat" { - entry, err := fs.getEntry(r.Filepath) + entry, err := fs.getEntry(absPath) if err != nil { return nil, err } fi := &EnhancedFileInfo{FileInfo: FileInfoFromEntry(entry), uid: entry.Attributes.Uid, gid: entry.Attributes.Gid} return listerat([]os.FileInfo{fi}), nil } - return fs.listAllPages(r.Filepath) + return fs.listAllPages(absPath) } func (fs *SftpServer) listAllPages(dirPath string) (sftp.ListerAt, error) { @@ -259,18 +284,19 @@ func (fs *SftpServer) fetchDirectoryPage(dirPath, start string) ([]os.FileInfo, } // makeDir creates a new directory with proper permissions. -func (fs *SftpServer) makeDir(r *sftp.Request) error { +func (fs *SftpServer) makeDir(absPath string) error { if fs.user == nil { return fmt.Errorf("cannot create directory: no user info") } - dir, name := util.FullPath(r.Filepath).DirAndName() - if err := fs.checkFilePermission(r.Filepath, "mkdir"); err != nil { + dir, name := util.FullPath(absPath).DirAndName() + if err := fs.checkFilePermission(dir, "write"); err != nil { return err } // default mode and ownership err := filer_pb.Mkdir(context.Background(), fs, string(dir), name, func(entry *filer_pb.Entry) { mode := uint32(0755 | os.ModeDir) - if strings.HasPrefix(r.Filepath, fs.user.HomeDir) { + // Defensive check: all paths should be under HomeDir after toAbsolutePath translation + if absPath == fs.user.HomeDir || strings.HasPrefix(absPath, fs.user.HomeDir+"/") { mode = uint32(0700 | os.ModeDir) } entry.Attributes.FileMode = mode @@ -288,8 +314,8 @@ func (fs *SftpServer) makeDir(r *sftp.Request) error { } // removeDir deletes a directory. -func (fs *SftpServer) removeDir(r *sftp.Request) error { - return fs.deleteEntry(r.Filepath, false) +func (fs *SftpServer) removeDir(absPath string) error { + return fs.deleteEntry(absPath, false) } func (fs *SftpServer) putFile(filepath string, reader io.Reader, user *user.User) error { diff --git a/weed/sftpd/sftp_server.go b/weed/sftpd/sftp_server.go index f158aeb64..e53098e6b 100644 --- a/weed/sftpd/sftp_server.go +++ b/weed/sftpd/sftp_server.go @@ -6,6 +6,8 @@ import ( "fmt" "io" "os" + "path" + "strings" "time" "github.com/pkg/sftp" @@ -37,6 +39,28 @@ func NewSftpServer(filerAddr pb.ServerAddress, grpcDialOption grpc.DialOption, d } } +// toAbsolutePath translates a user-relative path to an absolute filer path. +// When a user has HomeDir="/sftp/user", their view of "/" maps to "/sftp/user". +// This implements chroot-like behavior where the user's home directory +// becomes their root. +func (fs *SftpServer) toAbsolutePath(userPath string) (string, error) { + // If user has root as home directory, no translation needed + if fs.user.HomeDir == "" || fs.user.HomeDir == "/" { + return path.Clean(userPath), nil + } + + // Concatenate home directory with user path, then clean to resolve any ".." components + p := path.Join(fs.user.HomeDir, strings.TrimPrefix(userPath, "/")) + + // Security check: ensure the final path is within the home directory. + // This prevents path traversal attacks like `../..` that could escape the chroot jail. + if !strings.HasPrefix(p, fs.user.HomeDir+"/") && p != fs.user.HomeDir { + return "", fmt.Errorf("path traversal attempt: %s resolves to %s which is outside home dir %s", userPath, p, fs.user.HomeDir) + } + + return p, nil +} + // Fileread is invoked for “get” requests. func (fs *SftpServer) Fileread(req *sftp.Request) (io.ReaderAt, error) { return fs.readFile(req) diff --git a/weed/sftpd/sftp_server_test.go b/weed/sftpd/sftp_server_test.go new file mode 100644 index 000000000..0af94ca14 --- /dev/null +++ b/weed/sftpd/sftp_server_test.go @@ -0,0 +1,103 @@ +package sftpd + +import ( + "testing" + + "github.com/seaweedfs/seaweedfs/weed/sftpd/user" + "github.com/stretchr/testify/assert" +) + +func stringPtr(s string) *string { + return &s +} + +func TestToAbsolutePath(t *testing.T) { + tests := []struct { + name string + homeDir *string // Use pointer to distinguish between unset and empty + userPath string + expected string + expectError bool + }{ + { + name: "normal path", + userPath: "/foo.txt", + expected: "/sftp/testuser/foo.txt", + }, + { + name: "root path", + userPath: "/", + expected: "/sftp/testuser", + }, + { + name: "path with dot", + userPath: "/./foo.txt", + expected: "/sftp/testuser/foo.txt", + }, + { + name: "path traversal attempts", + userPath: "/../foo.txt", + expectError: true, + }, + { + name: "path traversal attempts 2", + userPath: "../../foo.txt", + expectError: true, + }, + { + name: "path traversal attempts 3", + userPath: "/subdir/../../foo.txt", + expectError: true, + }, + { + name: "empty path", + userPath: "", + expected: "/sftp/testuser", + }, + { + name: "multiple slashes", + userPath: "//foo.txt", + expected: "/sftp/testuser/foo.txt", + }, + { + name: "trailing slash", + userPath: "/foo/", + expected: "/sftp/testuser/foo", + }, + { + name: "empty HomeDir passthrough", + homeDir: stringPtr(""), + userPath: "/foo.txt", + expected: "/foo.txt", + }, + { + name: "root HomeDir passthrough", + homeDir: stringPtr("/"), + userPath: "/foo.txt", + expected: "/foo.txt", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + homeDir := "/sftp/testuser" // default + if tt.homeDir != nil { + homeDir = *tt.homeDir + } + + fs := &SftpServer{ + user: &user.User{ + HomeDir: homeDir, + }, + } + + got, err := fs.toAbsolutePath(tt.userPath) + if tt.expectError { + assert.Error(t, err) + } else { + assert.NoError(t, err) + assert.Equal(t, tt.expected, got) + } + }) + } +} diff --git a/weed/sftpd/sftp_service.go b/weed/sftpd/sftp_service.go index e50bd87ba..4d21815a9 100644 --- a/weed/sftpd/sftp_service.go +++ b/weed/sftpd/sftp_service.go @@ -284,8 +284,8 @@ func (s *SFTPService) handleChannel(newChannel ssh.NewChannel, fs *SftpServer) { // handleSFTP starts the SFTP server on the SSH channel. func (s *SFTPService) handleSFTP(channel ssh.Channel, fs *SftpServer) { - // Create server options with initial working directory set to user's home - serverOptions := sftp.WithStartDirectory(fs.user.HomeDir) + // Start at virtual root "/" - toAbsolutePath translates this to the user's HomeDir + serverOptions := sftp.WithStartDirectory("/") server := sftp.NewRequestServer(channel, sftp.Handlers{ FileGet: fs, FilePut: fs, diff --git a/weed/sftpd/user/filestore.go b/weed/sftpd/user/filestore.go index c522a388a..4c372aa76 100644 --- a/weed/sftpd/user/filestore.go +++ b/weed/sftpd/user/filestore.go @@ -5,6 +5,7 @@ import ( "encoding/json" "fmt" "os" + "path" "sync" "golang.org/x/crypto/ssh" @@ -99,6 +100,10 @@ func (s *FileStore) loadUsers() error { user.PublicKeys[i] = string(pubKey.Marshal()) } } + // Clean HomeDir to handle trailing slashes and normalize path + if user.HomeDir != "" { + user.HomeDir = path.Clean(user.HomeDir) + } s.users[user.Username] = user } diff --git a/weed/shell/command_volume_check_disk.go b/weed/shell/command_volume_check_disk.go index dbb64e239..4d775000f 100644 --- a/weed/shell/command_volume_check_disk.go +++ b/weed/shell/command_volume_check_disk.go @@ -64,9 +64,9 @@ func (c *commandVolumeCheckDisk) Help() string { append entries in B and not in A to A optionally, for each non-writable volume replica A - if volume is not full + select a writable volume replica B + if entries in A don't match B prune late volume entries not matching its index file - select a writable volume replica B append missing entries from B into A mark the volume as writable (healthy) @@ -179,9 +179,16 @@ func (vcd *volumeCheckDisk) checkWritableVolumes(volumeReplicas map[uint32][]*Vo writableReplicas = append(writableReplicas[:1], writableReplicas[2:]...) continue } - if err := vcd.syncTwoReplicas(a, b, true); err != nil { - vcd.write("sync volume %d on %s and %s: %v", a.info.Id, a.location.dataNode.Id, b.location.dataNode.Id, err) + + modified, err := vcd.syncTwoReplicas(a, b, true) + if err != nil { + vcd.write("failed to sync volumes %d on %s and %s: %v", a.info.Id, a.location.dataNode.Id, b.location.dataNode.Id, err) + } else { + if modified { + vcd.write("synced %s and %s for volume %d", a.location.dataNode.Id, b.location.dataNode.Id, a.info.Id) + } } + // always choose the larger volume to be the source if a.info.FileCount > b.info.FileCount { writableReplicas = append(writableReplicas[:1], writableReplicas[2:]...) @@ -280,19 +287,25 @@ func (vcd *volumeCheckDisk) checkReadOnlyVolumes(volumeReplicas map[uint32][]*Vo return err } - // ...fix it... - // TODO: test whether syncTwoReplicas() is enough to prune garbage entries on broken volumes. - if err := vcd.syncTwoReplicas(source, r, false); err != nil { - vcd.write("sync read-only volume %d on %s from %s: %v\n", vid, r.location.dataNode.Id, source.location.dataNode.Id, err) + // ...try to fix it... + // TODO: test whether syncTwoReplicas() is enough to prune garbage entries on broken volumes... + modified, err := vcd.syncTwoReplicas(source, r, false) + if err != nil { + vcd.write("sync read-only volume %d on %s from %s: %v", vid, r.location.dataNode.Id, source.location.dataNode.Id, err) - // ...or revert it back to read-only, if something went wrong. - // TODO: we should keep unchanged volumes as read-only, so we don't modify valid volumes which are full. if roErr := vcd.makeVolumeReadonly(vid, r); roErr != nil { - return fmt.Errorf("failed to make volume %d on %s readonly after: %v: %v", vid, r.location.dataNode.Id, err, roErr) + return fmt.Errorf("failed to revert volume %d on %s to readonly after: %v: %v", vid, r.location.dataNode.Id, err, roErr) } - vcd.write("volume %d on %s is now read-only\n", vid, r.location.dataNode.Id) - return err + } else { + if modified { + vcd.write("volume %d on %s is now synced to %d and writable", vid, r.location.dataNode.Id, source.location.dataNode.Id) + } else { + // ...or restore back to read-only, if no changes were made. + if err := vcd.makeVolumeReadonly(vid, r); err != nil { + return fmt.Errorf("failed to revert volume %d on %s to readonly: %v", vid, r.location.dataNode.Id, err) + } + } } return nil @@ -411,35 +424,39 @@ func (vcd *volumeCheckDisk) shouldSkipVolume(a, b *VolumeReplica) (bool, error) // syncTwoReplicas attempts to sync all entries from a source volume replica into a target. If bi-directional mode // is enabled, changes from target are also synced back into the source. -func (vcd *volumeCheckDisk) syncTwoReplicas(source, target *VolumeReplica, bidi bool) (err error) { +// Returns true if source and/or target were modified, false otherwise. +func (vcd *volumeCheckDisk) syncTwoReplicas(source, target *VolumeReplica, bidi bool) (modified bool, err error) { sourceHasChanges, targetHasChanges := true, true const maxIterations = 5 iteration := 0 + modified = false + for (sourceHasChanges || targetHasChanges) && iteration < maxIterations { iteration++ vcd.writeVerbose("sync iteration %d/%d for volume %d", iteration, maxIterations, source.info.Id) prevSourceHasChanges, prevTargetHasChanges := sourceHasChanges, targetHasChanges if sourceHasChanges, targetHasChanges, err = vcd.checkBoth(source, target, bidi); err != nil { - return err + return modified, err } + modified = modified || sourceHasChanges || targetHasChanges // Detect if we're stuck in a loop with no progress if iteration > 1 && prevSourceHasChanges == sourceHasChanges && prevTargetHasChanges == targetHasChanges && (sourceHasChanges || targetHasChanges) { vcd.write("volume %d sync is not making progress between %s and %s after iteration %d, stopping to prevent infinite loop", source.info.Id, source.location.dataNode.Id, target.location.dataNode.Id, iteration) - return fmt.Errorf("sync not making progress after %d iterations", iteration) + return modified, fmt.Errorf("sync not making progress after %d iterations", iteration) } } if iteration >= maxIterations && (sourceHasChanges || targetHasChanges) { vcd.write("volume %d sync reached maximum iterations (%d) between %s and %s, may need manual intervention", source.info.Id, maxIterations, source.location.dataNode.Id, target.location.dataNode.Id) - return fmt.Errorf("reached maximum sync iterations (%d)", maxIterations) + return modified, fmt.Errorf("reached maximum sync iterations (%d)", maxIterations) } - return nil + return modified, nil } // checkBoth performs a sync between source and target volume replicas. If bi-directional mode is enabled, changes from target are also synced back into the source. @@ -628,7 +645,7 @@ func (vcd *volumeCheckDisk) copyVolumeIndexFile(collection string, volumeId uint copyFileClient, err := volumeServerClient.CopyFile(context.Background(), &volume_server_pb.CopyFileRequest{ VolumeId: volumeId, - Ext: ".idx", + Ext: ext, CompactionRevision: math.MaxUint32, StopOffset: math.MaxInt64, Collection: collection, diff --git a/weed/shell/command_volume_server_evacuate.go b/weed/shell/command_volume_server_evacuate.go index 087eeddca..00c8b6b0a 100644 --- a/weed/shell/command_volume_server_evacuate.go +++ b/weed/shell/command_volume_server_evacuate.go @@ -4,7 +4,6 @@ import ( "flag" "fmt" "io" - "os" "slices" @@ -159,6 +158,9 @@ func (c *commandVolumeServerEvacuate) evacuateNormalVolumes(commandEnv *CommandE func (c *commandVolumeServerEvacuate) evacuateEcVolumes(commandEnv *CommandEnv, volumeServer string, skipNonMoveable, applyChange bool, writer io.Writer) error { // Evacuate EC volumes for all disk types // We need to handle each disk type separately because shards should be moved to nodes with the same disk type + // We collect topology once at the start and track capacity changes ourselves + // (via freeEcSlot decrement after each move) rather than repeatedly refreshing, + // which would give a false sense of correctness since topology could be stale. diskTypes := []types.DiskType{types.HardDriveType, types.SsdType} for _, diskType := range diskTypes { @@ -176,9 +178,9 @@ func (c *commandVolumeServerEvacuate) evacuateEcVolumes(commandEnv *CommandEnv, continue } for _, ecShardInfo := range diskInfo.EcShardInfos { - hasMoved, err := c.moveAwayOneEcVolume(commandEnv, ecShardInfo, thisNode, otherNodes, applyChange, diskType) + hasMoved, err := c.moveAwayOneEcVolume(commandEnv, ecShardInfo, thisNode, otherNodes, applyChange, diskType, writer) if err != nil { - fmt.Fprintf(writer, "move away volume %d from %s: %v", ecShardInfo.Id, volumeServer, err) + fmt.Fprintf(writer, "move away volume %d from %s: %v\n", ecShardInfo.Id, volumeServer, err) } if !hasMoved { if skipNonMoveable { @@ -193,14 +195,31 @@ func (c *commandVolumeServerEvacuate) evacuateEcVolumes(commandEnv *CommandEnv, return nil } -func (c *commandVolumeServerEvacuate) moveAwayOneEcVolume(commandEnv *CommandEnv, ecShardInfo *master_pb.VolumeEcShardInformationMessage, thisNode *EcNode, otherNodes []*EcNode, applyChange bool, diskType types.DiskType) (hasMoved bool, err error) { +func (c *commandVolumeServerEvacuate) moveAwayOneEcVolume(commandEnv *CommandEnv, ecShardInfo *master_pb.VolumeEcShardInformationMessage, thisNode *EcNode, otherNodes []*EcNode, applyChange bool, diskType types.DiskType, writer io.Writer) (hasMoved bool, err error) { for _, shardId := range erasure_coding.ShardBits(ecShardInfo.EcIndexBits).ShardIds() { + // Sort by: 1) fewest shards of this volume, 2) most free EC slots + // This ensures we prefer nodes with capacity and balanced shard distribution slices.SortFunc(otherNodes, func(a, b *EcNode) int { - return a.localShardIdCount(ecShardInfo.Id) - b.localShardIdCount(ecShardInfo.Id) + aShards := a.localShardIdCount(ecShardInfo.Id) + bShards := b.localShardIdCount(ecShardInfo.Id) + if aShards != bShards { + return aShards - bShards // Prefer fewer shards + } + return b.freeEcSlot - a.freeEcSlot // Then prefer more free slots }) + + shardMoved := false + skippedNodes := 0 for i := 0; i < len(otherNodes); i++ { emptyNode := otherNodes[i] + + // Skip nodes with no free EC slots + if emptyNode.freeEcSlot <= 0 { + skippedNodes++ + continue + } + collectionPrefix := "" if ecShardInfo.Collection != "" { collectionPrefix = ecShardInfo.Collection + "_" @@ -209,19 +228,29 @@ func (c *commandVolumeServerEvacuate) moveAwayOneEcVolume(commandEnv *CommandEnv // For evacuation, prefer same disk type but allow fallback to other types destDiskId := pickBestDiskOnNode(emptyNode, vid, diskType, false) if destDiskId > 0 { - fmt.Fprintf(os.Stdout, "moving ec volume %s%d.%d %s => %s (disk %d)\n", collectionPrefix, ecShardInfo.Id, shardId, thisNode.info.Id, emptyNode.info.Id, destDiskId) + fmt.Fprintf(writer, "moving ec volume %s%d.%d %s => %s (disk %d)\n", collectionPrefix, ecShardInfo.Id, shardId, thisNode.info.Id, emptyNode.info.Id, destDiskId) } else { - fmt.Fprintf(os.Stdout, "moving ec volume %s%d.%d %s => %s\n", collectionPrefix, ecShardInfo.Id, shardId, thisNode.info.Id, emptyNode.info.Id) + fmt.Fprintf(writer, "moving ec volume %s%d.%d %s => %s\n", collectionPrefix, ecShardInfo.Id, shardId, thisNode.info.Id, emptyNode.info.Id) } err = moveMountedShardToEcNode(commandEnv, thisNode, ecShardInfo.Collection, vid, shardId, emptyNode, destDiskId, applyChange, diskType) if err != nil { + hasMoved = false return } else { hasMoved = true + shardMoved = true + // Update the node's free slot count after successful move + emptyNode.freeEcSlot-- break } } - if !hasMoved { + if !shardMoved { + if skippedNodes > 0 { + fmt.Fprintf(writer, "no available destination for ec shard %d.%d: %d nodes have no free slots\n", + ecShardInfo.Id, shardId, skippedNodes) + } + // Ensure partial moves are reported as failures to prevent data loss + hasMoved = false return } } diff --git a/weed/storage/needle/needle_parse_upload.go b/weed/storage/needle/needle_parse_upload.go index 89708303d..6fadd80d6 100644 --- a/weed/storage/needle/needle_parse_upload.go +++ b/weed/storage/needle/needle_parse_upload.go @@ -128,7 +128,7 @@ func parseUpload(r *http.Request, sizeLimit int64, pu *ParsedUpload) (e error) { pu.FileName = part.FileName() if pu.FileName != "" { - pu.FileName = path.Base(pu.FileName) + pu.FileName = util.CleanWindowsPathBase(pu.FileName) } dataSize, e = pu.bytesBuffer.ReadFrom(io.LimitReader(part, sizeLimit+1)) @@ -169,7 +169,7 @@ func parseUpload(r *http.Request, sizeLimit int64, pu *ParsedUpload) (e error) { // update pu.Data = pu.bytesBuffer.Bytes() - pu.FileName = path.Base(fName) + pu.FileName = util.CleanWindowsPathBase(fName) contentType = part.Header.Get("Content-Type") part = part2 break @@ -207,7 +207,7 @@ func parseUpload(r *http.Request, sizeLimit int64, pu *ParsedUpload) (e error) { } if pu.FileName != "" { - pu.FileName = path.Base(pu.FileName) + pu.FileName = util.CleanWindowsPathBase(pu.FileName) } else { pu.FileName = path.Base(r.URL.Path) } diff --git a/weed/storage/store.go b/weed/storage/store.go index cc07f8702..7a336d1ff 100644 --- a/weed/storage/store.go +++ b/weed/storage/store.go @@ -63,6 +63,7 @@ type Store struct { Port int GrpcPort int PublicUrl string + Id string // volume server id, independent of ip:port for stable identification Locations []*DiskLocation dataCenter string // optional information, overwriting master setting if exists rack string // optional information, overwriting master setting if exists @@ -76,13 +77,13 @@ type Store struct { } func (s *Store) String() (str string) { - str = fmt.Sprintf("Ip:%s, Port:%d, GrpcPort:%d PublicUrl:%s, dataCenter:%s, rack:%s, connected:%v, volumeSizeLimit:%d", s.Ip, s.Port, s.GrpcPort, s.PublicUrl, s.dataCenter, s.rack, s.connected, s.GetVolumeSizeLimit()) + str = fmt.Sprintf("Id:%s, Ip:%s, Port:%d, GrpcPort:%d PublicUrl:%s, dataCenter:%s, rack:%s, connected:%v, volumeSizeLimit:%d", s.Id, s.Ip, s.Port, s.GrpcPort, s.PublicUrl, s.dataCenter, s.rack, s.connected, s.GetVolumeSizeLimit()) return } -func NewStore(grpcDialOption grpc.DialOption, ip string, port int, grpcPort int, publicUrl string, dirnames []string, maxVolumeCounts []int32, +func NewStore(grpcDialOption grpc.DialOption, ip string, port int, grpcPort int, publicUrl string, id string, dirnames []string, maxVolumeCounts []int32, minFreeSpaces []util.MinFreeSpace, idxFolder string, needleMapKind NeedleMapKind, diskTypes []DiskType, ldbTimeout int64) (s *Store) { - s = &Store{grpcDialOption: grpcDialOption, Port: port, Ip: ip, GrpcPort: grpcPort, PublicUrl: publicUrl, NeedleMapKind: needleMapKind} + s = &Store{grpcDialOption: grpcDialOption, Port: port, Ip: ip, GrpcPort: grpcPort, PublicUrl: publicUrl, Id: id, NeedleMapKind: needleMapKind} s.Locations = make([]*DiskLocation, 0) var wg sync.WaitGroup @@ -414,6 +415,7 @@ func (s *Store) CollectHeartbeat() *master_pb.Heartbeat { Port: uint32(s.Port), GrpcPort: uint32(s.GrpcPort), PublicUrl: s.PublicUrl, + Id: s.Id, MaxVolumeCounts: maxVolumeCounts, MaxFileKey: NeedleIdToUint64(maxFileKey), DataCenter: s.dataCenter, @@ -467,6 +469,10 @@ func (s *Store) SetStopping() { } } +func (s *Store) IsStopping() bool { + return s.isStopping +} + func (s *Store) LoadNewVolumes() { for _, location := range s.Locations { location.loadExistingVolumes(s.NeedleMapKind, 0) diff --git a/weed/storage/store_ec_delete.go b/weed/storage/store_ec_delete.go index a3e028bbb..9fcb092a2 100644 --- a/weed/storage/store_ec_delete.go +++ b/weed/storage/store_ec_delete.go @@ -3,6 +3,7 @@ package storage import ( "context" "fmt" + "github.com/seaweedfs/seaweedfs/weed/pb" "github.com/seaweedfs/seaweedfs/weed/glog" @@ -21,7 +22,8 @@ func (s *Store) DeleteEcShardNeedle(ecVolume *erasure_coding.EcVolume, n *needle return 0, err } - if cookie != n.Cookie { + // cookie == 0 indicates SkipCookieCheck was requested (e.g., orphan cleanup) + if cookie != 0 && cookie != n.Cookie { return 0, fmt.Errorf("unexpected cookie %x", cookie) } @@ -45,22 +47,17 @@ func (s *Store) doDeleteNeedleFromAtLeastOneRemoteEcShards(ecVolume *erasure_cod shardId, _ := intervals[0].ToShardIdAndOffset(erasure_coding.ErasureCodingLargeBlockSize, erasure_coding.ErasureCodingSmallBlockSize) - hasDeletionSuccess := false err = s.doDeleteNeedleFromRemoteEcShardServers(shardId, ecVolume, needleId) if err == nil { - hasDeletionSuccess = true + return nil } for shardId = erasure_coding.DataShardsCount; shardId < erasure_coding.TotalShardsCount; shardId++ { if parityDeletionError := s.doDeleteNeedleFromRemoteEcShardServers(shardId, ecVolume, needleId); parityDeletionError == nil { - hasDeletionSuccess = true + return nil } } - if hasDeletionSuccess { - return nil - } - return err } @@ -77,11 +74,9 @@ func (s *Store) doDeleteNeedleFromRemoteEcShardServers(shardId erasure_coding.Sh for _, sourceDataNode := range sourceDataNodes { glog.V(4).Infof("delete from remote ec shard %d.%d from %s", ecVolume.VolumeId, shardId, sourceDataNode) - err := s.doDeleteNeedleFromRemoteEcShard(sourceDataNode, ecVolume.VolumeId, ecVolume.Collection, ecVolume.Version, needleId) - if err != nil { + if err := s.doDeleteNeedleFromRemoteEcShard(sourceDataNode, ecVolume.VolumeId, ecVolume.Collection, ecVolume.Version, needleId); err != nil { return err } - glog.V(1).Infof("delete from remote ec shard %d.%d from %s: %v", ecVolume.VolumeId, shardId, sourceDataNode, err) } return nil diff --git a/weed/storage/store_load_balancing_test.go b/weed/storage/store_load_balancing_test.go index 15e709d53..35475a6ae 100644 --- a/weed/storage/store_load_balancing_test.go +++ b/weed/storage/store_load_balancing_test.go @@ -31,7 +31,7 @@ func newTestStore(t *testing.T, numDirs int) *Store { diskTypes = append(diskTypes, types.HardDriveType) } - store := NewStore(nil, "localhost", 8080, 18080, "http://localhost:8080", + store := NewStore(nil, "localhost", 8080, 18080, "http://localhost:8080", "", dirs, maxCounts, minFreeSpaces, "", NeedleMapInMemory, diskTypes, 3) // Consume channel messages to prevent blocking diff --git a/weed/topology/data_node.go b/weed/topology/data_node.go index 4f2dbe464..07e00ac0a 100644 --- a/weed/topology/data_node.go +++ b/weed/topology/data_node.go @@ -269,6 +269,7 @@ func (dn *DataNode) ToDataNodeInfo() *master_pb.DataNodeInfo { Id: string(dn.Id()), DiskInfos: make(map[string]*master_pb.DiskInfo), GrpcPort: uint32(dn.GrpcPort), + Address: dn.Url(), // ip:port for connecting to the volume server } for _, c := range dn.Children() { disk := c.(*Disk) diff --git a/weed/topology/rack.go b/weed/topology/rack.go index f526cd84d..1e5c8b632 100644 --- a/weed/topology/rack.go +++ b/weed/topology/rack.go @@ -5,6 +5,7 @@ import ( "strings" "time" + "github.com/seaweedfs/seaweedfs/weed/glog" "github.com/seaweedfs/seaweedfs/weed/pb/master_pb" "github.com/seaweedfs/seaweedfs/weed/storage/types" "github.com/seaweedfs/seaweedfs/weed/util" @@ -34,17 +35,73 @@ func (r *Rack) FindDataNode(ip string, port int) *DataNode { } return nil } -func (r *Rack) GetOrCreateDataNode(ip string, port int, grpcPort int, publicUrl string, maxVolumeCounts map[string]uint32) *DataNode { + +// FindDataNodeById finds a DataNode by its ID using O(1) map lookup +func (r *Rack) FindDataNodeById(id string) *DataNode { + r.RLock() + defer r.RUnlock() + if c, ok := r.children[NodeId(id)]; ok { + return c.(*DataNode) + } + return nil +} + +func (r *Rack) GetOrCreateDataNode(ip string, port int, grpcPort int, publicUrl string, id string, maxVolumeCounts map[string]uint32) *DataNode { r.Lock() defer r.Unlock() - for _, c := range r.children { + + // Normalize the id parameter (trim whitespace) + id = strings.TrimSpace(id) + + // Determine the node ID: use provided id, or fall back to ip:port for backward compatibility + nodeId := util.GetVolumeServerId(id, ip, port) + + // First, try to find by node ID using O(1) map lookup (stable identity) + if c, ok := r.children[NodeId(nodeId)]; ok { dn := c.(*DataNode) - if dn.MatchLocation(ip, port) { - dn.LastSeen = time.Now().Unix() - return dn + // Log if IP or Port changed (e.g., pod rescheduled in K8s) + if dn.Ip != ip || dn.Port != port { + glog.V(0).Infof("DataNode %s address changed from %s:%d to %s:%d", nodeId, dn.Ip, dn.Port, ip, port) } + // Update the IP/Port in case they changed + dn.Ip = ip + dn.Port = port + dn.GrpcPort = grpcPort + dn.PublicUrl = publicUrl + dn.LastSeen = time.Now().Unix() + return dn } - dn := NewDataNode(util.JoinHostPort(ip, port)) + + // For backward compatibility: if explicit id was provided, also check by ip:port + // to handle transition from old (ip:port) to new (explicit id) behavior + ipPortId := util.JoinHostPort(ip, port) + if nodeId != ipPortId { + for oldId, c := range r.children { + dn := c.(*DataNode) + if dn.MatchLocation(ip, port) { + // Only transition if the oldId exactly matches ip:port (legacy identification). + // If oldId is different, this is a node with an explicit id that happens to + // reuse the same ip:port - don't incorrectly merge them. + if string(oldId) != ipPortId { + glog.Warningf("Volume server with id %s has ip:port %s which is used by node %s", nodeId, ipPortId, oldId) + continue + } + // Found a legacy node identified by ip:port, transition it to use the new explicit id + glog.V(0).Infof("Volume server %s transitioning id from %s to %s", dn.Url(), oldId, nodeId) + // Re-key the node in the children map with the new id + delete(r.children, oldId) + dn.id = NodeId(nodeId) + r.children[NodeId(nodeId)] = dn + // Update connection info in case they changed + dn.GrpcPort = grpcPort + dn.PublicUrl = publicUrl + dn.LastSeen = time.Now().Unix() + return dn + } + } + } + + dn := NewDataNode(nodeId) dn.Ip = ip dn.Port = port dn.GrpcPort = grpcPort diff --git a/weed/topology/topology_test.go b/weed/topology/topology_test.go index 8515d2f81..e5a8969fc 100644 --- a/weed/topology/topology_test.go +++ b/weed/topology/topology_test.go @@ -34,7 +34,7 @@ func TestHandlingVolumeServerHeartbeat(t *testing.T) { maxVolumeCounts := make(map[string]uint32) maxVolumeCounts[""] = 25 maxVolumeCounts["ssd"] = 12 - dn := rack.GetOrCreateDataNode("127.0.0.1", 34534, 0, "127.0.0.1", maxVolumeCounts) + dn := rack.GetOrCreateDataNode("127.0.0.1", 34534, 0, "127.0.0.1", "", maxVolumeCounts) { volumeCount := 7 @@ -180,7 +180,7 @@ func TestAddRemoveVolume(t *testing.T) { maxVolumeCounts := make(map[string]uint32) maxVolumeCounts[""] = 25 maxVolumeCounts["ssd"] = 12 - dn := rack.GetOrCreateDataNode("127.0.0.1", 34534, 0, "127.0.0.1", maxVolumeCounts) + dn := rack.GetOrCreateDataNode("127.0.0.1", 34534, 0, "127.0.0.1", "", maxVolumeCounts) v := storage.VolumeInfo{ Id: needle.VolumeId(1), @@ -218,7 +218,7 @@ func TestVolumeReadOnlyStatusChange(t *testing.T) { rack := dc.GetOrCreateRack("rack1") maxVolumeCounts := make(map[string]uint32) maxVolumeCounts[""] = 25 - dn := rack.GetOrCreateDataNode("127.0.0.1", 34534, 0, "127.0.0.1", maxVolumeCounts) + dn := rack.GetOrCreateDataNode("127.0.0.1", 34534, 0, "127.0.0.1", "", maxVolumeCounts) // Create a writable volume v := storage.VolumeInfo{ @@ -267,7 +267,7 @@ func TestVolumeReadOnlyAndRemoteStatusChange(t *testing.T) { rack := dc.GetOrCreateRack("rack1") maxVolumeCounts := make(map[string]uint32) maxVolumeCounts[""] = 25 - dn := rack.GetOrCreateDataNode("127.0.0.1", 34534, 0, "127.0.0.1", maxVolumeCounts) + dn := rack.GetOrCreateDataNode("127.0.0.1", 34534, 0, "127.0.0.1", "", maxVolumeCounts) // Create a writable, local volume v := storage.VolumeInfo{ @@ -331,7 +331,7 @@ func TestListCollections(t *testing.T) { topo := NewTopology("weedfs", sequence.NewMemorySequencer(), 32*1024, 5, false) dc := topo.GetOrCreateDataCenter("dc1") rack := dc.GetOrCreateRack("rack1") - dn := rack.GetOrCreateDataNode("127.0.0.1", 34534, 0, "127.0.0.1", nil) + dn := rack.GetOrCreateDataNode("127.0.0.1", 34534, 0, "127.0.0.1", "", nil) topo.RegisterVolumeLayout(storage.VolumeInfo{ Id: needle.VolumeId(1111), @@ -396,3 +396,112 @@ func TestListCollections(t *testing.T) { }) } } + +func TestDataNodeIdBasedIdentification(t *testing.T) { + topo := NewTopology("weedfs", sequence.NewMemorySequencer(), 32*1024, 5, false) + dc := topo.GetOrCreateDataCenter("dc1") + rack := dc.GetOrCreateRack("rack1") + + maxVolumeCounts := make(map[string]uint32) + maxVolumeCounts[""] = 10 + + // Test 1: Create a DataNode with explicit id + dn1 := rack.GetOrCreateDataNode("10.0.0.1", 8080, 18080, "10.0.0.1:8080", "node-1", maxVolumeCounts) + if string(dn1.Id()) != "node-1" { + t.Errorf("expected node id 'node-1', got '%s'", dn1.Id()) + } + if dn1.Ip != "10.0.0.1" { + t.Errorf("expected ip '10.0.0.1', got '%s'", dn1.Ip) + } + + // Test 2: Same id with different IP should return the same DataNode (K8s pod reschedule scenario) + dn2 := rack.GetOrCreateDataNode("10.0.0.2", 8080, 18080, "10.0.0.2:8080", "node-1", maxVolumeCounts) + if dn1 != dn2 { + t.Errorf("expected same DataNode for same id, got different nodes") + } + // IP should be updated to the new value + if dn2.Ip != "10.0.0.2" { + t.Errorf("expected ip to be updated to '10.0.0.2', got '%s'", dn2.Ip) + } + if dn2.PublicUrl != "10.0.0.2:8080" { + t.Errorf("expected publicUrl to be updated to '10.0.0.2:8080', got '%s'", dn2.PublicUrl) + } + + // Test 3: Different id should create a new DataNode + dn3 := rack.GetOrCreateDataNode("10.0.0.3", 8080, 18080, "10.0.0.3:8080", "node-2", maxVolumeCounts) + if string(dn3.Id()) != "node-2" { + t.Errorf("expected node id 'node-2', got '%s'", dn3.Id()) + } + if dn1 == dn3 { + t.Errorf("expected different DataNode for different id") + } + + // Test 4: Empty id should fall back to ip:port (backward compatibility) + dn4 := rack.GetOrCreateDataNode("10.0.0.4", 8080, 18080, "10.0.0.4:8080", "", maxVolumeCounts) + if string(dn4.Id()) != "10.0.0.4:8080" { + t.Errorf("expected node id '10.0.0.4:8080' for empty id, got '%s'", dn4.Id()) + } + + // Test 5: Same ip:port with empty id should return the same DataNode + dn5 := rack.GetOrCreateDataNode("10.0.0.4", 8080, 18080, "10.0.0.4:8080", "", maxVolumeCounts) + if dn4 != dn5 { + t.Errorf("expected same DataNode for same ip:port with empty id") + } + + // Verify we have 3 unique DataNodes total: + // - node-1 (dn1/dn2 share the same id) + // - node-2 (dn3) + // - 10.0.0.4:8080 (dn4/dn5 share the same ip:port) + children := rack.Children() + if len(children) != 3 { + t.Errorf("expected 3 DataNodes, got %d", len(children)) + } + + // Test 6: Transition from ip:port to explicit id + // First, the node exists with ip:port as id (dn4/dn5) + // Now the same volume server starts sending an explicit id + dn6 := rack.GetOrCreateDataNode("10.0.0.4", 8080, 18080, "10.0.0.4:8080", "node-4-explicit", maxVolumeCounts) + // Should return the same DataNode instance + if dn6 != dn4 { + t.Errorf("expected same DataNode instance during transition") + } + // But the id should now be updated to the explicit id + if string(dn6.Id()) != "node-4-explicit" { + t.Errorf("expected node id to transition to 'node-4-explicit', got '%s'", dn6.Id()) + } + // The node should be re-keyed in the children map + if rack.FindDataNodeById("node-4-explicit") != dn6 { + t.Errorf("expected to find DataNode by new explicit id") + } + // Old ip:port key should no longer work + if rack.FindDataNodeById("10.0.0.4:8080") != nil { + t.Errorf("expected old ip:port id to be removed from children map") + } + + // Still 3 unique DataNodes (node-1, node-2, node-4-explicit) + children = rack.Children() + if len(children) != 3 { + t.Errorf("expected 3 DataNodes after transition, got %d", len(children)) + } + + // Test 7: Prevent incorrect transition when a new node reuses ip:port of a node with explicit id + // Scenario: node-1 runs at 10.0.0.1:8080, dies, new node-99 starts at same ip:port + // The transition should NOT happen because node-1 already has an explicit id + dn7 := rack.GetOrCreateDataNode("10.0.0.1", 8080, 18080, "10.0.0.1:8080", "node-99", maxVolumeCounts) + // Should create a NEW DataNode, not reuse node-1 + if dn7 == dn1 { + t.Errorf("expected new DataNode for node-99, got reused node-1") + } + if string(dn7.Id()) != "node-99" { + t.Errorf("expected node id 'node-99', got '%s'", dn7.Id()) + } + // node-1 should still exist with its original id + if rack.FindDataNodeById("node-1") == nil { + t.Errorf("node-1 should still exist") + } + // Now we have 4 DataNodes + children = rack.Children() + if len(children) != 4 { + t.Errorf("expected 4 DataNodes, got %d", len(children)) + } +} diff --git a/weed/util/fullpath.go b/weed/util/fullpath.go index c145919da..b485cae0d 100644 --- a/weed/util/fullpath.go +++ b/weed/util/fullpath.go @@ -1,6 +1,7 @@ package util import ( + "path" "path/filepath" "strings" ) @@ -85,3 +86,15 @@ func StringSplit(separatedValues string, sep string) []string { } return strings.Split(separatedValues, sep) } + +// CleanWindowsPath normalizes Windows-style backslashes to forward slashes. +// This handles paths from Windows clients where paths use backslashes. +func CleanWindowsPath(p string) string { + return strings.ReplaceAll(p, "\\", "/") +} + +// CleanWindowsPathBase normalizes Windows-style backslashes to forward slashes +// and returns the base name of the path. +func CleanWindowsPathBase(p string) string { + return path.Base(strings.ReplaceAll(p, "\\", "/")) +} diff --git a/weed/util/http/http_global_client_util.go b/weed/util/http/http_global_client_util.go index 3a969fdc8..a374c8a2b 100644 --- a/weed/util/http/http_global_client_util.go +++ b/weed/util/http/http_global_client_util.go @@ -487,6 +487,12 @@ func RetriedFetchChunkData(ctx context.Context, buffer []byte, urlStrings []stri ) } + // For unencrypted, non-gzipped full chunks, use direct buffer read + // This avoids the 64KB intermediate buffer and callback overhead + if cipherKey == nil && !isGzipped && isFullChunk { + return retriedFetchChunkDataDirect(ctx, buffer, urlStrings, string(jwt)) + } + var shouldRetry bool for waitTime := time.Second; waitTime < util.RetryWaitTime; waitTime += waitTime / 2 { @@ -551,3 +557,105 @@ func RetriedFetchChunkData(ctx context.Context, buffer []byte, urlStrings []stri return n, err } + +// retriedFetchChunkDataDirect reads chunk data directly into the buffer without +// intermediate buffering. This reduces memory copies and improves throughput +// for large chunk reads. +func retriedFetchChunkDataDirect(ctx context.Context, buffer []byte, urlStrings []string, jwt string) (n int, err error) { + var shouldRetry bool + + for waitTime := time.Second; waitTime < util.RetryWaitTime; waitTime += waitTime / 2 { + select { + case <-ctx.Done(): + return 0, ctx.Err() + default: + } + + for _, urlString := range urlStrings { + select { + case <-ctx.Done(): + return 0, ctx.Err() + default: + } + + n, shouldRetry, err = readUrlDirectToBuffer(ctx, urlString+"?readDeleted=true", jwt, buffer) + if err == nil { + return n, nil + } + if !shouldRetry { + break + } + glog.V(0).InfofCtx(ctx, "read %s failed, err: %v", urlString, err) + } + + if err != nil && shouldRetry { + glog.V(0).InfofCtx(ctx, "retry reading in %v", waitTime) + timer := time.NewTimer(waitTime) + select { + case <-ctx.Done(): + timer.Stop() + return 0, ctx.Err() + case <-timer.C: + } + } else { + break + } + } + + return n, err +} + +// readUrlDirectToBuffer reads HTTP response directly into the provided buffer, +// avoiding intermediate buffer allocations and copies. +func readUrlDirectToBuffer(ctx context.Context, fileUrl, jwt string, buffer []byte) (n int, retryable bool, err error) { + req, err := http.NewRequestWithContext(ctx, http.MethodGet, fileUrl, nil) + if err != nil { + return 0, false, err + } + maybeAddAuth(req, jwt) + request_id.InjectToRequest(ctx, req) + + r, err := GetGlobalHttpClient().Do(req) + if err != nil { + return 0, true, err + } + defer CloseResponse(r) + + if r.StatusCode >= 400 { + if r.StatusCode == http.StatusNotFound { + return 0, true, fmt.Errorf("%s: %s: %w", fileUrl, r.Status, ErrNotFound) + } + if r.StatusCode == http.StatusTooManyRequests { + return 0, false, fmt.Errorf("%s: %s: %w", fileUrl, r.Status, ErrTooManyRequests) + } + retryable = r.StatusCode >= 499 + return 0, retryable, fmt.Errorf("%s: %s", fileUrl, r.Status) + } + + // Read directly into the buffer without intermediate copying + // This is significantly faster for large chunks (16MB+) + var totalRead int + for totalRead < len(buffer) { + select { + case <-ctx.Done(): + return totalRead, false, ctx.Err() + default: + } + + m, readErr := r.Body.Read(buffer[totalRead:]) + totalRead += m + if readErr != nil { + if readErr == io.EOF { + // Return io.ErrUnexpectedEOF if we haven't filled the buffer + // This prevents silent data corruption from truncated responses + if totalRead < len(buffer) { + return totalRead, true, io.ErrUnexpectedEOF + } + return totalRead, false, nil + } + return totalRead, true, readErr + } + } + + return totalRead, false, nil +} diff --git a/weed/util/net_timeout.go b/weed/util/net_timeout.go index 75e475f6b..9aeb5cd48 100644 --- a/weed/util/net_timeout.go +++ b/weed/util/net_timeout.go @@ -9,22 +9,11 @@ import ( "github.com/seaweedfs/seaweedfs/weed/stats" ) -const ( - // minThroughputBytesPerSecond defines the minimum expected throughput (4KB/s) - // Used to calculate timeout scaling based on data transferred - minThroughputBytesPerSecond = 4000 - - // graceTimeCapMultiplier caps the grace period for slow clients at 3x base timeout - // This prevents indefinite connections while allowing time for server-side chunk fetches - graceTimeCapMultiplier = 3 -) - // Listener wraps a net.Listener, and gives a place to store the timeout // parameters. On Accept, it will wrap the net.Conn with our own Conn for us. type Listener struct { net.Listener - ReadTimeout time.Duration - WriteTimeout time.Duration + Timeout time.Duration } func (l *Listener) Accept() (net.Conn, error) { @@ -34,103 +23,50 @@ func (l *Listener) Accept() (net.Conn, error) { } stats.ConnectionOpen() tc := &Conn{ - Conn: c, - ReadTimeout: l.ReadTimeout, - WriteTimeout: l.WriteTimeout, + Conn: c, + Timeout: l.Timeout, } return tc, nil } -// Conn wraps a net.Conn, and sets a deadline for every read -// and write operation. +// Conn wraps a net.Conn and implements a "no activity timeout". +// Any activity (read or write) resets the deadline, so the connection +// only times out when there's no activity in either direction. type Conn struct { net.Conn - ReadTimeout time.Duration - WriteTimeout time.Duration - isClosed bool - bytesRead int64 - bytesWritten int64 - lastWrite time.Time + Timeout time.Duration + isClosed bool } -// calculateBytesPerTimeout calculates the expected number of bytes that should -// be transferred during one timeout period, based on the minimum throughput. -// Returns at least 1 to prevent division by zero. -func calculateBytesPerTimeout(timeout time.Duration) int64 { - bytesPerTimeout := int64(float64(minThroughputBytesPerSecond) * timeout.Seconds()) - if bytesPerTimeout <= 0 { - return 1 // Prevent division by zero +// extendDeadline extends the connection deadline from now. +// This implements "no activity timeout" - any activity keeps the connection alive. +func (c *Conn) extendDeadline() error { + if c.Timeout > 0 { + return c.Conn.SetDeadline(time.Now().Add(c.Timeout)) } - return bytesPerTimeout + return nil } func (c *Conn) Read(b []byte) (count int, e error) { - if c.ReadTimeout != 0 { - // Calculate expected bytes per timeout period based on minimum throughput (4KB/s) - // Example: with ReadTimeout=30s, bytesPerTimeout = 4000 * 30 = 120KB - // After reading 1MB: multiplier = 1,000,000/120,000 + 1 ≈ 9, deadline = 30s * 9 = 270s - bytesPerTimeout := calculateBytesPerTimeout(c.ReadTimeout) - timeoutMultiplier := time.Duration(c.bytesRead/bytesPerTimeout + 1) - err := c.Conn.SetReadDeadline(time.Now().Add(c.ReadTimeout * timeoutMultiplier)) - if err != nil { - return 0, err - } + // Extend deadline before reading - any activity keeps connection alive + if err := c.extendDeadline(); err != nil { + return 0, err } count, e = c.Conn.Read(b) if e == nil { stats.BytesIn(int64(count)) - c.bytesRead += int64(count) } return } func (c *Conn) Write(b []byte) (count int, e error) { - if c.WriteTimeout != 0 { - now := time.Now() - // Calculate timeout with two components: - // 1. Base timeout scaled by cumulative data (minimum throughput of 4KB/s) - // 2. Additional grace period if there was a gap since last write (for chunk fetch delays) - - // Calculate expected bytes per timeout period based on minimum throughput (4KB/s) - // Example: with WriteTimeout=30s, bytesPerTimeout = 4000 * 30 = 120KB - // After writing 1MB: multiplier = 1,000,000/120,000 + 1 ≈ 9, baseTimeout = 30s * 9 = 270s - bytesPerTimeout := calculateBytesPerTimeout(c.WriteTimeout) - timeoutMultiplier := time.Duration(c.bytesWritten/bytesPerTimeout + 1) - baseTimeout := c.WriteTimeout * timeoutMultiplier - - // If it's been a while since last write, add grace time for server-side chunk fetches - // But cap it to avoid keeping slow clients connected indefinitely - // - // The comparison uses unscaled WriteTimeout intentionally: triggers grace when idle time - // exceeds base timeout, independent of throughput scaling. - if !c.lastWrite.IsZero() { - timeSinceLastWrite := now.Sub(c.lastWrite) - if timeSinceLastWrite > c.WriteTimeout { - // Add grace time capped at graceTimeCapMultiplier * scaled timeout. - // This allows total deadline up to 4x scaled timeout for server-side delays. - // - // Example: WriteTimeout=30s, 1MB written (multiplier≈9), baseTimeout=270s - // If 400s gap occurs fetching chunks: graceTime capped at 270s*3=810s - // Final deadline: 270s + 810s = 1080s (~18min) to accommodate slow storage - // But if only 50s gap: graceTime = 50s, final deadline = 270s + 50s = 320s - graceTime := timeSinceLastWrite - if graceTime > baseTimeout*graceTimeCapMultiplier { - graceTime = baseTimeout * graceTimeCapMultiplier - } - baseTimeout += graceTime - } - } - - err := c.Conn.SetWriteDeadline(now.Add(baseTimeout)) - if err != nil { - return 0, err - } + // Extend deadline before writing - any activity keeps connection alive + if err := c.extendDeadline(); err != nil { + return 0, err } count, e = c.Conn.Write(b) if e == nil { stats.BytesOut(int64(count)) - c.bytesWritten += int64(count) - c.lastWrite = time.Now() } return } @@ -153,9 +89,8 @@ func NewListener(addr string, timeout time.Duration) (ipListener net.Listener, e } ipListener = &Listener{ - Listener: listener, - ReadTimeout: timeout, - WriteTimeout: timeout, + Listener: listener, + Timeout: timeout, } return @@ -168,9 +103,8 @@ func NewIpAndLocalListeners(host string, port int, timeout time.Duration) (ipLis } ipListener = &Listener{ - Listener: listener, - ReadTimeout: timeout, - WriteTimeout: timeout, + Listener: listener, + Timeout: timeout, } if host != "localhost" && host != "" && host != "0.0.0.0" && host != "127.0.0.1" && host != "[::]" && host != "[::1]" { @@ -181,9 +115,8 @@ func NewIpAndLocalListeners(host string, port int, timeout time.Duration) (ipLis } localListener = &Listener{ - Listener: listener, - ReadTimeout: timeout, - WriteTimeout: timeout, + Listener: listener, + Timeout: timeout, } } diff --git a/weed/util/network.go b/weed/util/network.go index 328808dbc..f7dbeebb7 100644 --- a/weed/util/network.go +++ b/weed/util/network.go @@ -64,3 +64,14 @@ func JoinHostPort(host string, port int) string { } return net.JoinHostPort(host, portStr) } + +// GetVolumeServerId returns the volume server ID. +// If id is provided (non-empty after trimming), use it as the identifier. +// Otherwise, fall back to ip:port for backward compatibility. +func GetVolumeServerId(id, ip string, port int) string { + volumeServerId := strings.TrimSpace(id) + if volumeServerId == "" { + volumeServerId = JoinHostPort(ip, port) + } + return volumeServerId +}