Browse Source
Fix maintenance worker panic and add EC integration tests (#8068)
Fix maintenance worker panic and add EC integration tests (#8068)
* Fix nil pointer panic in maintenance worker when receiving empty task assignment When a worker requests a task and none are available, the admin server sends an empty TaskAssignment message. The worker was attempting to log the task details without checking if the TaskId was empty, causing a nil pointer dereference when accessing taskAssign.Params.VolumeId. This fix adds a check for empty TaskId before processing the assignment, preventing worker crashes and improving stability in production environments. * Add EC integration test for admin-worker maintenance system Adds comprehensive integration test that verifies the end-to-end flow of erasure coding maintenance tasks: - Admin server detects volumes needing EC encoding - Workers register and receive task assignments - EC encoding is executed and verified in master topology - File read-back validation confirms data integrity The test uses unique absolute working directories for each worker to prevent ID conflicts and ensure stable worker registration. Includes proper cleanup and process management for reliable test execution. * Improve maintenance system stability and task deduplication - Add cross-type task deduplication to prevent concurrent maintenance operations on the same volume (EC, balance, vacuum) - Implement HasAnyTask check in ActiveTopology for better coordination - Increase RequestTask timeout from 5s to 30s to prevent unnecessary worker reconnections - Add TaskTypeNone sentinel for generic task checks - Update all task detectors to use HasAnyTask for conflict prevention - Improve config persistence and schema handling * Add GitHub Actions workflow for EC integration tests Adds CI workflow that runs EC integration tests on push and pull requests to master branch. The workflow: - Triggers on changes to admin, worker, or test files - Builds the weed binary - Runs the EC integration test suite - Uploads test logs as artifacts on failure for debugging This ensures the maintenance system remains stable and worker-admin integration is validated in CI. * go version 1.24 * address comments * Update maintenance_integration.go * support seconds * ec prioritize over balancing in testspull/8070/head
committed by
GitHub
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
23 changed files with 823 additions and 52 deletions
-
49.github/workflows/ec-integration.yml
-
3.gitignore
-
7test/erasure_coding/admin_dockertest/Makefile
-
404test/erasure_coding/admin_dockertest/ec_integration_test.go
-
2test/fuse_integration/README.md
-
7weed/admin/config/schema.go
-
20weed/admin/dash/config_persistence.go
-
17weed/admin/dash/worker_grpc_server.go
-
59weed/admin/maintenance/maintenance_integration.go
-
19weed/admin/maintenance/maintenance_manager.go
-
3weed/admin/maintenance/maintenance_types.go
-
15weed/admin/topology/capacity.go
-
59weed/admin/topology/task_management.go
-
5weed/admin/topology/types.go
-
28weed/admin/view/components/form_fields.templ
-
116weed/admin/view/components/form_fields_templ.go
-
12weed/worker/client.go
-
6weed/worker/tasks/balance/detection.go
-
4weed/worker/tasks/erasure_coding/config.go
-
21weed/worker/tasks/erasure_coding/detection.go
-
8weed/worker/tasks/vacuum/detection.go
-
7weed/worker/types/task_ui.go
-
4weed/worker/worker.go
@ -0,0 +1,49 @@ |
|||
name: EC Integration Tests |
|||
|
|||
on: |
|||
push: |
|||
branches: [ master ] |
|||
paths: |
|||
- 'weed/admin/**' |
|||
- 'weed/worker/**' |
|||
- 'test/erasure_coding/admin_dockertest/**' |
|||
- '.github/workflows/ec-integration.yml' |
|||
pull_request: |
|||
branches: [ master ] |
|||
paths: |
|||
- 'weed/admin/**' |
|||
- 'weed/worker/**' |
|||
- 'test/erasure_coding/admin_dockertest/**' |
|||
- '.github/workflows/ec-integration.yml' |
|||
|
|||
jobs: |
|||
ec-integration-test: |
|||
runs-on: ubuntu-latest |
|||
timeout-minutes: 15 |
|||
|
|||
steps: |
|||
- name: Checkout code |
|||
uses: actions/checkout@v4 |
|||
|
|||
- name: Set up Go |
|||
uses: actions/setup-go@v5 |
|||
with: |
|||
go-version: '1.24' |
|||
|
|||
- name: Build weed binary |
|||
run: | |
|||
cd weed |
|||
go build -o ../weed_bin |
|||
|
|||
- name: Run EC integration tests |
|||
run: | |
|||
cd test/erasure_coding/admin_dockertest |
|||
go test -v -timeout 15m ec_integration_test.go |
|||
|
|||
- name: Upload test logs on failure |
|||
if: failure() |
|||
uses: actions/upload-artifact@v4 |
|||
with: |
|||
name: ec-test-logs |
|||
path: test/erasure_coding/admin_dockertest/tmp/logs/ |
|||
retention-days: 7 |
|||
@ -0,0 +1,7 @@ |
|||
.PHONY: test clean |
|||
|
|||
test: |
|||
go test -v . |
|||
|
|||
clean: |
|||
rm -rf tmp |
|||
@ -0,0 +1,404 @@ |
|||
package admin_dockertest |
|||
|
|||
import ( |
|||
"bytes" |
|||
"encoding/json" |
|||
"fmt" |
|||
"io" |
|||
"io/ioutil" |
|||
"math/rand" |
|||
"net/http" |
|||
"net/url" |
|||
"os" |
|||
"os/exec" |
|||
"path/filepath" |
|||
"regexp" |
|||
"strings" |
|||
"testing" |
|||
"time" |
|||
) |
|||
|
|||
const ( |
|||
AdminUrl = "http://localhost:23646" |
|||
MasterUrl = "http://localhost:9333" |
|||
FilerUrl = "http://localhost:8888" |
|||
) |
|||
|
|||
// Helper to run commands in background and track PIDs for cleanup
|
|||
var runningCmds []*exec.Cmd |
|||
|
|||
func cleanup() { |
|||
for _, cmd := range runningCmds { |
|||
if cmd.Process != nil { |
|||
cmd.Process.Kill() |
|||
} |
|||
} |
|||
} |
|||
|
|||
func startWeed(t *testing.T, name string, args ...string) *exec.Cmd { |
|||
cmd := exec.Command("./weed_bin", args...) |
|||
|
|||
// Create logs dir in local ./tmp
|
|||
wd, _ := os.Getwd() |
|||
logDir := filepath.Join(wd, "tmp", "logs") |
|||
os.MkdirAll(logDir, 0755) |
|||
|
|||
logFile, err := os.Create(filepath.Join(logDir, name+".log")) |
|||
if err != nil { |
|||
t.Fatalf("Failed to create log file: %v", err) |
|||
} |
|||
|
|||
cmd.Stdout = logFile |
|||
cmd.Stderr = logFile |
|||
// Set Cwd to test directory so it finds local ./tmp
|
|||
cmd.Dir = wd |
|||
|
|||
// assume "weed_bin" binary is in project root.
|
|||
rootDir := filepath.Dir(filepath.Dir(filepath.Dir(wd))) |
|||
cmd.Path = filepath.Join(rootDir, "weed_bin") |
|||
|
|||
err = cmd.Start() |
|||
if err != nil { |
|||
t.Fatalf("Failed to start weed %v: %v", args, err) |
|||
} |
|||
runningCmds = append(runningCmds, cmd) |
|||
return cmd |
|||
} |
|||
|
|||
func stopWeed(t *testing.T, cmd *exec.Cmd) { |
|||
if cmd != nil && cmd.Process != nil { |
|||
t.Logf("Stopping process %d", cmd.Process.Pid) |
|||
cmd.Process.Kill() |
|||
cmd.Wait() |
|||
|
|||
// Remove from runningCmds to avoid double kill in cleanup
|
|||
for i, c := range runningCmds { |
|||
if c == cmd { |
|||
runningCmds = append(runningCmds[:i], runningCmds[i+1:]...) |
|||
break |
|||
} |
|||
} |
|||
} |
|||
} |
|||
|
|||
func ensureEnvironment(t *testing.T) { |
|||
// 1. Build weed binary
|
|||
wd, _ := os.Getwd() |
|||
rootDir := filepath.Dir(filepath.Dir(filepath.Dir(wd))) // Up 3 levels
|
|||
|
|||
buildCmd := exec.Command("go", "build", "-o", "weed_bin", "./weed") |
|||
buildCmd.Dir = rootDir |
|||
buildCmd.Stdout = os.Stdout |
|||
buildCmd.Stderr = os.Stderr |
|||
if err := buildCmd.Run(); err != nil { |
|||
t.Fatalf("Failed to build weed: %v", err) |
|||
} |
|||
t.Log("Successfully built weed binary") |
|||
|
|||
// 2. Start Master
|
|||
// Use local ./tmp/master
|
|||
os.RemoveAll("tmp") |
|||
err := os.MkdirAll(filepath.Join("tmp", "master"), 0755) |
|||
if err != nil { |
|||
t.Fatalf("Failed to create tmp dir: %v", err) |
|||
} |
|||
|
|||
startWeed(t, "master", "master", "-mdir=./tmp/master", "-port=9333", "-ip=localhost", "-peers=none", "-volumeSizeLimitMB=100") |
|||
|
|||
// Wait for master
|
|||
waitForUrl(t, MasterUrl+"/cluster/status", 10) |
|||
|
|||
// 3. Start Volume Server (Worker)
|
|||
// Start 14 volume servers to verify RS(10,4) default EC
|
|||
for i := 1; i <= 14; i++ { |
|||
volName := fmt.Sprintf("volume%d", i) |
|||
port := 8080 + i - 1 |
|||
dir := filepath.Join("tmp", volName) |
|||
os.MkdirAll(dir, 0755) |
|||
startWeed(t, volName, "volume", "-dir="+dir, "-mserver=localhost:9333", fmt.Sprintf("-port=%d", port), "-ip=localhost") |
|||
} |
|||
|
|||
// 4. Start Filer
|
|||
os.MkdirAll(filepath.Join("tmp", "filer"), 0755) |
|||
startWeed(t, "filer", "filer", "-defaultStoreDir=./tmp/filer", "-master=localhost:9333", "-port=8888", "-ip=localhost") |
|||
waitForUrl(t, FilerUrl+"/", 60) |
|||
|
|||
// 5. Start Workers (Maintenance)
|
|||
// We need workers to execute EC tasks
|
|||
for i := 1; i <= 2; i++ { |
|||
workerName := fmt.Sprintf("worker%d", i) |
|||
metricsPort := 9327 + i - 1 |
|||
debugPort := 6060 + i |
|||
dir, _ := filepath.Abs(filepath.Join("tmp", workerName)) |
|||
os.MkdirAll(dir, 0755) |
|||
startWeed(t, workerName, "worker", "-admin=localhost:23646", "-workingDir="+dir, fmt.Sprintf("-metricsPort=%d", metricsPort), fmt.Sprintf("-debug.port=%d", debugPort)) |
|||
} |
|||
|
|||
// 6. Start Admin
|
|||
os.RemoveAll(filepath.Join("tmp", "admin")) |
|||
os.MkdirAll(filepath.Join("tmp", "admin"), 0755) |
|||
startWeed(t, "admin", "admin", "-master=localhost:9333", "-port=23646", "-dataDir=./tmp/admin") |
|||
waitForUrl(t, AdminUrl+"/health", 60) |
|||
|
|||
t.Log("Environment started successfully") |
|||
} |
|||
|
|||
func waitForUrl(t *testing.T, url string, retries int) { |
|||
for i := 0; i < retries; i++ { |
|||
resp, err := http.Get(url) |
|||
if err == nil && resp.StatusCode == 200 { |
|||
resp.Body.Close() |
|||
return |
|||
} |
|||
time.Sleep(1 * time.Second) |
|||
} |
|||
t.Fatalf("Timeout waiting for %s", url) |
|||
} |
|||
|
|||
func TestEcEndToEnd(t *testing.T) { |
|||
defer cleanup() |
|||
ensureEnvironment(t) |
|||
|
|||
client := &http.Client{} |
|||
|
|||
// 1. Configure Global Maintenance (Scan Interval = 1s) via API
|
|||
t.Log("Configuring Global Maintenance via API...") |
|||
|
|||
// 1.1 Fetch current config
|
|||
req, _ := http.NewRequest("GET", AdminUrl+"/api/maintenance/config", nil) |
|||
resp, err := client.Do(req) |
|||
if err != nil { |
|||
t.Fatalf("Failed to get global config: %v", err) |
|||
} |
|||
if resp.StatusCode != http.StatusOK { |
|||
body, _ := io.ReadAll(resp.Body) |
|||
t.Fatalf("Failed to get global config (status %d): %s", resp.StatusCode, string(body)) |
|||
} |
|||
|
|||
var globalConfig map[string]interface{} |
|||
if err := json.NewDecoder(resp.Body).Decode(&globalConfig); err != nil { |
|||
t.Fatalf("Failed to decode global config: %v", err) |
|||
} |
|||
resp.Body.Close() |
|||
|
|||
// 1.2 Modify config
|
|||
globalConfig["enabled"] = true |
|||
globalConfig["scan_interval_seconds"] = 1 |
|||
|
|||
// Ensure policy structure exists
|
|||
if globalConfig["policy"] == nil { |
|||
globalConfig["policy"] = map[string]interface{}{} |
|||
} |
|||
policy, _ := globalConfig["policy"].(map[string]interface{}) |
|||
|
|||
// Ensure task_policies structure exists
|
|||
if policy["task_policies"] == nil { |
|||
policy["task_policies"] = map[string]interface{}{} |
|||
} |
|||
taskPolicies, _ := policy["task_policies"].(map[string]interface{}) |
|||
|
|||
// Disable balance tasks to avoid interference with EC test
|
|||
if taskPolicies["balance"] == nil { |
|||
taskPolicies["balance"] = map[string]interface{}{} |
|||
} |
|||
balancePolicy, _ := taskPolicies["balance"].(map[string]interface{}) |
|||
balancePolicy["enabled"] = false |
|||
|
|||
// Set global max concurrent
|
|||
policy["global_max_concurrent"] = 4 |
|||
globalConfig["policy"] = policy |
|||
|
|||
// Explicitly set required fields
|
|||
requiredFields := map[string]float64{ |
|||
"worker_timeout_seconds": 300, |
|||
"task_timeout_seconds": 7200, |
|||
"retry_delay_seconds": 900, |
|||
"cleanup_interval_seconds": 86400, |
|||
"task_retention_seconds": 604800, |
|||
"max_retries": 3, |
|||
} |
|||
for field, val := range requiredFields { |
|||
if _, ok := globalConfig[field]; !ok || globalConfig[field] == 0 { |
|||
globalConfig[field] = val |
|||
} |
|||
} |
|||
|
|||
// 1.3 Update config
|
|||
jsonBody, _ := json.Marshal(globalConfig) |
|||
req, _ = http.NewRequest("PUT", AdminUrl+"/api/maintenance/config", bytes.NewBuffer(jsonBody)) |
|||
req.Header.Set("Content-Type", "application/json") |
|||
resp, err = client.Do(req) |
|||
if err != nil { |
|||
t.Fatalf("Failed to update global config: %v", err) |
|||
} |
|||
if resp.StatusCode != http.StatusOK { |
|||
body, _ := io.ReadAll(resp.Body) |
|||
t.Fatalf("Failed to update global config (status %d): %s", resp.StatusCode, string(body)) |
|||
} |
|||
resp.Body.Close() |
|||
|
|||
// 2. Configure EC Task (Short intervals) via Form API
|
|||
t.Log("Configuring EC Task via Form API...") |
|||
formData := url.Values{} |
|||
formData.Set("enabled", "true") |
|||
formData.Set("scan_interval_seconds", "1") |
|||
formData.Set("repeat_interval_seconds", "1") |
|||
formData.Set("check_interval_seconds", "1") |
|||
formData.Set("max_concurrent", "4") |
|||
formData.Set("quiet_for_seconds_value", "1") |
|||
formData.Set("quiet_for_seconds_unit", "seconds") |
|||
formData.Set("min_size_mb", "1") |
|||
formData.Set("fullness_ratio", "0.0001") |
|||
|
|||
req, _ = http.NewRequest("POST", AdminUrl+"/maintenance/config/erasure_coding", strings.NewReader(formData.Encode())) |
|||
req.Header.Set("Content-Type", "application/x-www-form-urlencoded") |
|||
resp, err = client.Do(req) |
|||
if err != nil { |
|||
t.Fatalf("Failed to update EC config: %v", err) |
|||
} |
|||
if resp.StatusCode != 200 && resp.StatusCode != 303 { |
|||
body, _ := io.ReadAll(resp.Body) |
|||
t.Fatalf("Failed to update EC config (status %d): %s", resp.StatusCode, string(body)) |
|||
} |
|||
resp.Body.Close() |
|||
t.Log("EC Task Configuration updated") |
|||
|
|||
// 3. Restart Admin to pick up Global Config (Scan Interval)
|
|||
if len(runningCmds) > 0 { |
|||
adminCmd := runningCmds[len(runningCmds)-1] |
|||
t.Log("Restarting Admin Server to apply configuration...") |
|||
stopWeed(t, adminCmd) |
|||
time.Sleep(10 * time.Second) |
|||
startWeed(t, "admin_restarted", "admin", "-master=localhost:9333", "-port=23646", "-port.grpc=33646", "-dataDir=./tmp/admin") |
|||
waitForUrl(t, AdminUrl+"/health", 60) |
|||
} |
|||
|
|||
// 4. Upload a file
|
|||
fileSize := 5 * 1024 * 1024 |
|||
data := make([]byte, fileSize) |
|||
rand.Read(data) |
|||
fileName := fmt.Sprintf("ec_test_file_%d", time.Now().Unix()) |
|||
t.Logf("Uploading %d bytes file %s to Filer...", fileSize, fileName) |
|||
uploadUrl := FilerUrl + "/" + fileName |
|||
|
|||
var uploadErr error |
|||
for i := 0; i < 10; i++ { |
|||
req, _ := http.NewRequest("PUT", uploadUrl, bytes.NewBuffer(data)) |
|||
resp, err := client.Do(req) |
|||
if err == nil { |
|||
if resp.StatusCode == 201 { |
|||
resp.Body.Close() |
|||
uploadErr = nil |
|||
break |
|||
} |
|||
body, _ := io.ReadAll(resp.Body) |
|||
resp.Body.Close() |
|||
uploadErr = fmt.Errorf("status %d: %s", resp.StatusCode, string(body)) |
|||
} else { |
|||
uploadErr = err |
|||
} |
|||
t.Logf("Upload attempt %d failed: %v", i+1, uploadErr) |
|||
time.Sleep(2 * time.Second) |
|||
} |
|||
|
|||
if uploadErr != nil { |
|||
t.Fatalf("Failed to upload file after retries: %v", uploadErr) |
|||
} |
|||
t.Log("Upload successful") |
|||
|
|||
// 5. Verify EC Encoding
|
|||
t.Log("Waiting for EC encoding (checking Master topology)...") |
|||
startTime := time.Now() |
|||
ecVerified := false |
|||
var lastBody []byte |
|||
|
|||
for time.Since(startTime) < 300*time.Second { |
|||
// 5.1 Check Master Topology
|
|||
resp, err := http.Get(MasterUrl + "/dir/status") |
|||
if err == nil { |
|||
lastBody, _ = ioutil.ReadAll(resp.Body) |
|||
resp.Body.Close() |
|||
|
|||
// Check total EC shards
|
|||
reShards := regexp.MustCompile(`"EcShards":\s*(\d+)`) |
|||
matches := reShards.FindAllSubmatch(lastBody, -1) |
|||
totalShards := 0 |
|||
for _, m := range matches { |
|||
var count int |
|||
fmt.Sscanf(string(m[1]), "%d", &count) |
|||
totalShards += count |
|||
} |
|||
|
|||
if totalShards > 0 { |
|||
t.Logf("EC encoding verified (found %d total EcShards in topology) after %d seconds", totalShards, int(time.Since(startTime).Seconds())) |
|||
ecVerified = true |
|||
break |
|||
} |
|||
} |
|||
|
|||
// 5.2 Debug: Check workers and tasks
|
|||
wResp, wErr := http.Get(AdminUrl + "/api/maintenance/workers") |
|||
workerCount := 0 |
|||
if wErr == nil { |
|||
var workers []interface{} |
|||
json.NewDecoder(wResp.Body).Decode(&workers) |
|||
wResp.Body.Close() |
|||
workerCount = len(workers) |
|||
} |
|||
|
|||
tResp, tErr := http.Get(AdminUrl + "/api/maintenance/tasks") |
|||
taskCount := 0 |
|||
if tErr == nil { |
|||
var tasks []interface{} |
|||
json.NewDecoder(tResp.Body).Decode(&tasks) |
|||
tResp.Body.Close() |
|||
taskCount = len(tasks) |
|||
} |
|||
t.Logf("Waiting for EC... (Workers: %d, Active Tasks: %d)", workerCount, taskCount) |
|||
|
|||
time.Sleep(10 * time.Second) |
|||
} |
|||
|
|||
if !ecVerified { |
|||
dumpLogs(t) |
|||
t.Fatalf("Timed out waiting for EC encoding verified in Topology. Last body: %s", string(lastBody)) |
|||
} |
|||
|
|||
// 6. Verification: Read back the file
|
|||
t.Log("Reading back file...") |
|||
resp, err = http.Get(uploadUrl) |
|||
if err != nil { |
|||
dumpLogs(t) |
|||
t.Fatalf("Failed to read back file: %v", err) |
|||
} |
|||
defer resp.Body.Close() |
|||
if resp.StatusCode != 200 { |
|||
dumpLogs(t) |
|||
t.Fatalf("Read back failed status: %d", resp.StatusCode) |
|||
} |
|||
content, _ := io.ReadAll(resp.Body) |
|||
if len(content) != fileSize { |
|||
dumpLogs(t) |
|||
t.Fatalf("Read back size mismatch: got %d, want %d", len(content), fileSize) |
|||
} |
|||
|
|||
// Verify byte-wise content equality
|
|||
if !bytes.Equal(content, data) { |
|||
dumpLogs(t) |
|||
t.Fatalf("Read back content mismatch: uploaded and downloaded data differ") |
|||
} |
|||
|
|||
t.Log("Test PASS: EC encoding and read back successful!") |
|||
} |
|||
|
|||
func dumpLogs(t *testing.T) { |
|||
wd, _ := os.Getwd() |
|||
logDir := filepath.Join(wd, "tmp", "logs") |
|||
files, _ := os.ReadDir(logDir) |
|||
for _, f := range files { |
|||
if strings.HasSuffix(f.Name(), ".log") { |
|||
content, _ := os.ReadFile(filepath.Join(logDir, f.Name())) |
|||
t.Logf("--- LOG DUMP: %s ---\n%s\n--- END LOG ---", f.Name(), string(content)) |
|||
} |
|||
} |
|||
} |
|||
Write
Preview
Loading…
Cancel
Save
Reference in new issue