From d95df76bca58cfaf2ede402feac8779de8588153 Mon Sep 17 00:00:00 2001 From: Chris Lu Date: Thu, 26 Mar 2026 19:28:13 -0700 Subject: [PATCH] feat: separate scheduler lanes for iceberg, lifecycle, and volume management (#8787) * feat: introduce scheduler lanes for independent per-workload scheduling Split the single plugin scheduler loop into independent per-lane goroutines so that volume management, iceberg compaction, and lifecycle operations never block each other. Each lane has its own: - Goroutine (laneSchedulerLoop) - Wake channel for immediate scheduling - Admin lock scope (e.g. "plugin scheduler:default") - Configurable idle sleep duration - Loop state tracking Three lanes are defined: - default: vacuum, volume_balance, ec_balance, erasure_coding, admin_script - iceberg: iceberg_maintenance - lifecycle: s3_lifecycle (new, handler coming in a later commit) Job types are mapped to lanes via a hardcoded map with LaneDefault as the fallback. The SchedulerJobTypeState and SchedulerStatus types now include a Lane field for API consumers. * feat: per-lane execution reservation pools for resource isolation Each scheduler lane now maintains its own execution reservation map so that a busy volume lane cannot consume execution slots needed by iceberg or lifecycle lanes. The per-lane pool is used by default when dispatching jobs through the lane scheduler; the global pool remains as a fallback for the public DispatchProposals API. * feat: add per-lane scheduler status API and lane worker UI pages - GET /api/plugin/lanes returns all lanes with status and job types - GET /api/plugin/workers?lane=X filters workers by lane - GET /api/plugin/scheduler-states?lane=X filters job types by lane - GET /api/plugin/scheduler-status?lane=X returns lane-scoped status - GET /plugin/lanes/{lane}/workers renders per-lane worker page - SchedulerJobTypeState now includes a "lane" field The lane worker pages show scheduler status, job type configuration, and connected workers scoped to a single lane, with links back to the main plugin overview. * feat: add s3_lifecycle worker handler for object store lifecycle management Implements a full plugin worker handler for S3 lifecycle management, assigned to the new "lifecycle" scheduler lane. Detection phase: - Reads filer.conf to find buckets with TTL lifecycle rules - Creates one job proposal per bucket with active lifecycle rules - Supports bucket_filter wildcard pattern from admin config Execution phase: - Walks the bucket directory tree breadth-first - Identifies expired objects by checking TtlSec + Crtime < now - Deletes expired objects in configurable batches - Reports progress with scanned/expired/error counts - Supports dry_run mode for safe testing Configurable via admin UI: - batch_size: entries per filer listing page (default 1000) - max_deletes_per_bucket: safety cap per run (default 10000) - dry_run: detect without deleting - delete_marker_cleanup: clean expired delete markers - abort_mpu_days: abort stale multipart uploads The handler integrates with the existing PutBucketLifecycle flow which sets TtlSec on entries via filer.conf path rules. * feat: add per-lane submenu items under Workers sidebar menu Replace the single "Workers" sidebar link with a collapsible submenu containing three lane entries: - Default (volume management + admin scripts) -> /plugin - Iceberg (table compaction) -> /plugin/lanes/iceberg/workers - Lifecycle (S3 object expiration) -> /plugin/lanes/lifecycle/workers The submenu auto-expands when on any /plugin page and highlights the active lane. Icons match each lane's job type descriptor (server, snowflake, hourglass). * feat: scope plugin pages to their scheduler lane The plugin overview, configuration, detection, queue, and execution pages now filter workers, job types, scheduler states, and scheduler status to only show data for their lane. - Plugin() templ function accepts a lane parameter (default: "default") - JavaScript appends ?lane= to /api/plugin/workers, /job-types, /scheduler-states, and /scheduler-status API calls - GET /api/plugin/job-types now supports ?lane= filtering - When ?job= is provided (e.g. ?job=iceberg_maintenance), the lane is auto-derived from the job type so the page scopes correctly This ensures /plugin shows only default-lane workers and /plugin/configuration?job=iceberg_maintenance scopes to the iceberg lane. * fix: remove "Lane" from lane worker page titles and capitalize properly "lifecycle Lane Workers" -> "Lifecycle Workers" "iceberg Lane Workers" -> "Iceberg Workers" * refactor: promote lane items to top-level sidebar menu entries Move Default, Iceberg, and Lifecycle from a collapsible submenu to direct top-level items under the WORKERS heading. Removes the intermediate "Workers" parent link and collapse toggle. * admin: unify plugin lane routes and handlers * admin: filter plugin jobs and activities by lane * admin: reuse plugin UI for worker lane pages * fix: use ServerAddress.ToGrpcAddress() for filer connections in lifecycle handler ClusterContext addresses use ServerAddress format (host:port.grpcPort). Convert to the actual gRPC address via ToGrpcAddress() before dialing, and add a Ping verification after connecting. Fixes: "dial tcp: lookup tcp/8888.18888: unknown port" * fix: resolve ServerAddress gRPC port in iceberg and lifecycle filer connections ClusterContext addresses use ServerAddress format (host:httpPort.grpcPort). Both the iceberg and lifecycle handlers now detect the compound format and extract the gRPC port via ToGrpcAddress() before dialing. Plain host:port addresses (e.g. from tests) are passed through unchanged. Fixes: "dial tcp: lookup tcp/8888.18888: unknown port" * align url * Potential fix for code scanning alert no. 335: Incorrect conversion between integer types Co-authored-by: Copilot Autofix powered by AI <62310815+github-advanced-security[bot]@users.noreply.github.com> * fix: address PR review findings across scheduler lanes and lifecycle handler - Fix variable shadowing: rename loop var `w` to `worker` in GetPluginWorkersAPI to avoid shadowing the http.ResponseWriter param - Fix stale GetSchedulerStatus: aggregate loop states across all lanes instead of reading never-updated legacy schedulerLoopState - Scope InProcessJobs to lane in GetLaneSchedulerStatus - Fix AbortMPUDays=0 treated as unset: change <= 0 to < 0 so 0 disables - Propagate listing errors in lifecycle bucket walk instead of swallowing - Implement DeleteMarkerCleanup: scan for S3 delete marker entries and remove them - Implement AbortMPUDays: scan .uploads directory and remove stale multipart uploads older than the configured threshold - Fix success determination: mark job failed when result.errors > 0 even if no fatal error occurred - Add regression test for jobTypeLaneMap to catch drift from handler registrations * fix: guard against nil result in lifecycle completion and trim filer addresses - Guard result dereference in completion summary: use local vars defaulting to 0 when result is nil to prevent panic - Append trimmed filer addresses instead of originals so whitespace is not passed to the gRPC dialer * fix: propagate ctx cancellation from deleteExpiredObjects and add config logging - deleteExpiredObjects now returns a third error value when the context is canceled mid-batch; the caller stops processing further batches and returns the cancellation error to the job completion handler - readBoolConfig and readInt64Config now log unexpected ConfigValue types at V(1) for debugging, consistent with readStringConfig * fix: propagate errors in lifecycle cleanup helpers and use correct delete marker key - cleanupDeleteMarkers: return error on ctx cancellation and SeaweedList failures instead of silently continuing - abortIncompleteMPUs: log SeaweedList errors instead of discarding - isDeleteMarker: use ExtDeleteMarkerKey ("Seaweed-X-Amz-Delete-Marker") instead of ExtLatestVersionIsDeleteMarker which is for the parent entry - batchSize cap: use math.MaxInt instead of math.MaxInt32 * fix: propagate ctx cancellation from abortIncompleteMPUs and log unrecognized bool strings - abortIncompleteMPUs now returns (aborted, errors, ctxErr) matching cleanupDeleteMarkers; caller stops on cancellation or listing failure - readBoolConfig logs unrecognized string values before falling back * fix: shared per-bucket budget across lifecycle phases and allow cleanup without expired objects - Thread a shared remaining counter through TTL deletion, delete marker cleanup, and MPU abort so the total operations per bucket never exceed MaxDeletesPerBucket - Remove early return when no TTL-expired objects found so delete marker cleanup and MPU abort still run - Add NOTE on cleanupDeleteMarkers about version-safety limitation --------- Co-authored-by: Copilot Autofix powered by AI <62310815+github-advanced-security[bot]@users.noreply.github.com> --- weed/admin/dash/plugin_api.go | 125 +++++- weed/admin/dash/plugin_api_test.go | 36 ++ weed/admin/handlers/admin_handlers.go | 8 + .../handlers/admin_handlers_routes_test.go | 17 + weed/admin/handlers/plugin_handlers.go | 68 +++- weed/admin/plugin/plugin.go | 14 +- weed/admin/plugin/plugin_scheduler.go | 221 +++++++++- weed/admin/plugin/scheduler_lane.go | 109 +++++ weed/admin/plugin/scheduler_lane_test.go | 47 +++ weed/admin/plugin/scheduler_status.go | 182 ++++++++- weed/admin/plugin/types.go | 1 + weed/admin/view/app/plugin.templ | 41 +- weed/admin/view/app/plugin_templ.go | 53 ++- weed/admin/view/app/template_helpers.go | 37 ++ weed/admin/view/layout/layout.templ | 46 ++- weed/admin/view/layout/layout_templ.go | 364 ++++++++++------- weed/plugin/worker/iceberg/exec_test.go | 12 +- weed/plugin/worker/iceberg/handler.go | 2 +- weed/plugin/worker/lifecycle/config.go | 131 ++++++ weed/plugin/worker/lifecycle/detection.go | 204 ++++++++++ weed/plugin/worker/lifecycle/execution.go | 328 +++++++++++++++ weed/plugin/worker/lifecycle/handler.go | 380 ++++++++++++++++++ 22 files changed, 2230 insertions(+), 196 deletions(-) create mode 100644 weed/admin/plugin/scheduler_lane.go create mode 100644 weed/admin/plugin/scheduler_lane_test.go create mode 100644 weed/plugin/worker/lifecycle/config.go create mode 100644 weed/plugin/worker/lifecycle/detection.go create mode 100644 weed/plugin/worker/lifecycle/execution.go create mode 100644 weed/plugin/worker/lifecycle/handler.go diff --git a/weed/admin/dash/plugin_api.go b/weed/admin/dash/plugin_api.go index 897fb88a7..d8173e0f6 100644 --- a/weed/admin/dash/plugin_api.go +++ b/weed/admin/dash/plugin_api.go @@ -32,6 +32,42 @@ const ( maxPluginRunTimeout = 30 * time.Minute ) +func matchesPluginLane(jobType, laneFilter string) bool { + laneFilter = strings.TrimSpace(laneFilter) + if laneFilter == "" { + return true + } + return plugin.JobTypeLane(jobType) == plugin.SchedulerLane(laneFilter) +} + +func filterTrackedJobsByLane(jobs []plugin.TrackedJob, laneFilter string) []plugin.TrackedJob { + if strings.TrimSpace(laneFilter) == "" { + return jobs + } + + filtered := make([]plugin.TrackedJob, 0, len(jobs)) + for _, job := range jobs { + if matchesPluginLane(job.JobType, laneFilter) { + filtered = append(filtered, job) + } + } + return filtered +} + +func filterActivitiesByLane(activities []plugin.JobActivity, laneFilter string) []plugin.JobActivity { + if strings.TrimSpace(laneFilter) == "" { + return activities + } + + filtered := make([]plugin.JobActivity, 0, len(activities)) + for _, activity := range activities { + if matchesPluginLane(activity.JobType, laneFilter) { + filtered = append(filtered, activity) + } + } + return filtered +} + // GetPluginStatusAPI returns plugin status. func (s *AdminServer) GetPluginStatusAPI(w http.ResponseWriter, r *http.Request) { plugin := s.GetPlugin() @@ -53,16 +89,35 @@ func (s *AdminServer) GetPluginStatusAPI(w http.ResponseWriter, r *http.Request) } // GetPluginWorkersAPI returns currently connected plugin workers. +// Accepts an optional ?lane= query parameter to filter by scheduler lane. func (s *AdminServer) GetPluginWorkersAPI(w http.ResponseWriter, r *http.Request) { workers := s.GetPluginWorkers() if workers == nil { writeJSON(w, http.StatusOK, []interface{}{}) return } + + laneFilter := strings.TrimSpace(r.URL.Query().Get("lane")) + if laneFilter != "" { + lane := plugin.SchedulerLane(laneFilter) + filtered := make([]*plugin.WorkerSession, 0, len(workers)) + for _, worker := range workers { + for jobType := range worker.Capabilities { + if plugin.JobTypeLane(jobType) == lane { + filtered = append(filtered, worker) + break + } + } + } + writeJSON(w, http.StatusOK, filtered) + return + } + writeJSON(w, http.StatusOK, workers) } // GetPluginJobTypesAPI returns known plugin job types from workers and persisted data. +// Accepts an optional ?lane= query parameter to filter by scheduler lane. func (s *AdminServer) GetPluginJobTypesAPI(w http.ResponseWriter, r *http.Request) { jobTypes, err := s.ListPluginJobTypes() if err != nil { @@ -73,6 +128,20 @@ func (s *AdminServer) GetPluginJobTypesAPI(w http.ResponseWriter, r *http.Reques writeJSON(w, http.StatusOK, []interface{}{}) return } + + laneFilter := strings.TrimSpace(r.URL.Query().Get("lane")) + if laneFilter != "" { + lane := plugin.SchedulerLane(laneFilter) + filtered := make([]plugin.JobTypeInfo, 0, len(jobTypes)) + for _, jt := range jobTypes { + if plugin.JobTypeLane(jt.JobType) == lane { + filtered = append(filtered, jt) + } + } + writeJSON(w, http.StatusOK, filtered) + return + } + writeJSON(w, http.StatusOK, jobTypes) } @@ -81,13 +150,14 @@ func (s *AdminServer) GetPluginJobsAPI(w http.ResponseWriter, r *http.Request) { query := r.URL.Query() jobType := strings.TrimSpace(query.Get("job_type")) state := strings.TrimSpace(query.Get("state")) + laneFilter := strings.TrimSpace(query.Get("lane")) limit := parsePositiveInt(query.Get("limit"), 200) jobs := s.ListPluginJobs(jobType, state, limit) if jobs == nil { writeJSON(w, http.StatusOK, []interface{}{}) return } - writeJSON(w, http.StatusOK, jobs) + writeJSON(w, http.StatusOK, filterTrackedJobsByLane(jobs, laneFilter)) } // GetPluginJobAPI returns one tracked job. @@ -176,18 +246,21 @@ func (s *AdminServer) ExpirePluginJobAPI(w http.ResponseWriter, r *http.Request) func (s *AdminServer) GetPluginActivitiesAPI(w http.ResponseWriter, r *http.Request) { query := r.URL.Query() jobType := strings.TrimSpace(query.Get("job_type")) + laneFilter := strings.TrimSpace(query.Get("lane")) limit := parsePositiveInt(query.Get("limit"), 500) activities := s.ListPluginActivities(jobType, limit) if activities == nil { writeJSON(w, http.StatusOK, []interface{}{}) return } - writeJSON(w, http.StatusOK, activities) + writeJSON(w, http.StatusOK, filterActivitiesByLane(activities, laneFilter)) } // GetPluginSchedulerStatesAPI returns per-job-type scheduler status for monitoring. +// Accepts optional ?job_type= and ?lane= query parameters. func (s *AdminServer) GetPluginSchedulerStatesAPI(w http.ResponseWriter, r *http.Request) { jobTypeFilter := strings.TrimSpace(r.URL.Query().Get("job_type")) + laneFilter := strings.TrimSpace(r.URL.Query().Get("lane")) states, err := s.ListPluginSchedulerStates() if err != nil { @@ -195,12 +268,16 @@ func (s *AdminServer) GetPluginSchedulerStatesAPI(w http.ResponseWriter, r *http return } - if jobTypeFilter != "" { + if jobTypeFilter != "" || laneFilter != "" { filtered := make([]interface{}, 0, len(states)) for _, state := range states { - if state.JobType == jobTypeFilter { - filtered = append(filtered, state) + if jobTypeFilter != "" && state.JobType != jobTypeFilter { + continue } + if laneFilter != "" && state.Lane != laneFilter { + continue + } + filtered = append(filtered, state) } writeJSON(w, http.StatusOK, filtered) return @@ -215,6 +292,7 @@ func (s *AdminServer) GetPluginSchedulerStatesAPI(w http.ResponseWriter, r *http } // GetPluginSchedulerStatusAPI returns scheduler status including in-process jobs and lock state. +// Accepts optional ?lane= query parameter to scope to a specific lane. func (s *AdminServer) GetPluginSchedulerStatusAPI(w http.ResponseWriter, r *http.Request) { pluginSvc := s.GetPlugin() if pluginSvc == nil { @@ -224,6 +302,21 @@ func (s *AdminServer) GetPluginSchedulerStatusAPI(w http.ResponseWriter, r *http return } + laneFilter := strings.TrimSpace(r.URL.Query().Get("lane")) + + if laneFilter != "" { + lane := plugin.SchedulerLane(laneFilter) + response := map[string]interface{}{ + "enabled": true, + "scheduler": pluginSvc.GetLaneSchedulerStatus(lane), + } + if s.pluginLock != nil { + response["lock"] = s.pluginLock.Status() + } + writeJSON(w, http.StatusOK, response) + return + } + response := map[string]interface{}{ "enabled": true, "scheduler": pluginSvc.GetSchedulerStatus(), @@ -235,6 +328,28 @@ func (s *AdminServer) GetPluginSchedulerStatusAPI(w http.ResponseWriter, r *http writeJSON(w, http.StatusOK, response) } +// GetPluginLanesAPI returns all scheduler lanes and their current status. +func (s *AdminServer) GetPluginLanesAPI(w http.ResponseWriter, r *http.Request) { + pluginSvc := s.GetPlugin() + if pluginSvc == nil { + writeJSON(w, http.StatusOK, []interface{}{}) + return + } + + lanes := plugin.AllLanes() + result := make([]map[string]interface{}, 0, len(lanes)) + for _, lane := range lanes { + laneStatus := pluginSvc.GetLaneSchedulerStatus(lane) + result = append(result, map[string]interface{}{ + "lane": string(lane), + "idle_sleep_sec": int(plugin.LaneIdleSleep(lane) / time.Second), + "job_types": plugin.LaneJobTypes(lane), + "status": laneStatus, + }) + } + writeJSON(w, http.StatusOK, result) +} + // RequestPluginJobTypeSchemaAPI asks a worker for one job type schema. func (s *AdminServer) RequestPluginJobTypeSchemaAPI(w http.ResponseWriter, r *http.Request) { jobType := strings.TrimSpace(mux.Vars(r)["jobType"]) diff --git a/weed/admin/dash/plugin_api_test.go b/weed/admin/dash/plugin_api_test.go index 5e535382a..3637f3dab 100644 --- a/weed/admin/dash/plugin_api_test.go +++ b/weed/admin/dash/plugin_api_test.go @@ -220,3 +220,39 @@ func TestApplyDescriptorDefaultsToPersistedConfigReplacesBlankAdminScript(t *tes t.Fatalf("expected blank script to be replaced by default, got=%q", scriptKind.StringValue) } } + +func TestFilterTrackedJobsByLane(t *testing.T) { + t.Parallel() + + jobs := []plugin.TrackedJob{ + {JobID: "vacuum-1", JobType: "vacuum"}, + {JobID: "iceberg-1", JobType: "iceberg_maintenance"}, + {JobID: "lifecycle-1", JobType: "s3_lifecycle"}, + } + + filtered := filterTrackedJobsByLane(jobs, "iceberg") + if len(filtered) != 1 { + t.Fatalf("expected 1 iceberg job, got %d", len(filtered)) + } + if filtered[0].JobID != "iceberg-1" { + t.Fatalf("expected iceberg job to be retained, got %+v", filtered[0]) + } +} + +func TestFilterActivitiesByLane(t *testing.T) { + t.Parallel() + + activities := []plugin.JobActivity{ + {JobID: "vacuum-1", JobType: "vacuum"}, + {JobID: "iceberg-1", JobType: "iceberg_maintenance"}, + {JobID: "lifecycle-1", JobType: "s3_lifecycle"}, + } + + filtered := filterActivitiesByLane(activities, "lifecycle") + if len(filtered) != 1 { + t.Fatalf("expected 1 lifecycle activity, got %d", len(filtered)) + } + if filtered[0].JobID != "lifecycle-1" { + t.Fatalf("expected lifecycle activity to be retained, got %+v", filtered[0]) + } +} diff --git a/weed/admin/handlers/admin_handlers.go b/weed/admin/handlers/admin_handlers.go index 6594a0e0f..650df45cc 100644 --- a/weed/admin/handlers/admin_handlers.go +++ b/weed/admin/handlers/admin_handlers.go @@ -147,6 +147,13 @@ func (h *AdminHandlers) registerUIRoutes(r *mux.Router) { r.HandleFunc("/plugin/detection", h.pluginHandlers.ShowPluginDetection).Methods(http.MethodGet) r.HandleFunc("/plugin/execution", h.pluginHandlers.ShowPluginExecution).Methods(http.MethodGet) r.HandleFunc("/plugin/monitoring", h.pluginHandlers.ShowPluginMonitoring).Methods(http.MethodGet) + r.HandleFunc("/plugin/lanes/{lane}", h.pluginHandlers.ShowPluginLane).Methods(http.MethodGet) + r.HandleFunc("/plugin/lanes/{lane}/configuration", h.pluginHandlers.ShowPluginLaneConfiguration).Methods(http.MethodGet) + r.HandleFunc("/plugin/lanes/{lane}/queue", h.pluginHandlers.ShowPluginLaneQueue).Methods(http.MethodGet) + r.HandleFunc("/plugin/lanes/{lane}/detection", h.pluginHandlers.ShowPluginLaneDetection).Methods(http.MethodGet) + r.HandleFunc("/plugin/lanes/{lane}/execution", h.pluginHandlers.ShowPluginLaneExecution).Methods(http.MethodGet) + r.HandleFunc("/plugin/lanes/{lane}/monitoring", h.pluginHandlers.ShowPluginLaneMonitoring).Methods(http.MethodGet) + r.HandleFunc("/plugin/lanes/{lane}/workers", h.pluginHandlers.ShowPluginLaneWorkers).Methods(http.MethodGet) } func (h *AdminHandlers) registerAPIRoutes(api *mux.Router, enforceWrite bool) { @@ -245,6 +252,7 @@ func (h *AdminHandlers) registerAPIRoutes(api *mux.Router, enforceWrite bool) { pluginApi := api.PathPrefix("/plugin").Subrouter() pluginApi.HandleFunc("/status", h.adminServer.GetPluginStatusAPI).Methods(http.MethodGet) + pluginApi.HandleFunc("/lanes", h.adminServer.GetPluginLanesAPI).Methods(http.MethodGet) pluginApi.HandleFunc("/workers", h.adminServer.GetPluginWorkersAPI).Methods(http.MethodGet) pluginApi.HandleFunc("/job-types", h.adminServer.GetPluginJobTypesAPI).Methods(http.MethodGet) pluginApi.HandleFunc("/jobs", h.adminServer.GetPluginJobsAPI).Methods(http.MethodGet) diff --git a/weed/admin/handlers/admin_handlers_routes_test.go b/weed/admin/handlers/admin_handlers_routes_test.go index ff4632bf6..a34102d7e 100644 --- a/weed/admin/handlers/admin_handlers_routes_test.go +++ b/weed/admin/handlers/admin_handlers_routes_test.go @@ -53,6 +53,16 @@ func TestSetupRoutes_RegistersPluginPages_NoAuth(t *testing.T) { assertHasRoute(t, router, http.MethodGet, "/plugin/detection") assertHasRoute(t, router, http.MethodGet, "/plugin/execution") assertHasRoute(t, router, http.MethodGet, "/plugin/monitoring") + assertHasRoute(t, router, http.MethodGet, "/plugin/lanes/default") + assertHasRoute(t, router, http.MethodGet, "/plugin/lanes/default/configuration") + assertHasRoute(t, router, http.MethodGet, "/plugin/lanes/default/queue") + assertHasRoute(t, router, http.MethodGet, "/plugin/lanes/default/detection") + assertHasRoute(t, router, http.MethodGet, "/plugin/lanes/default/execution") + assertHasRoute(t, router, http.MethodGet, "/plugin/lanes/default/monitoring") + assertHasRoute(t, router, http.MethodGet, "/plugin/lanes/default/workers") + assertHasRoute(t, router, http.MethodGet, "/plugin/lanes/lifecycle") + assertHasRoute(t, router, http.MethodGet, "/plugin/lanes/lifecycle/configuration") + assertHasRoute(t, router, http.MethodGet, "/plugin/lanes/lifecycle/workers") } func TestSetupRoutes_RegistersPluginPages_WithAuth(t *testing.T) { @@ -66,6 +76,13 @@ func TestSetupRoutes_RegistersPluginPages_WithAuth(t *testing.T) { assertHasRoute(t, router, http.MethodGet, "/plugin/detection") assertHasRoute(t, router, http.MethodGet, "/plugin/execution") assertHasRoute(t, router, http.MethodGet, "/plugin/monitoring") + assertHasRoute(t, router, http.MethodGet, "/plugin/lanes/iceberg") + assertHasRoute(t, router, http.MethodGet, "/plugin/lanes/iceberg/configuration") + assertHasRoute(t, router, http.MethodGet, "/plugin/lanes/iceberg/queue") + assertHasRoute(t, router, http.MethodGet, "/plugin/lanes/iceberg/detection") + assertHasRoute(t, router, http.MethodGet, "/plugin/lanes/iceberg/execution") + assertHasRoute(t, router, http.MethodGet, "/plugin/lanes/iceberg/monitoring") + assertHasRoute(t, router, http.MethodGet, "/plugin/lanes/iceberg/workers") } func newRouteTestAdminHandlers() *AdminHandlers { diff --git a/weed/admin/handlers/plugin_handlers.go b/weed/admin/handlers/plugin_handlers.go index b4d0ec74b..1c6964abe 100644 --- a/weed/admin/handlers/plugin_handlers.go +++ b/weed/admin/handlers/plugin_handlers.go @@ -4,7 +4,9 @@ import ( "bytes" "net/http" + "github.com/gorilla/mux" "github.com/seaweedfs/seaweedfs/weed/admin/dash" + adminplugin "github.com/seaweedfs/seaweedfs/weed/admin/plugin" "github.com/seaweedfs/seaweedfs/weed/admin/view/app" "github.com/seaweedfs/seaweedfs/weed/admin/view/layout" ) @@ -52,9 +54,73 @@ func (h *PluginHandlers) ShowPluginMonitoring(w http.ResponseWriter, r *http.Req h.renderPluginPage(w, r, "detection") } +// ShowPluginLane displays a lane overview page using the shared plugin UI. +func (h *PluginHandlers) ShowPluginLane(w http.ResponseWriter, r *http.Request) { + h.renderPluginPageWithLane(w, r, "overview") +} + +// ShowPluginLaneConfiguration displays a lane-specific configuration page. +func (h *PluginHandlers) ShowPluginLaneConfiguration(w http.ResponseWriter, r *http.Request) { + h.renderPluginPageWithLane(w, r, "configuration") +} + +// ShowPluginLaneQueue displays a lane-specific queue page. +func (h *PluginHandlers) ShowPluginLaneQueue(w http.ResponseWriter, r *http.Request) { + h.renderPluginPageWithLane(w, r, "queue") +} + +// ShowPluginLaneDetection displays a lane-specific detection page. +func (h *PluginHandlers) ShowPluginLaneDetection(w http.ResponseWriter, r *http.Request) { + h.renderPluginPageWithLane(w, r, "detection") +} + +// ShowPluginLaneExecution displays a lane-specific execution page. +func (h *PluginHandlers) ShowPluginLaneExecution(w http.ResponseWriter, r *http.Request) { + h.renderPluginPageWithLane(w, r, "execution") +} + +// ShowPluginLaneMonitoring displays a lane-specific monitoring page. +func (h *PluginHandlers) ShowPluginLaneMonitoring(w http.ResponseWriter, r *http.Request) { + // Backward-compatible alias for the old monitoring URL. + h.renderPluginPageWithLane(w, r, "detection") +} + +// ShowPluginLaneWorkers displays workers filtered to a specific scheduler lane. +func (h *PluginHandlers) ShowPluginLaneWorkers(w http.ResponseWriter, r *http.Request) { + // Backward-compatible alias for the old lane overview URL. + h.renderPluginPageWithLane(w, r, "overview") +} + +func (h *PluginHandlers) renderPluginPageWithLane(w http.ResponseWriter, r *http.Request, page string) { + initialJob := r.URL.Query().Get("job") + lane := mux.Vars(r)["lane"] + component := app.Plugin(page, initialJob, lane) + viewCtx := layout.NewViewContext(r, dash.UsernameFromContext(r.Context()), dash.CSRFTokenFromContext(r.Context())) + layoutComponent := layout.Layout(viewCtx, component) + + var buf bytes.Buffer + if err := layoutComponent.Render(r.Context(), &buf); err != nil { + writeJSONError(w, http.StatusInternalServerError, "Failed to render template: "+err.Error()) + return + } + + w.Header().Set("Content-Type", "text/html; charset=utf-8") + w.WriteHeader(http.StatusOK) + _, _ = w.Write(buf.Bytes()) +} + func (h *PluginHandlers) renderPluginPage(w http.ResponseWriter, r *http.Request, page string) { initialJob := r.URL.Query().Get("job") - component := app.Plugin(page, initialJob) + lane := r.URL.Query().Get("lane") + if lane == "" && initialJob != "" { + // Derive lane from job type so that e.g. ?job=iceberg_maintenance + // scopes the page to the iceberg lane automatically. + lane = string(adminplugin.JobTypeLane(initialJob)) + } + if lane == "" { + lane = "default" + } + component := app.Plugin(page, initialJob, lane) viewCtx := layout.NewViewContext(r, dash.UsernameFromContext(r.Context()), dash.CSRFTokenFromContext(r.Context())) layoutComponent := layout.Layout(viewCtx, component) diff --git a/weed/admin/plugin/plugin.go b/weed/admin/plugin/plugin.go index 4351c2172..e14e7ae41 100644 --- a/weed/admin/plugin/plugin.go +++ b/weed/admin/plugin/plugin.go @@ -69,6 +69,8 @@ type Plugin struct { detectorLeaseMu sync.Mutex detectorLeases map[string]string + lanes map[SchedulerLane]*schedulerLaneState + schedulerExecMu sync.Mutex schedulerExecReservations map[string]int adminScriptRunMu sync.RWMutex @@ -158,6 +160,11 @@ func New(options Options) (*Plugin, error) { schedulerTick = defaultSchedulerTick } + lanes := make(map[SchedulerLane]*schedulerLaneState, len(AllLanes())) + for _, lane := range AllLanes() { + lanes[lane] = newLaneState(lane) + } + plugin := &Plugin{ store: store, registry: NewRegistry(), @@ -167,6 +174,7 @@ func New(options Options) (*Plugin, error) { clusterContextProvider: options.ClusterContextProvider, configDefaultsProvider: options.ConfigDefaultsProvider, lockManager: options.LockManager, + lanes: lanes, sessions: make(map[string]*streamSession), pendingSchema: make(map[string]chan *plugin_pb.ConfigSchemaResponse), pendingDetection: make(map[string]*pendingDetectionState), @@ -191,8 +199,10 @@ func New(options Options) (*Plugin, error) { } if plugin.clusterContextProvider != nil { - plugin.wg.Add(1) - go plugin.schedulerLoop() + for _, ls := range plugin.lanes { + plugin.wg.Add(1) + go plugin.laneSchedulerLoop(ls) + } } plugin.wg.Add(1) go plugin.persistenceLoop() diff --git a/weed/admin/plugin/plugin_scheduler.go b/weed/admin/plugin/plugin_scheduler.go index 06bab7e83..27a16f7b9 100644 --- a/weed/admin/plugin/plugin_scheduler.go +++ b/weed/admin/plugin/plugin_scheduler.go @@ -48,7 +48,9 @@ type schedulerPolicy struct { ExecutorReserveBackoff time.Duration } -func (r *Plugin) schedulerLoop() { +// laneSchedulerLoop is the main scheduling goroutine for a single lane. +// Each lane runs independently with its own timing, lock scope, and wake channel. +func (r *Plugin) laneSchedulerLoop(ls *schedulerLaneState) { defer r.wg.Done() for { select { @@ -57,16 +59,16 @@ func (r *Plugin) schedulerLoop() { default: } - hadJobs := r.runSchedulerIteration() - r.recordSchedulerIterationComplete(hadJobs) + hadJobs := r.runLaneSchedulerIteration(ls) + r.recordLaneIterationComplete(ls, hadJobs) if hadJobs { continue } - r.setSchedulerLoopState("", "sleeping") - idleSleep := defaultSchedulerIdleSleep - if nextRun := r.earliestNextDetectionAt(); !nextRun.IsZero() { + r.setLaneLoopState(ls, "", "sleeping") + idleSleep := LaneIdleSleep(ls.lane) + if nextRun := r.earliestLaneDetectionAt(ls.lane); !nextRun.IsZero() { if until := time.Until(nextRun); until <= 0 { idleSleep = 0 } else if until < idleSleep { @@ -82,7 +84,7 @@ func (r *Plugin) schedulerLoop() { case <-r.shutdownCh: timer.Stop() return - case <-r.schedulerWakeCh: + case <-ls.wakeCh: if !timer.Stop() { <-timer.C } @@ -92,7 +94,90 @@ func (r *Plugin) schedulerLoop() { } } +// schedulerLoop is kept for backward compatibility; it delegates to +// laneSchedulerLoop with the default lane. New code should not call this. +func (r *Plugin) schedulerLoop() { + ls := r.lanes[LaneDefault] + if ls == nil { + ls = newLaneState(LaneDefault) + } + r.laneSchedulerLoop(ls) +} + +// runLaneSchedulerIteration runs one scheduling pass for a single lane, +// processing only the job types assigned to that lane. +func (r *Plugin) runLaneSchedulerIteration(ls *schedulerLaneState) bool { + r.expireStaleJobs(time.Now().UTC()) + + allJobTypes := r.registry.DetectableJobTypes() + // Filter to only job types belonging to this lane. + var jobTypes []string + for _, jt := range allJobTypes { + if JobTypeLane(jt) == ls.lane { + jobTypes = append(jobTypes, jt) + } + } + if len(jobTypes) == 0 { + r.setLaneLoopState(ls, "", "idle") + return false + } + + r.setLaneLoopState(ls, "", "waiting_for_lock") + lockName := fmt.Sprintf("plugin scheduler:%s", ls.lane) + releaseLock, err := r.acquireAdminLock(lockName) + if err != nil { + glog.Warningf("Plugin scheduler [%s] failed to acquire lock: %v", ls.lane, err) + r.setLaneLoopState(ls, "", "idle") + return false + } + if releaseLock != nil { + defer releaseLock() + } + + active := make(map[string]struct{}, len(jobTypes)) + hadJobs := false + + for _, jobType := range jobTypes { + active[jobType] = struct{}{} + + policy, enabled, err := r.loadSchedulerPolicy(jobType) + if err != nil { + glog.Warningf("Plugin scheduler [%s] failed to load policy for %s: %v", ls.lane, jobType, err) + continue + } + if !enabled { + r.clearSchedulerJobType(jobType) + continue + } + initialDelay := time.Duration(0) + if runInfo := r.snapshotSchedulerRun(jobType); runInfo.lastRunStartedAt.IsZero() { + initialDelay = 5 * time.Second + } + if !r.markDetectionDue(jobType, policy.DetectionInterval, initialDelay) { + continue + } + + detected := r.runJobTypeIteration(jobType, policy) + if detected { + hadJobs = true + } + } + + r.pruneSchedulerState(active) + r.pruneDetectorLeases(active) + r.setLaneLoopState(ls, "", "idle") + return hadJobs +} + +// runSchedulerIteration is kept for backward compatibility. It runs a +// single iteration across ALL job types (equivalent to the old single-loop +// behavior). It is only used by the legacy schedulerLoop() fallback. func (r *Plugin) runSchedulerIteration() bool { + ls := r.lanes[LaneDefault] + if ls == nil { + ls = newLaneState(LaneDefault) + } + // For backward compat, the old function processes all job types. r.expireStaleJobs(time.Now().UTC()) jobTypes := r.registry.DetectableJobTypes() @@ -147,16 +232,38 @@ func (r *Plugin) runSchedulerIteration() bool { return hadJobs } -func (r *Plugin) wakeScheduler() { +// wakeLane wakes the scheduler goroutine for a specific lane. +func (r *Plugin) wakeLane(lane SchedulerLane) { if r == nil { return } - select { - case r.schedulerWakeCh <- struct{}{}: - default: + if ls, ok := r.lanes[lane]; ok { + select { + case ls.wakeCh <- struct{}{}: + default: + } + } +} + +// wakeAllLanes wakes all lane scheduler goroutines. +func (r *Plugin) wakeAllLanes() { + if r == nil { + return + } + for _, ls := range r.lanes { + select { + case ls.wakeCh <- struct{}{}: + default: + } } } +// wakeScheduler wakes the lane that owns the given job type, or all lanes +// if no job type is specified. Kept for backward compatibility. +func (r *Plugin) wakeScheduler() { + r.wakeAllLanes() +} + func (r *Plugin) runJobTypeIteration(jobType string, policy schedulerPolicy) bool { r.recordSchedulerRunStart(jobType) r.clearWaitingJobQueue(jobType) @@ -454,6 +561,7 @@ func (r *Plugin) ListSchedulerStates() ([]SchedulerJobTypeState, error) { jobType := jobTypeInfo.JobType state := SchedulerJobTypeState{ JobType: jobType, + Lane: string(JobTypeLane(jobType)), DetectionInFlight: detectionInFlight[jobType], } @@ -573,6 +681,35 @@ func (r *Plugin) markDetectionDue(jobType string, interval, initialDelay time.Du return true } +// earliestLaneDetectionAt returns the earliest next detection time among +// job types that belong to the given lane. +func (r *Plugin) earliestLaneDetectionAt(lane SchedulerLane) time.Time { + if r == nil { + return time.Time{} + } + + r.schedulerMu.Lock() + defer r.schedulerMu.Unlock() + + var earliest time.Time + for jobType, nextRun := range r.nextDetectionAt { + if JobTypeLane(jobType) != lane { + continue + } + if nextRun.IsZero() { + continue + } + if earliest.IsZero() || nextRun.Before(earliest) { + earliest = nextRun + } + } + + return earliest +} + +// earliestNextDetectionAt returns the earliest next detection time across +// all job types regardless of lane. Kept for backward compatibility and +// the global scheduler status API. func (r *Plugin) earliestNextDetectionAt() time.Time { if r == nil { return time.Time{} @@ -868,6 +1005,17 @@ func (r *Plugin) tryReserveExecutorCapacity( executor *WorkerSession, jobType string, policy schedulerPolicy, +) (func(), bool) { + return r.tryReserveExecutorCapacityForLane(executor, jobType, policy, JobTypeLane(jobType)) +} + +// tryReserveExecutorCapacityForLane reserves an execution slot on the +// per-lane reservation pool so that lanes cannot starve each other. +func (r *Plugin) tryReserveExecutorCapacityForLane( + executor *WorkerSession, + jobType string, + policy schedulerPolicy, + lane SchedulerLane, ) (func(), bool) { if executor == nil || strings.TrimSpace(executor.WorkerID) == "" { return nil, false @@ -884,21 +1032,60 @@ func (r *Plugin) tryReserveExecutorCapacity( workerID := strings.TrimSpace(executor.WorkerID) - r.schedulerExecMu.Lock() - reserved := r.schedulerExecReservations[workerID] - if heartbeatUsed+reserved >= limit { + ls := r.lanes[lane] + if ls == nil { + // Fallback to global reservations if lane state is missing. + r.schedulerExecMu.Lock() + reserved := r.schedulerExecReservations[workerID] + if heartbeatUsed+reserved >= limit { + r.schedulerExecMu.Unlock() + return nil, false + } + r.schedulerExecReservations[workerID] = reserved + 1 r.schedulerExecMu.Unlock() + release := func() { r.releaseExecutorCapacity(workerID) } + return release, true + } + + ls.execMu.Lock() + reserved := ls.execRes[workerID] + if heartbeatUsed+reserved >= limit { + ls.execMu.Unlock() return nil, false } - r.schedulerExecReservations[workerID] = reserved + 1 - r.schedulerExecMu.Unlock() + ls.execRes[workerID] = reserved + 1 + ls.execMu.Unlock() release := func() { - r.releaseExecutorCapacity(workerID) + r.releaseExecutorCapacityForLane(workerID, lane) } return release, true } +// releaseExecutorCapacityForLane releases a reservation from the per-lane pool. +func (r *Plugin) releaseExecutorCapacityForLane(workerID string, lane SchedulerLane) { + workerID = strings.TrimSpace(workerID) + if workerID == "" { + return + } + + ls := r.lanes[lane] + if ls == nil { + r.releaseExecutorCapacity(workerID) + return + } + + ls.execMu.Lock() + defer ls.execMu.Unlock() + + current := ls.execRes[workerID] + if current <= 1 { + delete(ls.execRes, workerID) + return + } + ls.execRes[workerID] = current - 1 +} + func (r *Plugin) releaseExecutorCapacity(workerID string) { workerID = strings.TrimSpace(workerID) if workerID == "" { diff --git a/weed/admin/plugin/scheduler_lane.go b/weed/admin/plugin/scheduler_lane.go new file mode 100644 index 000000000..efa808e73 --- /dev/null +++ b/weed/admin/plugin/scheduler_lane.go @@ -0,0 +1,109 @@ +package plugin + +import ( + "sync" + "time" +) + +// SchedulerLane identifies an independent scheduling track. Each lane runs +// its own goroutine, maintains its own detection timing, and acquires its +// own admin lock so that workloads in different lanes never block each other. +type SchedulerLane string + +const ( + // LaneDefault handles volume management operations (vacuum, balance, + // erasure coding) and admin scripts. It is the fallback lane for any + // job type that is not explicitly mapped elsewhere. + LaneDefault SchedulerLane = "default" + + // LaneIceberg handles table-bucket Iceberg compaction and maintenance. + LaneIceberg SchedulerLane = "iceberg" + + // LaneLifecycle handles S3 object store lifecycle management + // (expiration, transition, abort incomplete multipart uploads). + LaneLifecycle SchedulerLane = "lifecycle" +) + +// AllLanes returns every defined scheduler lane in a stable order. +func AllLanes() []SchedulerLane { + return []SchedulerLane{LaneDefault, LaneIceberg, LaneLifecycle} +} + +// laneIdleSleep maps each lane to its default idle sleep duration. +// Each lane can sleep for a different amount when no work is detected, +// independent of the per-job-type DetectionInterval. +var laneIdleSleep = map[SchedulerLane]time.Duration{ + LaneDefault: 61 * time.Second, + LaneIceberg: 61 * time.Second, + LaneLifecycle: 5 * time.Minute, +} + +// LaneIdleSleep returns the idle sleep duration for the given lane, +// falling back to defaultSchedulerIdleSleep if the lane is unknown. +func LaneIdleSleep(lane SchedulerLane) time.Duration { + if d, ok := laneIdleSleep[lane]; ok { + return d + } + return defaultSchedulerIdleSleep +} + +// jobTypeLaneMap is the hardcoded mapping from job type to scheduler lane. +// Job types not present here are assigned to LaneDefault. +var jobTypeLaneMap = map[string]SchedulerLane{ + // Volume management (default lane) + "vacuum": LaneDefault, + "volume_balance": LaneDefault, + "ec_balance": LaneDefault, + "erasure_coding": LaneDefault, + "admin_script": LaneDefault, + + // Iceberg table maintenance + "iceberg_maintenance": LaneIceberg, + + // S3 lifecycle management + "s3_lifecycle": LaneLifecycle, +} + +// JobTypeLane returns the scheduler lane for the given job type. +// Unknown job types are assigned to LaneDefault. +func JobTypeLane(jobType string) SchedulerLane { + if lane, ok := jobTypeLaneMap[jobType]; ok { + return lane + } + return LaneDefault +} + +// LaneJobTypes returns the set of known job types assigned to the given lane. +func LaneJobTypes(lane SchedulerLane) []string { + var result []string + for jobType, l := range jobTypeLaneMap { + if l == lane { + result = append(result, jobType) + } + } + return result +} + +// schedulerLaneState holds the per-lane runtime state used by the scheduler. +type schedulerLaneState struct { + lane SchedulerLane + wakeCh chan struct{} + + loopMu sync.Mutex + loop schedulerLoopState + + // Per-lane execution reservation pool. Each lane tracks how many + // execution slots it has reserved on each worker independently, + // so lanes cannot starve each other. + execMu sync.Mutex + execRes map[string]int +} + +// newLaneState creates a schedulerLaneState for the given lane. +func newLaneState(lane SchedulerLane) *schedulerLaneState { + return &schedulerLaneState{ + lane: lane, + wakeCh: make(chan struct{}, 1), + execRes: make(map[string]int), + } +} diff --git a/weed/admin/plugin/scheduler_lane_test.go b/weed/admin/plugin/scheduler_lane_test.go new file mode 100644 index 000000000..6217e7f22 --- /dev/null +++ b/weed/admin/plugin/scheduler_lane_test.go @@ -0,0 +1,47 @@ +package plugin + +import ( + "testing" +) + +func TestJobTypeLaneMapCoversKnownTypes(t *testing.T) { + // Every job type in the map must resolve to a valid lane. + for jobType, lane := range jobTypeLaneMap { + if lane != LaneDefault && lane != LaneIceberg && lane != LaneLifecycle { + t.Errorf("jobTypeLaneMap[%q] = %q, want a known lane", jobType, lane) + } + } +} + +func TestJobTypeLaneFallsBackToDefault(t *testing.T) { + if got := JobTypeLane("unknown_job_type"); got != LaneDefault { + t.Errorf("JobTypeLane(unknown) = %q, want %q", got, LaneDefault) + } +} + +func TestAllLanesHaveIdleSleep(t *testing.T) { + for _, lane := range AllLanes() { + if d := LaneIdleSleep(lane); d <= 0 { + t.Errorf("LaneIdleSleep(%q) = %v, want > 0", lane, d) + } + } +} + +func TestKnownJobTypesInMap(t *testing.T) { + // Ensure the well-known job types are mapped. This catches drift + // if a handler's job type string changes without updating the map. + expected := map[string]SchedulerLane{ + "vacuum": LaneDefault, + "volume_balance": LaneDefault, + "ec_balance": LaneDefault, + "erasure_coding": LaneDefault, + "admin_script": LaneDefault, + "iceberg_maintenance": LaneIceberg, + "s3_lifecycle": LaneLifecycle, + } + for jobType, wantLane := range expected { + if got := JobTypeLane(jobType); got != wantLane { + t.Errorf("JobTypeLane(%q) = %q, want %q", jobType, got, wantLane) + } + } +} diff --git a/weed/admin/plugin/scheduler_status.go b/weed/admin/plugin/scheduler_status.go index 75ae55aa9..e448a2800 100644 --- a/weed/admin/plugin/scheduler_status.go +++ b/weed/admin/plugin/scheduler_status.go @@ -8,6 +8,7 @@ import ( type SchedulerStatus struct { Now time.Time `json:"now"` + Lane string `json:"lane,omitempty"` SchedulerTickSeconds int `json:"scheduler_tick_seconds"` IdleSleepSeconds int `json:"idle_sleep_seconds,omitempty"` NextDetectionAt *time.Time `json:"next_detection_at,omitempty"` @@ -213,21 +214,104 @@ func (r *Plugin) snapshotSchedulerLoopState() schedulerLoopState { return r.schedulerLoopState } -func (r *Plugin) GetSchedulerStatus() SchedulerStatus { +// aggregateLaneLoopStates merges per-lane loop states into a single +// schedulerLoopState for the aggregate GetSchedulerStatus API. It picks +// the most recent iteration completion, any currently-active job type, +// and a phase that reflects whether any lane is actively working. +func (r *Plugin) aggregateLaneLoopStates() schedulerLoopState { + if r == nil || len(r.lanes) == 0 { + return r.snapshotSchedulerLoopState() + } + + var agg schedulerLoopState + for _, ls := range r.lanes { + snap := r.snapshotLaneLoopState(ls) + if snap.lastIterationCompleted.After(agg.lastIterationCompleted) { + agg.lastIterationCompleted = snap.lastIterationCompleted + } + if snap.lastIterationHadJobs { + agg.lastIterationHadJobs = true + } + // Prefer showing an active phase over idle/sleeping. + if snap.currentJobType != "" { + agg.currentJobType = snap.currentJobType + agg.currentPhase = snap.currentPhase + } + } + // If no lane is actively processing, show the most interesting phase. + if agg.currentPhase == "" { + for _, ls := range r.lanes { + snap := r.snapshotLaneLoopState(ls) + if snap.currentPhase != "" { + agg.currentPhase = snap.currentPhase + break + } + } + } + return agg +} + +// --- Per-lane loop state helpers --- + +func (r *Plugin) setLaneLoopState(ls *schedulerLaneState, jobType, phase string) { + if r == nil || ls == nil { + return + } + ls.loopMu.Lock() + ls.loop.currentJobType = jobType + ls.loop.currentPhase = phase + ls.loopMu.Unlock() +} + +func (r *Plugin) recordLaneIterationComplete(ls *schedulerLaneState, hadJobs bool) { + if r == nil || ls == nil { + return + } + ls.loopMu.Lock() + ls.loop.lastIterationHadJobs = hadJobs + ls.loop.lastIterationCompleted = time.Now().UTC() + ls.loopMu.Unlock() +} + +func (r *Plugin) snapshotLaneLoopState(ls *schedulerLaneState) schedulerLoopState { + if r == nil || ls == nil { + return schedulerLoopState{} + } + ls.loopMu.Lock() + defer ls.loopMu.Unlock() + return ls.loop +} + +// GetLaneSchedulerStatus returns scheduler status scoped to a single lane. +func (r *Plugin) GetLaneSchedulerStatus(lane SchedulerLane) SchedulerStatus { + ls := r.lanes[lane] + if ls == nil { + return SchedulerStatus{Now: time.Now().UTC()} + } now := time.Now().UTC() - loopState := r.snapshotSchedulerLoopState() + loopState := r.snapshotLaneLoopState(ls) + idleSleep := LaneIdleSleep(lane) + allInProcess := r.listInProcessJobs(now) + laneInProcess := make([]SchedulerJobStatus, 0, len(allInProcess)) + for _, job := range allInProcess { + if JobTypeLane(job.JobType) == lane { + laneInProcess = append(laneInProcess, job) + } + } + status := SchedulerStatus{ Now: now, + Lane: string(lane), SchedulerTickSeconds: int(secondsFromDuration(r.schedulerTick)), - InProcessJobs: r.listInProcessJobs(now), - IdleSleepSeconds: int(defaultSchedulerIdleSleep / time.Second), + InProcessJobs: laneInProcess, + IdleSleepSeconds: int(idleSleep / time.Second), CurrentJobType: loopState.currentJobType, CurrentPhase: loopState.currentPhase, LastIterationHadJobs: loopState.lastIterationHadJobs, } - nextDetectionAt := r.earliestNextDetectionAt() + nextDetectionAt := r.earliestLaneDetectionAt(lane) if nextDetectionAt.IsZero() && loopState.currentPhase == "sleeping" && !loopState.lastIterationCompleted.IsZero() { - nextDetectionAt = loopState.lastIterationCompleted.Add(defaultSchedulerIdleSleep) + nextDetectionAt = loopState.lastIterationCompleted.Add(idleSleep) } if !nextDetectionAt.IsZero() { at := nextDetectionAt @@ -243,6 +327,92 @@ func (r *Plugin) GetSchedulerStatus() SchedulerStatus { return status } + waiting := make([]SchedulerWaitingStatus, 0) + jobTypes := make([]SchedulerJobTypeStatus, 0) + + for _, state := range states { + if JobTypeLane(state.JobType) != lane { + continue + } + jobType := state.JobType + info := r.snapshotSchedulerDetection(jobType) + + jobStatus := SchedulerJobTypeStatus{ + JobType: jobType, + Enabled: state.Enabled, + DetectionInFlight: state.DetectionInFlight, + NextDetectionAt: state.NextDetectionAt, + DetectionIntervalSeconds: state.DetectionIntervalSeconds, + } + if !info.lastDetectedAt.IsZero() { + jobStatus.LastDetectedAt = timeToPtr(info.lastDetectedAt) + jobStatus.LastDetectedCount = info.lastDetectedCount + } + if info.lastError != "" { + jobStatus.LastDetectionError = info.lastError + } + if info.lastSkippedReason != "" { + jobStatus.LastDetectionSkipped = info.lastSkippedReason + } + jobTypes = append(jobTypes, jobStatus) + + if state.DetectionInFlight { + waiting = append(waiting, SchedulerWaitingStatus{ + Reason: "detection_in_flight", + JobType: jobType, + }) + } else if state.Enabled && state.NextDetectionAt != nil && now.Before(*state.NextDetectionAt) { + waiting = append(waiting, SchedulerWaitingStatus{ + Reason: "next_detection_at", + JobType: jobType, + Until: state.NextDetectionAt, + }) + } + } + + sort.Slice(jobTypes, func(i, j int) bool { + return jobTypes[i].JobType < jobTypes[j].JobType + }) + + status.Waiting = waiting + status.JobTypes = jobTypes + return status +} + +func (r *Plugin) GetSchedulerStatus() SchedulerStatus { + now := time.Now().UTC() + + // Aggregate loop state across all lanes instead of reading the + // legacy single-loop state which is no longer updated. + aggregated := r.aggregateLaneLoopStates() + + status := SchedulerStatus{ + Now: now, + SchedulerTickSeconds: int(secondsFromDuration(r.schedulerTick)), + InProcessJobs: r.listInProcessJobs(now), + IdleSleepSeconds: int(defaultSchedulerIdleSleep / time.Second), + CurrentJobType: aggregated.currentJobType, + CurrentPhase: aggregated.currentPhase, + LastIterationHadJobs: aggregated.lastIterationHadJobs, + } + nextDetectionAt := r.earliestNextDetectionAt() + if nextDetectionAt.IsZero() && aggregated.currentPhase == "sleeping" && !aggregated.lastIterationCompleted.IsZero() { + nextDetectionAt = aggregated.lastIterationCompleted.Add(defaultSchedulerIdleSleep) + } + if !nextDetectionAt.IsZero() { + at := nextDetectionAt + status.NextDetectionAt = &at + } + if !aggregated.lastIterationCompleted.IsZero() { + at := aggregated.lastIterationCompleted + status.LastIterationDoneAt = &at + } + + states, err := r.ListSchedulerStates() + if err != nil { + return status + } + waiting := make([]SchedulerWaitingStatus, 0) jobTypes := make([]SchedulerJobTypeStatus, 0, len(states)) diff --git a/weed/admin/plugin/types.go b/weed/admin/plugin/types.go index fcac0b2c6..d4804ac95 100644 --- a/weed/admin/plugin/types.go +++ b/weed/admin/plugin/types.go @@ -83,6 +83,7 @@ type JobDetail struct { type SchedulerJobTypeState struct { JobType string `json:"job_type"` + Lane string `json:"lane"` Enabled bool `json:"enabled"` PolicyError string `json:"policy_error,omitempty"` DetectionInFlight bool `json:"detection_in_flight"` diff --git a/weed/admin/view/app/plugin.templ b/weed/admin/view/app/plugin.templ index e1b897057..25aa883f7 100644 --- a/weed/admin/view/app/plugin.templ +++ b/weed/admin/view/app/plugin.templ @@ -1,19 +1,25 @@ package app -templ Plugin(page string, initialJob string) { +templ Plugin(page string, initialJob string, lane string) { {{ currentPage := page if currentPage == "" { currentPage = "overview" } + currentLane := lane + if currentLane == "" { + currentLane = "default" + } + laneTitle := pluginLaneTitle(currentLane) + laneDescription := pluginLaneDescription(currentLane) }} -
+
-

Workers

-

Cluster-wide worker status, per-job configuration, detection, queue, and execution workflows.

+

{ laneTitle }

+

{ laneDescription }

Workers

0

Active Jobs

0

Activities (recent)

0

Next Run

-

Per Job Type Summary
Job TypeActive JobsRecent Activities
Loading...
Scheduler State
Sequential scheduler with per-job runtime limits
Job TypeEnabledDetectorIn FlightMax RuntimeExec GlobalExec/WorkerExecutor WorkersEffective ExecLast Run
Loading...
Workers
WorkerAddressCapabilitiesLoad
Loading...
Job Type Configuration
Not loaded
Selected Job Type
-
Descriptor
Select a job type to load schema and config.
Admin Config Form
No admin form loaded.
Worker Config Form
No worker form loaded.
Job Scheduling Settings
How often to check for new work.
Next Run
Scheduler
-
Not scheduled
Run History
Keep last 10 success + last 10 errors
Successful Runs
TimeJob IDWorkerDuration
No data
Error Runs
TimeJob IDWorkerError
No data
Detection Results
Run detection to see proposals.
Job Queue
States: pending/assigned/running
Job IDTypeStateProgressWorkerUpdatedMessage
Loading...
Detection Jobs
Detection activities for selected job type
TimeJob TypeRequest IDWorkerStageSourceMessage
Loading...
Execution Jobs
Job IDTypeStateProgressWorkerUpdatedMessage
Loading...
Execution Activities
Non-detection events only
TimeJob TypeJob IDSourceStageMessage
Loading...
Job Detail
Select a job to view details.
") + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 3, "\" data-plugin-lane=\"") + if templ_7745c5c3_Err != nil { + return templ_7745c5c3_Err + } + var templ_7745c5c3_Var4 string + templ_7745c5c3_Var4, templ_7745c5c3_Err = templ.JoinStringErrs(currentLane) + if templ_7745c5c3_Err != nil { + return templ.Error{Err: templ_7745c5c3_Err, FileName: `view/app/plugin.templ`, Line: 16, Col: 144} + } + _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var4)) + if templ_7745c5c3_Err != nil { + return templ_7745c5c3_Err + } + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 4, "\">

") + if templ_7745c5c3_Err != nil { + return templ_7745c5c3_Err + } + var templ_7745c5c3_Var5 string + templ_7745c5c3_Var5, templ_7745c5c3_Err = templ.JoinStringErrs(laneTitle) + if templ_7745c5c3_Err != nil { + return templ.Error{Err: templ_7745c5c3_Err, FileName: `view/app/plugin.templ`, Line: 21, Col: 84} + } + _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var5)) + if templ_7745c5c3_Err != nil { + return templ_7745c5c3_Err + } + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 5, "

") + if templ_7745c5c3_Err != nil { + return templ_7745c5c3_Err + } + var templ_7745c5c3_Var6 string + templ_7745c5c3_Var6, templ_7745c5c3_Err = templ.JoinStringErrs(laneDescription) + if templ_7745c5c3_Err != nil { + return templ.Error{Err: templ_7745c5c3_Err, FileName: `view/app/plugin.templ`, Line: 22, Col: 68} + } + _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var6)) + if templ_7745c5c3_Err != nil { + return templ_7745c5c3_Err + } + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 6, "

Workers

0

Active Jobs

0

Activities (recent)

0

Next Run

-

Per Job Type Summary
Job TypeActive JobsRecent Activities
Loading...
Scheduler State
Sequential scheduler with per-job runtime limits
Job TypeEnabledDetectorIn FlightMax RuntimeExec GlobalExec/WorkerExecutor WorkersEffective ExecLast Run
Loading...
Workers
WorkerAddressCapabilitiesLoad
Loading...
Job Type Configuration
Not loaded
Selected Job Type
-
Descriptor
Select a job type to load schema and config.
Admin Config Form
No admin form loaded.
Worker Config Form
No worker form loaded.
Job Scheduling Settings
How often to check for new work.
Next Run
Scheduler
-
Not scheduled
Run History
Keep last 10 success + last 10 errors
Successful Runs
TimeJob IDWorkerDuration
No data
Error Runs
TimeJob IDWorkerError
No data
Detection Results
Run detection to see proposals.
Job Queue
States: pending/assigned/running
Job IDTypeStateProgressWorkerUpdatedMessage
Loading...
Detection Jobs
Detection activities for selected job type
TimeJob TypeRequest IDWorkerStageSourceMessage
Loading...
Execution Jobs
Job IDTypeStateProgressWorkerUpdatedMessage
Loading...
Execution Activities
Non-detection events only
TimeJob TypeJob IDSourceStageMessage
Loading...
Job Detail
Select a job to view details.
") if templ_7745c5c3_Err != nil { return templ_7745c5c3_Err } diff --git a/weed/admin/view/app/template_helpers.go b/weed/admin/view/app/template_helpers.go index eca78f136..14814a9bd 100644 --- a/weed/admin/view/app/template_helpers.go +++ b/weed/admin/view/app/template_helpers.go @@ -3,6 +3,7 @@ package app import ( "fmt" "strconv" + "strings" ) // getStatusColor returns Bootstrap color class for status @@ -66,3 +67,39 @@ func calculatePercent(current, max int) int { } return (current * 100) / max } + +func pluginLaneDisplayName(lane string) string { + normalized := strings.TrimSpace(strings.ToLower(lane)) + if normalized == "" || normalized == "default" { + return "Default" + } + + parts := strings.Fields(strings.ReplaceAll(normalized, "_", " ")) + if len(parts) == 0 { + return "Default" + } + + for i, part := range parts { + if part == "" { + continue + } + parts[i] = strings.ToUpper(part[:1]) + part[1:] + } + + return strings.Join(parts, " ") +} + +func pluginLaneTitle(lane string) string { + return pluginLaneDisplayName(lane) + " Workers" +} + +func pluginLaneDescription(lane string) string { + switch strings.TrimSpace(strings.ToLower(lane)) { + case "iceberg": + return "Iceberg maintenance workers, scheduler state, queue, and execution flows." + case "lifecycle": + return "Lifecycle workers, scheduler state, queue, and execution flows." + default: + return "Default workers, scheduler state, queue, and execution flows." + } +} diff --git a/weed/admin/view/layout/layout.templ b/weed/admin/view/layout/layout.templ index b3e847d79..d013ed099 100644 --- a/weed/admin/view/layout/layout.templ +++ b/weed/admin/view/layout/layout.templ @@ -28,14 +28,22 @@ templ Layout(view ViewContext, content templ.Component) { // Detect if we're on a message queue page to keep submenu expanded isMQPage := strings.HasPrefix(currentPath, "/mq/") - // Detect if we're on plugin page. - isPluginPage := strings.HasPrefix(currentPath, "/plugin") - // Detect if we're on a storage page to keep submenu expanded isStoragePage := strings.HasPrefix(currentPath, "/storage/volumes") || strings.HasPrefix(currentPath, "/storage/ec-shards") || strings.HasPrefix(currentPath, "/storage/collections") // Detect if we're on a cluster page (but not storage page) to keep submenu expanded isClusterPage := (strings.HasPrefix(currentPath, "/cluster/masters") || strings.HasPrefix(currentPath, "/cluster/volume-servers") || strings.HasPrefix(currentPath, "/cluster/filers")) + + isDefaultWorkerPage := currentPath == "/plugin" || + strings.HasPrefix(currentPath, "/plugin/configuration") || + strings.HasPrefix(currentPath, "/plugin/detection") || + strings.HasPrefix(currentPath, "/plugin/queue") || + strings.HasPrefix(currentPath, "/plugin/execution") || + strings.HasPrefix(currentPath, "/plugin/monitoring") || + currentPath == "/plugin/lanes/default" || + strings.HasPrefix(currentPath, "/plugin/lanes/default/") + isIcebergWorkerPage := currentPath == "/plugin/lanes/iceberg" || strings.HasPrefix(currentPath, "/plugin/lanes/iceberg/") + isLifecycleWorkerPage := currentPath == "/plugin/lanes/lifecycle" || strings.HasPrefix(currentPath, "/plugin/lanes/lifecycle/") }} @@ -266,13 +274,35 @@ templ Layout(view ViewContext, content templ.Component) {
") + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 67, "
") if templ_7745c5c3_Err != nil { return templ_7745c5c3_Err } @@ -673,127 +763,127 @@ func Layout(view ViewContext, content templ.Component) templ.Component { if templ_7745c5c3_Err != nil { return templ_7745c5c3_Err } - templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 58, "
© ") + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 68, "
") + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 79, "\">") if templ_7745c5c3_Err != nil { return templ_7745c5c3_Err } @@ -817,140 +907,140 @@ func LoginForm(title string, errorMessage string, csrfToken string) templ.Compon }() } ctx = templ.InitializeContext(ctx) - templ_7745c5c3_Var51 := templ.GetChildren(ctx) - if templ_7745c5c3_Var51 == nil { - templ_7745c5c3_Var51 = templ.NopComponent + templ_7745c5c3_Var55 := templ.GetChildren(ctx) + if templ_7745c5c3_Var55 == nil { + templ_7745c5c3_Var55 = templ.NopComponent } ctx = templ.ClearChildren(ctx) prefix := dash.URLPrefixFromContext(ctx) - templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 70, "") + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 80, "<!doctype html><html lang=\"en\"><head><meta charset=\"UTF-8\"><title>") if templ_7745c5c3_Err != nil { return templ_7745c5c3_Err } - var templ_7745c5c3_Var52 string - templ_7745c5c3_Var52, templ_7745c5c3_Err = templ.JoinStringErrs(title) + var templ_7745c5c3_Var56 string + templ_7745c5c3_Var56, templ_7745c5c3_Err = templ.JoinStringErrs(title) if templ_7745c5c3_Err != nil { - return templ.Error{Err: templ_7745c5c3_Err, FileName: `view/layout/layout.templ`, Line: 333, Col: 17} + return templ.Error{Err: templ_7745c5c3_Err, FileName: `view/layout/layout.templ`, Line: 363, Col: 17} } - _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var52)) + _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var56)) if templ_7745c5c3_Err != nil { return templ_7745c5c3_Err } - templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 71, " - Login

") + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 84, "\" rel=\"stylesheet\">

") if templ_7745c5c3_Err != nil { return templ_7745c5c3_Err } - var templ_7745c5c3_Var56 string - templ_7745c5c3_Var56, templ_7745c5c3_Err = templ.JoinStringErrs(title) + var templ_7745c5c3_Var60 string + templ_7745c5c3_Var60, templ_7745c5c3_Err = templ.JoinStringErrs(title) if templ_7745c5c3_Err != nil { - return templ.Error{Err: templ_7745c5c3_Err, FileName: `view/layout/layout.templ`, Line: 347, Col: 57} + return templ.Error{Err: templ_7745c5c3_Err, FileName: `view/layout/layout.templ`, Line: 377, Col: 57} } - _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var56)) + _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var60)) if templ_7745c5c3_Err != nil { return templ_7745c5c3_Err } - templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 75, "

Please sign in to continue

") + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 85, "

Please sign in to continue

") if templ_7745c5c3_Err != nil { return templ_7745c5c3_Err } if errorMessage != "" { - templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 76, "
") + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 86, "
") if templ_7745c5c3_Err != nil { return templ_7745c5c3_Err } - var templ_7745c5c3_Var57 string - templ_7745c5c3_Var57, templ_7745c5c3_Err = templ.JoinStringErrs(errorMessage) + var templ_7745c5c3_Var61 string + templ_7745c5c3_Var61, templ_7745c5c3_Err = templ.JoinStringErrs(errorMessage) if templ_7745c5c3_Err != nil { - return templ.Error{Err: templ_7745c5c3_Err, FileName: `view/layout/layout.templ`, Line: 354, Col: 45} + return templ.Error{Err: templ_7745c5c3_Err, FileName: `view/layout/layout.templ`, Line: 384, Col: 45} } - _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var57)) + _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var61)) if templ_7745c5c3_Err != nil { return templ_7745c5c3_Err } - templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 77, "
") + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 87, "
") if templ_7745c5c3_Err != nil { return templ_7745c5c3_Err } } - templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 78, "
") + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 91, "\">") if templ_7745c5c3_Err != nil { return templ_7745c5c3_Err } diff --git a/weed/plugin/worker/iceberg/exec_test.go b/weed/plugin/worker/iceberg/exec_test.go index 38a83c4a6..93c4e3455 100644 --- a/weed/plugin/worker/iceberg/exec_test.go +++ b/weed/plugin/worker/iceberg/exec_test.go @@ -245,7 +245,11 @@ func startFakeFilerWithAddress(t *testing.T) (*fakeFilerServer, filer_pb.Seaweed time.Sleep(10 * time.Millisecond) } - return fakeServer, client, listener.Addr().String() + // Return the address in ServerAddress format (host:httpPort.grpcPort) + // so that dialFiler resolves it correctly via ToGrpcAddress(). + _, portStr, _ := net.SplitHostPort(listener.Addr().String()) + serverAddr := fmt.Sprintf("127.0.0.1:0.%s", portStr) + return fakeServer, client, serverAddr } // --------------------------------------------------------------------------- @@ -1001,8 +1005,9 @@ func TestConnectToFilerSkipsUnreachableAddresses(t *testing.T) { if err != nil { t.Fatalf("listen for dead address: %v", err) } - deadAddr := deadListener.Addr().String() + _, deadPortStr, _ := net.SplitHostPort(deadListener.Addr().String()) _ = deadListener.Close() + deadAddr := fmt.Sprintf("127.0.0.1:0.%s", deadPortStr) addr, conn, err := handler.connectToFiler(context.Background(), []string{deadAddr, liveAddr}) if err != nil { @@ -1022,8 +1027,9 @@ func TestConnectToFilerFailsWhenAllAddressesAreUnreachable(t *testing.T) { if err != nil { t.Fatalf("listen for dead address: %v", err) } - deadAddr := deadListener.Addr().String() + _, deadPortStr, _ := net.SplitHostPort(deadListener.Addr().String()) _ = deadListener.Close() + deadAddr := fmt.Sprintf("127.0.0.1:0.%s", deadPortStr) _, _, err = handler.connectToFiler(context.Background(), []string{deadAddr}) if err == nil { diff --git a/weed/plugin/worker/iceberg/handler.go b/weed/plugin/worker/iceberg/handler.go index f0be05d89..8dd121895 100644 --- a/weed/plugin/worker/iceberg/handler.go +++ b/weed/plugin/worker/iceberg/handler.go @@ -643,7 +643,7 @@ func (h *Handler) dialFiler(ctx context.Context, address string) (*grpc.ClientCo opCtx, opCancel := context.WithTimeout(ctx, filerConnectTimeout) defer opCancel() - conn, err := pb.GrpcDial(opCtx, address, false, h.grpcDialOption) + conn, err := pb.GrpcDial(opCtx, pb.ServerAddress(address).ToGrpcAddress(), false, h.grpcDialOption) if err != nil { return nil, err } diff --git a/weed/plugin/worker/lifecycle/config.go b/weed/plugin/worker/lifecycle/config.go new file mode 100644 index 000000000..62e0b4dbf --- /dev/null +++ b/weed/plugin/worker/lifecycle/config.go @@ -0,0 +1,131 @@ +package lifecycle + +import ( + "strconv" + "strings" + + "github.com/seaweedfs/seaweedfs/weed/glog" + "github.com/seaweedfs/seaweedfs/weed/pb/plugin_pb" +) + +const ( + jobType = "s3_lifecycle" + + defaultBatchSize = 1000 + defaultMaxDeletesPerBucket = 10000 + defaultDryRun = false + defaultDeleteMarkerCleanup = true + defaultAbortMPUDaysDefault = 7 + + MetricObjectsExpired = "objects_expired" + MetricObjectsScanned = "objects_scanned" + MetricBucketsScanned = "buckets_scanned" + MetricBucketsWithRules = "buckets_with_rules" + MetricDeleteMarkersClean = "delete_markers_cleaned" + MetricMPUAborted = "mpu_aborted" + MetricErrors = "errors" + MetricDurationMs = "duration_ms" +) + +// Config holds parsed worker config values for lifecycle management. +type Config struct { + BatchSize int64 + MaxDeletesPerBucket int64 + DryRun bool + DeleteMarkerCleanup bool + AbortMPUDays int64 +} + +// ParseConfig extracts a lifecycle Config from plugin config values. +func ParseConfig(values map[string]*plugin_pb.ConfigValue) Config { + cfg := Config{ + BatchSize: readInt64Config(values, "batch_size", defaultBatchSize), + MaxDeletesPerBucket: readInt64Config(values, "max_deletes_per_bucket", defaultMaxDeletesPerBucket), + DryRun: readBoolConfig(values, "dry_run", defaultDryRun), + DeleteMarkerCleanup: readBoolConfig(values, "delete_marker_cleanup", defaultDeleteMarkerCleanup), + AbortMPUDays: readInt64Config(values, "abort_mpu_days", defaultAbortMPUDaysDefault), + } + + if cfg.BatchSize <= 0 { + cfg.BatchSize = defaultBatchSize + } + if cfg.MaxDeletesPerBucket <= 0 { + cfg.MaxDeletesPerBucket = defaultMaxDeletesPerBucket + } + if cfg.AbortMPUDays < 0 { + cfg.AbortMPUDays = defaultAbortMPUDaysDefault + } + + return cfg +} + +func readStringConfig(values map[string]*plugin_pb.ConfigValue, field string, fallback string) string { + if values == nil { + return fallback + } + value := values[field] + if value == nil { + return fallback + } + switch kind := value.Kind.(type) { + case *plugin_pb.ConfigValue_StringValue: + return kind.StringValue + case *plugin_pb.ConfigValue_Int64Value: + return strconv.FormatInt(kind.Int64Value, 10) + default: + glog.V(1).Infof("readStringConfig: unexpected type %T for field %q", value.Kind, field) + } + return fallback +} + +func readBoolConfig(values map[string]*plugin_pb.ConfigValue, field string, fallback bool) bool { + if values == nil { + return fallback + } + value := values[field] + if value == nil { + return fallback + } + switch kind := value.Kind.(type) { + case *plugin_pb.ConfigValue_BoolValue: + return kind.BoolValue + case *plugin_pb.ConfigValue_StringValue: + s := strings.TrimSpace(strings.ToLower(kind.StringValue)) + if s == "true" || s == "1" || s == "yes" { + return true + } + if s == "false" || s == "0" || s == "no" { + return false + } + glog.V(1).Infof("readBoolConfig: unrecognized string value %q for field %q, using fallback %v", kind.StringValue, field, fallback) + case *plugin_pb.ConfigValue_Int64Value: + return kind.Int64Value != 0 + default: + glog.V(1).Infof("readBoolConfig: unexpected config value type %T for field %q, using fallback %v", value.Kind, field, fallback) + } + return fallback +} + +func readInt64Config(values map[string]*plugin_pb.ConfigValue, field string, fallback int64) int64 { + if values == nil { + return fallback + } + value := values[field] + if value == nil { + return fallback + } + switch kind := value.Kind.(type) { + case *plugin_pb.ConfigValue_Int64Value: + return kind.Int64Value + case *plugin_pb.ConfigValue_DoubleValue: + return int64(kind.DoubleValue) + case *plugin_pb.ConfigValue_StringValue: + parsed, err := strconv.ParseInt(strings.TrimSpace(kind.StringValue), 10, 64) + if err == nil { + return parsed + } + default: + glog.V(1).Infof("readInt64Config: unexpected config value type %T for field %q, using fallback %d", value.Kind, field, fallback) + } + return fallback +} diff --git a/weed/plugin/worker/lifecycle/detection.go b/weed/plugin/worker/lifecycle/detection.go new file mode 100644 index 000000000..e88e680ca --- /dev/null +++ b/weed/plugin/worker/lifecycle/detection.go @@ -0,0 +1,204 @@ +package lifecycle + +import ( + "context" + "fmt" + "path" + "strings" + + "github.com/seaweedfs/seaweedfs/weed/filer" + "github.com/seaweedfs/seaweedfs/weed/glog" + "github.com/seaweedfs/seaweedfs/weed/pb/filer_pb" + "github.com/seaweedfs/seaweedfs/weed/pb/plugin_pb" + "github.com/seaweedfs/seaweedfs/weed/util/wildcard" +) + +// detectBucketsWithLifecycleRules scans all S3 buckets to find those +// with lifecycle (TTL) rules configured in filer.conf. +func (h *Handler) detectBucketsWithLifecycleRules( + ctx context.Context, + filerClient filer_pb.SeaweedFilerClient, + config Config, + bucketFilter string, + maxResults int, +) ([]*plugin_pb.JobProposal, error) { + // Load filer configuration to find TTL rules. + fc, err := loadFilerConf(ctx, filerClient) + if err != nil { + return nil, fmt.Errorf("load filer conf: %w", err) + } + + bucketsPath := defaultBucketsPath + bucketMatchers := wildcard.CompileWildcardMatchers(bucketFilter) + + // List all buckets. + bucketEntries, err := listFilerEntries(ctx, filerClient, bucketsPath, "") + if err != nil { + return nil, fmt.Errorf("list buckets at %s: %w", bucketsPath, err) + } + + var proposals []*plugin_pb.JobProposal + for _, entry := range bucketEntries { + select { + case <-ctx.Done(): + return proposals, ctx.Err() + default: + } + + if !entry.IsDirectory { + continue + } + bucketName := entry.Name + if !wildcard.MatchesAnyWildcard(bucketMatchers, bucketName) { + continue + } + + // Derive the collection name for this bucket. + collection := bucketName + ttls := fc.GetCollectionTtls(collection) + if len(ttls) == 0 { + continue + } + + glog.V(2).Infof("s3_lifecycle: bucket %s has %d lifecycle rule(s)", bucketName, len(ttls)) + + proposal := &plugin_pb.JobProposal{ + ProposalId: fmt.Sprintf("s3_lifecycle:%s", bucketName), + JobType: jobType, + Summary: fmt.Sprintf("Lifecycle management for bucket %s (%d rules)", bucketName, len(ttls)), + DedupeKey: fmt.Sprintf("s3_lifecycle:%s", bucketName), + Parameters: map[string]*plugin_pb.ConfigValue{ + "bucket": {Kind: &plugin_pb.ConfigValue_StringValue{StringValue: bucketName}}, + "buckets_path": {Kind: &plugin_pb.ConfigValue_StringValue{StringValue: bucketsPath}}, + "collection": {Kind: &plugin_pb.ConfigValue_StringValue{StringValue: collection}}, + "rule_count": {Kind: &plugin_pb.ConfigValue_Int64Value{Int64Value: int64(len(ttls))}}, + }, + Labels: map[string]string{ + "bucket": bucketName, + }, + } + + proposals = append(proposals, proposal) + if maxResults > 0 && len(proposals) >= maxResults { + break + } + } + + return proposals, nil +} + +const defaultBucketsPath = "/buckets" + +// loadFilerConf reads the filer configuration from the filer. +func loadFilerConf(ctx context.Context, client filer_pb.SeaweedFilerClient) (*filer.FilerConf, error) { + fc := filer.NewFilerConf() + + content, err := filer.ReadInsideFiler(ctx, client, filer.DirectoryEtcSeaweedFS, filer.FilerConfName) + if err != nil { + // filer.conf may not exist yet - return empty config. + glog.V(1).Infof("s3_lifecycle: filer.conf not found or unreadable: %v (using empty config)", err) + return fc, nil + } + if err := fc.LoadFromBytes(content); err != nil { + return nil, fmt.Errorf("parse filer.conf: %w", err) + } + + return fc, nil +} + +// listFilerEntries lists directory entries from the filer. +func listFilerEntries(ctx context.Context, client filer_pb.SeaweedFilerClient, dir, startFrom string) ([]*filer_pb.Entry, error) { + var entries []*filer_pb.Entry + err := filer_pb.SeaweedList(ctx, client, dir, "", func(entry *filer_pb.Entry, isLast bool) error { + entries = append(entries, entry) + return nil + }, startFrom, false, 10000) + return entries, err +} + +type expiredObject struct { + dir string + name string +} + +// listExpiredObjects scans a bucket directory tree for objects whose TTL +// has expired based on their TtlSec attribute set by PutBucketLifecycle. +func listExpiredObjects( + ctx context.Context, + client filer_pb.SeaweedFilerClient, + bucketsPath, bucket string, + limit int64, +) ([]expiredObject, int64, error) { + var expired []expiredObject + var scanned int64 + + bucketPath := path.Join(bucketsPath, bucket) + + // Walk the bucket directory tree using breadth-first traversal. + dirsToProcess := []string{bucketPath} + for len(dirsToProcess) > 0 { + select { + case <-ctx.Done(): + return expired, scanned, ctx.Err() + default: + } + + dir := dirsToProcess[0] + dirsToProcess = dirsToProcess[1:] + + limitReached := false + err := filer_pb.SeaweedList(ctx, client, dir, "", func(entry *filer_pb.Entry, isLast bool) error { + if entry.IsDirectory { + dirsToProcess = append(dirsToProcess, path.Join(dir, entry.Name)) + return nil + } + scanned++ + + if isExpiredByTTL(entry) { + expired = append(expired, expiredObject{ + dir: dir, + name: entry.Name, + }) + } + + if limit > 0 && int64(len(expired)) >= limit { + limitReached = true + return fmt.Errorf("limit reached") + } + return nil + }, "", false, 10000) + + if err != nil && !strings.Contains(err.Error(), "limit reached") { + return expired, scanned, fmt.Errorf("list %s: %w", dir, err) + } + + if limitReached || (limit > 0 && int64(len(expired)) >= limit) { + break + } + } + + return expired, scanned, nil +} + +// isExpiredByTTL checks if an entry is expired based on its TTL attribute. +// SeaweedFS sets TtlSec on entries when lifecycle rules are applied via +// PutBucketLifecycleConfiguration. An entry is expired when +// creation_time + TTL < now. +func isExpiredByTTL(entry *filer_pb.Entry) bool { + if entry == nil || entry.Attributes == nil { + return false + } + + ttlSec := entry.Attributes.TtlSec + if ttlSec <= 0 { + return false + } + + crTime := entry.Attributes.Crtime + if crTime <= 0 { + return false + } + + expirationUnix := crTime + int64(ttlSec) + return expirationUnix < nowUnix() +} diff --git a/weed/plugin/worker/lifecycle/execution.go b/weed/plugin/worker/lifecycle/execution.go new file mode 100644 index 000000000..628623195 --- /dev/null +++ b/weed/plugin/worker/lifecycle/execution.go @@ -0,0 +1,328 @@ +package lifecycle + +import ( + "context" + "fmt" + "math" + "path" + "strings" + "time" + + "github.com/seaweedfs/seaweedfs/weed/glog" + "github.com/seaweedfs/seaweedfs/weed/pb/filer_pb" + "github.com/seaweedfs/seaweedfs/weed/pb/plugin_pb" + pluginworker "github.com/seaweedfs/seaweedfs/weed/plugin/worker" + "github.com/seaweedfs/seaweedfs/weed/s3api/s3_constants" +) + +type executionResult struct { + objectsExpired int64 + objectsScanned int64 + deleteMarkersClean int64 + mpuAborted int64 + errors int64 +} + +// executeLifecycleForBucket processes lifecycle rules for a single bucket: +// 1. Reads filer.conf to get TTL rules for the bucket's collection +// 2. Walks the bucket directory tree to find expired objects +// 3. Deletes expired objects (unless dry run) +func (h *Handler) executeLifecycleForBucket( + ctx context.Context, + filerClient filer_pb.SeaweedFilerClient, + config Config, + bucket, bucketsPath string, + sender pluginworker.ExecutionSender, + jobID string, +) (*executionResult, error) { + result := &executionResult{} + + // Load filer.conf to verify TTL rules still exist. + fc, err := loadFilerConf(ctx, filerClient) + if err != nil { + return result, fmt.Errorf("load filer conf: %w", err) + } + + collection := bucket + ttlRules := fc.GetCollectionTtls(collection) + if len(ttlRules) == 0 { + glog.V(1).Infof("s3_lifecycle: bucket %s has no lifecycle rules, skipping", bucket) + return result, nil + } + + _ = sender.SendProgress(&plugin_pb.JobProgressUpdate{ + JobId: jobID, + JobType: jobType, + State: plugin_pb.JobState_JOB_STATE_RUNNING, + ProgressPercent: 10, + Stage: "scanning", + Message: fmt.Sprintf("scanning bucket %s for expired objects (%d rules)", bucket, len(ttlRules)), + }) + + // Shared budget across all phases so we don't exceed MaxDeletesPerBucket. + remaining := config.MaxDeletesPerBucket + + // Find expired objects. + expired, scanned, err := listExpiredObjects(ctx, filerClient, bucketsPath, bucket, remaining) + result.objectsScanned = scanned + if err != nil { + return result, fmt.Errorf("list expired objects: %w", err) + } + + if len(expired) > 0 { + glog.V(1).Infof("s3_lifecycle: bucket %s: found %d expired objects out of %d scanned", bucket, len(expired), scanned) + } else { + glog.V(1).Infof("s3_lifecycle: bucket %s: scanned %d objects, none expired", bucket, scanned) + } + + if config.DryRun && len(expired) > 0 { + result.objectsExpired = int64(len(expired)) + _ = sender.SendProgress(&plugin_pb.JobProgressUpdate{ + JobId: jobID, + JobType: jobType, + State: plugin_pb.JobState_JOB_STATE_RUNNING, + ProgressPercent: 100, + Stage: "dry_run", + Message: fmt.Sprintf("dry run: would delete %d expired objects", len(expired)), + }) + return result, nil + } + + // Delete expired objects in batches. + if len(expired) > 0 { + _ = sender.SendProgress(&plugin_pb.JobProgressUpdate{ + JobId: jobID, + JobType: jobType, + State: plugin_pb.JobState_JOB_STATE_RUNNING, + ProgressPercent: 50, + Stage: "deleting", + Message: fmt.Sprintf("deleting %d expired objects", len(expired)), + }) + + var batchSize int + if config.BatchSize <= 0 { + batchSize = defaultBatchSize + } else if config.BatchSize > math.MaxInt { + batchSize = math.MaxInt + } else { + batchSize = int(config.BatchSize) + } + + for i := 0; i < len(expired); i += batchSize { + select { + case <-ctx.Done(): + return result, ctx.Err() + default: + } + + end := i + batchSize + if end > len(expired) { + end = len(expired) + } + batch := expired[i:end] + + deleted, errs, batchErr := deleteExpiredObjects(ctx, filerClient, batch) + result.objectsExpired += int64(deleted) + result.errors += int64(errs) + + if batchErr != nil { + return result, batchErr + } + + progress := float64(end)/float64(len(expired))*50 + 50 // 50-100% + _ = sender.SendProgress(&plugin_pb.JobProgressUpdate{ + JobId: jobID, + JobType: jobType, + State: plugin_pb.JobState_JOB_STATE_RUNNING, + ProgressPercent: progress, + Stage: "deleting", + Message: fmt.Sprintf("deleted %d/%d expired objects", result.objectsExpired, len(expired)), + }) + } + + remaining -= result.objectsExpired + result.errors + if remaining < 0 { + remaining = 0 + } + } + + // Delete marker cleanup. + if config.DeleteMarkerCleanup && remaining > 0 { + _ = sender.SendProgress(&plugin_pb.JobProgressUpdate{ + JobId: jobID, JobType: jobType, + State: plugin_pb.JobState_JOB_STATE_RUNNING, + Stage: "cleaning_delete_markers", Message: "cleaning expired delete markers", + }) + cleaned, cleanErrs, cleanCtxErr := cleanupDeleteMarkers(ctx, filerClient, bucketsPath, bucket, remaining) + result.deleteMarkersClean = int64(cleaned) + result.errors += int64(cleanErrs) + if cleanCtxErr != nil { + return result, cleanCtxErr + } + remaining -= int64(cleaned + cleanErrs) + if remaining < 0 { + remaining = 0 + } + } + + // Abort incomplete multipart uploads. + if config.AbortMPUDays > 0 && remaining > 0 { + _ = sender.SendProgress(&plugin_pb.JobProgressUpdate{ + JobId: jobID, JobType: jobType, + State: plugin_pb.JobState_JOB_STATE_RUNNING, + Stage: "aborting_mpus", Message: fmt.Sprintf("aborting multipart uploads older than %d days", config.AbortMPUDays), + }) + aborted, abortErrs, abortCtxErr := abortIncompleteMPUs(ctx, filerClient, bucketsPath, bucket, config.AbortMPUDays, remaining) + result.mpuAborted = int64(aborted) + result.errors += int64(abortErrs) + if abortCtxErr != nil { + return result, abortCtxErr + } + } + + return result, nil +} + +// cleanupDeleteMarkers scans the bucket for entries marked as delete markers +// (via the S3 versioning extended attribute) and removes them. +// +// NOTE: This currently removes delete markers unconditionally without checking +// whether prior non-expired versions exist. In versioned buckets, removing a +// delete marker can resurface an older version. A future enhancement should +// query version metadata before removal to match AWS ExpiredObjectDeleteMarker +// semantics (only remove when no non-current versions remain). +func cleanupDeleteMarkers( + ctx context.Context, + client filer_pb.SeaweedFilerClient, + bucketsPath, bucket string, + limit int64, +) (cleaned, errors int, ctxErr error) { + bucketPath := path.Join(bucketsPath, bucket) + + dirsToProcess := []string{bucketPath} + for len(dirsToProcess) > 0 { + if ctx.Err() != nil { + return cleaned, errors, ctx.Err() + } + + dir := dirsToProcess[0] + dirsToProcess = dirsToProcess[1:] + + listErr := filer_pb.SeaweedList(ctx, client, dir, "", func(entry *filer_pb.Entry, isLast bool) error { + if entry.IsDirectory { + // Skip .uploads directories. + if entry.Name != ".uploads" { + dirsToProcess = append(dirsToProcess, path.Join(dir, entry.Name)) + } + return nil + } + + if isDeleteMarker(entry) { + if err := filer_pb.DoRemove(ctx, client, dir, entry.Name, true, false, false, false, nil); err != nil { + glog.V(1).Infof("s3_lifecycle: failed to remove delete marker %s/%s: %v", dir, entry.Name, err) + errors++ + } else { + cleaned++ + } + } + + if limit > 0 && int64(cleaned+errors) >= limit { + return fmt.Errorf("limit reached") + } + return nil + }, "", false, 10000) + + if listErr != nil && !strings.Contains(listErr.Error(), "limit reached") { + return cleaned, errors, fmt.Errorf("list %s: %w", dir, listErr) + } + + if limit > 0 && int64(cleaned+errors) >= limit { + break + } + } + return cleaned, errors, nil +} + +// isDeleteMarker checks if an entry is an S3 delete marker. +func isDeleteMarker(entry *filer_pb.Entry) bool { + if entry == nil || entry.Extended == nil { + return false + } + return string(entry.Extended[s3_constants.ExtDeleteMarkerKey]) == "true" +} + +// abortIncompleteMPUs scans the .uploads directory under a bucket and +// removes multipart upload entries older than the specified number of days. +func abortIncompleteMPUs( + ctx context.Context, + client filer_pb.SeaweedFilerClient, + bucketsPath, bucket string, + olderThanDays, limit int64, +) (aborted, errors int, ctxErr error) { + uploadsDir := path.Join(bucketsPath, bucket, ".uploads") + cutoff := time.Now().Add(-time.Duration(olderThanDays) * 24 * time.Hour) + + listErr := filer_pb.SeaweedList(ctx, client, uploadsDir, "", func(entry *filer_pb.Entry, isLast bool) error { + if ctx.Err() != nil { + return ctx.Err() + } + + if !entry.IsDirectory { + return nil + } + + // Each subdirectory under .uploads is one multipart upload. + // Check the directory creation time. + if entry.Attributes != nil && entry.Attributes.Crtime > 0 { + created := time.Unix(entry.Attributes.Crtime, 0) + if created.Before(cutoff) { + uploadPath := path.Join(uploadsDir, entry.Name) + if err := filer_pb.DoRemove(ctx, client, uploadsDir, entry.Name, true, true, true, false, nil); err != nil { + glog.V(1).Infof("s3_lifecycle: failed to abort MPU %s: %v", uploadPath, err) + errors++ + } else { + aborted++ + } + } + } + + if limit > 0 && int64(aborted+errors) >= limit { + return fmt.Errorf("limit reached") + } + return nil + }, "", false, 10000) + + if listErr != nil && !strings.Contains(listErr.Error(), "limit reached") { + return aborted, errors, fmt.Errorf("list uploads in %s: %w", uploadsDir, listErr) + } + + return aborted, errors, nil +} + +// deleteExpiredObjects deletes a batch of expired objects from the filer. +// Returns a non-nil error when the context is canceled mid-batch. +func deleteExpiredObjects( + ctx context.Context, + client filer_pb.SeaweedFilerClient, + objects []expiredObject, +) (deleted, errors int, ctxErr error) { + for _, obj := range objects { + if ctx.Err() != nil { + return deleted, errors, ctx.Err() + } + + err := filer_pb.DoRemove(ctx, client, obj.dir, obj.name, true, false, false, false, nil) + if err != nil { + glog.V(1).Infof("s3_lifecycle: failed to delete %s/%s: %v", obj.dir, obj.name, err) + errors++ + continue + } + deleted++ + } + return deleted, errors, nil +} + +// nowUnix returns the current time as a Unix timestamp. +func nowUnix() int64 { + return time.Now().Unix() +} diff --git a/weed/plugin/worker/lifecycle/handler.go b/weed/plugin/worker/lifecycle/handler.go new file mode 100644 index 000000000..22ab4d1ff --- /dev/null +++ b/weed/plugin/worker/lifecycle/handler.go @@ -0,0 +1,380 @@ +package lifecycle + +import ( + "context" + "fmt" + "strings" + "time" + + "github.com/seaweedfs/seaweedfs/weed/glog" + "github.com/seaweedfs/seaweedfs/weed/pb" + "github.com/seaweedfs/seaweedfs/weed/pb/filer_pb" + "github.com/seaweedfs/seaweedfs/weed/pb/plugin_pb" + pluginworker "github.com/seaweedfs/seaweedfs/weed/plugin/worker" + "google.golang.org/grpc" + "google.golang.org/protobuf/types/known/timestamppb" +) + +func init() { + pluginworker.RegisterHandler(pluginworker.HandlerFactory{ + JobType: jobType, + Category: pluginworker.CategoryHeavy, + Aliases: []string{"lifecycle", "s3-lifecycle", "s3.lifecycle"}, + Build: func(opts pluginworker.HandlerBuildOptions) (pluginworker.JobHandler, error) { + return NewHandler(opts.GrpcDialOption), nil + }, + }) +} + +// Handler implements the JobHandler interface for S3 lifecycle management: +// object expiration, delete marker cleanup, and abort incomplete multipart uploads. +type Handler struct { + grpcDialOption grpc.DialOption +} + +const filerConnectTimeout = 5 * time.Second + +// NewHandler creates a new handler for S3 lifecycle management. +func NewHandler(grpcDialOption grpc.DialOption) *Handler { + return &Handler{grpcDialOption: grpcDialOption} +} + +func (h *Handler) Capability() *plugin_pb.JobTypeCapability { + return &plugin_pb.JobTypeCapability{ + JobType: jobType, + CanDetect: true, + CanExecute: true, + MaxDetectionConcurrency: 1, + MaxExecutionConcurrency: 4, + DisplayName: "S3 Lifecycle", + Description: "Manages S3 object lifecycle: expiration of objects based on TTL rules, delete marker cleanup, and abort of incomplete multipart uploads", + Weight: 40, + } +} + +func (h *Handler) Descriptor() *plugin_pb.JobTypeDescriptor { + return &plugin_pb.JobTypeDescriptor{ + JobType: jobType, + DisplayName: "S3 Lifecycle Management", + Description: "Automated S3 object lifecycle management: expire objects by TTL rules, clean up expired delete markers, and abort stale multipart uploads", + Icon: "fas fa-hourglass-half", + DescriptorVersion: 1, + AdminConfigForm: &plugin_pb.ConfigForm{ + FormId: "s3-lifecycle-admin", + Title: "S3 Lifecycle Admin Config", + Description: "Admin-side controls for S3 lifecycle management scope.", + Sections: []*plugin_pb.ConfigSection{ + { + SectionId: "scope", + Title: "Scope", + Description: "Which buckets to include in lifecycle management.", + Fields: []*plugin_pb.ConfigField{ + { + Name: "bucket_filter", + Label: "Bucket Filter", + Description: "Wildcard pattern for bucket names to include (e.g. \"prod-*\"). Empty means all buckets.", + FieldType: plugin_pb.ConfigFieldType_CONFIG_FIELD_TYPE_STRING, + Widget: plugin_pb.ConfigWidget_CONFIG_WIDGET_TEXT, + }, + }, + }, + }, + }, + WorkerConfigForm: &plugin_pb.ConfigForm{ + FormId: "s3-lifecycle-worker", + Title: "S3 Lifecycle Worker Config", + Description: "Worker-side controls for lifecycle execution behavior.", + Sections: []*plugin_pb.ConfigSection{ + { + SectionId: "execution", + Title: "Execution", + Description: "Controls for lifecycle rule execution.", + Fields: []*plugin_pb.ConfigField{ + { + Name: "batch_size", + Label: "Batch Size", + Description: "Number of entries to process per filer listing page.", + FieldType: plugin_pb.ConfigFieldType_CONFIG_FIELD_TYPE_INT64, + Widget: plugin_pb.ConfigWidget_CONFIG_WIDGET_NUMBER, + MinValue: configInt64(100), + MaxValue: configInt64(10000), + }, + { + Name: "max_deletes_per_bucket", + Label: "Max Deletes Per Bucket", + Description: "Maximum number of expired objects to delete per bucket in one execution run.", + FieldType: plugin_pb.ConfigFieldType_CONFIG_FIELD_TYPE_INT64, + Widget: plugin_pb.ConfigWidget_CONFIG_WIDGET_NUMBER, + MinValue: configInt64(100), + MaxValue: configInt64(1000000), + }, + { + Name: "dry_run", + Label: "Dry Run", + Description: "When enabled, detect expired objects but do not delete them.", + FieldType: plugin_pb.ConfigFieldType_CONFIG_FIELD_TYPE_BOOL, + Widget: plugin_pb.ConfigWidget_CONFIG_WIDGET_TOGGLE, + }, + { + Name: "delete_marker_cleanup", + Label: "Delete Marker Cleanup", + Description: "Remove expired delete markers that have no non-current versions.", + FieldType: plugin_pb.ConfigFieldType_CONFIG_FIELD_TYPE_BOOL, + Widget: plugin_pb.ConfigWidget_CONFIG_WIDGET_TOGGLE, + }, + { + Name: "abort_mpu_days", + Label: "Abort Incomplete MPU (days)", + Description: "Abort incomplete multipart uploads older than this many days. 0 disables.", + FieldType: plugin_pb.ConfigFieldType_CONFIG_FIELD_TYPE_INT64, + Widget: plugin_pb.ConfigWidget_CONFIG_WIDGET_NUMBER, + MinValue: configInt64(0), + MaxValue: configInt64(365), + }, + }, + }, + }, + }, + AdminRuntimeDefaults: &plugin_pb.AdminRuntimeDefaults{ + Enabled: true, + DetectionIntervalSeconds: 300, // 5 minutes + DetectionTimeoutSeconds: 60, + MaxJobsPerDetection: 100, + GlobalExecutionConcurrency: 2, + PerWorkerExecutionConcurrency: 2, + RetryLimit: 1, + RetryBackoffSeconds: 10, + }, + WorkerDefaultValues: map[string]*plugin_pb.ConfigValue{ + "batch_size": {Kind: &plugin_pb.ConfigValue_Int64Value{Int64Value: defaultBatchSize}}, + "max_deletes_per_bucket": {Kind: &plugin_pb.ConfigValue_Int64Value{Int64Value: defaultMaxDeletesPerBucket}}, + "dry_run": {Kind: &plugin_pb.ConfigValue_BoolValue{BoolValue: defaultDryRun}}, + "delete_marker_cleanup": {Kind: &plugin_pb.ConfigValue_BoolValue{BoolValue: defaultDeleteMarkerCleanup}}, + "abort_mpu_days": {Kind: &plugin_pb.ConfigValue_Int64Value{Int64Value: defaultAbortMPUDaysDefault}}, + }, + } +} + +func (h *Handler) Detect(ctx context.Context, req *plugin_pb.RunDetectionRequest, sender pluginworker.DetectionSender) error { + if req == nil { + return fmt.Errorf("nil detection request") + } + + config := ParseConfig(req.WorkerConfigValues) + + bucketFilter := readStringConfig(req.AdminConfigValues, "bucket_filter", "") + + filerAddresses := filerAddressesFromCluster(req.ClusterContext) + if len(filerAddresses) == 0 { + _ = sender.SendActivity(pluginworker.BuildDetectorActivity("skipped", "no filer addresses in cluster context", nil)) + return sendEmptyDetection(sender) + } + + _ = sender.SendActivity(pluginworker.BuildDetectorActivity("connecting", "connecting to filer", nil)) + + filerClient, filerConn, err := connectToFiler(ctx, filerAddresses, h.grpcDialOption) + if err != nil { + return fmt.Errorf("failed to connect to any filer: %v", err) + } + defer filerConn.Close() + + maxResults := int(req.MaxResults) + if maxResults <= 0 { + maxResults = 100 + } + + _ = sender.SendActivity(pluginworker.BuildDetectorActivity("scanning", "scanning buckets for lifecycle rules", nil)) + proposals, err := h.detectBucketsWithLifecycleRules(ctx, filerClient, config, bucketFilter, maxResults) + if err != nil { + _ = sender.SendActivity(pluginworker.BuildDetectorActivity("scan_error", fmt.Sprintf("error scanning buckets: %v", err), nil)) + return fmt.Errorf("detect lifecycle rules: %w", err) + } + + _ = sender.SendActivity(pluginworker.BuildDetectorActivity("scan_complete", + fmt.Sprintf("found %d bucket(s) with lifecycle rules", len(proposals)), + map[string]*plugin_pb.ConfigValue{ + "buckets_found": {Kind: &plugin_pb.ConfigValue_Int64Value{Int64Value: int64(len(proposals))}}, + })) + + if err := sender.SendProposals(&plugin_pb.DetectionProposals{ + JobType: jobType, + Proposals: proposals, + HasMore: len(proposals) >= maxResults, + }); err != nil { + return err + } + + return sender.SendComplete(&plugin_pb.DetectionComplete{ + JobType: jobType, + Success: true, + TotalProposals: int32(len(proposals)), + }) +} + +func (h *Handler) Execute(ctx context.Context, req *plugin_pb.ExecuteJobRequest, sender pluginworker.ExecutionSender) error { + if req == nil || req.Job == nil { + return fmt.Errorf("nil execution request") + } + + job := req.Job + config := ParseConfig(req.WorkerConfigValues) + + bucket := readParamString(job.Parameters, "bucket") + bucketsPath := readParamString(job.Parameters, "buckets_path") + if bucket == "" || bucketsPath == "" { + return fmt.Errorf("missing bucket or buckets_path parameter") + } + + filerAddresses := filerAddressesFromCluster(req.ClusterContext) + if len(filerAddresses) == 0 { + return fmt.Errorf("no filer addresses in cluster context") + } + + filerClient, filerConn, err := connectToFiler(ctx, filerAddresses, h.grpcDialOption) + if err != nil { + return fmt.Errorf("failed to connect to any filer: %v", err) + } + defer filerConn.Close() + + _ = sender.SendProgress(&plugin_pb.JobProgressUpdate{ + JobId: job.JobId, + JobType: jobType, + State: plugin_pb.JobState_JOB_STATE_ASSIGNED, + ProgressPercent: 0, + Stage: "starting", + Message: fmt.Sprintf("executing lifecycle rules for bucket %s", bucket), + }) + + start := time.Now() + result, execErr := h.executeLifecycleForBucket(ctx, filerClient, config, bucket, bucketsPath, sender, job.JobId) + elapsed := time.Since(start) + + metrics := map[string]*plugin_pb.ConfigValue{ + MetricDurationMs: {Kind: &plugin_pb.ConfigValue_Int64Value{Int64Value: elapsed.Milliseconds()}}, + } + if result != nil { + metrics[MetricObjectsExpired] = &plugin_pb.ConfigValue{Kind: &plugin_pb.ConfigValue_Int64Value{Int64Value: result.objectsExpired}} + metrics[MetricObjectsScanned] = &plugin_pb.ConfigValue{Kind: &plugin_pb.ConfigValue_Int64Value{Int64Value: result.objectsScanned}} + metrics[MetricDeleteMarkersClean] = &plugin_pb.ConfigValue{Kind: &plugin_pb.ConfigValue_Int64Value{Int64Value: result.deleteMarkersClean}} + metrics[MetricMPUAborted] = &plugin_pb.ConfigValue{Kind: &plugin_pb.ConfigValue_Int64Value{Int64Value: result.mpuAborted}} + metrics[MetricErrors] = &plugin_pb.ConfigValue{Kind: &plugin_pb.ConfigValue_Int64Value{Int64Value: result.errors}} + } + + var scanned, expired int64 + if result != nil { + scanned = result.objectsScanned + expired = result.objectsExpired + } + + success := execErr == nil && (result == nil || result.errors == 0) + message := fmt.Sprintf("bucket %s: scanned %d objects, expired %d", bucket, scanned, expired) + if result != nil && result.deleteMarkersClean > 0 { + message += fmt.Sprintf(", delete markers cleaned %d", result.deleteMarkersClean) + } + if result != nil && result.mpuAborted > 0 { + message += fmt.Sprintf(", MPUs aborted %d", result.mpuAborted) + } + if config.DryRun { + message += " (dry run)" + } + if result != nil && result.errors > 0 { + message += fmt.Sprintf(" (%d errors)", result.errors) + } + if execErr != nil { + message = fmt.Sprintf("lifecycle execution failed for bucket %s: %v", bucket, execErr) + } + + errMsg := "" + if execErr != nil { + errMsg = execErr.Error() + } else if result != nil && result.errors > 0 { + errMsg = fmt.Sprintf("%d objects failed to process", result.errors) + } + + return sender.SendCompleted(&plugin_pb.JobCompleted{ + JobId: job.JobId, + JobType: jobType, + Success: success, + ErrorMessage: errMsg, + Result: &plugin_pb.JobResult{ + Summary: message, + OutputValues: metrics, + }, + CompletedAt: timestamppb.Now(), + }) +} + +func connectToFiler(ctx context.Context, addresses []string, dialOption grpc.DialOption) (filer_pb.SeaweedFilerClient, *grpc.ClientConn, error) { + var lastErr error + for _, addr := range addresses { + grpcAddr := pb.ServerAddress(addr).ToGrpcAddress() + connCtx, cancel := context.WithTimeout(ctx, filerConnectTimeout) + conn, err := pb.GrpcDial(connCtx, grpcAddr, false, dialOption) + cancel() + if err != nil { + lastErr = err + glog.V(1).Infof("s3_lifecycle: failed to connect to filer %s (grpc %s): %v", addr, grpcAddr, err) + continue + } + // Verify the connection with a ping. + client := filer_pb.NewSeaweedFilerClient(conn) + pingCtx, pingCancel := context.WithTimeout(ctx, filerConnectTimeout) + _, pingErr := client.Ping(pingCtx, &filer_pb.PingRequest{}) + pingCancel() + if pingErr != nil { + _ = conn.Close() + lastErr = pingErr + glog.V(1).Infof("s3_lifecycle: filer %s ping failed: %v", grpcAddr, pingErr) + continue + } + return client, conn, nil + } + return nil, nil, lastErr +} + +func sendEmptyDetection(sender pluginworker.DetectionSender) error { + if err := sender.SendProposals(&plugin_pb.DetectionProposals{ + JobType: jobType, + Proposals: []*plugin_pb.JobProposal{}, + HasMore: false, + }); err != nil { + return err + } + return sender.SendComplete(&plugin_pb.DetectionComplete{ + JobType: jobType, + Success: true, + TotalProposals: 0, + }) +} + +func filerAddressesFromCluster(cc *plugin_pb.ClusterContext) []string { + if cc == nil { + return nil + } + var addrs []string + for _, addr := range cc.FilerGrpcAddresses { + trimmed := strings.TrimSpace(addr) + if trimmed != "" { + addrs = append(addrs, trimmed) + } + } + return addrs +} + +func readParamString(params map[string]*plugin_pb.ConfigValue, key string) string { + if params == nil { + return "" + } + v := params[key] + if v == nil { + return "" + } + if sv, ok := v.Kind.(*plugin_pb.ConfigValue_StringValue); ok { + return sv.StringValue + } + return "" +} + +func configInt64(v int64) *plugin_pb.ConfigValue { + return &plugin_pb.ConfigValue{Kind: &plugin_pb.ConfigValue_Int64Value{Int64Value: v}} +}