Browse Source

admin/plugin: migrate inline job details asynchronously to avoid slow startup (#8721)

loadPersistedMonitorState performed a backward-compatibility migration that
wrote every job with inline rich detail fields to a dedicated per-job detail
file synchronously during startup. On deployments with many historical jobs
(e.g. 1000+) stored on distributed block storage (e.g. Longhorn), each
individual file write requires an fsync round-trip, making startup
disproportionately slow and causing readiness/liveness probe failures.

The in-memory state is populated correctly before the goroutine is started
because stripTrackedJobDetailFields is still called in-place; only the disk
writes are deferred. A completion log message at V(1) is emitted once the
background migration finishes.

Co-authored-by: Anton Ustyugov <anton@devops>
master
Anton 19 hours ago
committed by GitHub
parent
commit
90277ceed5
No known key found for this signature in database GPG Key ID: B5690EEEBB952194
  1. 32
      weed/admin/plugin/plugin_monitor.go

32
weed/admin/plugin/plugin_monitor.go

@ -63,6 +63,12 @@ func (r *Plugin) loadPersistedMonitorState() error {
} }
if len(trackedJobs) > 0 { if len(trackedJobs) > 0 {
// Collect jobs that still carry inline rich details (old format) so we
// can migrate them to dedicated per-job detail files asynchronously.
// Writing many files synchronously during startup can be extremely slow
// on distributed storage (e.g. Longhorn), so we defer those writes.
var jobsToMigrate []TrackedJob
r.jobsMu.Lock() r.jobsMu.Lock()
for i := range trackedJobs { for i := range trackedJobs {
job := trackedJobs[i] job := trackedJobs[i]
@ -78,10 +84,9 @@ func (r *Plugin) loadPersistedMonitorState() error {
} }
// Backward compatibility: migrate older inline detail payloads // Backward compatibility: migrate older inline detail payloads
// out of tracked_jobs.json into dedicated per-job detail files. // out of tracked_jobs.json into dedicated per-job detail files.
// Collect for async migration instead of writing synchronously.
if hasTrackedJobRichDetails(job) { if hasTrackedJobRichDetails(job) {
if err := r.store.SaveJobDetail(job); err != nil {
glog.Warningf("Plugin failed to migrate detail snapshot for job %s: %v", job.JobID, err)
}
jobsToMigrate = append(jobsToMigrate, job)
} }
stripTrackedJobDetailFields(&job) stripTrackedJobDetailFields(&job)
jobCopy := job jobCopy := job
@ -89,6 +94,10 @@ func (r *Plugin) loadPersistedMonitorState() error {
} }
r.pruneTrackedJobsLocked() r.pruneTrackedJobsLocked()
r.jobsMu.Unlock() r.jobsMu.Unlock()
if len(jobsToMigrate) > 0 {
go r.migrateInlineJobDetails(jobsToMigrate)
}
} }
if len(activities) > maxActivityRecords { if len(activities) > maxActivityRecords {
@ -103,6 +112,23 @@ func (r *Plugin) loadPersistedMonitorState() error {
return nil return nil
} }
// migrateInlineJobDetails writes inline detail payloads from old-format
// tracked_jobs.json entries to dedicated per-job detail files. It is called
// asynchronously during startup to avoid blocking plugin initialisation.
func (r *Plugin) migrateInlineJobDetails(jobs []TrackedJob) {
migrated := 0
for _, job := range jobs {
if err := r.store.SaveJobDetail(job); err != nil {
glog.Warningf("Plugin failed to migrate detail snapshot for job %s: %v", job.JobID, err)
} else {
migrated++
}
}
if migrated > 0 {
glog.V(1).Infof("Plugin migrated %d inline job details to dedicated files", migrated)
}
}
// ExpireJob marks an active job as failed so it no longer blocks scheduling. // ExpireJob marks an active job as failed so it no longer blocks scheduling.
func (r *Plugin) ExpireJob(jobID, reason string) (*TrackedJob, bool, error) { func (r *Plugin) ExpireJob(jobID, reason string) (*TrackedJob, bool, error) {
normalizedJobID := strings.TrimSpace(jobID) normalizedJobID := strings.TrimSpace(jobID)

Loading…
Cancel
Save