You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
366 lines
9.2 KiB
366 lines
9.2 KiB
package main
|
|
|
|
import (
|
|
"encoding/json"
|
|
"fmt"
|
|
"io"
|
|
"log"
|
|
"net/http"
|
|
"os"
|
|
"time"
|
|
)
|
|
|
|
type Monitor struct {
|
|
masterAddr string
|
|
adminAddr string
|
|
filerAddr string
|
|
interval time.Duration
|
|
startTime time.Time
|
|
stats MonitorStats
|
|
}
|
|
|
|
type MonitorStats struct {
|
|
TotalChecks int64
|
|
MasterHealthy int64
|
|
AdminHealthy int64
|
|
FilerHealthy int64
|
|
VolumeCount int64
|
|
LastVolumeCheck time.Time
|
|
ECTasksDetected int64
|
|
WorkersActive int64
|
|
LastWorkerCheck time.Time
|
|
}
|
|
|
|
type ClusterStatus struct {
|
|
IsLeader bool `json:"IsLeader"`
|
|
Leader string `json:"Leader"`
|
|
Peers []string `json:"Peers"`
|
|
}
|
|
|
|
type VolumeStatus struct {
|
|
Volumes []VolumeInfo `json:"Volumes"`
|
|
}
|
|
|
|
type VolumeInfo struct {
|
|
Id uint32 `json:"Id"`
|
|
Size uint64 `json:"Size"`
|
|
Collection string `json:"Collection"`
|
|
FileCount int64 `json:"FileCount"`
|
|
DeleteCount int64 `json:"DeleteCount"`
|
|
DeletedByteCount uint64 `json:"DeletedByteCount"`
|
|
ReadOnly bool `json:"ReadOnly"`
|
|
CompactRevision uint32 `json:"CompactRevision"`
|
|
Version uint32 `json:"Version"`
|
|
}
|
|
|
|
type AdminStatus struct {
|
|
Status string `json:"status"`
|
|
Uptime string `json:"uptime"`
|
|
Tasks int `json:"tasks"`
|
|
Workers int `json:"workers"`
|
|
}
|
|
|
|
// checkMasterHealth checks the master server health
|
|
func (m *Monitor) checkMasterHealth() bool {
|
|
resp, err := http.Get(fmt.Sprintf("http://%s/cluster/status", m.masterAddr))
|
|
if err != nil {
|
|
log.Printf("ERROR: Cannot reach master %s: %v", m.masterAddr, err)
|
|
return false
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
if resp.StatusCode != http.StatusOK {
|
|
log.Printf("ERROR: Master returned status %d", resp.StatusCode)
|
|
return false
|
|
}
|
|
|
|
var status ClusterStatus
|
|
body, err := io.ReadAll(resp.Body)
|
|
if err != nil {
|
|
log.Printf("ERROR: Cannot read master response: %v", err)
|
|
return false
|
|
}
|
|
|
|
err = json.Unmarshal(body, &status)
|
|
if err != nil {
|
|
log.Printf("WARNING: Cannot parse master status: %v", err)
|
|
// Still consider it healthy if we got a response
|
|
return true
|
|
}
|
|
|
|
log.Printf("Master status: Leader=%s, IsLeader=%t, Peers=%d",
|
|
status.Leader, status.IsLeader, len(status.Peers))
|
|
|
|
m.stats.MasterHealthy++
|
|
return true
|
|
}
|
|
|
|
// checkAdminHealth checks the admin server health
|
|
func (m *Monitor) checkAdminHealth() bool {
|
|
resp, err := http.Get(fmt.Sprintf("http://%s/health", m.adminAddr))
|
|
if err != nil {
|
|
log.Printf("ERROR: Cannot reach admin %s: %v", m.adminAddr, err)
|
|
return false
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
if resp.StatusCode != http.StatusOK {
|
|
log.Printf("ERROR: Admin returned status %d", resp.StatusCode)
|
|
return false
|
|
}
|
|
|
|
var status AdminStatus
|
|
body, err := io.ReadAll(resp.Body)
|
|
if err != nil {
|
|
log.Printf("ERROR: Cannot read admin response: %v", err)
|
|
return false
|
|
}
|
|
|
|
err = json.Unmarshal(body, &status)
|
|
if err != nil {
|
|
log.Printf("WARNING: Cannot parse admin status: %v", err)
|
|
return true
|
|
}
|
|
|
|
log.Printf("Admin status: %s, Uptime=%s, Tasks=%d, Workers=%d",
|
|
status.Status, status.Uptime, status.Tasks, status.Workers)
|
|
|
|
m.stats.AdminHealthy++
|
|
m.stats.ECTasksDetected += int64(status.Tasks)
|
|
m.stats.WorkersActive = int64(status.Workers)
|
|
m.stats.LastWorkerCheck = time.Now()
|
|
|
|
return true
|
|
}
|
|
|
|
// checkFilerHealth checks the filer health
|
|
func (m *Monitor) checkFilerHealth() bool {
|
|
resp, err := http.Get(fmt.Sprintf("http://%s/", m.filerAddr))
|
|
if err != nil {
|
|
log.Printf("ERROR: Cannot reach filer %s: %v", m.filerAddr, err)
|
|
return false
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
if resp.StatusCode != http.StatusOK {
|
|
log.Printf("ERROR: Filer returned status %d", resp.StatusCode)
|
|
return false
|
|
}
|
|
|
|
m.stats.FilerHealthy++
|
|
return true
|
|
}
|
|
|
|
// checkVolumeStatus checks volume information from master
|
|
func (m *Monitor) checkVolumeStatus() {
|
|
resp, err := http.Get(fmt.Sprintf("http://%s/vol/status", m.masterAddr))
|
|
if err != nil {
|
|
log.Printf("ERROR: Cannot get volume status: %v", err)
|
|
return
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
if resp.StatusCode != http.StatusOK {
|
|
log.Printf("ERROR: Volume status returned status %d", resp.StatusCode)
|
|
return
|
|
}
|
|
|
|
body, err := io.ReadAll(resp.Body)
|
|
if err != nil {
|
|
log.Printf("ERROR: Cannot read volume status: %v", err)
|
|
return
|
|
}
|
|
|
|
var volumeStatus VolumeStatus
|
|
err = json.Unmarshal(body, &volumeStatus)
|
|
if err != nil {
|
|
log.Printf("WARNING: Cannot parse volume status: %v", err)
|
|
return
|
|
}
|
|
|
|
m.stats.VolumeCount = int64(len(volumeStatus.Volumes))
|
|
m.stats.LastVolumeCheck = time.Now()
|
|
|
|
// Analyze volumes
|
|
var readOnlyCount, fullVolumeCount, ecCandidates int
|
|
var totalSize, totalFiles uint64
|
|
|
|
for _, vol := range volumeStatus.Volumes {
|
|
totalSize += vol.Size
|
|
totalFiles += uint64(vol.FileCount)
|
|
|
|
if vol.ReadOnly {
|
|
readOnlyCount++
|
|
}
|
|
|
|
// Volume is close to full (>40MB for 50MB limit)
|
|
if vol.Size > 40*1024*1024 {
|
|
fullVolumeCount++
|
|
if !vol.ReadOnly {
|
|
ecCandidates++
|
|
}
|
|
}
|
|
}
|
|
|
|
log.Printf("Volume analysis: Total=%d, ReadOnly=%d, Full=%d, EC_Candidates=%d",
|
|
len(volumeStatus.Volumes), readOnlyCount, fullVolumeCount, ecCandidates)
|
|
log.Printf("Storage stats: Total_Size=%.2fMB, Total_Files=%d",
|
|
float64(totalSize)/(1024*1024), totalFiles)
|
|
|
|
if ecCandidates > 0 {
|
|
log.Printf("⚠️ DETECTED %d volumes that should be EC'd!", ecCandidates)
|
|
}
|
|
}
|
|
|
|
// healthHandler provides a health endpoint for the monitor itself
|
|
func (m *Monitor) healthHandler(w http.ResponseWriter, r *http.Request) {
|
|
w.Header().Set("Content-Type", "application/json")
|
|
json.NewEncoder(w).Encode(map[string]interface{}{
|
|
"status": "healthy",
|
|
"uptime": time.Since(m.startTime).String(),
|
|
"checks": m.stats.TotalChecks,
|
|
"last_check": m.stats.LastVolumeCheck.Format(time.RFC3339),
|
|
})
|
|
}
|
|
|
|
// statusHandler provides detailed monitoring status
|
|
func (m *Monitor) statusHandler(w http.ResponseWriter, r *http.Request) {
|
|
w.Header().Set("Content-Type", "application/json")
|
|
json.NewEncoder(w).Encode(map[string]interface{}{
|
|
"monitor": map[string]interface{}{
|
|
"uptime": time.Since(m.startTime).String(),
|
|
"master_addr": m.masterAddr,
|
|
"admin_addr": m.adminAddr,
|
|
"filer_addr": m.filerAddr,
|
|
"interval": m.interval.String(),
|
|
},
|
|
"stats": m.stats,
|
|
"health": map[string]interface{}{
|
|
"master_healthy": m.stats.MasterHealthy > 0 && time.Since(m.stats.LastVolumeCheck) < 2*m.interval,
|
|
"admin_healthy": m.stats.AdminHealthy > 0 && time.Since(m.stats.LastWorkerCheck) < 2*m.interval,
|
|
"filer_healthy": m.stats.FilerHealthy > 0,
|
|
},
|
|
})
|
|
}
|
|
|
|
// runMonitoring runs the main monitoring loop
|
|
func (m *Monitor) runMonitoring() {
|
|
ticker := time.NewTicker(m.interval)
|
|
defer ticker.Stop()
|
|
|
|
log.Printf("Starting monitoring loop every %v", m.interval)
|
|
|
|
for {
|
|
m.stats.TotalChecks++
|
|
|
|
log.Printf("=== Monitoring Check #%d ===", m.stats.TotalChecks)
|
|
|
|
// Check master health
|
|
if m.checkMasterHealth() {
|
|
// If master is healthy, check volumes
|
|
m.checkVolumeStatus()
|
|
}
|
|
|
|
// Check admin health
|
|
m.checkAdminHealth()
|
|
|
|
// Check filer health
|
|
m.checkFilerHealth()
|
|
|
|
// Print summary
|
|
log.Printf("Health Summary: Master=%t, Admin=%t, Filer=%t, Volumes=%d, Workers=%d",
|
|
m.stats.MasterHealthy > 0,
|
|
m.stats.AdminHealthy > 0,
|
|
m.stats.FilerHealthy > 0,
|
|
m.stats.VolumeCount,
|
|
m.stats.WorkersActive)
|
|
|
|
log.Printf("=== End Check #%d ===", m.stats.TotalChecks)
|
|
|
|
<-ticker.C
|
|
}
|
|
}
|
|
|
|
func main() {
|
|
masterAddr := os.Getenv("MASTER_ADDRESS")
|
|
if masterAddr == "" {
|
|
masterAddr = "master:9333"
|
|
}
|
|
|
|
adminAddr := os.Getenv("ADMIN_ADDRESS")
|
|
if adminAddr == "" {
|
|
adminAddr = "admin:9900"
|
|
}
|
|
|
|
filerAddr := os.Getenv("FILER_ADDRESS")
|
|
if filerAddr == "" {
|
|
filerAddr = "filer:8888"
|
|
}
|
|
|
|
intervalStr := os.Getenv("MONITOR_INTERVAL")
|
|
interval, err := time.ParseDuration(intervalStr)
|
|
if err != nil {
|
|
interval = 10 * time.Second
|
|
}
|
|
|
|
monitor := &Monitor{
|
|
masterAddr: masterAddr,
|
|
adminAddr: adminAddr,
|
|
filerAddr: filerAddr,
|
|
interval: interval,
|
|
startTime: time.Now(),
|
|
stats: MonitorStats{},
|
|
}
|
|
|
|
log.Printf("Starting SeaweedFS Cluster Monitor")
|
|
log.Printf("Master: %s", masterAddr)
|
|
log.Printf("Admin: %s", adminAddr)
|
|
log.Printf("Filer: %s", filerAddr)
|
|
log.Printf("Interval: %v", interval)
|
|
|
|
// Setup HTTP endpoints
|
|
http.HandleFunc("/health", monitor.healthHandler)
|
|
http.HandleFunc("/status", monitor.statusHandler)
|
|
|
|
// Start HTTP server in background
|
|
go func() {
|
|
log.Println("Monitor HTTP server starting on :9999")
|
|
if err := http.ListenAndServe(":9999", nil); err != nil {
|
|
log.Printf("Monitor HTTP server error: %v", err)
|
|
}
|
|
}()
|
|
|
|
// Wait for services to be ready
|
|
log.Println("Waiting for services to be ready...")
|
|
for {
|
|
masterOK := false
|
|
adminOK := false
|
|
filerOK := false
|
|
|
|
if resp, err := http.Get(fmt.Sprintf("http://%s/cluster/status", masterAddr)); err == nil && resp.StatusCode == http.StatusOK {
|
|
masterOK = true
|
|
resp.Body.Close()
|
|
}
|
|
|
|
if resp, err := http.Get(fmt.Sprintf("http://%s/health", adminAddr)); err == nil && resp.StatusCode == http.StatusOK {
|
|
adminOK = true
|
|
resp.Body.Close()
|
|
}
|
|
|
|
if resp, err := http.Get(fmt.Sprintf("http://%s/", filerAddr)); err == nil && resp.StatusCode == http.StatusOK {
|
|
filerOK = true
|
|
resp.Body.Close()
|
|
}
|
|
|
|
if masterOK && adminOK && filerOK {
|
|
log.Println("All services are ready!")
|
|
break
|
|
}
|
|
|
|
log.Printf("Services ready: Master=%t, Admin=%t, Filer=%t", masterOK, adminOK, filerOK)
|
|
time.Sleep(5 * time.Second)
|
|
}
|
|
|
|
// Start monitoring
|
|
monitor.runMonitoring()
|
|
}
|