You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
353 lines
11 KiB
353 lines
11 KiB
package metrics
|
|
|
|
import (
|
|
"fmt"
|
|
"io"
|
|
"sort"
|
|
"sync"
|
|
"sync/atomic"
|
|
"time"
|
|
|
|
"github.com/prometheus/client_golang/prometheus"
|
|
"github.com/prometheus/client_golang/prometheus/promauto"
|
|
)
|
|
|
|
// Collector handles metrics collection for the load test
|
|
type Collector struct {
|
|
// Atomic counters for thread-safe operations
|
|
messagesProduced int64
|
|
messagesConsumed int64
|
|
bytesProduced int64
|
|
bytesConsumed int64
|
|
producerErrors int64
|
|
consumerErrors int64
|
|
|
|
// Latency tracking
|
|
latencies []time.Duration
|
|
latencyMutex sync.RWMutex
|
|
|
|
// Consumer lag tracking
|
|
consumerLag map[string]int64
|
|
consumerLagMutex sync.RWMutex
|
|
|
|
// Test timing
|
|
startTime time.Time
|
|
|
|
// Prometheus metrics
|
|
prometheusMetrics *PrometheusMetrics
|
|
}
|
|
|
|
// PrometheusMetrics holds all Prometheus metric definitions
|
|
type PrometheusMetrics struct {
|
|
MessagesProducedTotal prometheus.Counter
|
|
MessagesConsumedTotal prometheus.Counter
|
|
BytesProducedTotal prometheus.Counter
|
|
BytesConsumedTotal prometheus.Counter
|
|
ProducerErrorsTotal prometheus.Counter
|
|
ConsumerErrorsTotal prometheus.Counter
|
|
|
|
MessageLatencyHistogram prometheus.Histogram
|
|
ProducerThroughput prometheus.Gauge
|
|
ConsumerThroughput prometheus.Gauge
|
|
ConsumerLagGauge *prometheus.GaugeVec
|
|
|
|
ActiveProducers prometheus.Gauge
|
|
ActiveConsumers prometheus.Gauge
|
|
}
|
|
|
|
// NewCollector creates a new metrics collector
|
|
func NewCollector() *Collector {
|
|
return &Collector{
|
|
startTime: time.Now(),
|
|
consumerLag: make(map[string]int64),
|
|
prometheusMetrics: &PrometheusMetrics{
|
|
MessagesProducedTotal: promauto.NewCounter(prometheus.CounterOpts{
|
|
Name: "kafka_loadtest_messages_produced_total",
|
|
Help: "Total number of messages produced",
|
|
}),
|
|
MessagesConsumedTotal: promauto.NewCounter(prometheus.CounterOpts{
|
|
Name: "kafka_loadtest_messages_consumed_total",
|
|
Help: "Total number of messages consumed",
|
|
}),
|
|
BytesProducedTotal: promauto.NewCounter(prometheus.CounterOpts{
|
|
Name: "kafka_loadtest_bytes_produced_total",
|
|
Help: "Total bytes produced",
|
|
}),
|
|
BytesConsumedTotal: promauto.NewCounter(prometheus.CounterOpts{
|
|
Name: "kafka_loadtest_bytes_consumed_total",
|
|
Help: "Total bytes consumed",
|
|
}),
|
|
ProducerErrorsTotal: promauto.NewCounter(prometheus.CounterOpts{
|
|
Name: "kafka_loadtest_producer_errors_total",
|
|
Help: "Total number of producer errors",
|
|
}),
|
|
ConsumerErrorsTotal: promauto.NewCounter(prometheus.CounterOpts{
|
|
Name: "kafka_loadtest_consumer_errors_total",
|
|
Help: "Total number of consumer errors",
|
|
}),
|
|
MessageLatencyHistogram: promauto.NewHistogram(prometheus.HistogramOpts{
|
|
Name: "kafka_loadtest_message_latency_seconds",
|
|
Help: "Message end-to-end latency in seconds",
|
|
Buckets: prometheus.ExponentialBuckets(0.001, 2, 15), // 1ms to ~32s
|
|
}),
|
|
ProducerThroughput: promauto.NewGauge(prometheus.GaugeOpts{
|
|
Name: "kafka_loadtest_producer_throughput_msgs_per_sec",
|
|
Help: "Current producer throughput in messages per second",
|
|
}),
|
|
ConsumerThroughput: promauto.NewGauge(prometheus.GaugeOpts{
|
|
Name: "kafka_loadtest_consumer_throughput_msgs_per_sec",
|
|
Help: "Current consumer throughput in messages per second",
|
|
}),
|
|
ConsumerLagGauge: promauto.NewGaugeVec(prometheus.GaugeOpts{
|
|
Name: "kafka_loadtest_consumer_lag_messages",
|
|
Help: "Consumer lag in messages",
|
|
}, []string{"consumer_group", "topic", "partition"}),
|
|
ActiveProducers: promauto.NewGauge(prometheus.GaugeOpts{
|
|
Name: "kafka_loadtest_active_producers",
|
|
Help: "Number of active producers",
|
|
}),
|
|
ActiveConsumers: promauto.NewGauge(prometheus.GaugeOpts{
|
|
Name: "kafka_loadtest_active_consumers",
|
|
Help: "Number of active consumers",
|
|
}),
|
|
},
|
|
}
|
|
}
|
|
|
|
// RecordProducedMessage records a successfully produced message
|
|
func (c *Collector) RecordProducedMessage(size int, latency time.Duration) {
|
|
atomic.AddInt64(&c.messagesProduced, 1)
|
|
atomic.AddInt64(&c.bytesProduced, int64(size))
|
|
|
|
c.prometheusMetrics.MessagesProducedTotal.Inc()
|
|
c.prometheusMetrics.BytesProducedTotal.Add(float64(size))
|
|
c.prometheusMetrics.MessageLatencyHistogram.Observe(latency.Seconds())
|
|
|
|
// Store latency for percentile calculations
|
|
c.latencyMutex.Lock()
|
|
c.latencies = append(c.latencies, latency)
|
|
// Keep only recent latencies to avoid memory bloat
|
|
if len(c.latencies) > 100000 {
|
|
c.latencies = c.latencies[50000:]
|
|
}
|
|
c.latencyMutex.Unlock()
|
|
}
|
|
|
|
// RecordConsumedMessage records a successfully consumed message
|
|
func (c *Collector) RecordConsumedMessage(size int) {
|
|
atomic.AddInt64(&c.messagesConsumed, 1)
|
|
atomic.AddInt64(&c.bytesConsumed, int64(size))
|
|
|
|
c.prometheusMetrics.MessagesConsumedTotal.Inc()
|
|
c.prometheusMetrics.BytesConsumedTotal.Add(float64(size))
|
|
}
|
|
|
|
// RecordProducerError records a producer error
|
|
func (c *Collector) RecordProducerError() {
|
|
atomic.AddInt64(&c.producerErrors, 1)
|
|
c.prometheusMetrics.ProducerErrorsTotal.Inc()
|
|
}
|
|
|
|
// RecordConsumerError records a consumer error
|
|
func (c *Collector) RecordConsumerError() {
|
|
atomic.AddInt64(&c.consumerErrors, 1)
|
|
c.prometheusMetrics.ConsumerErrorsTotal.Inc()
|
|
}
|
|
|
|
// UpdateConsumerLag updates consumer lag metrics
|
|
func (c *Collector) UpdateConsumerLag(consumerGroup, topic string, partition int32, lag int64) {
|
|
key := fmt.Sprintf("%s-%s-%d", consumerGroup, topic, partition)
|
|
|
|
c.consumerLagMutex.Lock()
|
|
c.consumerLag[key] = lag
|
|
c.consumerLagMutex.Unlock()
|
|
|
|
c.prometheusMetrics.ConsumerLagGauge.WithLabelValues(
|
|
consumerGroup, topic, fmt.Sprintf("%d", partition),
|
|
).Set(float64(lag))
|
|
}
|
|
|
|
// UpdateThroughput updates throughput gauges
|
|
func (c *Collector) UpdateThroughput(producerRate, consumerRate float64) {
|
|
c.prometheusMetrics.ProducerThroughput.Set(producerRate)
|
|
c.prometheusMetrics.ConsumerThroughput.Set(consumerRate)
|
|
}
|
|
|
|
// UpdateActiveClients updates active client counts
|
|
func (c *Collector) UpdateActiveClients(producers, consumers int) {
|
|
c.prometheusMetrics.ActiveProducers.Set(float64(producers))
|
|
c.prometheusMetrics.ActiveConsumers.Set(float64(consumers))
|
|
}
|
|
|
|
// GetStats returns current statistics
|
|
func (c *Collector) GetStats() Stats {
|
|
produced := atomic.LoadInt64(&c.messagesProduced)
|
|
consumed := atomic.LoadInt64(&c.messagesConsumed)
|
|
bytesProduced := atomic.LoadInt64(&c.bytesProduced)
|
|
bytesConsumed := atomic.LoadInt64(&c.bytesConsumed)
|
|
producerErrors := atomic.LoadInt64(&c.producerErrors)
|
|
consumerErrors := atomic.LoadInt64(&c.consumerErrors)
|
|
|
|
duration := time.Since(c.startTime)
|
|
|
|
// Calculate throughput
|
|
producerThroughput := float64(produced) / duration.Seconds()
|
|
consumerThroughput := float64(consumed) / duration.Seconds()
|
|
|
|
// Calculate latency percentiles
|
|
var latencyPercentiles map[float64]time.Duration
|
|
c.latencyMutex.RLock()
|
|
if len(c.latencies) > 0 {
|
|
latencyPercentiles = c.calculatePercentiles(c.latencies)
|
|
}
|
|
c.latencyMutex.RUnlock()
|
|
|
|
// Get consumer lag summary
|
|
c.consumerLagMutex.RLock()
|
|
totalLag := int64(0)
|
|
maxLag := int64(0)
|
|
for _, lag := range c.consumerLag {
|
|
totalLag += lag
|
|
if lag > maxLag {
|
|
maxLag = lag
|
|
}
|
|
}
|
|
avgLag := float64(0)
|
|
if len(c.consumerLag) > 0 {
|
|
avgLag = float64(totalLag) / float64(len(c.consumerLag))
|
|
}
|
|
c.consumerLagMutex.RUnlock()
|
|
|
|
return Stats{
|
|
Duration: duration,
|
|
MessagesProduced: produced,
|
|
MessagesConsumed: consumed,
|
|
BytesProduced: bytesProduced,
|
|
BytesConsumed: bytesConsumed,
|
|
ProducerErrors: producerErrors,
|
|
ConsumerErrors: consumerErrors,
|
|
ProducerThroughput: producerThroughput,
|
|
ConsumerThroughput: consumerThroughput,
|
|
LatencyPercentiles: latencyPercentiles,
|
|
TotalConsumerLag: totalLag,
|
|
MaxConsumerLag: maxLag,
|
|
AvgConsumerLag: avgLag,
|
|
}
|
|
}
|
|
|
|
// PrintSummary prints a summary of the test statistics
|
|
func (c *Collector) PrintSummary() {
|
|
stats := c.GetStats()
|
|
|
|
fmt.Printf("\n=== Load Test Summary ===\n")
|
|
fmt.Printf("Test Duration: %v\n", stats.Duration)
|
|
fmt.Printf("\nMessages:\n")
|
|
fmt.Printf(" Produced: %d (%.2f MB)\n", stats.MessagesProduced, float64(stats.BytesProduced)/1024/1024)
|
|
fmt.Printf(" Consumed: %d (%.2f MB)\n", stats.MessagesConsumed, float64(stats.BytesConsumed)/1024/1024)
|
|
fmt.Printf(" Producer Errors: %d\n", stats.ProducerErrors)
|
|
fmt.Printf(" Consumer Errors: %d\n", stats.ConsumerErrors)
|
|
|
|
fmt.Printf("\nThroughput:\n")
|
|
fmt.Printf(" Producer: %.2f msgs/sec\n", stats.ProducerThroughput)
|
|
fmt.Printf(" Consumer: %.2f msgs/sec\n", stats.ConsumerThroughput)
|
|
|
|
if stats.LatencyPercentiles != nil {
|
|
fmt.Printf("\nLatency Percentiles:\n")
|
|
percentiles := []float64{50, 90, 95, 99, 99.9}
|
|
for _, p := range percentiles {
|
|
if latency, exists := stats.LatencyPercentiles[p]; exists {
|
|
fmt.Printf(" p%.1f: %v\n", p, latency)
|
|
}
|
|
}
|
|
}
|
|
|
|
fmt.Printf("\nConsumer Lag:\n")
|
|
fmt.Printf(" Total: %d messages\n", stats.TotalConsumerLag)
|
|
fmt.Printf(" Max: %d messages\n", stats.MaxConsumerLag)
|
|
fmt.Printf(" Average: %.2f messages\n", stats.AvgConsumerLag)
|
|
fmt.Printf("=========================\n")
|
|
}
|
|
|
|
// WriteStats writes statistics to a writer (for HTTP endpoint)
|
|
func (c *Collector) WriteStats(w io.Writer) {
|
|
stats := c.GetStats()
|
|
|
|
fmt.Fprintf(w, "# Load Test Statistics\n")
|
|
fmt.Fprintf(w, "duration_seconds %v\n", stats.Duration.Seconds())
|
|
fmt.Fprintf(w, "messages_produced %d\n", stats.MessagesProduced)
|
|
fmt.Fprintf(w, "messages_consumed %d\n", stats.MessagesConsumed)
|
|
fmt.Fprintf(w, "bytes_produced %d\n", stats.BytesProduced)
|
|
fmt.Fprintf(w, "bytes_consumed %d\n", stats.BytesConsumed)
|
|
fmt.Fprintf(w, "producer_errors %d\n", stats.ProducerErrors)
|
|
fmt.Fprintf(w, "consumer_errors %d\n", stats.ConsumerErrors)
|
|
fmt.Fprintf(w, "producer_throughput_msgs_per_sec %f\n", stats.ProducerThroughput)
|
|
fmt.Fprintf(w, "consumer_throughput_msgs_per_sec %f\n", stats.ConsumerThroughput)
|
|
fmt.Fprintf(w, "total_consumer_lag %d\n", stats.TotalConsumerLag)
|
|
fmt.Fprintf(w, "max_consumer_lag %d\n", stats.MaxConsumerLag)
|
|
fmt.Fprintf(w, "avg_consumer_lag %f\n", stats.AvgConsumerLag)
|
|
|
|
if stats.LatencyPercentiles != nil {
|
|
for percentile, latency := range stats.LatencyPercentiles {
|
|
fmt.Fprintf(w, "latency_p%g_seconds %f\n", percentile, latency.Seconds())
|
|
}
|
|
}
|
|
}
|
|
|
|
// calculatePercentiles calculates latency percentiles
|
|
func (c *Collector) calculatePercentiles(latencies []time.Duration) map[float64]time.Duration {
|
|
if len(latencies) == 0 {
|
|
return nil
|
|
}
|
|
|
|
// Make a copy and sort
|
|
sorted := make([]time.Duration, len(latencies))
|
|
copy(sorted, latencies)
|
|
sort.Slice(sorted, func(i, j int) bool {
|
|
return sorted[i] < sorted[j]
|
|
})
|
|
|
|
percentiles := map[float64]time.Duration{
|
|
50: calculatePercentile(sorted, 50),
|
|
90: calculatePercentile(sorted, 90),
|
|
95: calculatePercentile(sorted, 95),
|
|
99: calculatePercentile(sorted, 99),
|
|
99.9: calculatePercentile(sorted, 99.9),
|
|
}
|
|
|
|
return percentiles
|
|
}
|
|
|
|
// calculatePercentile calculates a specific percentile from sorted data
|
|
func calculatePercentile(sorted []time.Duration, percentile float64) time.Duration {
|
|
if len(sorted) == 0 {
|
|
return 0
|
|
}
|
|
|
|
index := percentile / 100.0 * float64(len(sorted)-1)
|
|
if index == float64(int(index)) {
|
|
return sorted[int(index)]
|
|
}
|
|
|
|
lower := sorted[int(index)]
|
|
upper := sorted[int(index)+1]
|
|
weight := index - float64(int(index))
|
|
|
|
return time.Duration(float64(lower) + weight*float64(upper-lower))
|
|
}
|
|
|
|
// Stats represents the current test statistics
|
|
type Stats struct {
|
|
Duration time.Duration
|
|
MessagesProduced int64
|
|
MessagesConsumed int64
|
|
BytesProduced int64
|
|
BytesConsumed int64
|
|
ProducerErrors int64
|
|
ConsumerErrors int64
|
|
ProducerThroughput float64
|
|
ConsumerThroughput float64
|
|
LatencyPercentiles map[float64]time.Duration
|
|
TotalConsumerLag int64
|
|
MaxConsumerLag int64
|
|
AvgConsumerLag float64
|
|
}
|