package metrics import ( "fmt" "io" "sort" "sync" "sync/atomic" "time" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promauto" ) // Collector handles metrics collection for the load test type Collector struct { // Atomic counters for thread-safe operations messagesProduced int64 messagesConsumed int64 bytesProduced int64 bytesConsumed int64 producerErrors int64 consumerErrors int64 // Latency tracking latencies []time.Duration latencyMutex sync.RWMutex // Consumer lag tracking consumerLag map[string]int64 consumerLagMutex sync.RWMutex // Test timing startTime time.Time // Prometheus metrics prometheusMetrics *PrometheusMetrics } // PrometheusMetrics holds all Prometheus metric definitions type PrometheusMetrics struct { MessagesProducedTotal prometheus.Counter MessagesConsumedTotal prometheus.Counter BytesProducedTotal prometheus.Counter BytesConsumedTotal prometheus.Counter ProducerErrorsTotal prometheus.Counter ConsumerErrorsTotal prometheus.Counter MessageLatencyHistogram prometheus.Histogram ProducerThroughput prometheus.Gauge ConsumerThroughput prometheus.Gauge ConsumerLagGauge *prometheus.GaugeVec ActiveProducers prometheus.Gauge ActiveConsumers prometheus.Gauge } // NewCollector creates a new metrics collector func NewCollector() *Collector { return &Collector{ startTime: time.Now(), consumerLag: make(map[string]int64), prometheusMetrics: &PrometheusMetrics{ MessagesProducedTotal: promauto.NewCounter(prometheus.CounterOpts{ Name: "kafka_loadtest_messages_produced_total", Help: "Total number of messages produced", }), MessagesConsumedTotal: promauto.NewCounter(prometheus.CounterOpts{ Name: "kafka_loadtest_messages_consumed_total", Help: "Total number of messages consumed", }), BytesProducedTotal: promauto.NewCounter(prometheus.CounterOpts{ Name: "kafka_loadtest_bytes_produced_total", Help: "Total bytes produced", }), BytesConsumedTotal: promauto.NewCounter(prometheus.CounterOpts{ Name: "kafka_loadtest_bytes_consumed_total", Help: "Total bytes consumed", }), ProducerErrorsTotal: promauto.NewCounter(prometheus.CounterOpts{ Name: "kafka_loadtest_producer_errors_total", Help: "Total number of producer errors", }), ConsumerErrorsTotal: promauto.NewCounter(prometheus.CounterOpts{ Name: "kafka_loadtest_consumer_errors_total", Help: "Total number of consumer errors", }), MessageLatencyHistogram: promauto.NewHistogram(prometheus.HistogramOpts{ Name: "kafka_loadtest_message_latency_seconds", Help: "Message end-to-end latency in seconds", Buckets: prometheus.ExponentialBuckets(0.001, 2, 15), // 1ms to ~32s }), ProducerThroughput: promauto.NewGauge(prometheus.GaugeOpts{ Name: "kafka_loadtest_producer_throughput_msgs_per_sec", Help: "Current producer throughput in messages per second", }), ConsumerThroughput: promauto.NewGauge(prometheus.GaugeOpts{ Name: "kafka_loadtest_consumer_throughput_msgs_per_sec", Help: "Current consumer throughput in messages per second", }), ConsumerLagGauge: promauto.NewGaugeVec(prometheus.GaugeOpts{ Name: "kafka_loadtest_consumer_lag_messages", Help: "Consumer lag in messages", }, []string{"consumer_group", "topic", "partition"}), ActiveProducers: promauto.NewGauge(prometheus.GaugeOpts{ Name: "kafka_loadtest_active_producers", Help: "Number of active producers", }), ActiveConsumers: promauto.NewGauge(prometheus.GaugeOpts{ Name: "kafka_loadtest_active_consumers", Help: "Number of active consumers", }), }, } } // RecordProducedMessage records a successfully produced message func (c *Collector) RecordProducedMessage(size int, latency time.Duration) { atomic.AddInt64(&c.messagesProduced, 1) atomic.AddInt64(&c.bytesProduced, int64(size)) c.prometheusMetrics.MessagesProducedTotal.Inc() c.prometheusMetrics.BytesProducedTotal.Add(float64(size)) c.prometheusMetrics.MessageLatencyHistogram.Observe(latency.Seconds()) // Store latency for percentile calculations c.latencyMutex.Lock() c.latencies = append(c.latencies, latency) // Keep only recent latencies to avoid memory bloat if len(c.latencies) > 100000 { c.latencies = c.latencies[50000:] } c.latencyMutex.Unlock() } // RecordConsumedMessage records a successfully consumed message func (c *Collector) RecordConsumedMessage(size int) { atomic.AddInt64(&c.messagesConsumed, 1) atomic.AddInt64(&c.bytesConsumed, int64(size)) c.prometheusMetrics.MessagesConsumedTotal.Inc() c.prometheusMetrics.BytesConsumedTotal.Add(float64(size)) } // RecordProducerError records a producer error func (c *Collector) RecordProducerError() { atomic.AddInt64(&c.producerErrors, 1) c.prometheusMetrics.ProducerErrorsTotal.Inc() } // RecordConsumerError records a consumer error func (c *Collector) RecordConsumerError() { atomic.AddInt64(&c.consumerErrors, 1) c.prometheusMetrics.ConsumerErrorsTotal.Inc() } // UpdateConsumerLag updates consumer lag metrics func (c *Collector) UpdateConsumerLag(consumerGroup, topic string, partition int32, lag int64) { key := fmt.Sprintf("%s-%s-%d", consumerGroup, topic, partition) c.consumerLagMutex.Lock() c.consumerLag[key] = lag c.consumerLagMutex.Unlock() c.prometheusMetrics.ConsumerLagGauge.WithLabelValues( consumerGroup, topic, fmt.Sprintf("%d", partition), ).Set(float64(lag)) } // UpdateThroughput updates throughput gauges func (c *Collector) UpdateThroughput(producerRate, consumerRate float64) { c.prometheusMetrics.ProducerThroughput.Set(producerRate) c.prometheusMetrics.ConsumerThroughput.Set(consumerRate) } // UpdateActiveClients updates active client counts func (c *Collector) UpdateActiveClients(producers, consumers int) { c.prometheusMetrics.ActiveProducers.Set(float64(producers)) c.prometheusMetrics.ActiveConsumers.Set(float64(consumers)) } // GetStats returns current statistics func (c *Collector) GetStats() Stats { produced := atomic.LoadInt64(&c.messagesProduced) consumed := atomic.LoadInt64(&c.messagesConsumed) bytesProduced := atomic.LoadInt64(&c.bytesProduced) bytesConsumed := atomic.LoadInt64(&c.bytesConsumed) producerErrors := atomic.LoadInt64(&c.producerErrors) consumerErrors := atomic.LoadInt64(&c.consumerErrors) duration := time.Since(c.startTime) // Calculate throughput producerThroughput := float64(produced) / duration.Seconds() consumerThroughput := float64(consumed) / duration.Seconds() // Calculate latency percentiles var latencyPercentiles map[float64]time.Duration c.latencyMutex.RLock() if len(c.latencies) > 0 { latencyPercentiles = c.calculatePercentiles(c.latencies) } c.latencyMutex.RUnlock() // Get consumer lag summary c.consumerLagMutex.RLock() totalLag := int64(0) maxLag := int64(0) for _, lag := range c.consumerLag { totalLag += lag if lag > maxLag { maxLag = lag } } avgLag := float64(0) if len(c.consumerLag) > 0 { avgLag = float64(totalLag) / float64(len(c.consumerLag)) } c.consumerLagMutex.RUnlock() return Stats{ Duration: duration, MessagesProduced: produced, MessagesConsumed: consumed, BytesProduced: bytesProduced, BytesConsumed: bytesConsumed, ProducerErrors: producerErrors, ConsumerErrors: consumerErrors, ProducerThroughput: producerThroughput, ConsumerThroughput: consumerThroughput, LatencyPercentiles: latencyPercentiles, TotalConsumerLag: totalLag, MaxConsumerLag: maxLag, AvgConsumerLag: avgLag, } } // PrintSummary prints a summary of the test statistics func (c *Collector) PrintSummary() { stats := c.GetStats() fmt.Printf("\n=== Load Test Summary ===\n") fmt.Printf("Test Duration: %v\n", stats.Duration) fmt.Printf("\nMessages:\n") fmt.Printf(" Produced: %d (%.2f MB)\n", stats.MessagesProduced, float64(stats.BytesProduced)/1024/1024) fmt.Printf(" Consumed: %d (%.2f MB)\n", stats.MessagesConsumed, float64(stats.BytesConsumed)/1024/1024) fmt.Printf(" Producer Errors: %d\n", stats.ProducerErrors) fmt.Printf(" Consumer Errors: %d\n", stats.ConsumerErrors) fmt.Printf("\nThroughput:\n") fmt.Printf(" Producer: %.2f msgs/sec\n", stats.ProducerThroughput) fmt.Printf(" Consumer: %.2f msgs/sec\n", stats.ConsumerThroughput) if stats.LatencyPercentiles != nil { fmt.Printf("\nLatency Percentiles:\n") percentiles := []float64{50, 90, 95, 99, 99.9} for _, p := range percentiles { if latency, exists := stats.LatencyPercentiles[p]; exists { fmt.Printf(" p%.1f: %v\n", p, latency) } } } fmt.Printf("\nConsumer Lag:\n") fmt.Printf(" Total: %d messages\n", stats.TotalConsumerLag) fmt.Printf(" Max: %d messages\n", stats.MaxConsumerLag) fmt.Printf(" Average: %.2f messages\n", stats.AvgConsumerLag) fmt.Printf("=========================\n") } // WriteStats writes statistics to a writer (for HTTP endpoint) func (c *Collector) WriteStats(w io.Writer) { stats := c.GetStats() fmt.Fprintf(w, "# Load Test Statistics\n") fmt.Fprintf(w, "duration_seconds %v\n", stats.Duration.Seconds()) fmt.Fprintf(w, "messages_produced %d\n", stats.MessagesProduced) fmt.Fprintf(w, "messages_consumed %d\n", stats.MessagesConsumed) fmt.Fprintf(w, "bytes_produced %d\n", stats.BytesProduced) fmt.Fprintf(w, "bytes_consumed %d\n", stats.BytesConsumed) fmt.Fprintf(w, "producer_errors %d\n", stats.ProducerErrors) fmt.Fprintf(w, "consumer_errors %d\n", stats.ConsumerErrors) fmt.Fprintf(w, "producer_throughput_msgs_per_sec %f\n", stats.ProducerThroughput) fmt.Fprintf(w, "consumer_throughput_msgs_per_sec %f\n", stats.ConsumerThroughput) fmt.Fprintf(w, "total_consumer_lag %d\n", stats.TotalConsumerLag) fmt.Fprintf(w, "max_consumer_lag %d\n", stats.MaxConsumerLag) fmt.Fprintf(w, "avg_consumer_lag %f\n", stats.AvgConsumerLag) if stats.LatencyPercentiles != nil { for percentile, latency := range stats.LatencyPercentiles { fmt.Fprintf(w, "latency_p%g_seconds %f\n", percentile, latency.Seconds()) } } } // calculatePercentiles calculates latency percentiles func (c *Collector) calculatePercentiles(latencies []time.Duration) map[float64]time.Duration { if len(latencies) == 0 { return nil } // Make a copy and sort sorted := make([]time.Duration, len(latencies)) copy(sorted, latencies) sort.Slice(sorted, func(i, j int) bool { return sorted[i] < sorted[j] }) percentiles := map[float64]time.Duration{ 50: calculatePercentile(sorted, 50), 90: calculatePercentile(sorted, 90), 95: calculatePercentile(sorted, 95), 99: calculatePercentile(sorted, 99), 99.9: calculatePercentile(sorted, 99.9), } return percentiles } // calculatePercentile calculates a specific percentile from sorted data func calculatePercentile(sorted []time.Duration, percentile float64) time.Duration { if len(sorted) == 0 { return 0 } index := percentile / 100.0 * float64(len(sorted)-1) if index == float64(int(index)) { return sorted[int(index)] } lower := sorted[int(index)] upper := sorted[int(index)+1] weight := index - float64(int(index)) return time.Duration(float64(lower) + weight*float64(upper-lower)) } // Stats represents the current test statistics type Stats struct { Duration time.Duration MessagesProduced int64 MessagesConsumed int64 BytesProduced int64 BytesConsumed int64 ProducerErrors int64 ConsumerErrors int64 ProducerThroughput float64 ConsumerThroughput float64 LatencyPercentiles map[float64]time.Duration TotalConsumerLag int64 MaxConsumerLag int64 AvgConsumerLag float64 }