init version

8 months ago · 0347212b64
8 changed files with 1904 additions and 0 deletions
--- a/test/mq/Dockerfile.test
+++ b/test/mq/Dockerfile.test
@ -0,0 +1,37 @@
+FROM golang:1.21-alpine
+
+# Install necessary tools
+RUN apk add --no-cache \
+    curl \
+    netcat-openbsd \
+    bash \
+    git \
+    build-base
+
+# Set working directory
+WORKDIR /app
+
+# Copy go mod files first for better caching
+COPY go.mod go.sum ./
+RUN go mod download
+
+# Copy the entire source code
+COPY . .
+
+# Install test dependencies
+RUN go install github.com/onsi/ginkgo/v2/ginkgo@latest
+RUN go install github.com/stretchr/testify@latest
+
+# Build the weed binary for testing
+RUN go build -o weed weed/weed.go
+
+# Create test results directory
+RUN mkdir -p /test-results
+
+# Set up environment
+ENV CGO_ENABLED=1
+ENV GOOS=linux
+ENV GO111MODULE=on
+
+# Entry point for running tests
+ENTRYPOINT ["/bin/bash"] 
--- a/test/mq/Makefile
+++ b/test/mq/Makefile
@ -0,0 +1,135 @@
+.PHONY: help test test-basic test-performance test-failover test-agent clean up down logs
+
+# Default target
+help:
+	@echo "SeaweedMQ Integration Test Suite"
+	@echo ""
+	@echo "Available targets:"
+	@echo "  test           - Run all integration tests"
+	@echo "  test-basic     - Run basic pub/sub tests"
+	@echo "  test-performance - Run performance tests"
+	@echo "  test-failover  - Run failover tests"
+	@echo "  test-agent     - Run agent tests"
+	@echo "  up             - Start test environment"
+	@echo "  down           - Stop test environment"
+	@echo "  clean          - Clean up test environment and results"
+	@echo "  logs           - Show container logs"
+
+# Start the test environment
+up:
+	@echo "Starting SeaweedMQ test environment..."
+	docker-compose -f docker-compose.test.yml up -d master0 master1 master2
+	@echo "Waiting for masters to be ready..."
+	sleep 10
+	docker-compose -f docker-compose.test.yml up -d volume1 volume2 volume3
+	@echo "Waiting for volumes to be ready..."
+	sleep 10
+	docker-compose -f docker-compose.test.yml up -d filer1 filer2
+	@echo "Waiting for filers to be ready..."
+	sleep 15
+	docker-compose -f docker-compose.test.yml up -d broker1 broker2 broker3
+	@echo "Waiting for brokers to be ready..."
+	sleep 20
+	@echo "Test environment is ready!"
+
+# Stop the test environment
+down:
+	@echo "Stopping SeaweedMQ test environment..."
+	docker-compose -f docker-compose.test.yml down
+
+# Clean up everything
+clean:
+	@echo "Cleaning up test environment..."
+	docker-compose -f docker-compose.test.yml down -v
+	docker system prune -f
+	sudo rm -rf /tmp/test-results/*
+
+# Show container logs
+logs:
+	docker-compose -f docker-compose.test.yml logs -f
+
+# Run all integration tests
+test: up
+	@echo "Running all integration tests..."
+	docker-compose -f docker-compose.test.yml run --rm test-runner \
+		sh -c "go test -v -timeout=30m ./test/mq/integration/... -args -test.parallel=4"
+
+# Run basic pub/sub tests
+test-basic: up
+	@echo "Running basic pub/sub tests..."
+	docker-compose -f docker-compose.test.yml run --rm test-runner \
+		sh -c "go test -v -timeout=10m ./test/mq/integration/ -run TestBasic"
+
+# Run performance tests
+test-performance: up
+	@echo "Running performance tests..."
+	docker-compose -f docker-compose.test.yml run --rm test-runner \
+		sh -c "go test -v -timeout=20m ./test/mq/integration/ -run TestPerformance"
+
+# Run failover tests
+test-failover: up
+	@echo "Running failover tests..."
+	docker-compose -f docker-compose.test.yml run --rm test-runner \
+		sh -c "go test -v -timeout=15m ./test/mq/integration/ -run TestFailover"
+
+# Run agent tests
+test-agent: up
+	@echo "Running agent tests..."
+	docker-compose -f docker-compose.test.yml run --rm test-runner \
+		sh -c "go test -v -timeout=10m ./test/mq/integration/ -run TestAgent"
+
+# Development targets
+test-dev:
+	@echo "Running tests in development mode (using local binaries)..."
+	SEAWEED_MASTERS="localhost:19333,localhost:19334,localhost:19335" \
+	SEAWEED_BROKERS="localhost:17777,localhost:17778,localhost:17779" \
+	SEAWEED_FILERS="localhost:18888,localhost:18889" \
+	go test -v -timeout=10m ./test/mq/integration/...
+
+# Quick smoke test
+smoke-test: up
+	@echo "Running smoke test..."
+	docker-compose -f docker-compose.test.yml run --rm test-runner \
+		sh -c "go test -v -timeout=5m ./test/mq/integration/ -run TestBasicPublishSubscribe"
+
+# Performance benchmarks
+benchmark: up
+	@echo "Running performance benchmarks..."
+	docker-compose -f docker-compose.test.yml run --rm test-runner \
+		sh -c "go test -v -timeout=30m -bench=. ./test/mq/integration/..."
+
+# Check test environment health
+health:
+	@echo "Checking test environment health..."
+	@echo "Masters:"
+	@curl -s http://localhost:19333/cluster/status || echo "Master 0 not accessible"
+	@curl -s http://localhost:19334/cluster/status || echo "Master 1 not accessible"
+	@curl -s http://localhost:19335/cluster/status || echo "Master 2 not accessible"
+	@echo ""
+	@echo "Filers:"
+	@curl -s http://localhost:18888/ || echo "Filer 1 not accessible"
+	@curl -s http://localhost:18889/ || echo "Filer 2 not accessible"
+	@echo ""
+	@echo "Brokers:"
+	@nc -z localhost 17777 && echo "Broker 1 accessible" || echo "Broker 1 not accessible"
+	@nc -z localhost 17778 && echo "Broker 2 accessible" || echo "Broker 2 not accessible"  
+	@nc -z localhost 17779 && echo "Broker 3 accessible" || echo "Broker 3 not accessible"
+
+# Generate test reports
+report:
+	@echo "Generating test reports..."
+	docker-compose -f docker-compose.test.yml run --rm test-runner \
+		sh -c "go test -v -timeout=30m ./test/mq/integration/... -json > /test-results/test-report.json"
+
+# Load testing
+load-test: up
+	@echo "Running load tests..."
+	docker-compose -f docker-compose.test.yml run --rm test-runner \
+		sh -c "go test -v -timeout=45m ./test/mq/integration/ -run TestLoad"
+
+# View monitoring dashboards
+monitoring:
+	@echo "Starting monitoring stack..."
+	docker-compose -f docker-compose.test.yml up -d prometheus grafana
+	@echo "Prometheus: http://localhost:19090"
+	@echo "Grafana: http://localhost:13000 (admin/admin)" 
--- a/test/mq/README.md
+++ b/test/mq/README.md
@ -0,0 +1,370 @@
+# SeaweedMQ Integration Test Suite
+
+This directory contains a comprehensive integration test suite for SeaweedMQ, designed to validate all critical functionalities from basic pub/sub operations to advanced features like auto-scaling, failover, and performance testing.
+
+## Overview
+
+The integration test suite provides:
+
+- **Automated Environment Setup**: Docker Compose based test clusters
+- **Comprehensive Test Coverage**: Basic pub/sub, scaling, failover, performance
+- **Monitoring & Metrics**: Prometheus and Grafana integration
+- **CI/CD Ready**: Configurable for continuous integration pipelines
+- **Load Testing**: Performance benchmarks and stress tests
+
+## Architecture
+
+The test environment consists of:
+
+```
+┌─────────────────┐    ┌─────────────────┐    ┌─────────────────┐
+│  Master Cluster │    │ Volume Servers  │    │  Filer Cluster  │
+│   (3 nodes)     │    │   (3 nodes)     │    │   (2 nodes)     │
+└─────────────────┘    └─────────────────┘    └─────────────────┘
+         │                       │                       │
+         └───────────────────────┼───────────────────────┘
+                                 │
+┌─────────────────┐    ┌─────────────────┐    ┌─────────────────┐
+│ Broker Cluster  │    │ Test Framework  │    │   Monitoring    │
+│   (3 nodes)     │    │   (Go Tests)    │    │ (Prometheus +   │
+└─────────────────┘    └─────────────────┘    │   Grafana)      │
+                                              └─────────────────┘
+```
+
+## Quick Start
+
+### Prerequisites
+
+- Docker and Docker Compose
+- Go 1.21+
+- Make
+- 8GB+ RAM recommended
+- 20GB+ disk space for test data
+
+### Basic Usage
+
+1. **Start Test Environment**:
+   ```bash
+   cd test/mq
+   make up
+   ```
+
+2. **Run All Tests**:
+   ```bash
+   make test
+   ```
+
+3. **Run Specific Test Categories**:
+   ```bash
+   make test-basic      # Basic pub/sub tests
+   make test-performance # Performance tests
+   make test-failover   # Failover tests
+   make test-agent      # Agent tests
+   ```
+
+4. **Quick Smoke Test**:
+   ```bash
+   make smoke-test
+   ```
+
+5. **Clean Up**:
+   ```bash
+   make down
+   ```
+
+## Test Categories
+
+### 1. Basic Functionality Tests
+
+**File**: `integration/basic_pubsub_test.go`
+
+- **TestBasicPublishSubscribe**: Basic message publishing and consumption
+- **TestMultipleConsumers**: Load balancing across multiple consumers
+- **TestMessageOrdering**: FIFO ordering within partitions
+- **TestSchemaValidation**: Schema validation and complex nested structures
+
+### 2. Partitioning and Scaling Tests
+
+**File**: `integration/scaling_test.go` (to be implemented)
+
+- **TestPartitionDistribution**: Message distribution across partitions
+- **TestAutoSplitMerge**: Automatic partition split/merge based on load
+- **TestBrokerScaling**: Adding/removing brokers during operation
+- **TestLoadBalancing**: Even load distribution verification
+
+### 3. Failover and Reliability Tests
+
+**File**: `integration/failover_test.go` (to be implemented)
+
+- **TestBrokerFailover**: Leader failover scenarios
+- **TestBrokerRecovery**: Recovery from broker failures
+- **TestMessagePersistence**: Data durability across restarts
+- **TestFollowerReplication**: Leader-follower consistency
+
+### 4. Performance Tests
+
+**File**: `integration/performance_test.go` (to be implemented)
+
+- **TestHighThroughputPublish**: High-volume message publishing
+- **TestHighThroughputSubscribe**: High-volume message consumption
+- **TestLatencyMeasurement**: End-to-end latency analysis
+- **TestResourceUtilization**: CPU, memory, and disk usage
+
+### 5. Agent Tests
+
+**File**: `integration/agent_test.go` (to be implemented)
+
+- **TestAgentPublishSessions**: Session management for publishers
+- **TestAgentSubscribeSessions**: Session management for subscribers
+- **TestAgentFailover**: Agent reconnection and failover
+- **TestAgentConcurrency**: Concurrent session handling
+
+## Configuration
+
+### Environment Variables
+
+The test framework supports configuration via environment variables:
+
+```bash
+# Cluster endpoints
+SEAWEED_MASTERS="master0:9333,master1:9334,master2:9335"
+SEAWEED_BROKERS="broker1:17777,broker2:17778,broker3:17779"
+SEAWEED_FILERS="filer1:8888,filer2:8889"
+
+# Test configuration
+GO_TEST_TIMEOUT="30m"
+TEST_RESULTS_DIR="/test-results"
+```
+
+### Docker Compose Override
+
+Create `docker-compose.override.yml` to customize the test environment:
+
+```yaml
+version: '3.9'
+services:
+  broker1:
+    environment:
+      - CUSTOM_ENV_VAR=value
+  test-runner:
+    volumes:
+      - ./custom-config:/config
+```
+
+## Monitoring and Metrics
+
+### Prometheus Metrics
+
+Access Prometheus at: http://localhost:19090
+
+Key metrics to monitor:
+- Message throughput: `seaweedmq_messages_published_total`
+- Consumer lag: `seaweedmq_consumer_lag_seconds`
+- Broker health: `seaweedmq_broker_health`
+- Resource usage: `seaweedfs_disk_usage_bytes`
+
+### Grafana Dashboards
+
+Access Grafana at: http://localhost:13000 (admin/admin)
+
+Pre-configured dashboards:
+- **SeaweedMQ Overview**: System health and throughput
+- **Performance Metrics**: Latency and resource usage
+- **Error Analysis**: Error rates and failure patterns
+
+## Development
+
+### Writing New Tests
+
+1. **Create Test File**:
+   ```bash
+   touch integration/my_new_test.go
+   ```
+
+2. **Use Test Framework**:
+   ```go
+   func TestMyFeature(t *testing.T) {
+       suite := NewIntegrationTestSuite(t)
+       require.NoError(t, suite.Setup())
+       
+       // Your test logic here
+   }
+   ```
+
+3. **Run Specific Test**:
+   ```bash
+   go test -v ./integration/ -run TestMyFeature
+   ```
+
+### Test Framework Components
+
+**IntegrationTestSuite**: Base test framework with cluster management
+**MessageCollector**: Utility for collecting and verifying received messages
+**TestMessage**: Standard message structure for testing
+**Schema Builders**: Helpers for creating test schemas
+
+### Local Development
+
+Run tests against a local SeaweedMQ cluster:
+
+```bash
+make test-dev
+```
+
+This uses local binaries instead of Docker containers.
+
+## Continuous Integration
+
+### GitHub Actions Example
+
+```yaml
+name: Integration Tests
+on: [push, pull_request]
+
+jobs:
+  integration-tests:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - uses: actions/setup-go@v3
+        with:
+          go-version: 1.21
+      - name: Run Integration Tests
+        run: |
+          cd test/mq
+          make test
+```
+
+### Jenkins Pipeline
+
+```groovy
+pipeline {
+    agent any
+    stages {
+        stage('Setup') {
+            steps {
+                sh 'cd test/mq && make up'
+            }
+        }
+        stage('Test') {
+            steps {
+                sh 'cd test/mq && make test'
+            }
+            post {
+                always {
+                    sh 'cd test/mq && make down'
+                }
+            }
+        }
+    }
+}
+```
+
+## Troubleshooting
+
+### Common Issues
+
+1. **Port Conflicts**:
+   ```bash
+   # Check port usage
+   netstat -tulpn | grep :19333
+   
+   # Kill conflicting processes
+   sudo kill -9 $(lsof -t -i:19333)
+   ```
+
+2. **Docker Resource Issues**:
+   ```bash
+   # Increase Docker memory (8GB+)
+   # Clean up Docker resources
+   docker system prune -a
+   ```
+
+3. **Test Timeouts**:
+   ```bash
+   # Increase timeout
+   GO_TEST_TIMEOUT=60m make test
+   ```
+
+### Debug Mode
+
+Run tests with verbose logging:
+
+```bash
+docker-compose -f docker-compose.test.yml run --rm test-runner \
+  sh -c "go test -v -race ./test/mq/integration/... -args -test.v"
+```
+
+### Container Logs
+
+View real-time logs:
+
+```bash
+make logs
+
+# Or specific service
+docker-compose -f docker-compose.test.yml logs -f broker1
+```
+
+## Performance Benchmarks
+
+### Throughput Benchmarks
+
+```bash
+make benchmark
+```
+
+Expected performance (on 8-core, 16GB RAM):
+- **Publish Throughput**: 50K+ messages/second/broker
+- **Subscribe Throughput**: 100K+ messages/second/broker
+- **End-to-End Latency**: P95 < 100ms
+- **Storage Efficiency**: < 20% overhead
+
+### Load Testing
+
+```bash
+make load-test
+```
+
+Stress tests with:
+- 1M+ messages
+- 100+ concurrent producers
+- 50+ concurrent consumers
+- Multiple topic scenarios
+
+## Contributing
+
+### Test Guidelines
+
+1. **Test Isolation**: Each test should be independent
+2. **Resource Cleanup**: Always clean up resources in test teardown
+3. **Timeouts**: Set appropriate timeouts for operations
+4. **Error Handling**: Test both success and failure scenarios
+5. **Documentation**: Document test purpose and expected behavior
+
+### Code Style
+
+- Follow Go testing conventions
+- Use testify for assertions
+- Include setup/teardown in test functions
+- Use descriptive test names
+
+## Future Enhancements
+
+- [ ] Chaos engineering tests (network partitions, node failures)
+- [ ] Multi-datacenter deployment testing
+- [ ] Schema evolution compatibility tests
+- [ ] Security and authentication tests
+- [ ] Performance regression detection
+- [ ] Automated load pattern generation
+
+## Support
+
+For issues and questions:
+- Check existing GitHub issues
+- Review SeaweedMQ documentation
+- Join SeaweedFS community discussions
+
+---
+
+*This integration test suite ensures SeaweedMQ's reliability, performance, and functionality across all critical use cases and failure scenarios.* 
--- a/test/mq/docker-compose.test.yml
+++ b/test/mq/docker-compose.test.yml
@ -0,0 +1,333 @@
+version: '3.9'
+
+services:
+  # Master cluster for coordination and metadata
+  master0:
+    image: chrislusf/seaweedfs:local
+    container_name: test-master0
+    ports:
+      - "19333:9333"
+      - "29333:19333"
+    command: >
+      master 
+      -v=1 
+      -volumeSizeLimitMB=100 
+      -resumeState=false 
+      -ip=master0 
+      -port=9333 
+      -peers=master0:9333,master1:9334,master2:9335 
+      -mdir=/tmp/master0
+    environment:
+      WEED_MASTER_VOLUME_GROWTH_COPY_1: 1
+      WEED_MASTER_VOLUME_GROWTH_COPY_2: 2
+      WEED_MASTER_VOLUME_GROWTH_COPY_OTHER: 1
+    networks:
+      - seaweedmq-test
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:9333/cluster/status"]
+      interval: 10s
+      timeout: 5s
+      retries: 3
+
+  master1:
+    image: chrislusf/seaweedfs:local
+    container_name: test-master1
+    ports:
+      - "19334:9334"
+      - "29334:19334"
+    command: >
+      master 
+      -v=1 
+      -volumeSizeLimitMB=100 
+      -resumeState=false 
+      -ip=master1 
+      -port=9334 
+      -peers=master0:9333,master1:9334,master2:9335 
+      -mdir=/tmp/master1
+    environment:
+      WEED_MASTER_VOLUME_GROWTH_COPY_1: 1
+      WEED_MASTER_VOLUME_GROWTH_COPY_2: 2
+      WEED_MASTER_VOLUME_GROWTH_COPY_OTHER: 1
+    networks:
+      - seaweedmq-test
+    depends_on:
+      - master0
+
+  master2:
+    image: chrislusf/seaweedfs:local
+    container_name: test-master2
+    ports:
+      - "19335:9335"
+      - "29335:19335"
+    command: >
+      master 
+      -v=1 
+      -volumeSizeLimitMB=100 
+      -resumeState=false 
+      -ip=master2 
+      -port=9335 
+      -peers=master0:9333,master1:9334,master2:9335 
+      -mdir=/tmp/master2
+    environment:
+      WEED_MASTER_VOLUME_GROWTH_COPY_1: 1
+      WEED_MASTER_VOLUME_GROWTH_COPY_2: 2
+      WEED_MASTER_VOLUME_GROWTH_COPY_OTHER: 1
+    networks:
+      - seaweedmq-test
+    depends_on:
+      - master0
+
+  # Volume servers for data storage
+  volume1:
+    image: chrislusf/seaweedfs:local
+    container_name: test-volume1
+    ports:
+      - "18080:8080"
+      - "28080:18080"
+    command: >
+      volume 
+      -v=1
+      -dataCenter=dc1 
+      -rack=rack1 
+      -mserver=master0:9333,master1:9334,master2:9335 
+      -port=8080 
+      -ip=volume1 
+      -publicUrl=localhost:18080 
+      -preStopSeconds=1
+      -dir=/tmp/volume1
+    networks:
+      - seaweedmq-test
+    depends_on:
+      master0:
+        condition: service_healthy
+
+  volume2:
+    image: chrislusf/seaweedfs:local
+    container_name: test-volume2
+    ports:
+      - "18081:8081"
+      - "28081:18081"
+    command: >
+      volume 
+      -v=1
+      -dataCenter=dc1 
+      -rack=rack2 
+      -mserver=master0:9333,master1:9334,master2:9335 
+      -port=8081 
+      -ip=volume2 
+      -publicUrl=localhost:18081 
+      -preStopSeconds=1
+      -dir=/tmp/volume2
+    networks:
+      - seaweedmq-test
+    depends_on:
+      master0:
+        condition: service_healthy
+
+  volume3:
+    image: chrislusf/seaweedfs:local
+    container_name: test-volume3
+    ports:
+      - "18082:8082"
+      - "28082:18082"
+    command: >
+      volume 
+      -v=1
+      -dataCenter=dc2 
+      -rack=rack1 
+      -mserver=master0:9333,master1:9334,master2:9335 
+      -port=8082 
+      -ip=volume3 
+      -publicUrl=localhost:18082 
+      -preStopSeconds=1
+      -dir=/tmp/volume3
+    networks:
+      - seaweedmq-test
+    depends_on:
+      master0:
+        condition: service_healthy
+
+  # Filer servers for metadata
+  filer1:
+    image: chrislusf/seaweedfs:local
+    container_name: test-filer1
+    ports:
+      - "18888:8888"
+      - "28888:18888"
+    command: >
+      filer 
+      -v=1
+      -defaultReplicaPlacement=100 
+      -iam 
+      -master=master0:9333,master1:9334,master2:9335 
+      -port=8888 
+      -ip=filer1
+      -dataCenter=dc1
+    networks:
+      - seaweedmq-test
+    depends_on:
+      master0:
+        condition: service_healthy
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:8888/"]
+      interval: 10s
+      timeout: 5s
+      retries: 3
+
+  filer2:
+    image: chrislusf/seaweedfs:local
+    container_name: test-filer2
+    ports:
+      - "18889:8889"
+      - "28889:18889"
+    command: >
+      filer 
+      -v=1
+      -defaultReplicaPlacement=100 
+      -iam 
+      -master=master0:9333,master1:9334,master2:9335 
+      -port=8889 
+      -ip=filer2
+      -dataCenter=dc2
+    networks:
+      - seaweedmq-test
+    depends_on:
+      filer1:
+        condition: service_healthy
+
+  # Message Queue Brokers
+  broker1:
+    image: chrislusf/seaweedfs:local
+    container_name: test-broker1
+    ports:
+      - "17777:17777"
+    command: >
+      mq.broker 
+      -v=1
+      -master=master0:9333,master1:9334,master2:9335 
+      -port=17777 
+      -ip=broker1
+      -dataCenter=dc1
+      -rack=rack1
+    networks:
+      - seaweedmq-test
+    depends_on:
+      filer1:
+        condition: service_healthy
+    healthcheck:
+      test: ["CMD", "nc", "-z", "localhost", "17777"]
+      interval: 10s
+      timeout: 5s
+      retries: 3
+
+  broker2:
+    image: chrislusf/seaweedfs:local
+    container_name: test-broker2
+    ports:
+      - "17778:17778"
+    command: >
+      mq.broker 
+      -v=1
+      -master=master0:9333,master1:9334,master2:9335 
+      -port=17778 
+      -ip=broker2
+      -dataCenter=dc1
+      -rack=rack2
+    networks:
+      - seaweedmq-test
+    depends_on:
+      broker1:
+        condition: service_healthy
+
+  broker3:
+    image: chrislusf/seaweedfs:local
+    container_name: test-broker3
+    ports:
+      - "17779:17779"
+    command: >
+      mq.broker 
+      -v=1
+      -master=master0:9333,master1:9334,master2:9335 
+      -port=17779 
+      -ip=broker3
+      -dataCenter=dc2
+      -rack=rack1
+    networks:
+      - seaweedmq-test
+    depends_on:
+      broker1:
+        condition: service_healthy
+
+  # Test runner container
+  test-runner:
+    build:
+      context: ../../
+      dockerfile: test/mq/Dockerfile.test
+    container_name: test-runner
+    volumes:
+      - ../../:/app
+      - /tmp/test-results:/test-results
+    working_dir: /app
+    environment:
+      - SEAWEED_MASTERS=master0:9333,master1:9334,master2:9335
+      - SEAWEED_BROKERS=broker1:17777,broker2:17778,broker3:17779
+      - SEAWEED_FILERS=filer1:8888,filer2:8889
+      - TEST_RESULTS_DIR=/test-results
+      - GO_TEST_TIMEOUT=30m
+    networks:
+      - seaweedmq-test
+    depends_on:
+      broker1:
+        condition: service_healthy
+      broker2:
+        condition: service_started
+      broker3:
+        condition: service_started
+    command: >
+      sh -c "
+        echo 'Waiting for cluster to be ready...' &&
+        sleep 30 &&
+        echo 'Running integration tests...' &&
+        go test -v -timeout=30m ./test/mq/integration/... -args -test.parallel=4
+      "
+
+  # Monitoring and metrics
+  prometheus:
+    image: prom/prometheus:latest
+    container_name: test-prometheus
+    ports:
+      - "19090:9090"
+    volumes:
+      - ./prometheus.yml:/etc/prometheus/prometheus.yml
+    networks:
+      - seaweedmq-test
+    command:
+      - '--config.file=/etc/prometheus/prometheus.yml'
+      - '--storage.tsdb.path=/prometheus'
+      - '--web.console.libraries=/etc/prometheus/console_libraries'
+      - '--web.console.templates=/etc/prometheus/consoles'
+      - '--web.enable-lifecycle'
+
+  grafana:
+    image: grafana/grafana:latest
+    container_name: test-grafana
+    ports:
+      - "13000:3000"
+    environment:
+      - GF_SECURITY_ADMIN_PASSWORD=admin
+    volumes:
+      - grafana-storage:/var/lib/grafana
+      - ./grafana/dashboards:/etc/grafana/provisioning/dashboards
+      - ./grafana/datasources:/etc/grafana/provisioning/datasources
+    networks:
+      - seaweedmq-test
+
+networks:
+  seaweedmq-test:
+    driver: bridge
+    ipam:
+      config:
+        - subnet: 172.20.0.0/16
+
+volumes:
+  grafana-storage: 
--- a/test/mq/integration/basic_pubsub_test.go
+++ b/test/mq/integration/basic_pubsub_test.go
@ -0,0 +1,334 @@
+package integration
+
+import (
+	"fmt"
+	"testing"
+	"time"
+
+	"github.com/seaweedfs/seaweedfs/weed/mq/schema"
+	"github.com/seaweedfs/seaweedfs/weed/pb/mq_pb"
+	"github.com/seaweedfs/seaweedfs/weed/pb/schema_pb"
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+)
+
+func TestBasicPublishSubscribe(t *testing.T) {
+	suite := NewIntegrationTestSuite(t)
+	require.NoError(t, suite.Setup())
+
+	// Test configuration
+	namespace := "test"
+	topicName := "basic-pubsub"
+	testSchema := CreateTestSchema()
+	messageCount := 10
+
+	// Create publisher
+	pubConfig := &PublisherTestConfig{
+		Namespace:      namespace,
+		TopicName:      topicName,
+		PartitionCount: 1,
+		PublisherName:  "test-publisher",
+		RecordType:     testSchema,
+	}
+
+	publisher, err := suite.CreatePublisher(pubConfig)
+	require.NoError(t, err, "Failed to create publisher")
+
+	// Create subscriber
+	subConfig := &SubscriberTestConfig{
+		Namespace:          namespace,
+		TopicName:          topicName,
+		ConsumerGroup:      "test-group",
+		ConsumerInstanceId: "consumer-1",
+		MaxPartitionCount:  1,
+		SlidingWindowSize:  10,
+		OffsetType:         schema_pb.OffsetType_RESET_TO_EARLIEST,
+	}
+
+	subscriber, err := suite.CreateSubscriber(subConfig)
+	require.NoError(t, err, "Failed to create subscriber")
+
+	// Set up message collector
+	collector := NewMessageCollector(messageCount)
+	subscriber.SetOnDataMessageFn(func(m *mq_pb.SubscribeMessageResponse_Data) {
+		collector.AddMessage(TestMessage{
+			ID:        fmt.Sprintf("msg-%d", len(collector.GetMessages())),
+			Content:   m.Data.Value,
+			Timestamp: time.Unix(0, m.Data.TsNs),
+			Key:       m.Data.Key,
+		})
+	})
+
+	// Start subscriber
+	go func() {
+		err := subscriber.Subscribe()
+		if err != nil {
+			t.Logf("Subscriber error: %v", err)
+		}
+	}()
+
+	// Wait for subscriber to be ready
+	time.Sleep(2 * time.Second)
+
+	// Publish test messages
+	for i := 0; i < messageCount; i++ {
+		record := schema.RecordBegin().
+			SetString("id", fmt.Sprintf("msg-%d", i)).
+			SetInt64("timestamp", time.Now().UnixNano()).
+			SetString("content", fmt.Sprintf("Test message %d", i)).
+			SetInt32("sequence", int32(i)).
+			RecordEnd()
+
+		key := []byte(fmt.Sprintf("key-%d", i))
+		err := publisher.PublishRecord(key, record)
+		require.NoError(t, err, "Failed to publish message %d", i)
+	}
+
+	// Wait for messages to be received
+	messages := collector.WaitForMessages(30 * time.Second)
+
+	// Verify all messages were received
+	assert.Len(t, messages, messageCount, "Expected %d messages, got %d", messageCount, len(messages))
+
+	// Verify message content
+	for i, msg := range messages {
+		assert.NotEmpty(t, msg.Content, "Message %d should have content", i)
+		assert.NotEmpty(t, msg.Key, "Message %d should have key", i)
+	}
+}
+
+func TestMultipleConsumers(t *testing.T) {
+	suite := NewIntegrationTestSuite(t)
+	require.NoError(t, suite.Setup())
+
+	namespace := "test"
+	topicName := "multi-consumer"
+	testSchema := CreateTestSchema()
+	messageCount := 20
+	consumerCount := 3
+
+	// Create publisher
+	pubConfig := &PublisherTestConfig{
+		Namespace:      namespace,
+		TopicName:      topicName,
+		PartitionCount: 3, // Multiple partitions for load distribution
+		PublisherName:  "multi-publisher",
+		RecordType:     testSchema,
+	}
+
+	publisher, err := suite.CreatePublisher(pubConfig)
+	require.NoError(t, err)
+
+	// Create multiple consumers
+	collectors := make([]*MessageCollector, consumerCount)
+	for i := 0; i < consumerCount; i++ {
+		collectors[i] = NewMessageCollector(messageCount / consumerCount) // Expect roughly equal distribution
+
+		subConfig := &SubscriberTestConfig{
+			Namespace:          namespace,
+			TopicName:          topicName,
+			ConsumerGroup:      "multi-consumer-group", // Same group for load balancing
+			ConsumerInstanceId: fmt.Sprintf("consumer-%d", i),
+			MaxPartitionCount:  1,
+			SlidingWindowSize:  10,
+			OffsetType:         schema_pb.OffsetType_RESET_TO_EARLIEST,
+		}
+
+		subscriber, err := suite.CreateSubscriber(subConfig)
+		require.NoError(t, err)
+
+		// Set up message collection for this consumer
+		collectorIndex := i
+		subscriber.SetOnDataMessageFn(func(m *mq_pb.SubscribeMessageResponse_Data) {
+			collectors[collectorIndex].AddMessage(TestMessage{
+				ID:        fmt.Sprintf("consumer-%d-msg-%d", collectorIndex, len(collectors[collectorIndex].GetMessages())),
+				Content:   m.Data.Value,
+				Timestamp: time.Unix(0, m.Data.TsNs),
+				Key:       m.Data.Key,
+			})
+		})
+
+		// Start subscriber
+		go func() {
+			subscriber.Subscribe()
+		}()
+	}
+
+	// Wait for subscribers to be ready
+	time.Sleep(3 * time.Second)
+
+	// Publish messages with different keys to distribute across partitions
+	for i := 0; i < messageCount; i++ {
+		record := schema.RecordBegin().
+			SetString("id", fmt.Sprintf("multi-msg-%d", i)).
+			SetInt64("timestamp", time.Now().UnixNano()).
+			SetString("content", fmt.Sprintf("Multi consumer test message %d", i)).
+			SetInt32("sequence", int32(i)).
+			RecordEnd()
+
+		key := []byte(fmt.Sprintf("partition-key-%d", i%3)) // Distribute across 3 partitions
+		err := publisher.PublishRecord(key, record)
+		require.NoError(t, err)
+	}
+
+	// Wait for all messages to be consumed
+	time.Sleep(10 * time.Second)
+
+	// Verify message distribution
+	totalReceived := 0
+	for i, collector := range collectors {
+		messages := collector.GetMessages()
+		t.Logf("Consumer %d received %d messages", i, len(messages))
+		totalReceived += len(messages)
+	}
+
+	// All messages should be consumed across all consumers
+	assert.Equal(t, messageCount, totalReceived, "Total messages received should equal messages sent")
+}
+
+func TestMessageOrdering(t *testing.T) {
+	suite := NewIntegrationTestSuite(t)
+	require.NoError(t, suite.Setup())
+
+	namespace := "test"
+	topicName := "ordering-test"
+	testSchema := CreateTestSchema()
+	messageCount := 15
+
+	// Create publisher
+	pubConfig := &PublisherTestConfig{
+		Namespace:      namespace,
+		TopicName:      topicName,
+		PartitionCount: 1, // Single partition to guarantee ordering
+		PublisherName:  "ordering-publisher",
+		RecordType:     testSchema,
+	}
+
+	publisher, err := suite.CreatePublisher(pubConfig)
+	require.NoError(t, err)
+
+	// Create subscriber
+	subConfig := &SubscriberTestConfig{
+		Namespace:          namespace,
+		TopicName:          topicName,
+		ConsumerGroup:      "ordering-group",
+		ConsumerInstanceId: "ordering-consumer",
+		MaxPartitionCount:  1,
+		SlidingWindowSize:  5,
+		OffsetType:         schema_pb.OffsetType_RESET_TO_EARLIEST,
+	}
+
+	subscriber, err := suite.CreateSubscriber(subConfig)
+	require.NoError(t, err)
+
+	// Set up message collector
+	collector := NewMessageCollector(messageCount)
+	subscriber.SetOnDataMessageFn(func(m *mq_pb.SubscribeMessageResponse_Data) {
+		collector.AddMessage(TestMessage{
+			ID:        fmt.Sprintf("ordered-msg"),
+			Content:   m.Data.Value,
+			Timestamp: time.Unix(0, m.Data.TsNs),
+			Key:       m.Data.Key,
+		})
+	})
+
+	// Start subscriber
+	go func() {
+		subscriber.Subscribe()
+	}()
+
+	// Wait for consumer to be ready
+	time.Sleep(2 * time.Second)
+
+	// Publish messages with same key to ensure they go to same partition
+	publishTimes := make([]time.Time, messageCount)
+	for i := 0; i < messageCount; i++ {
+		publishTimes[i] = time.Now()
+
+		record := schema.RecordBegin().
+			SetString("id", fmt.Sprintf("ordered-%d", i)).
+			SetInt64("timestamp", publishTimes[i].UnixNano()).
+			SetString("content", fmt.Sprintf("Ordered message %d", i)).
+			SetInt32("sequence", int32(i)).
+			RecordEnd()
+
+		key := []byte("same-partition-key") // Same key ensures same partition
+		err := publisher.PublishRecord(key, record)
+		require.NoError(t, err)
+
+		// Small delay to ensure different timestamps
+		time.Sleep(10 * time.Millisecond)
+	}
+
+	// Wait for all messages
+	messages := collector.WaitForMessages(30 * time.Second)
+	require.Len(t, messages, messageCount)
+
+	// Verify ordering within the partition
+	suite.AssertMessageOrdering(t, messages)
+}
+
+func TestSchemaValidation(t *testing.T) {
+	suite := NewIntegrationTestSuite(t)
+	require.NoError(t, suite.Setup())
+
+	namespace := "test"
+	topicName := "schema-validation"
+
+	// Test with simple schema
+	simpleSchema := CreateTestSchema()
+
+	pubConfig := &PublisherTestConfig{
+		Namespace:      namespace,
+		TopicName:      topicName,
+		PartitionCount: 1,
+		PublisherName:  "schema-publisher",
+		RecordType:     simpleSchema,
+	}
+
+	publisher, err := suite.CreatePublisher(pubConfig)
+	require.NoError(t, err)
+
+	// Test valid record
+	validRecord := schema.RecordBegin().
+		SetString("id", "valid-msg").
+		SetInt64("timestamp", time.Now().UnixNano()).
+		SetString("content", "Valid message").
+		SetInt32("sequence", 1).
+		RecordEnd()
+
+	err = publisher.PublishRecord([]byte("test-key"), validRecord)
+	assert.NoError(t, err, "Valid record should be published successfully")
+
+	// Test with complex nested schema
+	complexSchema := CreateComplexTestSchema()
+
+	complexPubConfig := &PublisherTestConfig{
+		Namespace:      namespace,
+		TopicName:      topicName + "-complex",
+		PartitionCount: 1,
+		PublisherName:  "complex-publisher",
+		RecordType:     complexSchema,
+	}
+
+	complexPublisher, err := suite.CreatePublisher(complexPubConfig)
+	require.NoError(t, err)
+
+	// Test complex nested record
+	complexRecord := schema.RecordBegin().
+		SetString("user_id", "user123").
+		SetString("name", "John Doe").
+		SetInt32("age", 30).
+		SetStringList("emails", "john@example.com", "john.doe@company.com").
+		SetRecord("address",
+			schema.RecordBegin().
+				SetString("street", "123 Main St").
+				SetString("city", "New York").
+				SetString("zipcode", "10001").
+				RecordEnd()).
+		SetInt64("created_at", time.Now().UnixNano()).
+		RecordEnd()
+
+	err = complexPublisher.PublishRecord([]byte("complex-key"), complexRecord)
+	assert.NoError(t, err, "Complex nested record should be published successfully")
+}
--- a/test/mq/integration/framework.go
+++ b/test/mq/integration/framework.go
@ -0,0 +1,355 @@
+package integration
+
+import (
+	"context"
+	"fmt"
+	"os"
+	"strings"
+	"sync"
+	"testing"
+	"time"
+
+	"github.com/seaweedfs/seaweedfs/weed/mq/agent"
+	"github.com/seaweedfs/seaweedfs/weed/mq/client/pub_client"
+	"github.com/seaweedfs/seaweedfs/weed/mq/client/sub_client"
+	"github.com/seaweedfs/seaweedfs/weed/mq/schema"
+	"github.com/seaweedfs/seaweedfs/weed/mq/topic"
+	"github.com/seaweedfs/seaweedfs/weed/pb"
+	"github.com/seaweedfs/seaweedfs/weed/pb/schema_pb"
+	"github.com/stretchr/testify/require"
+	"google.golang.org/grpc"
+	"google.golang.org/grpc/credentials/insecure"
+)
+
+// TestEnvironment holds the configuration for the test environment
+type TestEnvironment struct {
+	Masters      []string
+	Brokers      []string
+	Filers       []string
+	TestTimeout  time.Duration
+	CleanupFuncs []func()
+	mutex        sync.Mutex
+}
+
+// IntegrationTestSuite provides the base test framework
+type IntegrationTestSuite struct {
+	env         *TestEnvironment
+	agents      map[string]*agent.MessageQueueAgent
+	publishers  map[string]*pub_client.TopicPublisher
+	subscribers map[string]*sub_client.TopicSubscriber
+	cleanupOnce sync.Once
+	t           *testing.T
+}
+
+// NewIntegrationTestSuite creates a new test suite instance
+func NewIntegrationTestSuite(t *testing.T) *IntegrationTestSuite {
+	env := &TestEnvironment{
+		Masters:     getEnvList("SEAWEED_MASTERS", []string{"localhost:19333"}),
+		Brokers:     getEnvList("SEAWEED_BROKERS", []string{"localhost:17777"}),
+		Filers:      getEnvList("SEAWEED_FILERS", []string{"localhost:18888"}),
+		TestTimeout: getEnvDuration("GO_TEST_TIMEOUT", 30*time.Minute),
+	}
+
+	return &IntegrationTestSuite{
+		env:         env,
+		agents:      make(map[string]*agent.MessageQueueAgent),
+		publishers:  make(map[string]*pub_client.TopicPublisher),
+		subscribers: make(map[string]*sub_client.TopicSubscriber),
+		t:           t,
+	}
+}
+
+// Setup initializes the test environment
+func (its *IntegrationTestSuite) Setup() error {
+	// Wait for cluster to be ready
+	if err := its.waitForClusterReady(); err != nil {
+		return fmt.Errorf("cluster not ready: %v", err)
+	}
+
+	// Register cleanup
+	its.t.Cleanup(its.Cleanup)
+
+	return nil
+}
+
+// Cleanup performs cleanup operations
+func (its *IntegrationTestSuite) Cleanup() {
+	its.cleanupOnce.Do(func() {
+		// Close all subscribers (they use context cancellation)
+		for name, _ := range its.subscribers {
+			its.t.Logf("Cleaned up subscriber: %s", name)
+		}
+
+		// Close all publishers
+		for name, publisher := range its.publishers {
+			if publisher != nil {
+				publisher.Shutdown()
+				its.t.Logf("Cleaned up publisher: %s", name)
+			}
+		}
+
+		// Execute additional cleanup functions
+		its.env.mutex.Lock()
+		for _, cleanup := range its.env.CleanupFuncs {
+			cleanup()
+		}
+		its.env.mutex.Unlock()
+	})
+}
+
+// CreatePublisher creates a new topic publisher
+func (its *IntegrationTestSuite) CreatePublisher(config *PublisherTestConfig) (*pub_client.TopicPublisher, error) {
+	publisherConfig := &pub_client.PublisherConfiguration{
+		Topic:          topic.NewTopic(config.Namespace, config.TopicName),
+		PartitionCount: config.PartitionCount,
+		Brokers:        its.env.Brokers,
+		PublisherName:  config.PublisherName,
+		RecordType:     config.RecordType,
+	}
+
+	publisher, err := pub_client.NewTopicPublisher(publisherConfig)
+	if err != nil {
+		return nil, fmt.Errorf("failed to create publisher: %v", err)
+	}
+
+	its.publishers[config.PublisherName] = publisher
+	return publisher, nil
+}
+
+// CreateSubscriber creates a new topic subscriber
+func (its *IntegrationTestSuite) CreateSubscriber(config *SubscriberTestConfig) (*sub_client.TopicSubscriber, error) {
+	subscriberConfig := &sub_client.SubscriberConfiguration{
+		ConsumerGroup:           config.ConsumerGroup,
+		ConsumerGroupInstanceId: config.ConsumerInstanceId,
+		GrpcDialOption:          grpc.WithTransportCredentials(insecure.NewCredentials()),
+		MaxPartitionCount:       config.MaxPartitionCount,
+		SlidingWindowSize:       config.SlidingWindowSize,
+	}
+
+	contentConfig := &sub_client.ContentConfiguration{
+		Topic:            topic.NewTopic(config.Namespace, config.TopicName),
+		Filter:           config.Filter,
+		PartitionOffsets: config.PartitionOffsets,
+		OffsetType:       config.OffsetType,
+		OffsetTsNs:       config.OffsetTsNs,
+	}
+
+	offsetChan := make(chan sub_client.KeyedOffset, 1024)
+	subscriber := sub_client.NewTopicSubscriber(
+		context.Background(),
+		its.env.Brokers,
+		subscriberConfig,
+		contentConfig,
+		offsetChan,
+	)
+
+	its.subscribers[config.ConsumerInstanceId] = subscriber
+	return subscriber, nil
+}
+
+// CreateAgent creates a new message queue agent
+func (its *IntegrationTestSuite) CreateAgent(name string) (*agent.MessageQueueAgent, error) {
+	var brokerAddresses []pb.ServerAddress
+	for _, broker := range its.env.Brokers {
+		brokerAddresses = append(brokerAddresses, pb.ServerAddress(broker))
+	}
+
+	agentOptions := &agent.MessageQueueAgentOptions{
+		SeedBrokers: brokerAddresses,
+	}
+
+	mqAgent := agent.NewMessageQueueAgent(
+		agentOptions,
+		grpc.WithTransportCredentials(insecure.NewCredentials()),
+	)
+
+	its.agents[name] = mqAgent
+	return mqAgent, nil
+}
+
+// PublisherTestConfig holds configuration for creating test publishers
+type PublisherTestConfig struct {
+	Namespace      string
+	TopicName      string
+	PartitionCount int32
+	PublisherName  string
+	RecordType     *schema_pb.RecordType
+}
+
+// SubscriberTestConfig holds configuration for creating test subscribers
+type SubscriberTestConfig struct {
+	Namespace          string
+	TopicName          string
+	ConsumerGroup      string
+	ConsumerInstanceId string
+	MaxPartitionCount  int32
+	SlidingWindowSize  int32
+	Filter             string
+	PartitionOffsets   []*schema_pb.PartitionOffset
+	OffsetType         schema_pb.OffsetType
+	OffsetTsNs         int64
+}
+
+// TestMessage represents a test message with metadata
+type TestMessage struct {
+	ID        string
+	Content   []byte
+	Timestamp time.Time
+	Key       []byte
+}
+
+// MessageCollector collects received messages for verification
+type MessageCollector struct {
+	messages []TestMessage
+	mutex    sync.RWMutex
+	waitCh   chan struct{}
+	expected int
+}
+
+// NewMessageCollector creates a new message collector
+func NewMessageCollector(expectedCount int) *MessageCollector {
+	return &MessageCollector{
+		messages: make([]TestMessage, 0),
+		waitCh:   make(chan struct{}),
+		expected: expectedCount,
+	}
+}
+
+// AddMessage adds a received message to the collector
+func (mc *MessageCollector) AddMessage(msg TestMessage) {
+	mc.mutex.Lock()
+	defer mc.mutex.Unlock()
+
+	mc.messages = append(mc.messages, msg)
+	if len(mc.messages) >= mc.expected {
+		close(mc.waitCh)
+	}
+}
+
+// WaitForMessages waits for the expected number of messages or timeout
+func (mc *MessageCollector) WaitForMessages(timeout time.Duration) []TestMessage {
+	select {
+	case <-mc.waitCh:
+	case <-time.After(timeout):
+	}
+
+	mc.mutex.RLock()
+	defer mc.mutex.RUnlock()
+
+	result := make([]TestMessage, len(mc.messages))
+	copy(result, mc.messages)
+	return result
+}
+
+// GetMessages returns all collected messages
+func (mc *MessageCollector) GetMessages() []TestMessage {
+	mc.mutex.RLock()
+	defer mc.mutex.RUnlock()
+
+	result := make([]TestMessage, len(mc.messages))
+	copy(result, mc.messages)
+	return result
+}
+
+// CreateTestSchema creates a simple test schema
+func CreateTestSchema() *schema_pb.RecordType {
+	return schema.RecordTypeBegin().
+		WithField("id", schema.TypeString).
+		WithField("timestamp", schema.TypeInt64).
+		WithField("content", schema.TypeString).
+		WithField("sequence", schema.TypeInt32).
+		RecordTypeEnd()
+}
+
+// CreateComplexTestSchema creates a complex test schema with nested structures
+func CreateComplexTestSchema() *schema_pb.RecordType {
+	addressType := schema.RecordTypeBegin().
+		WithField("street", schema.TypeString).
+		WithField("city", schema.TypeString).
+		WithField("zipcode", schema.TypeString).
+		RecordTypeEnd()
+
+	return schema.RecordTypeBegin().
+		WithField("user_id", schema.TypeString).
+		WithField("name", schema.TypeString).
+		WithField("age", schema.TypeInt32).
+		WithField("emails", schema.ListOf(schema.TypeString)).
+		WithRecordField("address", addressType).
+		WithField("created_at", schema.TypeInt64).
+		RecordTypeEnd()
+}
+
+// Helper functions
+
+func getEnvList(key string, defaultValue []string) []string {
+	value := os.Getenv(key)
+	if value == "" {
+		return defaultValue
+	}
+	return strings.Split(value, ",")
+}
+
+func getEnvDuration(key string, defaultValue time.Duration) time.Duration {
+	value := os.Getenv(key)
+	if value == "" {
+		return defaultValue
+	}
+
+	duration, err := time.ParseDuration(value)
+	if err != nil {
+		return defaultValue
+	}
+	return duration
+}
+
+func (its *IntegrationTestSuite) waitForClusterReady() error {
+	maxRetries := 30
+	retryInterval := 2 * time.Second
+
+	for i := 0; i < maxRetries; i++ {
+		if its.isClusterReady() {
+			return nil
+		}
+		its.t.Logf("Waiting for cluster to be ready... attempt %d/%d", i+1, maxRetries)
+		time.Sleep(retryInterval)
+	}
+
+	return fmt.Errorf("cluster not ready after %d attempts", maxRetries)
+}
+
+func (its *IntegrationTestSuite) isClusterReady() bool {
+	// Check if at least one broker is accessible
+	for _, broker := range its.env.Brokers {
+		if its.isBrokerReady(broker) {
+			return true
+		}
+	}
+	return false
+}
+
+func (its *IntegrationTestSuite) isBrokerReady(broker string) bool {
+	// Simple connection test
+	conn, err := grpc.NewClient(broker, grpc.WithTransportCredentials(insecure.NewCredentials()))
+	if err != nil {
+		return false
+	}
+	defer conn.Close()
+
+	// TODO: Add actual health check call here
+	return true
+}
+
+// AssertMessagesReceived verifies that expected messages were received
+func (its *IntegrationTestSuite) AssertMessagesReceived(t *testing.T, collector *MessageCollector, expectedCount int, timeout time.Duration) {
+	messages := collector.WaitForMessages(timeout)
+	require.Len(t, messages, expectedCount, "Expected %d messages, got %d", expectedCount, len(messages))
+}
+
+// AssertMessageOrdering verifies that messages are received in the expected order
+func (its *IntegrationTestSuite) AssertMessageOrdering(t *testing.T, messages []TestMessage) {
+	for i := 1; i < len(messages); i++ {
+		require.True(t, messages[i].Timestamp.After(messages[i-1].Timestamp) || messages[i].Timestamp.Equal(messages[i-1].Timestamp),
+			"Messages not in chronological order: message %d timestamp %v should be >= message %d timestamp %v",
+			i, messages[i].Timestamp, i-1, messages[i-1].Timestamp)
+	}
+}
--- a/test/mq/integration_test_design.md
+++ b/test/mq/integration_test_design.md
@ -0,0 +1,286 @@
+# SeaweedMQ Integration Test Design
+
+## Overview
+
+This document outlines the comprehensive integration test strategy for SeaweedMQ, covering all critical functionalities from basic pub/sub operations to advanced features like auto-scaling, failover, and performance testing.
+
+## Architecture Under Test
+
+SeaweedMQ consists of:
+- **Masters**: Cluster coordination and metadata management
+- **Volume Servers**: Storage layer for persistent messages
+- **Filers**: File system interface for metadata storage
+- **Brokers**: Message processing and routing (stateless)
+- **Agents**: Client interface for pub/sub operations
+- **Schema System**: Protobuf-based message schema management
+
+## Test Categories
+
+### 1. Basic Functionality Tests
+
+#### 1.1 Basic Pub/Sub Operations
+- **Test**: `TestBasicPublishSubscribe`
+  - Publish messages to a topic
+  - Subscribe and receive messages
+  - Verify message content and ordering
+  - Test with different data types (string, int, bytes, records)
+
+- **Test**: `TestMultipleConsumers`
+  - Multiple subscribers on same topic
+  - Verify message distribution
+  - Test consumer group functionality
+
+- **Test**: `TestMessageOrdering`
+  - Publish messages in sequence
+  - Verify FIFO ordering within partitions
+  - Test with different partition keys
+
+#### 1.2 Schema Management
+- **Test**: `TestSchemaValidation`
+  - Publish with valid schemas
+  - Reject invalid schema messages
+  - Test schema evolution scenarios
+
+- **Test**: `TestRecordTypes`
+  - Nested record structures
+  - List types and complex schemas
+  - Schema-to-Parquet conversion
+
+### 2. Partitioning and Scaling Tests
+
+#### 2.1 Partition Management
+- **Test**: `TestPartitionDistribution`
+  - Messages distributed across partitions based on keys
+  - Verify partition assignment logic
+  - Test partition rebalancing
+
+- **Test**: `TestAutoSplitMerge`
+  - Simulate high load to trigger auto-split
+  - Simulate low load to trigger auto-merge
+  - Verify data consistency during splits/merges
+
+#### 2.2 Broker Scaling
+- **Test**: `TestBrokerAddRemove`
+  - Add brokers during operation
+  - Remove brokers gracefully
+  - Verify partition reassignment
+
+- **Test**: `TestLoadBalancing`
+  - Verify even load distribution across brokers
+  - Test with varying message sizes and rates
+  - Monitor broker resource utilization
+
+### 3. Failover and Reliability Tests
+
+#### 3.1 Broker Failover
+- **Test**: `TestBrokerFailover`
+  - Kill leader broker during publishing
+  - Verify seamless failover to follower
+  - Test data consistency after failover
+
+- **Test**: `TestBrokerRecovery`
+  - Broker restart scenarios
+  - State recovery from storage
+  - Partition reassignment after recovery
+
+#### 3.2 Data Durability
+- **Test**: `TestMessagePersistence`
+  - Publish messages and restart cluster
+  - Verify all messages are recovered
+  - Test with different replication settings
+
+- **Test**: `TestFollowerReplication`
+  - Leader-follower message replication
+  - Verify consistency between replicas
+  - Test follower promotion scenarios
+
+### 4. Agent Functionality Tests
+
+#### 4.1 Session Management
+- **Test**: `TestPublishSessions`
+  - Create/close publish sessions
+  - Concurrent session management
+  - Session cleanup after failures
+
+- **Test**: `TestSubscribeSessions`
+  - Subscribe session lifecycle
+  - Consumer group management
+  - Offset tracking and acknowledgments
+
+#### 4.2 Error Handling
+- **Test**: `TestConnectionFailures`
+  - Network partitions between agent and broker
+  - Automatic reconnection logic
+  - Message buffering during outages
+
+### 5. Performance and Load Tests
+
+#### 5.1 Throughput Tests
+- **Test**: `TestHighThroughputPublish`
+  - Publish 100K+ messages/second
+  - Monitor system resources
+  - Verify no message loss
+
+- **Test**: `TestHighThroughputSubscribe`
+  - Multiple consumers processing high volume
+  - Monitor processing latency
+  - Test backpressure handling
+
+#### 5.2 Spike Traffic Tests
+- **Test**: `TestTrafficSpikes`
+  - Sudden increase in message volume
+  - Auto-scaling behavior verification
+  - Resource utilization patterns
+
+- **Test**: `TestLargeMessages`
+  - Messages with large payloads (MB size)
+  - Memory usage monitoring
+  - Storage efficiency testing
+
+### 6. End-to-End Scenarios
+
+#### 6.1 Complete Workflow Tests
+- **Test**: `TestProducerConsumerWorkflow`
+  - Multi-stage data processing pipeline
+  - Producer → Topic → Multiple Consumers
+  - Data transformation and aggregation
+
+- **Test**: `TestMultiTopicOperations`
+  - Multiple topics with different schemas
+  - Cross-topic message routing
+  - Topic management operations
+
+## Test Infrastructure
+
+### Environment Setup
+
+#### Docker Compose Configuration
+```yaml
+# test-environment.yml
+version: '3.9'
+services:
+  master-cluster:
+    # 3 master nodes for HA
+  volume-cluster:
+    # 3 volume servers for data storage
+  filer-cluster:
+    # 2 filers for metadata
+  broker-cluster:
+    # 3 brokers for message processing
+  test-runner:
+    # Container to run integration tests
+```
+
+#### Test Data Management
+- Pre-defined test schemas
+- Sample message datasets
+- Performance benchmarking data
+
+### Test Framework Structure
+
+```go
+// Base test framework
+type IntegrationTestSuite struct {
+    masters     []string
+    brokers     []string
+    filers      []string
+    testClient  *TestClient
+    cleanup     []func()
+}
+
+// Test utilities
+type TestClient struct {
+    publishers  map[string]*pub_client.TopicPublisher
+    subscribers map[string]*sub_client.TopicSubscriber
+    agents      []*agent.MessageQueueAgent
+}
+```
+
+### Monitoring and Metrics
+
+#### Health Checks
+- Broker connectivity status
+- Master cluster health
+- Storage system availability
+- Network connectivity between components
+
+#### Performance Metrics
+- Message throughput (msgs/sec)
+- End-to-end latency
+- Resource utilization (CPU, Memory, Disk)
+- Network bandwidth usage
+
+## Test Execution Strategy
+
+### Parallel Test Execution
+- Categorize tests by resource requirements
+- Run independent tests in parallel
+- Serialize tests that modify cluster state
+
+### Continuous Integration
+- Automated test runs on PR submissions
+- Performance regression detection
+- Multi-platform testing (Linux, macOS, Windows)
+
+### Test Environment Management
+- Docker-based isolated environments
+- Automatic cleanup after test completion
+- Resource monitoring and alerts
+
+## Success Criteria
+
+### Functional Requirements
+- ✅ All messages published are received by subscribers
+- ✅ Message ordering preserved within partitions
+- ✅ Schema validation works correctly
+- ✅ Auto-scaling triggers at expected thresholds
+- ✅ Failover completes within 30 seconds
+- ✅ No data loss during normal operations
+
+### Performance Requirements
+- ✅ Throughput: 50K+ messages/second/broker
+- ✅ Latency: P95 < 100ms end-to-end
+- ✅ Memory usage: < 1GB per broker under normal load
+- ✅ Storage efficiency: < 20% overhead vs raw message size
+
+### Reliability Requirements
+- ✅ 99.9% uptime during normal operations
+- ✅ Automatic recovery from single component failures
+- ✅ Data consistency maintained across all scenarios
+- ✅ Graceful degradation under resource constraints
+
+## Implementation Timeline
+
+### Phase 1: Core Functionality (Week 1-2)
+- Basic pub/sub tests
+- Schema validation tests
+- Simple failover scenarios
+
+### Phase 2: Advanced Features (Week 3-4)
+- Auto-scaling tests
+- Complex failover scenarios
+- Agent functionality tests
+
+### Phase 3: Performance & Load (Week 5-6)
+- Throughput and latency tests
+- Spike traffic handling
+- Resource utilization monitoring
+
+### Phase 4: End-to-End (Week 7-8)
+- Complete workflow tests
+- Multi-component integration
+- Performance regression testing
+
+## Maintenance and Updates
+
+### Regular Updates
+- Add tests for new features
+- Update performance baselines
+- Enhance error scenarios coverage
+
+### Test Data Refresh
+- Generate new test datasets quarterly
+- Update schema examples
+- Refresh performance benchmarks
+
+This comprehensive test design ensures SeaweedMQ's reliability, performance, and functionality across all critical use cases and failure scenarios. 
--- a/test/mq/prometheus.yml
+++ b/test/mq/prometheus.yml
@ -0,0 +1,54 @@
+global:
+  scrape_interval: 15s
+  evaluation_interval: 15s
+
+rule_files:
+  # - "first_rules.yml"
+  # - "second_rules.yml"
+
+scrape_configs:
+  # SeaweedFS Masters
+  - job_name: 'seaweedfs-master'
+    static_configs:
+      - targets: 
+        - 'master0:9333'
+        - 'master1:9334'
+        - 'master2:9335'
+    metrics_path: '/metrics'
+    scrape_interval: 10s
+
+  # SeaweedFS Volume Servers
+  - job_name: 'seaweedfs-volume'
+    static_configs:
+      - targets:
+        - 'volume1:8080'
+        - 'volume2:8081'
+        - 'volume3:8082'
+    metrics_path: '/metrics'
+    scrape_interval: 10s
+
+  # SeaweedFS Filers
+  - job_name: 'seaweedfs-filer'
+    static_configs:
+      - targets:
+        - 'filer1:8888'
+        - 'filer2:8889'
+    metrics_path: '/metrics'
+    scrape_interval: 10s
+
+  # SeaweedMQ Brokers
+  - job_name: 'seaweedmq-broker'
+    static_configs:
+      - targets:
+        - 'broker1:17777'
+        - 'broker2:17778'
+        - 'broker3:17779'
+    metrics_path: '/metrics'
+    scrape_interval: 5s
+
+  # Docker containers
+  - job_name: 'docker'
+    static_configs:
+      - targets: ['localhost:9323']
+    metrics_path: '/metrics'
+    scrape_interval: 30s