package loadtest

import (
	"bytes"
	"crypto/rand"
	"fmt"
	"io"
	"net/http"
	"os"
	"sort"
	"sync"
	"sync/atomic"
	"testing"
	"time"

	"github.com/seaweedfs/seaweedfs/test/volume_server/framework"
	"github.com/seaweedfs/seaweedfs/test/volume_server/matrix"
)

// Run with:
//   go test -v -count=1 -timeout 300s -run BenchmarkVolumeServer ./test/volume_server/loadtest/...
//   VOLUME_SERVER_IMPL=rust go test -v -count=1 -timeout 300s -run BenchmarkVolumeServer ./test/volume_server/loadtest/...
//
// Compare results:
//   go test -count=1 -timeout 300s -run BenchmarkVolumeServer -bench . ./test/volume_server/loadtest/... | tee go.txt
//   VOLUME_SERVER_IMPL=rust go test -count=1 -timeout 300s -run BenchmarkVolumeServer -bench . ./test/volume_server/loadtest/... | tee rust.txt

// Step-by-step payload sizes: 1KB → 4KB → 16KB → 64KB → 256KB → 1MB → 4MB → 8MB
var payloadSteps = []struct {
	name string
	size int
}{
	{"1KB", 1 << 10},
	{"4KB", 4 << 10},
	{"16KB", 16 << 10},
	{"64KB", 64 << 10},
	{"256KB", 256 << 10},
	{"1MB", 1 << 20},
	{"4MB", 4 << 20},
	{"8MB", 8 << 20},
}

func implName() string {
	if os.Getenv("VOLUME_SERVER_IMPL") == "rust" {
		return "rust"
	}
	return "go"
}

// setupCluster starts a volume cluster and returns the admin URL and cleanup.
func setupCluster(tb testing.TB) (adminURL string, grpcAddr string, cleanup func()) {
	tb.Helper()
	cluster := framework.StartVolumeCluster(tb, matrix.P1())
	return cluster.VolumeAdminURL(), cluster.VolumeGRPCAddress(), cluster.Stop
}

// allocateVolume allocates a volume via gRPC and returns its ID.
func allocateVolume(tb testing.TB, grpcAddr string, volumeID uint32) {
	tb.Helper()
	conn, client := framework.DialVolumeServer(tb, grpcAddr)
	defer conn.Close()
	framework.AllocateVolume(tb, client, volumeID, "loadtest")
}

func makePayload(size int) []byte {
	data := make([]byte, size)
	rand.Read(data)
	return data
}

// uploadFile uploads data and returns the file ID used.
func uploadFile(client *http.Client, adminURL string, volumeID uint32, key uint64, cookie uint32, data []byte) error {
	fid := framework.NewFileID(volumeID, key, cookie)
	url := fmt.Sprintf("%s/%s", adminURL, fid)
	req, err := http.NewRequest(http.MethodPost, url, bytes.NewReader(data))
	if err != nil {
		return err
	}
	req.Header.Set("Content-Type", "application/octet-stream")
	resp, err := client.Do(req)
	if err != nil {
		return err
	}
	io.Copy(io.Discard, resp.Body)
	resp.Body.Close()
	if resp.StatusCode >= 400 {
		return fmt.Errorf("upload %s: status %d", fid, resp.StatusCode)
	}
	return nil
}

// downloadFile reads a file and discards the body.
func downloadFile(client *http.Client, adminURL string, volumeID uint32, key uint64, cookie uint32) error {
	fid := framework.NewFileID(volumeID, key, cookie)
	url := fmt.Sprintf("%s/%s", adminURL, fid)
	resp, err := client.Get(url)
	if err != nil {
		return err
	}
	io.Copy(io.Discard, resp.Body)
	resp.Body.Close()
	if resp.StatusCode >= 400 {
		return fmt.Errorf("download %s: status %d", fid, resp.StatusCode)
	}
	return nil
}

// deleteFile deletes a file.
func deleteFile(client *http.Client, adminURL string, volumeID uint32, key uint64, cookie uint32) error {
	fid := framework.NewFileID(volumeID, key, cookie)
	url := fmt.Sprintf("%s/%s", adminURL, fid)
	req, err := http.NewRequest(http.MethodDelete, url, nil)
	if err != nil {
		return err
	}
	resp, err := client.Do(req)
	if err != nil {
		return err
	}
	io.Copy(io.Discard, resp.Body)
	resp.Body.Close()
	return nil
}

// --- Throughput load tests (not Go benchmarks, manual timing for comparison) ---

// TestBenchmarkVolumeServer runs a suite of load tests printing ops/sec and latency.
func TestBenchmarkVolumeServer(t *testing.T) {
	if testing.Short() {
		t.Skip("skipping load test in short mode")
	}

	impl := implName()
	adminURL, grpcAddr, cleanup := setupCluster(t)
	defer cleanup()

	const volumeID = uint32(10)
	allocateVolume(t, grpcAddr, volumeID)

	httpClient := &http.Client{
		Timeout: 30 * time.Second,
		Transport: &http.Transport{
			MaxIdleConnsPerHost: 128,
			MaxConnsPerHost:     128,
		},
	}

	// opsForSize returns fewer ops for larger payloads to keep test time reasonable.
	opsForSize := func(size, concurrency int) int {
		switch {
		case size >= 4<<20:
			if concurrency > 1 {
				return 64
			}
			return 30
		case size >= 1<<20:
			if concurrency > 1 {
				return 200
			}
			return 100
		case size >= 64<<10:
			if concurrency > 1 {
				return 500
			}
			return 300
		default:
			if concurrency > 1 {
				return 1000
			}
			return 500
		}
	}

	// Step-by-step upload: 1KB → 4KB → 16KB → 64KB → 256KB → 1MB → 4MB → 8MB
	for _, ps := range payloadSteps {
		for _, mode := range []struct {
			label       string
			concurrency int
		}{
			{"seq", 1},
			{"c16", 16},
		} {
			name := fmt.Sprintf("Upload/%s/%s", ps.name, mode.label)
			numOps := opsForSize(ps.size, mode.concurrency)
			t.Run(fmt.Sprintf("%s/%s", impl, name), func(t *testing.T) {
				payload := makePayload(ps.size)
				runThroughputTest(t, impl, name, httpClient, adminURL, volumeID,
					payload, numOps, mode.concurrency, false, false)
			})
		}
	}

	// Step-by-step download: 1KB → 4KB → 16KB → 64KB → 256KB → 1MB → 4MB → 8MB
	for _, ps := range payloadSteps {
		for _, mode := range []struct {
			label       string
			concurrency int
		}{
			{"seq", 1},
			{"c16", 16},
		} {
			name := fmt.Sprintf("Download/%s/%s", ps.name, mode.label)
			numOps := opsForSize(ps.size, mode.concurrency)
			t.Run(fmt.Sprintf("%s/%s", impl, name), func(t *testing.T) {
				payload := makePayload(ps.size)
				runThroughputTest(t, impl, name, httpClient, adminURL, volumeID,
					payload, numOps, mode.concurrency, true, false)
			})
		}
	}

	// Mixed read/write at each size
	for _, ps := range payloadSteps {
		name := fmt.Sprintf("Mixed/%s/c16", ps.name)
		numOps := opsForSize(ps.size, 16)
		t.Run(fmt.Sprintf("%s/%s", impl, name), func(t *testing.T) {
			payload := makePayload(ps.size)
			runThroughputTest(t, impl, name, httpClient, adminURL, volumeID,
				payload, numOps, 16, false, true)
		})
	}

	// Delete test
	t.Run(fmt.Sprintf("%s/Delete/1KB/c16", impl), func(t *testing.T) {
		payload := makePayload(1 << 10)
		numOps := 1000
		baseKey := uint64(900000)

		for i := 0; i < numOps; i++ {
			if err := uploadFile(httpClient, adminURL, volumeID, baseKey+uint64(i), 1, payload); err != nil {
				t.Fatalf("pre-upload for delete %d: %v", i, err)
			}
		}

		var ops atomic.Int64
		var totalLatency atomic.Int64

		start := time.Now()
		var wg sync.WaitGroup
		concurrency := 16
		opsPerWorker := numOps / concurrency

		for w := 0; w < concurrency; w++ {
			workerBase := baseKey + uint64(w*opsPerWorker)
			wg.Add(1)
			go func(wb uint64) {
				defer wg.Done()
				for i := 0; i < opsPerWorker; i++ {
					opStart := time.Now()
					deleteFile(httpClient, adminURL, volumeID, wb+uint64(i), 1)
					totalLatency.Add(time.Since(opStart).Nanoseconds())
					ops.Add(1)
				}
			}(workerBase)
		}
		wg.Wait()
		elapsed := time.Since(start)

		totalOps := ops.Load()
		avgLatencyUs := float64(totalLatency.Load()) / float64(totalOps) / 1000.0
		opsPerSec := float64(totalOps) / elapsed.Seconds()

		t.Logf("RESULT impl=%-4s test=%-22s ops=%-6d errors=%-4d elapsed=%-10s ops/s=%-10.1f avg_lat=%-10.0fus",
			impl, "Delete/1KB/c16", totalOps, 0, elapsed.Round(time.Millisecond), opsPerSec, avgLatencyUs)
	})
}

// runThroughputTest is the shared core for throughput tests.
// keyOffset separates key ranges so concurrent tests in the same volume don't collide.
// keyCounter provides globally unique key ranges. Starts at 1 because key=0 is invalid.
var keyCounter atomic.Uint64

func init() {
	keyCounter.Store(1)
}

func runThroughputTest(
	t *testing.T, impl, name string,
	httpClient *http.Client, adminURL string, volumeID uint32,
	payload []byte, numOps, concurrency int,
	isDownload, isMixed bool,
) {
	t.Helper()

	// Each call gets a unique key range
	baseKey := keyCounter.Add(uint64(numOps*2)) - uint64(numOps*2)

	// Pre-upload for download / mixed
	if isDownload || isMixed {
		for i := 0; i < numOps; i++ {
			if err := uploadFile(httpClient, adminURL, volumeID, baseKey+uint64(i), 1, payload); err != nil {
				t.Fatalf("pre-upload %d: %v", i, err)
			}
		}
	}

	uploadBase := baseKey
	if !isDownload && !isMixed {
		uploadBase = baseKey + uint64(numOps) // fresh range for uploads
	}

	var ops atomic.Int64
	var errors atomic.Int64
	var totalLatency atomic.Int64

	start := time.Now()

	var wg sync.WaitGroup
	opsPerWorker := numOps / concurrency
	remainder := numOps % concurrency

	for w := 0; w < concurrency; w++ {
		n := opsPerWorker
		if w < remainder {
			n++
		}
		var workerBase uint64
		if w < remainder {
			workerBase = uploadBase + uint64(w*(opsPerWorker+1))
		} else {
			workerBase = uploadBase + uint64(remainder*(opsPerWorker+1)) + uint64((w-remainder)*opsPerWorker)
		}

		wg.Add(1)
		go func(wb uint64, count int) {
			defer wg.Done()
			for i := 0; i < count; i++ {
				key := wb + uint64(i)
				opStart := time.Now()
				var err error

				if isMixed {
					if i%2 == 0 {
						err = uploadFile(httpClient, adminURL, volumeID, key, 1, payload)
					} else {
						err = downloadFile(httpClient, adminURL, volumeID, key, 1)
					}
				} else if isDownload {
					err = downloadFile(httpClient, adminURL, volumeID, key, 1)
				} else {
					err = uploadFile(httpClient, adminURL, volumeID, key, 1, payload)
				}

				totalLatency.Add(time.Since(opStart).Nanoseconds())
				ops.Add(1)
				if err != nil {
					errors.Add(1)
				}
			}
		}(workerBase, n)
	}

	wg.Wait()
	elapsed := time.Since(start)

	totalOps := ops.Load()
	totalErrs := errors.Load()
	avgLatencyUs := float64(totalLatency.Load()) / float64(totalOps) / 1000.0
	opsPerSec := float64(totalOps) / elapsed.Seconds()
	throughputMBs := opsPerSec * float64(len(payload)) / (1024 * 1024)

	t.Logf("RESULT impl=%-4s test=%-22s ops=%-6d errors=%-4d elapsed=%-10s ops/s=%-10.1f avg_lat=%-10.0fus throughput=%.2f MB/s",
		impl, name, totalOps, totalErrs, elapsed.Round(time.Millisecond), opsPerSec, avgLatencyUs, throughputMBs)
}

// TestLatencyPercentiles measures p50/p95/p99 latencies for upload and download at each size.
func TestLatencyPercentiles(t *testing.T) {
	if testing.Short() {
		t.Skip("skipping load test in short mode")
	}

	impl := implName()
	adminURL, grpcAddr, cleanup := setupCluster(t)
	defer cleanup()

	const volumeID = uint32(20)
	allocateVolume(t, grpcAddr, volumeID)

	httpClient := &http.Client{
		Timeout: 30 * time.Second,
		Transport: &http.Transport{
			MaxIdleConnsPerHost: 64,
			MaxConnsPerHost:     64,
		},
	}

	latOpsForSize := func(size int) int {
		switch {
		case size >= 4<<20:
			return 30
		case size >= 1<<20:
			return 100
		default:
			return 300
		}
	}

	for _, ps := range payloadSteps {
		for _, dl := range []struct {
			prefix     string
			isDownload bool
		}{
			{"Upload", false},
			{"Download", true},
		} {
			name := fmt.Sprintf("%s/%s", dl.prefix, ps.name)
			numOps := latOpsForSize(ps.size)

			t.Run(fmt.Sprintf("%s/%s", impl, name), func(t *testing.T) {
				payload := makePayload(ps.size)
				baseKey := keyCounter.Add(uint64(numOps * 2))

				if dl.isDownload {
					for i := 0; i < numOps; i++ {
						if err := uploadFile(httpClient, adminURL, volumeID, baseKey+uint64(i), 2, payload); err != nil {
							t.Fatalf("pre-upload: %v", err)
						}
					}
				}

				uploadBase := baseKey
				if !dl.isDownload {
					uploadBase = baseKey + uint64(numOps)
				}

				latencies := make([]time.Duration, numOps)
				for i := 0; i < numOps; i++ {
					key := uploadBase + uint64(i)
					start := time.Now()
					if dl.isDownload {
						downloadFile(httpClient, adminURL, volumeID, key, 2)
					} else {
						uploadFile(httpClient, adminURL, volumeID, key, 2, payload)
					}
					latencies[i] = time.Since(start)
				}

				sortDurations(latencies)

				p50 := latencies[len(latencies)*50/100]
				p95 := latencies[len(latencies)*95/100]
				p99 := latencies[len(latencies)*99/100]
				min := latencies[0]
				max := latencies[len(latencies)-1]

				t.Logf("RESULT impl=%-4s test=%-20s n=%-4d min=%-10s p50=%-10s p95=%-10s p99=%-10s max=%-10s",
					impl, name, numOps, min.Round(time.Microsecond), p50.Round(time.Microsecond), p95.Round(time.Microsecond), p99.Round(time.Microsecond), max.Round(time.Microsecond))
			})
		}
	}
}

func sortDurations(d []time.Duration) {
	sort.Slice(d, func(i, j int) bool { return d[i] < d[j] })
}

// TestSustainedP99 runs high-concurrency load for a sustained period (default 60s,
// override with LOADTEST_DURATION=120s) and reports p50/p95/p99/p999 latencies.
// This reveals tail latency differences that short tests miss (GC pauses, lock contention, etc).
//
// Run:
//   go test -v -count=1 -timeout 600s -run TestSustainedP99 ./test/volume_server/loadtest/...
//   VOLUME_SERVER_IMPL=rust go test -v -count=1 -timeout 600s -run TestSustainedP99 ./test/volume_server/loadtest/...
//   LOADTEST_DURATION=120s VOLUME_SERVER_IMPL=rust go test -v -count=1 -timeout 600s -run TestSustainedP99 ./test/volume_server/loadtest/...
func TestSustainedP99(t *testing.T) {
	if testing.Short() {
		t.Skip("skipping sustained load test in short mode")
	}

	duration := 60 * time.Second
	if d := os.Getenv("LOADTEST_DURATION"); d != "" {
		parsed, err := time.ParseDuration(d)
		if err == nil && parsed > 0 {
			duration = parsed
		}
	}

	impl := implName()
	adminURL, grpcAddr, cleanup := setupCluster(t)
	defer cleanup()

	httpClient := &http.Client{
		Timeout: 30 * time.Second,
		Transport: &http.Transport{
			MaxIdleConnsPerHost: 128,
			MaxConnsPerHost:     128,
		},
	}

	type scenario struct {
		name        string
		size        int
		concurrency int
		isDownload  bool
	}

	scenarios := []scenario{
		{"Upload/1KB/c16", 1 << 10, 16, false},
		{"Upload/64KB/c16", 64 << 10, 16, false},
		{"Download/1KB/c16", 1 << 10, 16, true},
		{"Download/64KB/c16", 64 << 10, 16, true},
	}

	var nextVolID atomic.Uint32
	nextVolID.Store(30)

	for _, sc := range scenarios {
		t.Run(fmt.Sprintf("%s/%s", impl, sc.name), func(t *testing.T) {
			// Each scenario gets its own volume to avoid filling up
			volumeID := nextVolID.Add(1) - 1
			allocateVolume(t, grpcAddr, volumeID)

			payload := makePayload(sc.size)

			// Pre-upload a pool of files for download tests
			poolSize := 500
			baseKey := keyCounter.Add(uint64(poolSize*2)) - uint64(poolSize*2)

			if sc.isDownload {
				t.Logf("Pre-uploading %d files for download test...", poolSize)
				for i := 0; i < poolSize; i++ {
					if err := uploadFile(httpClient, adminURL, volumeID, baseKey+uint64(i), 3, payload); err != nil {
						t.Fatalf("pre-upload %d: %v", i, err)
					}
				}
			}

			// Collect latencies from all workers
			type latencyBucket struct {
				mu        sync.Mutex
				latencies []time.Duration
			}
			bucket := &latencyBucket{
				latencies: make([]time.Duration, 0, 100000),
			}

			var totalOps atomic.Int64
			var totalErrors atomic.Int64

			deadline := time.Now().Add(duration)
			start := time.Now()

			// For uploads, pre-seed the pool so subsequent writes are overwrites (no volume fill)
			if !sc.isDownload {
				t.Logf("Pre-seeding %d files for upload overwrite test...", poolSize)
				for i := 0; i < poolSize; i++ {
					if err := uploadFile(httpClient, adminURL, volumeID, baseKey+uint64(i), 3, payload); err != nil {
						t.Fatalf("pre-seed %d: %v", i, err)
					}
				}
			}

			var wg sync.WaitGroup
			for w := 0; w < sc.concurrency; w++ {
				wg.Add(1)
				go func(workerID int) {
					defer wg.Done()
					localLats := make([]time.Duration, 0, 8192)

					var i uint64
					for time.Now().Before(deadline) {
						// Cycle through the pool to avoid filling up the volume
						key := baseKey + uint64(int(i)%poolSize)

						opStart := time.Now()
						var err error
						if sc.isDownload {
							err = downloadFile(httpClient, adminURL, volumeID, key, 3)
						} else {
							err = uploadFile(httpClient, adminURL, volumeID, key, 3, payload)
						}
						lat := time.Since(opStart)

						localLats = append(localLats, lat)
						totalOps.Add(1)
						if err != nil {
							totalErrors.Add(1)
						}
						i++

						// Flush local buffer periodically
						if len(localLats) >= 8192 {
							bucket.mu.Lock()
							bucket.latencies = append(bucket.latencies, localLats...)
							bucket.mu.Unlock()
							localLats = localLats[:0]
						}
					}
					// Final flush
					if len(localLats) > 0 {
						bucket.mu.Lock()
						bucket.latencies = append(bucket.latencies, localLats...)
						bucket.mu.Unlock()
					}
				}(w)
			}

			wg.Wait()
			elapsed := time.Since(start)

			lats := bucket.latencies
			n := len(lats)
			ops := totalOps.Load()
			errs := totalErrors.Load()
			opsPerSec := float64(ops) / elapsed.Seconds()

			sortDurations(lats)

			pct := func(p float64) time.Duration {
				idx := int(float64(n) * p / 100.0)
				if idx >= n {
					idx = n - 1
				}
				return lats[idx]
			}

			t.Logf("RESULT impl=%-4s test=%-22s duration=%-6s ops=%-8d errors=%-4d ops/s=%-10.1f",
				impl, sc.name, elapsed.Round(time.Second), ops, errs, opsPerSec)
			t.Logf("       p50=%-10s p90=%-10s p95=%-10s p99=%-10s p999=%-10s max=%-10s",
				pct(50).Round(time.Microsecond),
				pct(90).Round(time.Microsecond),
				pct(95).Round(time.Microsecond),
				pct(99).Round(time.Microsecond),
				pct(99.9).Round(time.Microsecond),
				lats[n-1].Round(time.Microsecond))
		})
	}
}