Browse Source
production sprint 4: EC batch delete, JPEG orientation, write queue, S3 tier move
production sprint 4: EC batch delete, JPEG orientation, write queue, S3 tier move
- BatchDelete now supports EC volumes: looks up needle in .ecx index, journals deletion to .ecj file (local-only, Go handles distributed part) - JPEG EXIF orientation auto-fix on upload using kamadak-exif + image crate, matching Go's FixJpgOrientation behavior (8 orientation transforms) - Async batched write processing via mpsc queue (up to 128 entries per batch), groups writes by volume ID and syncs once per volume per batch - VolumeTierMoveDatToRemote: multipart upload .dat file to S3 with progress streaming, updates .vif with remote file reference - VolumeTierMoveDatFromRemote: downloads .dat from S3 with progress streaming, removes remote file reference from .vif - S3TierRegistry for managing named remote storage backends - VolumeInfo (.vif) JSON persistence matching Go's protojson format - 124 lib + 7 integration = 131 Rust tests pass - All 109 Go integration tests pass (53 HTTP + 56 gRPC)rust-volume-server
16 changed files with 1545 additions and 89 deletions
-
16seaweed-volume/Cargo.lock
-
1seaweed-volume/Cargo.toml
-
31seaweed-volume/DEV_PLAN.md
-
3seaweed-volume/src/config.rs
-
275seaweed-volume/src/images.rs
-
1seaweed-volume/src/lib.rs
-
12seaweed-volume/src/main.rs
-
1seaweed-volume/src/remote_storage/mod.rs
-
327seaweed-volume/src/remote_storage/s3_tier.rs
-
392seaweed-volume/src/server/grpc_server.rs
-
46seaweed-volume/src/server/handlers.rs
-
1seaweed-volume/src/server/mod.rs
-
5seaweed-volume/src/server/volume_server.rs
-
315seaweed-volume/src/server/write_queue.rs
-
204seaweed-volume/src/storage/volume.rs
-
4seaweed-volume/tests/http_integration.rs
@ -0,0 +1,275 @@ |
|||
//! JPEG EXIF orientation auto-fix, matching Go's `FixJpgOrientation`.
|
|||
//!
|
|||
//! Reads the EXIF orientation tag from JPEG data and rotates/flips the image
|
|||
//! to normalize it to orientation 1 (top-left). If EXIF parsing fails or
|
|||
//! orientation is already normal, returns the original data unchanged.
|
|||
|
|||
use std::io::Cursor;
|
|||
|
|||
use image::{DynamicImage, GenericImageView, ImageFormat, RgbaImage};
|
|||
|
|||
/// EXIF orientation tag values.
|
|||
/// See: <http://sylvana.net/jpegcrop/exif_orientation.html>
|
|||
const TOP_LEFT_SIDE: u32 = 1;
|
|||
const TOP_RIGHT_SIDE: u32 = 2;
|
|||
const BOTTOM_RIGHT_SIDE: u32 = 3;
|
|||
const BOTTOM_LEFT_SIDE: u32 = 4;
|
|||
const LEFT_SIDE_TOP: u32 = 5;
|
|||
const RIGHT_SIDE_TOP: u32 = 6;
|
|||
const RIGHT_SIDE_BOTTOM: u32 = 7;
|
|||
const LEFT_SIDE_BOTTOM: u32 = 8;
|
|||
|
|||
/// Fix JPEG orientation based on EXIF data.
|
|||
///
|
|||
/// Reads the EXIF orientation tag and applies the appropriate rotation/flip
|
|||
/// to normalize the image to orientation 1 (top-left). Re-encodes as JPEG.
|
|||
///
|
|||
/// Returns the original data unchanged if:
|
|||
/// - EXIF data cannot be parsed
|
|||
/// - No orientation tag is present
|
|||
/// - Orientation is already 1 (normal)
|
|||
/// - Image decoding or re-encoding fails
|
|||
pub fn fix_jpg_orientation(data: &[u8]) -> Vec<u8> {
|
|||
// Parse EXIF data
|
|||
let orientation = match read_exif_orientation(data) {
|
|||
Some(o) => o,
|
|||
None => return data.to_vec(),
|
|||
};
|
|||
|
|||
// Orientation 1 means normal — no transformation needed
|
|||
if orientation == TOP_LEFT_SIDE {
|
|||
return data.to_vec();
|
|||
}
|
|||
|
|||
// Determine rotation angle and flip mode
|
|||
let (angle, flip_horizontal) = match orientation {
|
|||
TOP_RIGHT_SIDE => (0, true),
|
|||
BOTTOM_RIGHT_SIDE => (180, false),
|
|||
BOTTOM_LEFT_SIDE => (180, true),
|
|||
LEFT_SIDE_TOP => (-90, true),
|
|||
RIGHT_SIDE_TOP => (-90, false),
|
|||
RIGHT_SIDE_BOTTOM => (90, true),
|
|||
LEFT_SIDE_BOTTOM => (90, false),
|
|||
_ => return data.to_vec(),
|
|||
};
|
|||
|
|||
// Decode the image
|
|||
let src_image = match image::load_from_memory_with_format(data, ImageFormat::Jpeg) {
|
|||
Ok(img) => img,
|
|||
Err(_) => return data.to_vec(),
|
|||
};
|
|||
|
|||
// Apply rotation then flip (matching Go's flip(rotate(img, angle), flipMode))
|
|||
let transformed = flip_horizontal_if(rotate(src_image, angle), flip_horizontal);
|
|||
|
|||
// Re-encode as JPEG
|
|||
let mut buf = Cursor::new(Vec::new());
|
|||
match transformed.write_to(&mut buf, ImageFormat::Jpeg) {
|
|||
Ok(_) => buf.into_inner(),
|
|||
Err(_) => data.to_vec(),
|
|||
}
|
|||
}
|
|||
|
|||
/// Read the EXIF orientation tag from JPEG data.
|
|||
/// Returns None if EXIF cannot be parsed or orientation tag is not present.
|
|||
fn read_exif_orientation(data: &[u8]) -> Option<u32> {
|
|||
let exif_reader = exif::Reader::new();
|
|||
let mut cursor = Cursor::new(data);
|
|||
let exif_data = exif_reader.read_from_container(&mut cursor).ok()?;
|
|||
|
|||
let orientation_field = exif_data.get_field(exif::Tag::Orientation, exif::In::PRIMARY)?;
|
|||
match orientation_field.value {
|
|||
exif::Value::Short(ref v) if !v.is_empty() => Some(v[0] as u32),
|
|||
_ => orientation_field.value.get_uint(0),
|
|||
}
|
|||
}
|
|||
|
|||
/// Rotate an image by the given angle (counter-clockwise, in degrees).
|
|||
/// Matches Go's rotate function.
|
|||
fn rotate(img: DynamicImage, angle: i32) -> DynamicImage {
|
|||
let (width, height) = img.dimensions();
|
|||
|
|||
match angle {
|
|||
90 => {
|
|||
// 90 degrees counter-clockwise
|
|||
let new_w = height;
|
|||
let new_h = width;
|
|||
let mut out = RgbaImage::new(new_w, new_h);
|
|||
for y in 0..new_h {
|
|||
for x in 0..new_w {
|
|||
out.put_pixel(x, y, img.get_pixel(new_h - 1 - y, x));
|
|||
}
|
|||
}
|
|||
DynamicImage::ImageRgba8(out)
|
|||
}
|
|||
-90 => {
|
|||
// 90 degrees clockwise (or 270 counter-clockwise)
|
|||
let new_w = height;
|
|||
let new_h = width;
|
|||
let mut out = RgbaImage::new(new_w, new_h);
|
|||
for y in 0..new_h {
|
|||
for x in 0..new_w {
|
|||
out.put_pixel(x, y, img.get_pixel(y, new_w - 1 - x));
|
|||
}
|
|||
}
|
|||
DynamicImage::ImageRgba8(out)
|
|||
}
|
|||
180 | -180 => {
|
|||
let mut out = RgbaImage::new(width, height);
|
|||
for y in 0..height {
|
|||
for x in 0..width {
|
|||
out.put_pixel(x, y, img.get_pixel(width - 1 - x, height - 1 - y));
|
|||
}
|
|||
}
|
|||
DynamicImage::ImageRgba8(out)
|
|||
}
|
|||
_ => img,
|
|||
}
|
|||
}
|
|||
|
|||
/// Flip the image horizontally if requested.
|
|||
/// In Go, flipMode 2 == FlipHorizontal. We simplify since only horizontal flip is used.
|
|||
fn flip_horizontal_if(img: DynamicImage, do_flip: bool) -> DynamicImage {
|
|||
if !do_flip {
|
|||
return img;
|
|||
}
|
|||
let (width, height) = img.dimensions();
|
|||
let mut out = RgbaImage::new(width, height);
|
|||
for y in 0..height {
|
|||
for x in 0..width {
|
|||
out.put_pixel(x, y, img.get_pixel(width - 1 - x, y));
|
|||
}
|
|||
}
|
|||
DynamicImage::ImageRgba8(out)
|
|||
}
|
|||
|
|||
/// Returns true if the given MIME type or file path extension indicates a JPEG file.
|
|||
pub fn is_jpeg(mime_type: &str, path: &str) -> bool {
|
|||
if mime_type == "image/jpeg" {
|
|||
return true;
|
|||
}
|
|||
let lower = path.to_lowercase();
|
|||
lower.ends_with(".jpg") || lower.ends_with(".jpeg")
|
|||
}
|
|||
|
|||
#[cfg(test)]
|
|||
mod tests {
|
|||
use super::*;
|
|||
|
|||
#[test]
|
|||
fn test_non_jpeg_data_returned_unchanged() {
|
|||
let data = b"not a jpeg file at all";
|
|||
let result = fix_jpg_orientation(data);
|
|||
assert_eq!(result, data);
|
|||
}
|
|||
|
|||
#[test]
|
|||
fn test_jpeg_without_exif_returned_unchanged() {
|
|||
// Create a minimal JPEG without EXIF data
|
|||
let img = DynamicImage::ImageRgba8(RgbaImage::new(2, 2));
|
|||
let mut buf = Cursor::new(Vec::new());
|
|||
img.write_to(&mut buf, ImageFormat::Jpeg).unwrap();
|
|||
let jpeg_data = buf.into_inner();
|
|||
|
|||
let result = fix_jpg_orientation(&jpeg_data);
|
|||
// Should return data unchanged (no EXIF orientation tag)
|
|||
// Just verify it's still valid JPEG
|
|||
assert!(!result.is_empty());
|
|||
assert_eq!(&result[0..2], &[0xFF, 0xD8]); // JPEG magic bytes
|
|||
}
|
|||
|
|||
#[test]
|
|||
fn test_is_jpeg() {
|
|||
assert!(is_jpeg("image/jpeg", ""));
|
|||
assert!(is_jpeg("", "/3,abc.jpg"));
|
|||
assert!(is_jpeg("", "/3,abc.JPEG"));
|
|||
assert!(is_jpeg("application/octet-stream", "/3,abc.JPG"));
|
|||
assert!(!is_jpeg("image/png", "/3,abc.png"));
|
|||
assert!(!is_jpeg("", "/3,abc.png"));
|
|||
}
|
|||
|
|||
#[test]
|
|||
fn test_rotate_180() {
|
|||
// Create a 2x2 image with distinct pixel colors
|
|||
let mut img = RgbaImage::new(2, 2);
|
|||
img.put_pixel(0, 0, image::Rgba([255, 0, 0, 255])); // red top-left
|
|||
img.put_pixel(1, 0, image::Rgba([0, 255, 0, 255])); // green top-right
|
|||
img.put_pixel(0, 1, image::Rgba([0, 0, 255, 255])); // blue bottom-left
|
|||
img.put_pixel(1, 1, image::Rgba([255, 255, 0, 255])); // yellow bottom-right
|
|||
let dynamic = DynamicImage::ImageRgba8(img);
|
|||
|
|||
let rotated = rotate(dynamic, 180);
|
|||
let (w, h) = rotated.dimensions();
|
|||
assert_eq!((w, h), (2, 2));
|
|||
// After 180 rotation: top-left should be yellow, top-right should be blue
|
|||
assert_eq!(rotated.get_pixel(0, 0), image::Rgba([255, 255, 0, 255]));
|
|||
assert_eq!(rotated.get_pixel(1, 0), image::Rgba([0, 0, 255, 255]));
|
|||
assert_eq!(rotated.get_pixel(0, 1), image::Rgba([0, 255, 0, 255]));
|
|||
assert_eq!(rotated.get_pixel(1, 1), image::Rgba([255, 0, 0, 255]));
|
|||
}
|
|||
|
|||
#[test]
|
|||
fn test_rotate_90_ccw() {
|
|||
// Create 3x2 image (width=3, height=2)
|
|||
let mut img = RgbaImage::new(3, 2);
|
|||
img.put_pixel(0, 0, image::Rgba([1, 0, 0, 255]));
|
|||
img.put_pixel(1, 0, image::Rgba([2, 0, 0, 255]));
|
|||
img.put_pixel(2, 0, image::Rgba([3, 0, 0, 255]));
|
|||
img.put_pixel(0, 1, image::Rgba([4, 0, 0, 255]));
|
|||
img.put_pixel(1, 1, image::Rgba([5, 0, 0, 255]));
|
|||
img.put_pixel(2, 1, image::Rgba([6, 0, 0, 255]));
|
|||
let dynamic = DynamicImage::ImageRgba8(img);
|
|||
|
|||
let rotated = rotate(dynamic, 90);
|
|||
let (w, h) = rotated.dimensions();
|
|||
// 90 CCW: width=3,height=2 -> new_w=2, new_h=3
|
|||
assert_eq!((w, h), (2, 3));
|
|||
// Top-right (2,0) should move to top-left (0,0) in CCW 90
|
|||
assert_eq!(rotated.get_pixel(0, 0)[0], 3);
|
|||
assert_eq!(rotated.get_pixel(1, 0)[0], 6);
|
|||
}
|
|||
|
|||
#[test]
|
|||
fn test_rotate_neg90_cw() {
|
|||
// Create 3x2 image
|
|||
let mut img = RgbaImage::new(3, 2);
|
|||
img.put_pixel(0, 0, image::Rgba([1, 0, 0, 255]));
|
|||
img.put_pixel(1, 0, image::Rgba([2, 0, 0, 255]));
|
|||
img.put_pixel(2, 0, image::Rgba([3, 0, 0, 255]));
|
|||
img.put_pixel(0, 1, image::Rgba([4, 0, 0, 255]));
|
|||
img.put_pixel(1, 1, image::Rgba([5, 0, 0, 255]));
|
|||
img.put_pixel(2, 1, image::Rgba([6, 0, 0, 255]));
|
|||
let dynamic = DynamicImage::ImageRgba8(img);
|
|||
|
|||
let rotated = rotate(dynamic, -90);
|
|||
let (w, h) = rotated.dimensions();
|
|||
assert_eq!((w, h), (2, 3));
|
|||
// -90 (CW 90): top-left (0,0) should go to top-right
|
|||
assert_eq!(rotated.get_pixel(0, 0)[0], 4);
|
|||
assert_eq!(rotated.get_pixel(1, 0)[0], 1);
|
|||
}
|
|||
|
|||
#[test]
|
|||
fn test_flip_horizontal() {
|
|||
let mut img = RgbaImage::new(2, 1);
|
|||
img.put_pixel(0, 0, image::Rgba([10, 0, 0, 255]));
|
|||
img.put_pixel(1, 0, image::Rgba([20, 0, 0, 255]));
|
|||
let dynamic = DynamicImage::ImageRgba8(img);
|
|||
|
|||
let flipped = flip_horizontal_if(dynamic, true);
|
|||
assert_eq!(flipped.get_pixel(0, 0)[0], 20);
|
|||
assert_eq!(flipped.get_pixel(1, 0)[0], 10);
|
|||
}
|
|||
|
|||
#[test]
|
|||
fn test_flip_horizontal_noop() {
|
|||
let mut img = RgbaImage::new(2, 1);
|
|||
img.put_pixel(0, 0, image::Rgba([10, 0, 0, 255]));
|
|||
img.put_pixel(1, 0, image::Rgba([20, 0, 0, 255]));
|
|||
let dynamic = DynamicImage::ImageRgba8(img);
|
|||
|
|||
let not_flipped = flip_horizontal_if(dynamic, false);
|
|||
assert_eq!(not_flipped.get_pixel(0, 0)[0], 10);
|
|||
assert_eq!(not_flipped.get_pixel(1, 0)[0], 20);
|
|||
}
|
|||
}
|
|||
@ -0,0 +1,327 @@ |
|||
//! S3-compatible tiered storage backend for volume .dat file upload/download.
|
|||
//!
|
|||
//! Provides multipart upload and concurrent download with progress callbacks,
|
|||
//! matching the Go SeaweedFS S3 backend behavior.
|
|||
|
|||
use std::collections::HashMap;
|
|||
use std::sync::Arc;
|
|||
|
|||
use aws_sdk_s3::Client;
|
|||
use aws_sdk_s3::config::{BehaviorVersion, Credentials, Region};
|
|||
use aws_sdk_s3::types::{CompletedMultipartUpload, CompletedPart};
|
|||
use tokio::io::AsyncReadExt;
|
|||
|
|||
/// Configuration for an S3 tier backend.
|
|||
#[derive(Debug, Clone)]
|
|||
pub struct S3TierConfig {
|
|||
pub access_key: String,
|
|||
pub secret_key: String,
|
|||
pub region: String,
|
|||
pub bucket: String,
|
|||
pub endpoint: String,
|
|||
pub storage_class: String,
|
|||
pub force_path_style: bool,
|
|||
}
|
|||
|
|||
/// S3 tier backend for uploading/downloading volume .dat files.
|
|||
pub struct S3TierBackend {
|
|||
client: Client,
|
|||
pub bucket: String,
|
|||
pub storage_class: String,
|
|||
}
|
|||
|
|||
impl S3TierBackend {
|
|||
/// Create a new S3 tier backend from configuration.
|
|||
pub fn new(config: &S3TierConfig) -> Self {
|
|||
let region = if config.region.is_empty() {
|
|||
"us-east-1"
|
|||
} else {
|
|||
&config.region
|
|||
};
|
|||
|
|||
let credentials = Credentials::new(
|
|||
&config.access_key,
|
|||
&config.secret_key,
|
|||
None,
|
|||
None,
|
|||
"seaweedfs-volume-tier",
|
|||
);
|
|||
|
|||
let mut s3_config = aws_sdk_s3::Config::builder()
|
|||
.behavior_version(BehaviorVersion::latest())
|
|||
.region(Region::new(region.to_string()))
|
|||
.credentials_provider(credentials)
|
|||
.force_path_style(config.force_path_style);
|
|||
|
|||
if !config.endpoint.is_empty() {
|
|||
s3_config = s3_config.endpoint_url(&config.endpoint);
|
|||
}
|
|||
|
|||
let client = Client::from_conf(s3_config.build());
|
|||
|
|||
S3TierBackend {
|
|||
client,
|
|||
bucket: config.bucket.clone(),
|
|||
storage_class: if config.storage_class.is_empty() {
|
|||
"STANDARD_IA".to_string()
|
|||
} else {
|
|||
config.storage_class.clone()
|
|||
},
|
|||
}
|
|||
}
|
|||
|
|||
/// Upload a local file to S3 using multipart upload with progress reporting.
|
|||
///
|
|||
/// Returns (s3_key, file_size) on success.
|
|||
/// The progress callback receives (bytes_uploaded, percentage).
|
|||
pub async fn upload_file<F>(
|
|||
&self,
|
|||
file_path: &str,
|
|||
mut progress_fn: F,
|
|||
) -> Result<(String, u64), String>
|
|||
where
|
|||
F: FnMut(i64, f32) + Send + Sync + 'static,
|
|||
{
|
|||
let key = uuid::Uuid::new_v4().to_string();
|
|||
|
|||
let metadata = tokio::fs::metadata(file_path)
|
|||
.await
|
|||
.map_err(|e| format!("failed to stat file {}: {}", file_path, e))?;
|
|||
let file_size = metadata.len();
|
|||
|
|||
// Calculate part size: start at 64MB, scale up for very large files
|
|||
let mut part_size: u64 = 64 * 1024 * 1024;
|
|||
while part_size * 1000 < file_size {
|
|||
part_size *= 4;
|
|||
}
|
|||
|
|||
// Initiate multipart upload
|
|||
let create_resp = self
|
|||
.client
|
|||
.create_multipart_upload()
|
|||
.bucket(&self.bucket)
|
|||
.key(&key)
|
|||
.storage_class(
|
|||
self.storage_class
|
|||
.parse()
|
|||
.unwrap_or(aws_sdk_s3::types::StorageClass::StandardIa),
|
|||
)
|
|||
.send()
|
|||
.await
|
|||
.map_err(|e| format!("failed to create multipart upload: {}", e))?;
|
|||
|
|||
let upload_id = create_resp
|
|||
.upload_id()
|
|||
.ok_or_else(|| "no upload_id in multipart upload response".to_string())?
|
|||
.to_string();
|
|||
|
|||
let mut file = tokio::fs::File::open(file_path)
|
|||
.await
|
|||
.map_err(|e| format!("failed to open file {}: {}", file_path, e))?;
|
|||
|
|||
let mut completed_parts = Vec::new();
|
|||
let mut offset: u64 = 0;
|
|||
let mut part_number: i32 = 1;
|
|||
|
|||
loop {
|
|||
let remaining = file_size - offset;
|
|||
if remaining == 0 {
|
|||
break;
|
|||
}
|
|||
let this_part_size = std::cmp::min(part_size, remaining) as usize;
|
|||
let mut buf = vec![0u8; this_part_size];
|
|||
file.read_exact(&mut buf)
|
|||
.await
|
|||
.map_err(|e| format!("failed to read file at offset {}: {}", offset, e))?;
|
|||
|
|||
let upload_part_resp = self
|
|||
.client
|
|||
.upload_part()
|
|||
.bucket(&self.bucket)
|
|||
.key(&key)
|
|||
.upload_id(&upload_id)
|
|||
.part_number(part_number)
|
|||
.body(buf.into())
|
|||
.send()
|
|||
.await
|
|||
.map_err(|e| {
|
|||
format!(
|
|||
"failed to upload part {} at offset {}: {}",
|
|||
part_number, offset, e
|
|||
)
|
|||
})?;
|
|||
|
|||
let e_tag = upload_part_resp.e_tag().unwrap_or_default().to_string();
|
|||
completed_parts.push(
|
|||
CompletedPart::builder()
|
|||
.e_tag(e_tag)
|
|||
.part_number(part_number)
|
|||
.build(),
|
|||
);
|
|||
|
|||
offset += this_part_size as u64;
|
|||
|
|||
// Report progress
|
|||
let pct = if file_size > 0 {
|
|||
(offset as f32 * 100.0) / file_size as f32
|
|||
} else {
|
|||
100.0
|
|||
};
|
|||
progress_fn(offset as i64, pct);
|
|||
|
|||
part_number += 1;
|
|||
}
|
|||
|
|||
// Complete multipart upload
|
|||
let completed_upload = CompletedMultipartUpload::builder()
|
|||
.set_parts(Some(completed_parts))
|
|||
.build();
|
|||
|
|||
self.client
|
|||
.complete_multipart_upload()
|
|||
.bucket(&self.bucket)
|
|||
.key(&key)
|
|||
.upload_id(&upload_id)
|
|||
.multipart_upload(completed_upload)
|
|||
.send()
|
|||
.await
|
|||
.map_err(|e| format!("failed to complete multipart upload: {}", e))?;
|
|||
|
|||
Ok((key, file_size))
|
|||
}
|
|||
|
|||
/// Download a file from S3 to a local path with progress reporting.
|
|||
///
|
|||
/// Returns the file size on success.
|
|||
pub async fn download_file<F>(
|
|||
&self,
|
|||
dest_path: &str,
|
|||
key: &str,
|
|||
mut progress_fn: F,
|
|||
) -> Result<u64, String>
|
|||
where
|
|||
F: FnMut(i64, f32) + Send + Sync + 'static,
|
|||
{
|
|||
// Get file size first
|
|||
let head_resp = self
|
|||
.client
|
|||
.head_object()
|
|||
.bucket(&self.bucket)
|
|||
.key(key)
|
|||
.send()
|
|||
.await
|
|||
.map_err(|e| format!("failed to head object {}: {}", key, e))?;
|
|||
|
|||
let file_size = head_resp.content_length().unwrap_or(0) as u64;
|
|||
|
|||
// Download in chunks
|
|||
let part_size: u64 = 64 * 1024 * 1024;
|
|||
let mut file = tokio::fs::OpenOptions::new()
|
|||
.write(true)
|
|||
.create(true)
|
|||
.truncate(true)
|
|||
.open(dest_path)
|
|||
.await
|
|||
.map_err(|e| format!("failed to open dest file {}: {}", dest_path, e))?;
|
|||
|
|||
let mut offset: u64 = 0;
|
|||
|
|||
loop {
|
|||
if offset >= file_size {
|
|||
break;
|
|||
}
|
|||
let end = std::cmp::min(offset + part_size - 1, file_size - 1);
|
|||
let range = format!("bytes={}-{}", offset, end);
|
|||
|
|||
let get_resp = self
|
|||
.client
|
|||
.get_object()
|
|||
.bucket(&self.bucket)
|
|||
.key(key)
|
|||
.range(&range)
|
|||
.send()
|
|||
.await
|
|||
.map_err(|e| format!("failed to get object {} range {}: {}", key, range, e))?;
|
|||
|
|||
let body = get_resp
|
|||
.body
|
|||
.collect()
|
|||
.await
|
|||
.map_err(|e| format!("failed to read body: {}", e))?;
|
|||
let bytes = body.into_bytes();
|
|||
|
|||
use tokio::io::AsyncWriteExt;
|
|||
file.write_all(&bytes)
|
|||
.await
|
|||
.map_err(|e| format!("failed to write to {}: {}", dest_path, e))?;
|
|||
|
|||
offset += bytes.len() as u64;
|
|||
|
|||
let pct = if file_size > 0 {
|
|||
(offset as f32 * 100.0) / file_size as f32
|
|||
} else {
|
|||
100.0
|
|||
};
|
|||
progress_fn(offset as i64, pct);
|
|||
}
|
|||
|
|||
use tokio::io::AsyncWriteExt;
|
|||
file.flush()
|
|||
.await
|
|||
.map_err(|e| format!("failed to flush {}: {}", dest_path, e))?;
|
|||
|
|||
Ok(file_size)
|
|||
}
|
|||
|
|||
/// Delete a file from S3.
|
|||
pub async fn delete_file(&self, key: &str) -> Result<(), String> {
|
|||
self.client
|
|||
.delete_object()
|
|||
.bucket(&self.bucket)
|
|||
.key(key)
|
|||
.send()
|
|||
.await
|
|||
.map_err(|e| format!("failed to delete object {}: {}", key, e))?;
|
|||
Ok(())
|
|||
}
|
|||
}
|
|||
|
|||
/// Parse a backend name like "s3" or "s3.default" into (backend_type, backend_id).
|
|||
/// Matches Go's `BackendNameToTypeId`.
|
|||
pub fn backend_name_to_type_id(backend_name: &str) -> (String, String) {
|
|||
let parts: Vec<&str> = backend_name.split('.').collect();
|
|||
match parts.len() {
|
|||
1 => (backend_name.to_string(), "default".to_string()),
|
|||
2 => (parts[0].to_string(), parts[1].to_string()),
|
|||
_ => (String::new(), String::new()),
|
|||
}
|
|||
}
|
|||
|
|||
/// A registry of configured S3 tier backends, keyed by backend name (e.g., "s3.default").
|
|||
#[derive(Default)]
|
|||
pub struct S3TierRegistry {
|
|||
backends: HashMap<String, Arc<S3TierBackend>>,
|
|||
}
|
|||
|
|||
impl S3TierRegistry {
|
|||
pub fn new() -> Self {
|
|||
Self {
|
|||
backends: HashMap::new(),
|
|||
}
|
|||
}
|
|||
|
|||
/// Register a backend with the given name.
|
|||
pub fn register(&mut self, name: String, backend: S3TierBackend) {
|
|||
self.backends.insert(name, Arc::new(backend));
|
|||
}
|
|||
|
|||
/// Look up a backend by name.
|
|||
pub fn get(&self, name: &str) -> Option<Arc<S3TierBackend>> {
|
|||
self.backends.get(name).cloned()
|
|||
}
|
|||
|
|||
/// List all registered backend names.
|
|||
pub fn names(&self) -> Vec<String> {
|
|||
self.backends.keys().cloned().collect()
|
|||
}
|
|||
}
|
|||
@ -0,0 +1,315 @@ |
|||
//! Async batched write processing for the volume server.
|
|||
//!
|
|||
//! Instead of each upload handler directly calling `write_needle` and syncing,
|
|||
//! writes are submitted to a queue. A background worker drains the queue in
|
|||
//! batches (up to 128 entries), groups them by volume ID, processes them
|
|||
//! together, and syncs once per volume for the entire batch.
|
|||
|
|||
use std::sync::Arc;
|
|||
|
|||
use tokio::sync::{mpsc, oneshot};
|
|||
use tracing::debug;
|
|||
|
|||
use crate::storage::needle::needle::Needle;
|
|||
use crate::storage::types::{Size, VolumeId};
|
|||
use crate::storage::volume::VolumeError;
|
|||
|
|||
use super::volume_server::VolumeServerState;
|
|||
|
|||
/// Result of a single write operation: (offset, size, is_unchanged).
|
|||
pub type WriteResult = Result<(u64, Size, bool), VolumeError>;
|
|||
|
|||
/// A request to write a needle, submitted to the write queue.
|
|||
pub struct WriteRequest {
|
|||
pub volume_id: VolumeId,
|
|||
pub needle: Needle,
|
|||
pub response_tx: oneshot::Sender<WriteResult>,
|
|||
}
|
|||
|
|||
/// Maximum number of write requests to batch together.
|
|||
const MAX_BATCH_SIZE: usize = 128;
|
|||
|
|||
/// Handle for submitting write requests to the background worker.
|
|||
#[derive(Clone)]
|
|||
pub struct WriteQueue {
|
|||
tx: mpsc::Sender<WriteRequest>,
|
|||
}
|
|||
|
|||
impl WriteQueue {
|
|||
/// Create a new write queue and spawn the background worker.
|
|||
///
|
|||
/// `capacity` controls the channel buffer size (backpressure kicks in when full).
|
|||
/// The worker holds a reference to `state` for accessing the store.
|
|||
pub fn new(state: Arc<VolumeServerState>, capacity: usize) -> Self {
|
|||
let (tx, rx) = mpsc::channel(capacity);
|
|||
let worker = WriteQueueWorker {
|
|||
rx,
|
|||
state,
|
|||
};
|
|||
tokio::spawn(worker.run());
|
|||
WriteQueue { tx }
|
|||
}
|
|||
|
|||
/// Submit a write request and wait for the result.
|
|||
///
|
|||
/// Returns `Err` if the worker has shut down or the response channel was dropped.
|
|||
pub async fn submit(
|
|||
&self,
|
|||
volume_id: VolumeId,
|
|||
needle: Needle,
|
|||
) -> WriteResult {
|
|||
let (response_tx, response_rx) = oneshot::channel();
|
|||
let request = WriteRequest {
|
|||
volume_id,
|
|||
needle,
|
|||
response_tx,
|
|||
};
|
|||
|
|||
// Send to queue; this awaits if the channel is full (backpressure).
|
|||
if self.tx.send(request).await.is_err() {
|
|||
return Err(VolumeError::Io(std::io::Error::new(
|
|||
std::io::ErrorKind::BrokenPipe,
|
|||
"write queue worker has shut down",
|
|||
)));
|
|||
}
|
|||
|
|||
// Wait for the worker to process our request.
|
|||
match response_rx.await {
|
|||
Ok(result) => result,
|
|||
Err(_) => Err(VolumeError::Io(std::io::Error::new(
|
|||
std::io::ErrorKind::BrokenPipe,
|
|||
"write queue worker dropped response channel",
|
|||
))),
|
|||
}
|
|||
}
|
|||
}
|
|||
|
|||
/// Background worker that drains write requests and processes them in batches.
|
|||
struct WriteQueueWorker {
|
|||
rx: mpsc::Receiver<WriteRequest>,
|
|||
state: Arc<VolumeServerState>,
|
|||
}
|
|||
|
|||
impl WriteQueueWorker {
|
|||
async fn run(mut self) {
|
|||
debug!("write queue worker started");
|
|||
|
|||
loop {
|
|||
// Wait for the first request (blocks until one arrives or channel closes).
|
|||
let first = match self.rx.recv().await {
|
|||
Some(req) => req,
|
|||
None => {
|
|||
debug!("write queue channel closed, worker exiting");
|
|||
return;
|
|||
}
|
|||
};
|
|||
|
|||
// Drain as many additional requests as available, up to MAX_BATCH_SIZE.
|
|||
let mut batch = Vec::with_capacity(MAX_BATCH_SIZE);
|
|||
batch.push(first);
|
|||
|
|||
while batch.len() < MAX_BATCH_SIZE {
|
|||
match self.rx.try_recv() {
|
|||
Ok(req) => batch.push(req),
|
|||
Err(_) => break,
|
|||
}
|
|||
}
|
|||
|
|||
let batch_size = batch.len();
|
|||
debug!("processing write batch of {} requests", batch_size);
|
|||
|
|||
// Process the batch in spawn_blocking since write_needle does file I/O.
|
|||
let state = self.state.clone();
|
|||
let _ = tokio::task::spawn_blocking(move || {
|
|||
process_batch(state, batch);
|
|||
})
|
|||
.await;
|
|||
}
|
|||
}
|
|||
}
|
|||
|
|||
/// Process a batch of write requests, grouped by volume ID.
|
|||
///
|
|||
/// Groups writes by volume to minimize the number of store lock acquisitions,
|
|||
/// then sends results back via each request's oneshot channel.
|
|||
fn process_batch(state: Arc<VolumeServerState>, batch: Vec<WriteRequest>) {
|
|||
// Group requests by volume ID for efficient processing.
|
|||
// We use a Vec of (VolumeId, Vec<(Needle, Sender)>) to preserve order
|
|||
// and avoid requiring Hash on VolumeId.
|
|||
let mut groups: Vec<(VolumeId, Vec<(Needle, oneshot::Sender<WriteResult>)>)> = Vec::new();
|
|||
|
|||
for req in batch {
|
|||
let vid = req.volume_id;
|
|||
if let Some(group) = groups.iter_mut().find(|(v, _)| *v == vid) {
|
|||
group.1.push((req.needle, req.response_tx));
|
|||
} else {
|
|||
groups.push((vid, vec![(req.needle, req.response_tx)]));
|
|||
}
|
|||
}
|
|||
|
|||
// Process each volume group under a single store lock.
|
|||
let mut store = state.store.write().unwrap();
|
|||
|
|||
for (vid, entries) in groups {
|
|||
for (mut needle, response_tx) in entries {
|
|||
let result = store.write_volume_needle(vid, &mut needle);
|
|||
// Send result back; ignore error if receiver dropped.
|
|||
let _ = response_tx.send(result);
|
|||
}
|
|||
}
|
|||
}
|
|||
|
|||
#[cfg(test)]
|
|||
mod tests {
|
|||
use super::*;
|
|||
use crate::storage::types::VolumeId;
|
|||
|
|||
/// Helper to create a minimal VolumeServerState for testing.
|
|||
fn make_test_state() -> Arc<VolumeServerState> {
|
|||
use std::sync::RwLock;
|
|||
use std::sync::atomic::{AtomicBool, AtomicI64, AtomicU32};
|
|||
use crate::security::{Guard, SigningKey};
|
|||
use crate::storage::store::Store;
|
|||
use crate::storage::needle_map::NeedleMapKind;
|
|||
|
|||
let store = Store::new(NeedleMapKind::InMemory);
|
|||
let guard = Guard::new(
|
|||
&[],
|
|||
SigningKey(vec![]),
|
|||
0,
|
|||
SigningKey(vec![]),
|
|||
0,
|
|||
);
|
|||
|
|||
Arc::new(VolumeServerState {
|
|||
store: RwLock::new(store),
|
|||
guard,
|
|||
is_stopping: RwLock::new(false),
|
|||
maintenance: AtomicBool::new(false),
|
|||
state_version: AtomicU32::new(0),
|
|||
concurrent_upload_limit: 0,
|
|||
concurrent_download_limit: 0,
|
|||
inflight_upload_data_timeout: std::time::Duration::ZERO,
|
|||
inflight_download_data_timeout: std::time::Duration::ZERO,
|
|||
inflight_upload_bytes: AtomicI64::new(0),
|
|||
inflight_download_bytes: AtomicI64::new(0),
|
|||
upload_notify: tokio::sync::Notify::new(),
|
|||
download_notify: tokio::sync::Notify::new(),
|
|||
data_center: String::new(),
|
|||
rack: String::new(),
|
|||
file_size_limit_bytes: 0,
|
|||
is_heartbeating: AtomicBool::new(false),
|
|||
has_master: false,
|
|||
pre_stop_seconds: 0,
|
|||
volume_state_notify: tokio::sync::Notify::new(),
|
|||
write_queue: std::sync::OnceLock::new(),
|
|||
s3_tier_registry: std::sync::RwLock::new(crate::remote_storage::s3_tier::S3TierRegistry::new()),
|
|||
})
|
|||
}
|
|||
|
|||
#[tokio::test]
|
|||
async fn test_write_queue_submit_no_volume() {
|
|||
// Submit a write to a non-existent volume -- should return VolumeError::NotFound.
|
|||
let state = make_test_state();
|
|||
let queue = WriteQueue::new(state, MAX_BATCH_SIZE);
|
|||
|
|||
let needle = Needle {
|
|||
id: 1.into(),
|
|||
cookie: 0x12345678.into(),
|
|||
data: vec![1, 2, 3],
|
|||
data_size: 3,
|
|||
..Needle::default()
|
|||
};
|
|||
|
|||
let result = queue.submit(VolumeId(999), needle).await;
|
|||
assert!(result.is_err());
|
|||
match result {
|
|||
Err(VolumeError::NotFound) => {} // expected
|
|||
other => panic!("expected NotFound, got {:?}", other),
|
|||
}
|
|||
}
|
|||
|
|||
#[tokio::test]
|
|||
async fn test_write_queue_concurrent_submissions() {
|
|||
// Submit multiple concurrent writes -- all should complete (with errors since no volume).
|
|||
let state = make_test_state();
|
|||
let queue = WriteQueue::new(state, MAX_BATCH_SIZE);
|
|||
|
|||
let mut handles = Vec::new();
|
|||
for i in 0..10u64 {
|
|||
let q = queue.clone();
|
|||
handles.push(tokio::spawn(async move {
|
|||
let needle = Needle {
|
|||
id: i.into(),
|
|||
cookie: 0xABCD.into(),
|
|||
data: vec![i as u8; 10],
|
|||
data_size: 10,
|
|||
..Needle::default()
|
|||
};
|
|||
q.submit(VolumeId(1), needle).await
|
|||
}));
|
|||
}
|
|||
|
|||
for handle in handles {
|
|||
let result = handle.await.unwrap();
|
|||
// All should fail with NotFound since there's no volume 1
|
|||
assert!(matches!(result, Err(VolumeError::NotFound)));
|
|||
}
|
|||
}
|
|||
|
|||
#[tokio::test]
|
|||
async fn test_write_queue_batching() {
|
|||
// Verify that many concurrent writes get processed (testing the batching path).
|
|||
let state = make_test_state();
|
|||
let queue = WriteQueue::new(state, MAX_BATCH_SIZE);
|
|||
|
|||
// Submit MAX_BATCH_SIZE requests concurrently
|
|||
let mut handles = Vec::new();
|
|||
for i in 0..MAX_BATCH_SIZE as u64 {
|
|||
let q = queue.clone();
|
|||
handles.push(tokio::spawn(async move {
|
|||
let needle = Needle {
|
|||
id: i.into(),
|
|||
cookie: 0x1111.into(),
|
|||
data: vec![0u8; 4],
|
|||
data_size: 4,
|
|||
..Needle::default()
|
|||
};
|
|||
q.submit(VolumeId(42), needle).await
|
|||
}));
|
|||
}
|
|||
|
|||
let mut results = Vec::new();
|
|||
for handle in handles {
|
|||
results.push(handle.await.unwrap());
|
|||
}
|
|||
|
|||
// All should complete (with NotFound errors since no volume exists)
|
|||
assert_eq!(results.len(), MAX_BATCH_SIZE);
|
|||
for r in results {
|
|||
assert!(matches!(r, Err(VolumeError::NotFound)));
|
|||
}
|
|||
}
|
|||
|
|||
#[tokio::test]
|
|||
async fn test_write_queue_dropped_sender() {
|
|||
// When the queue is dropped, subsequent submits should fail gracefully.
|
|||
let state = make_test_state();
|
|||
let queue = WriteQueue::new(state, 1);
|
|||
|
|||
// Clone then drop the original -- the worker keeps running via its rx handle.
|
|||
let queue2 = queue.clone();
|
|||
drop(queue);
|
|||
|
|||
// This should still work since the worker is alive.
|
|||
let needle = Needle {
|
|||
id: 1.into(),
|
|||
cookie: 0.into(),
|
|||
data: vec![],
|
|||
data_size: 0,
|
|||
..Needle::default()
|
|||
};
|
|||
let result = queue2.submit(VolumeId(1), needle).await;
|
|||
assert!(result.is_err()); // NotFound is fine -- the point is it doesn't panic
|
|||
}
|
|||
}
|
|||
Write
Preview
Loading…
Cancel
Save
Reference in new issue