Browse Source
production sprint 4: EC batch delete, JPEG orientation, write queue, S3 tier move
production sprint 4: EC batch delete, JPEG orientation, write queue, S3 tier move
- BatchDelete now supports EC volumes: looks up needle in .ecx index, journals deletion to .ecj file (local-only, Go handles distributed part) - JPEG EXIF orientation auto-fix on upload using kamadak-exif + image crate, matching Go's FixJpgOrientation behavior (8 orientation transforms) - Async batched write processing via mpsc queue (up to 128 entries per batch), groups writes by volume ID and syncs once per volume per batch - VolumeTierMoveDatToRemote: multipart upload .dat file to S3 with progress streaming, updates .vif with remote file reference - VolumeTierMoveDatFromRemote: downloads .dat from S3 with progress streaming, removes remote file reference from .vif - S3TierRegistry for managing named remote storage backends - VolumeInfo (.vif) JSON persistence matching Go's protojson format - 124 lib + 7 integration = 131 Rust tests pass - All 109 Go integration tests pass (53 HTTP + 56 gRPC)rust-volume-server
16 changed files with 1545 additions and 89 deletions
-
16seaweed-volume/Cargo.lock
-
1seaweed-volume/Cargo.toml
-
31seaweed-volume/DEV_PLAN.md
-
3seaweed-volume/src/config.rs
-
275seaweed-volume/src/images.rs
-
1seaweed-volume/src/lib.rs
-
12seaweed-volume/src/main.rs
-
1seaweed-volume/src/remote_storage/mod.rs
-
327seaweed-volume/src/remote_storage/s3_tier.rs
-
392seaweed-volume/src/server/grpc_server.rs
-
46seaweed-volume/src/server/handlers.rs
-
1seaweed-volume/src/server/mod.rs
-
5seaweed-volume/src/server/volume_server.rs
-
315seaweed-volume/src/server/write_queue.rs
-
204seaweed-volume/src/storage/volume.rs
-
4seaweed-volume/tests/http_integration.rs
@ -0,0 +1,275 @@ |
|||||
|
//! JPEG EXIF orientation auto-fix, matching Go's `FixJpgOrientation`.
|
||||
|
//!
|
||||
|
//! Reads the EXIF orientation tag from JPEG data and rotates/flips the image
|
||||
|
//! to normalize it to orientation 1 (top-left). If EXIF parsing fails or
|
||||
|
//! orientation is already normal, returns the original data unchanged.
|
||||
|
|
||||
|
use std::io::Cursor;
|
||||
|
|
||||
|
use image::{DynamicImage, GenericImageView, ImageFormat, RgbaImage};
|
||||
|
|
||||
|
/// EXIF orientation tag values.
|
||||
|
/// See: <http://sylvana.net/jpegcrop/exif_orientation.html>
|
||||
|
const TOP_LEFT_SIDE: u32 = 1;
|
||||
|
const TOP_RIGHT_SIDE: u32 = 2;
|
||||
|
const BOTTOM_RIGHT_SIDE: u32 = 3;
|
||||
|
const BOTTOM_LEFT_SIDE: u32 = 4;
|
||||
|
const LEFT_SIDE_TOP: u32 = 5;
|
||||
|
const RIGHT_SIDE_TOP: u32 = 6;
|
||||
|
const RIGHT_SIDE_BOTTOM: u32 = 7;
|
||||
|
const LEFT_SIDE_BOTTOM: u32 = 8;
|
||||
|
|
||||
|
/// Fix JPEG orientation based on EXIF data.
|
||||
|
///
|
||||
|
/// Reads the EXIF orientation tag and applies the appropriate rotation/flip
|
||||
|
/// to normalize the image to orientation 1 (top-left). Re-encodes as JPEG.
|
||||
|
///
|
||||
|
/// Returns the original data unchanged if:
|
||||
|
/// - EXIF data cannot be parsed
|
||||
|
/// - No orientation tag is present
|
||||
|
/// - Orientation is already 1 (normal)
|
||||
|
/// - Image decoding or re-encoding fails
|
||||
|
pub fn fix_jpg_orientation(data: &[u8]) -> Vec<u8> {
|
||||
|
// Parse EXIF data
|
||||
|
let orientation = match read_exif_orientation(data) {
|
||||
|
Some(o) => o,
|
||||
|
None => return data.to_vec(),
|
||||
|
};
|
||||
|
|
||||
|
// Orientation 1 means normal — no transformation needed
|
||||
|
if orientation == TOP_LEFT_SIDE {
|
||||
|
return data.to_vec();
|
||||
|
}
|
||||
|
|
||||
|
// Determine rotation angle and flip mode
|
||||
|
let (angle, flip_horizontal) = match orientation {
|
||||
|
TOP_RIGHT_SIDE => (0, true),
|
||||
|
BOTTOM_RIGHT_SIDE => (180, false),
|
||||
|
BOTTOM_LEFT_SIDE => (180, true),
|
||||
|
LEFT_SIDE_TOP => (-90, true),
|
||||
|
RIGHT_SIDE_TOP => (-90, false),
|
||||
|
RIGHT_SIDE_BOTTOM => (90, true),
|
||||
|
LEFT_SIDE_BOTTOM => (90, false),
|
||||
|
_ => return data.to_vec(),
|
||||
|
};
|
||||
|
|
||||
|
// Decode the image
|
||||
|
let src_image = match image::load_from_memory_with_format(data, ImageFormat::Jpeg) {
|
||||
|
Ok(img) => img,
|
||||
|
Err(_) => return data.to_vec(),
|
||||
|
};
|
||||
|
|
||||
|
// Apply rotation then flip (matching Go's flip(rotate(img, angle), flipMode))
|
||||
|
let transformed = flip_horizontal_if(rotate(src_image, angle), flip_horizontal);
|
||||
|
|
||||
|
// Re-encode as JPEG
|
||||
|
let mut buf = Cursor::new(Vec::new());
|
||||
|
match transformed.write_to(&mut buf, ImageFormat::Jpeg) {
|
||||
|
Ok(_) => buf.into_inner(),
|
||||
|
Err(_) => data.to_vec(),
|
||||
|
}
|
||||
|
}
|
||||
|
|
||||
|
/// Read the EXIF orientation tag from JPEG data.
|
||||
|
/// Returns None if EXIF cannot be parsed or orientation tag is not present.
|
||||
|
fn read_exif_orientation(data: &[u8]) -> Option<u32> {
|
||||
|
let exif_reader = exif::Reader::new();
|
||||
|
let mut cursor = Cursor::new(data);
|
||||
|
let exif_data = exif_reader.read_from_container(&mut cursor).ok()?;
|
||||
|
|
||||
|
let orientation_field = exif_data.get_field(exif::Tag::Orientation, exif::In::PRIMARY)?;
|
||||
|
match orientation_field.value {
|
||||
|
exif::Value::Short(ref v) if !v.is_empty() => Some(v[0] as u32),
|
||||
|
_ => orientation_field.value.get_uint(0),
|
||||
|
}
|
||||
|
}
|
||||
|
|
||||
|
/// Rotate an image by the given angle (counter-clockwise, in degrees).
|
||||
|
/// Matches Go's rotate function.
|
||||
|
fn rotate(img: DynamicImage, angle: i32) -> DynamicImage {
|
||||
|
let (width, height) = img.dimensions();
|
||||
|
|
||||
|
match angle {
|
||||
|
90 => {
|
||||
|
// 90 degrees counter-clockwise
|
||||
|
let new_w = height;
|
||||
|
let new_h = width;
|
||||
|
let mut out = RgbaImage::new(new_w, new_h);
|
||||
|
for y in 0..new_h {
|
||||
|
for x in 0..new_w {
|
||||
|
out.put_pixel(x, y, img.get_pixel(new_h - 1 - y, x));
|
||||
|
}
|
||||
|
}
|
||||
|
DynamicImage::ImageRgba8(out)
|
||||
|
}
|
||||
|
-90 => {
|
||||
|
// 90 degrees clockwise (or 270 counter-clockwise)
|
||||
|
let new_w = height;
|
||||
|
let new_h = width;
|
||||
|
let mut out = RgbaImage::new(new_w, new_h);
|
||||
|
for y in 0..new_h {
|
||||
|
for x in 0..new_w {
|
||||
|
out.put_pixel(x, y, img.get_pixel(y, new_w - 1 - x));
|
||||
|
}
|
||||
|
}
|
||||
|
DynamicImage::ImageRgba8(out)
|
||||
|
}
|
||||
|
180 | -180 => {
|
||||
|
let mut out = RgbaImage::new(width, height);
|
||||
|
for y in 0..height {
|
||||
|
for x in 0..width {
|
||||
|
out.put_pixel(x, y, img.get_pixel(width - 1 - x, height - 1 - y));
|
||||
|
}
|
||||
|
}
|
||||
|
DynamicImage::ImageRgba8(out)
|
||||
|
}
|
||||
|
_ => img,
|
||||
|
}
|
||||
|
}
|
||||
|
|
||||
|
/// Flip the image horizontally if requested.
|
||||
|
/// In Go, flipMode 2 == FlipHorizontal. We simplify since only horizontal flip is used.
|
||||
|
fn flip_horizontal_if(img: DynamicImage, do_flip: bool) -> DynamicImage {
|
||||
|
if !do_flip {
|
||||
|
return img;
|
||||
|
}
|
||||
|
let (width, height) = img.dimensions();
|
||||
|
let mut out = RgbaImage::new(width, height);
|
||||
|
for y in 0..height {
|
||||
|
for x in 0..width {
|
||||
|
out.put_pixel(x, y, img.get_pixel(width - 1 - x, y));
|
||||
|
}
|
||||
|
}
|
||||
|
DynamicImage::ImageRgba8(out)
|
||||
|
}
|
||||
|
|
||||
|
/// Returns true if the given MIME type or file path extension indicates a JPEG file.
|
||||
|
pub fn is_jpeg(mime_type: &str, path: &str) -> bool {
|
||||
|
if mime_type == "image/jpeg" {
|
||||
|
return true;
|
||||
|
}
|
||||
|
let lower = path.to_lowercase();
|
||||
|
lower.ends_with(".jpg") || lower.ends_with(".jpeg")
|
||||
|
}
|
||||
|
|
||||
|
#[cfg(test)]
|
||||
|
mod tests {
|
||||
|
use super::*;
|
||||
|
|
||||
|
#[test]
|
||||
|
fn test_non_jpeg_data_returned_unchanged() {
|
||||
|
let data = b"not a jpeg file at all";
|
||||
|
let result = fix_jpg_orientation(data);
|
||||
|
assert_eq!(result, data);
|
||||
|
}
|
||||
|
|
||||
|
#[test]
|
||||
|
fn test_jpeg_without_exif_returned_unchanged() {
|
||||
|
// Create a minimal JPEG without EXIF data
|
||||
|
let img = DynamicImage::ImageRgba8(RgbaImage::new(2, 2));
|
||||
|
let mut buf = Cursor::new(Vec::new());
|
||||
|
img.write_to(&mut buf, ImageFormat::Jpeg).unwrap();
|
||||
|
let jpeg_data = buf.into_inner();
|
||||
|
|
||||
|
let result = fix_jpg_orientation(&jpeg_data);
|
||||
|
// Should return data unchanged (no EXIF orientation tag)
|
||||
|
// Just verify it's still valid JPEG
|
||||
|
assert!(!result.is_empty());
|
||||
|
assert_eq!(&result[0..2], &[0xFF, 0xD8]); // JPEG magic bytes
|
||||
|
}
|
||||
|
|
||||
|
#[test]
|
||||
|
fn test_is_jpeg() {
|
||||
|
assert!(is_jpeg("image/jpeg", ""));
|
||||
|
assert!(is_jpeg("", "/3,abc.jpg"));
|
||||
|
assert!(is_jpeg("", "/3,abc.JPEG"));
|
||||
|
assert!(is_jpeg("application/octet-stream", "/3,abc.JPG"));
|
||||
|
assert!(!is_jpeg("image/png", "/3,abc.png"));
|
||||
|
assert!(!is_jpeg("", "/3,abc.png"));
|
||||
|
}
|
||||
|
|
||||
|
#[test]
|
||||
|
fn test_rotate_180() {
|
||||
|
// Create a 2x2 image with distinct pixel colors
|
||||
|
let mut img = RgbaImage::new(2, 2);
|
||||
|
img.put_pixel(0, 0, image::Rgba([255, 0, 0, 255])); // red top-left
|
||||
|
img.put_pixel(1, 0, image::Rgba([0, 255, 0, 255])); // green top-right
|
||||
|
img.put_pixel(0, 1, image::Rgba([0, 0, 255, 255])); // blue bottom-left
|
||||
|
img.put_pixel(1, 1, image::Rgba([255, 255, 0, 255])); // yellow bottom-right
|
||||
|
let dynamic = DynamicImage::ImageRgba8(img);
|
||||
|
|
||||
|
let rotated = rotate(dynamic, 180);
|
||||
|
let (w, h) = rotated.dimensions();
|
||||
|
assert_eq!((w, h), (2, 2));
|
||||
|
// After 180 rotation: top-left should be yellow, top-right should be blue
|
||||
|
assert_eq!(rotated.get_pixel(0, 0), image::Rgba([255, 255, 0, 255]));
|
||||
|
assert_eq!(rotated.get_pixel(1, 0), image::Rgba([0, 0, 255, 255]));
|
||||
|
assert_eq!(rotated.get_pixel(0, 1), image::Rgba([0, 255, 0, 255]));
|
||||
|
assert_eq!(rotated.get_pixel(1, 1), image::Rgba([255, 0, 0, 255]));
|
||||
|
}
|
||||
|
|
||||
|
#[test]
|
||||
|
fn test_rotate_90_ccw() {
|
||||
|
// Create 3x2 image (width=3, height=2)
|
||||
|
let mut img = RgbaImage::new(3, 2);
|
||||
|
img.put_pixel(0, 0, image::Rgba([1, 0, 0, 255]));
|
||||
|
img.put_pixel(1, 0, image::Rgba([2, 0, 0, 255]));
|
||||
|
img.put_pixel(2, 0, image::Rgba([3, 0, 0, 255]));
|
||||
|
img.put_pixel(0, 1, image::Rgba([4, 0, 0, 255]));
|
||||
|
img.put_pixel(1, 1, image::Rgba([5, 0, 0, 255]));
|
||||
|
img.put_pixel(2, 1, image::Rgba([6, 0, 0, 255]));
|
||||
|
let dynamic = DynamicImage::ImageRgba8(img);
|
||||
|
|
||||
|
let rotated = rotate(dynamic, 90);
|
||||
|
let (w, h) = rotated.dimensions();
|
||||
|
// 90 CCW: width=3,height=2 -> new_w=2, new_h=3
|
||||
|
assert_eq!((w, h), (2, 3));
|
||||
|
// Top-right (2,0) should move to top-left (0,0) in CCW 90
|
||||
|
assert_eq!(rotated.get_pixel(0, 0)[0], 3);
|
||||
|
assert_eq!(rotated.get_pixel(1, 0)[0], 6);
|
||||
|
}
|
||||
|
|
||||
|
#[test]
|
||||
|
fn test_rotate_neg90_cw() {
|
||||
|
// Create 3x2 image
|
||||
|
let mut img = RgbaImage::new(3, 2);
|
||||
|
img.put_pixel(0, 0, image::Rgba([1, 0, 0, 255]));
|
||||
|
img.put_pixel(1, 0, image::Rgba([2, 0, 0, 255]));
|
||||
|
img.put_pixel(2, 0, image::Rgba([3, 0, 0, 255]));
|
||||
|
img.put_pixel(0, 1, image::Rgba([4, 0, 0, 255]));
|
||||
|
img.put_pixel(1, 1, image::Rgba([5, 0, 0, 255]));
|
||||
|
img.put_pixel(2, 1, image::Rgba([6, 0, 0, 255]));
|
||||
|
let dynamic = DynamicImage::ImageRgba8(img);
|
||||
|
|
||||
|
let rotated = rotate(dynamic, -90);
|
||||
|
let (w, h) = rotated.dimensions();
|
||||
|
assert_eq!((w, h), (2, 3));
|
||||
|
// -90 (CW 90): top-left (0,0) should go to top-right
|
||||
|
assert_eq!(rotated.get_pixel(0, 0)[0], 4);
|
||||
|
assert_eq!(rotated.get_pixel(1, 0)[0], 1);
|
||||
|
}
|
||||
|
|
||||
|
#[test]
|
||||
|
fn test_flip_horizontal() {
|
||||
|
let mut img = RgbaImage::new(2, 1);
|
||||
|
img.put_pixel(0, 0, image::Rgba([10, 0, 0, 255]));
|
||||
|
img.put_pixel(1, 0, image::Rgba([20, 0, 0, 255]));
|
||||
|
let dynamic = DynamicImage::ImageRgba8(img);
|
||||
|
|
||||
|
let flipped = flip_horizontal_if(dynamic, true);
|
||||
|
assert_eq!(flipped.get_pixel(0, 0)[0], 20);
|
||||
|
assert_eq!(flipped.get_pixel(1, 0)[0], 10);
|
||||
|
}
|
||||
|
|
||||
|
#[test]
|
||||
|
fn test_flip_horizontal_noop() {
|
||||
|
let mut img = RgbaImage::new(2, 1);
|
||||
|
img.put_pixel(0, 0, image::Rgba([10, 0, 0, 255]));
|
||||
|
img.put_pixel(1, 0, image::Rgba([20, 0, 0, 255]));
|
||||
|
let dynamic = DynamicImage::ImageRgba8(img);
|
||||
|
|
||||
|
let not_flipped = flip_horizontal_if(dynamic, false);
|
||||
|
assert_eq!(not_flipped.get_pixel(0, 0)[0], 10);
|
||||
|
assert_eq!(not_flipped.get_pixel(1, 0)[0], 20);
|
||||
|
}
|
||||
|
}
|
||||
@ -0,0 +1,327 @@ |
|||||
|
//! S3-compatible tiered storage backend for volume .dat file upload/download.
|
||||
|
//!
|
||||
|
//! Provides multipart upload and concurrent download with progress callbacks,
|
||||
|
//! matching the Go SeaweedFS S3 backend behavior.
|
||||
|
|
||||
|
use std::collections::HashMap;
|
||||
|
use std::sync::Arc;
|
||||
|
|
||||
|
use aws_sdk_s3::Client;
|
||||
|
use aws_sdk_s3::config::{BehaviorVersion, Credentials, Region};
|
||||
|
use aws_sdk_s3::types::{CompletedMultipartUpload, CompletedPart};
|
||||
|
use tokio::io::AsyncReadExt;
|
||||
|
|
||||
|
/// Configuration for an S3 tier backend.
|
||||
|
#[derive(Debug, Clone)]
|
||||
|
pub struct S3TierConfig {
|
||||
|
pub access_key: String,
|
||||
|
pub secret_key: String,
|
||||
|
pub region: String,
|
||||
|
pub bucket: String,
|
||||
|
pub endpoint: String,
|
||||
|
pub storage_class: String,
|
||||
|
pub force_path_style: bool,
|
||||
|
}
|
||||
|
|
||||
|
/// S3 tier backend for uploading/downloading volume .dat files.
|
||||
|
pub struct S3TierBackend {
|
||||
|
client: Client,
|
||||
|
pub bucket: String,
|
||||
|
pub storage_class: String,
|
||||
|
}
|
||||
|
|
||||
|
impl S3TierBackend {
|
||||
|
/// Create a new S3 tier backend from configuration.
|
||||
|
pub fn new(config: &S3TierConfig) -> Self {
|
||||
|
let region = if config.region.is_empty() {
|
||||
|
"us-east-1"
|
||||
|
} else {
|
||||
|
&config.region
|
||||
|
};
|
||||
|
|
||||
|
let credentials = Credentials::new(
|
||||
|
&config.access_key,
|
||||
|
&config.secret_key,
|
||||
|
None,
|
||||
|
None,
|
||||
|
"seaweedfs-volume-tier",
|
||||
|
);
|
||||
|
|
||||
|
let mut s3_config = aws_sdk_s3::Config::builder()
|
||||
|
.behavior_version(BehaviorVersion::latest())
|
||||
|
.region(Region::new(region.to_string()))
|
||||
|
.credentials_provider(credentials)
|
||||
|
.force_path_style(config.force_path_style);
|
||||
|
|
||||
|
if !config.endpoint.is_empty() {
|
||||
|
s3_config = s3_config.endpoint_url(&config.endpoint);
|
||||
|
}
|
||||
|
|
||||
|
let client = Client::from_conf(s3_config.build());
|
||||
|
|
||||
|
S3TierBackend {
|
||||
|
client,
|
||||
|
bucket: config.bucket.clone(),
|
||||
|
storage_class: if config.storage_class.is_empty() {
|
||||
|
"STANDARD_IA".to_string()
|
||||
|
} else {
|
||||
|
config.storage_class.clone()
|
||||
|
},
|
||||
|
}
|
||||
|
}
|
||||
|
|
||||
|
/// Upload a local file to S3 using multipart upload with progress reporting.
|
||||
|
///
|
||||
|
/// Returns (s3_key, file_size) on success.
|
||||
|
/// The progress callback receives (bytes_uploaded, percentage).
|
||||
|
pub async fn upload_file<F>(
|
||||
|
&self,
|
||||
|
file_path: &str,
|
||||
|
mut progress_fn: F,
|
||||
|
) -> Result<(String, u64), String>
|
||||
|
where
|
||||
|
F: FnMut(i64, f32) + Send + Sync + 'static,
|
||||
|
{
|
||||
|
let key = uuid::Uuid::new_v4().to_string();
|
||||
|
|
||||
|
let metadata = tokio::fs::metadata(file_path)
|
||||
|
.await
|
||||
|
.map_err(|e| format!("failed to stat file {}: {}", file_path, e))?;
|
||||
|
let file_size = metadata.len();
|
||||
|
|
||||
|
// Calculate part size: start at 64MB, scale up for very large files
|
||||
|
let mut part_size: u64 = 64 * 1024 * 1024;
|
||||
|
while part_size * 1000 < file_size {
|
||||
|
part_size *= 4;
|
||||
|
}
|
||||
|
|
||||
|
// Initiate multipart upload
|
||||
|
let create_resp = self
|
||||
|
.client
|
||||
|
.create_multipart_upload()
|
||||
|
.bucket(&self.bucket)
|
||||
|
.key(&key)
|
||||
|
.storage_class(
|
||||
|
self.storage_class
|
||||
|
.parse()
|
||||
|
.unwrap_or(aws_sdk_s3::types::StorageClass::StandardIa),
|
||||
|
)
|
||||
|
.send()
|
||||
|
.await
|
||||
|
.map_err(|e| format!("failed to create multipart upload: {}", e))?;
|
||||
|
|
||||
|
let upload_id = create_resp
|
||||
|
.upload_id()
|
||||
|
.ok_or_else(|| "no upload_id in multipart upload response".to_string())?
|
||||
|
.to_string();
|
||||
|
|
||||
|
let mut file = tokio::fs::File::open(file_path)
|
||||
|
.await
|
||||
|
.map_err(|e| format!("failed to open file {}: {}", file_path, e))?;
|
||||
|
|
||||
|
let mut completed_parts = Vec::new();
|
||||
|
let mut offset: u64 = 0;
|
||||
|
let mut part_number: i32 = 1;
|
||||
|
|
||||
|
loop {
|
||||
|
let remaining = file_size - offset;
|
||||
|
if remaining == 0 {
|
||||
|
break;
|
||||
|
}
|
||||
|
let this_part_size = std::cmp::min(part_size, remaining) as usize;
|
||||
|
let mut buf = vec![0u8; this_part_size];
|
||||
|
file.read_exact(&mut buf)
|
||||
|
.await
|
||||
|
.map_err(|e| format!("failed to read file at offset {}: {}", offset, e))?;
|
||||
|
|
||||
|
let upload_part_resp = self
|
||||
|
.client
|
||||
|
.upload_part()
|
||||
|
.bucket(&self.bucket)
|
||||
|
.key(&key)
|
||||
|
.upload_id(&upload_id)
|
||||
|
.part_number(part_number)
|
||||
|
.body(buf.into())
|
||||
|
.send()
|
||||
|
.await
|
||||
|
.map_err(|e| {
|
||||
|
format!(
|
||||
|
"failed to upload part {} at offset {}: {}",
|
||||
|
part_number, offset, e
|
||||
|
)
|
||||
|
})?;
|
||||
|
|
||||
|
let e_tag = upload_part_resp.e_tag().unwrap_or_default().to_string();
|
||||
|
completed_parts.push(
|
||||
|
CompletedPart::builder()
|
||||
|
.e_tag(e_tag)
|
||||
|
.part_number(part_number)
|
||||
|
.build(),
|
||||
|
);
|
||||
|
|
||||
|
offset += this_part_size as u64;
|
||||
|
|
||||
|
// Report progress
|
||||
|
let pct = if file_size > 0 {
|
||||
|
(offset as f32 * 100.0) / file_size as f32
|
||||
|
} else {
|
||||
|
100.0
|
||||
|
};
|
||||
|
progress_fn(offset as i64, pct);
|
||||
|
|
||||
|
part_number += 1;
|
||||
|
}
|
||||
|
|
||||
|
// Complete multipart upload
|
||||
|
let completed_upload = CompletedMultipartUpload::builder()
|
||||
|
.set_parts(Some(completed_parts))
|
||||
|
.build();
|
||||
|
|
||||
|
self.client
|
||||
|
.complete_multipart_upload()
|
||||
|
.bucket(&self.bucket)
|
||||
|
.key(&key)
|
||||
|
.upload_id(&upload_id)
|
||||
|
.multipart_upload(completed_upload)
|
||||
|
.send()
|
||||
|
.await
|
||||
|
.map_err(|e| format!("failed to complete multipart upload: {}", e))?;
|
||||
|
|
||||
|
Ok((key, file_size))
|
||||
|
}
|
||||
|
|
||||
|
/// Download a file from S3 to a local path with progress reporting.
|
||||
|
///
|
||||
|
/// Returns the file size on success.
|
||||
|
pub async fn download_file<F>(
|
||||
|
&self,
|
||||
|
dest_path: &str,
|
||||
|
key: &str,
|
||||
|
mut progress_fn: F,
|
||||
|
) -> Result<u64, String>
|
||||
|
where
|
||||
|
F: FnMut(i64, f32) + Send + Sync + 'static,
|
||||
|
{
|
||||
|
// Get file size first
|
||||
|
let head_resp = self
|
||||
|
.client
|
||||
|
.head_object()
|
||||
|
.bucket(&self.bucket)
|
||||
|
.key(key)
|
||||
|
.send()
|
||||
|
.await
|
||||
|
.map_err(|e| format!("failed to head object {}: {}", key, e))?;
|
||||
|
|
||||
|
let file_size = head_resp.content_length().unwrap_or(0) as u64;
|
||||
|
|
||||
|
// Download in chunks
|
||||
|
let part_size: u64 = 64 * 1024 * 1024;
|
||||
|
let mut file = tokio::fs::OpenOptions::new()
|
||||
|
.write(true)
|
||||
|
.create(true)
|
||||
|
.truncate(true)
|
||||
|
.open(dest_path)
|
||||
|
.await
|
||||
|
.map_err(|e| format!("failed to open dest file {}: {}", dest_path, e))?;
|
||||
|
|
||||
|
let mut offset: u64 = 0;
|
||||
|
|
||||
|
loop {
|
||||
|
if offset >= file_size {
|
||||
|
break;
|
||||
|
}
|
||||
|
let end = std::cmp::min(offset + part_size - 1, file_size - 1);
|
||||
|
let range = format!("bytes={}-{}", offset, end);
|
||||
|
|
||||
|
let get_resp = self
|
||||
|
.client
|
||||
|
.get_object()
|
||||
|
.bucket(&self.bucket)
|
||||
|
.key(key)
|
||||
|
.range(&range)
|
||||
|
.send()
|
||||
|
.await
|
||||
|
.map_err(|e| format!("failed to get object {} range {}: {}", key, range, e))?;
|
||||
|
|
||||
|
let body = get_resp
|
||||
|
.body
|
||||
|
.collect()
|
||||
|
.await
|
||||
|
.map_err(|e| format!("failed to read body: {}", e))?;
|
||||
|
let bytes = body.into_bytes();
|
||||
|
|
||||
|
use tokio::io::AsyncWriteExt;
|
||||
|
file.write_all(&bytes)
|
||||
|
.await
|
||||
|
.map_err(|e| format!("failed to write to {}: {}", dest_path, e))?;
|
||||
|
|
||||
|
offset += bytes.len() as u64;
|
||||
|
|
||||
|
let pct = if file_size > 0 {
|
||||
|
(offset as f32 * 100.0) / file_size as f32
|
||||
|
} else {
|
||||
|
100.0
|
||||
|
};
|
||||
|
progress_fn(offset as i64, pct);
|
||||
|
}
|
||||
|
|
||||
|
use tokio::io::AsyncWriteExt;
|
||||
|
file.flush()
|
||||
|
.await
|
||||
|
.map_err(|e| format!("failed to flush {}: {}", dest_path, e))?;
|
||||
|
|
||||
|
Ok(file_size)
|
||||
|
}
|
||||
|
|
||||
|
/// Delete a file from S3.
|
||||
|
pub async fn delete_file(&self, key: &str) -> Result<(), String> {
|
||||
|
self.client
|
||||
|
.delete_object()
|
||||
|
.bucket(&self.bucket)
|
||||
|
.key(key)
|
||||
|
.send()
|
||||
|
.await
|
||||
|
.map_err(|e| format!("failed to delete object {}: {}", key, e))?;
|
||||
|
Ok(())
|
||||
|
}
|
||||
|
}
|
||||
|
|
||||
|
/// Parse a backend name like "s3" or "s3.default" into (backend_type, backend_id).
|
||||
|
/// Matches Go's `BackendNameToTypeId`.
|
||||
|
pub fn backend_name_to_type_id(backend_name: &str) -> (String, String) {
|
||||
|
let parts: Vec<&str> = backend_name.split('.').collect();
|
||||
|
match parts.len() {
|
||||
|
1 => (backend_name.to_string(), "default".to_string()),
|
||||
|
2 => (parts[0].to_string(), parts[1].to_string()),
|
||||
|
_ => (String::new(), String::new()),
|
||||
|
}
|
||||
|
}
|
||||
|
|
||||
|
/// A registry of configured S3 tier backends, keyed by backend name (e.g., "s3.default").
|
||||
|
#[derive(Default)]
|
||||
|
pub struct S3TierRegistry {
|
||||
|
backends: HashMap<String, Arc<S3TierBackend>>,
|
||||
|
}
|
||||
|
|
||||
|
impl S3TierRegistry {
|
||||
|
pub fn new() -> Self {
|
||||
|
Self {
|
||||
|
backends: HashMap::new(),
|
||||
|
}
|
||||
|
}
|
||||
|
|
||||
|
/// Register a backend with the given name.
|
||||
|
pub fn register(&mut self, name: String, backend: S3TierBackend) {
|
||||
|
self.backends.insert(name, Arc::new(backend));
|
||||
|
}
|
||||
|
|
||||
|
/// Look up a backend by name.
|
||||
|
pub fn get(&self, name: &str) -> Option<Arc<S3TierBackend>> {
|
||||
|
self.backends.get(name).cloned()
|
||||
|
}
|
||||
|
|
||||
|
/// List all registered backend names.
|
||||
|
pub fn names(&self) -> Vec<String> {
|
||||
|
self.backends.keys().cloned().collect()
|
||||
|
}
|
||||
|
}
|
||||
@ -0,0 +1,315 @@ |
|||||
|
//! Async batched write processing for the volume server.
|
||||
|
//!
|
||||
|
//! Instead of each upload handler directly calling `write_needle` and syncing,
|
||||
|
//! writes are submitted to a queue. A background worker drains the queue in
|
||||
|
//! batches (up to 128 entries), groups them by volume ID, processes them
|
||||
|
//! together, and syncs once per volume for the entire batch.
|
||||
|
|
||||
|
use std::sync::Arc;
|
||||
|
|
||||
|
use tokio::sync::{mpsc, oneshot};
|
||||
|
use tracing::debug;
|
||||
|
|
||||
|
use crate::storage::needle::needle::Needle;
|
||||
|
use crate::storage::types::{Size, VolumeId};
|
||||
|
use crate::storage::volume::VolumeError;
|
||||
|
|
||||
|
use super::volume_server::VolumeServerState;
|
||||
|
|
||||
|
/// Result of a single write operation: (offset, size, is_unchanged).
|
||||
|
pub type WriteResult = Result<(u64, Size, bool), VolumeError>;
|
||||
|
|
||||
|
/// A request to write a needle, submitted to the write queue.
|
||||
|
pub struct WriteRequest {
|
||||
|
pub volume_id: VolumeId,
|
||||
|
pub needle: Needle,
|
||||
|
pub response_tx: oneshot::Sender<WriteResult>,
|
||||
|
}
|
||||
|
|
||||
|
/// Maximum number of write requests to batch together.
|
||||
|
const MAX_BATCH_SIZE: usize = 128;
|
||||
|
|
||||
|
/// Handle for submitting write requests to the background worker.
|
||||
|
#[derive(Clone)]
|
||||
|
pub struct WriteQueue {
|
||||
|
tx: mpsc::Sender<WriteRequest>,
|
||||
|
}
|
||||
|
|
||||
|
impl WriteQueue {
|
||||
|
/// Create a new write queue and spawn the background worker.
|
||||
|
///
|
||||
|
/// `capacity` controls the channel buffer size (backpressure kicks in when full).
|
||||
|
/// The worker holds a reference to `state` for accessing the store.
|
||||
|
pub fn new(state: Arc<VolumeServerState>, capacity: usize) -> Self {
|
||||
|
let (tx, rx) = mpsc::channel(capacity);
|
||||
|
let worker = WriteQueueWorker {
|
||||
|
rx,
|
||||
|
state,
|
||||
|
};
|
||||
|
tokio::spawn(worker.run());
|
||||
|
WriteQueue { tx }
|
||||
|
}
|
||||
|
|
||||
|
/// Submit a write request and wait for the result.
|
||||
|
///
|
||||
|
/// Returns `Err` if the worker has shut down or the response channel was dropped.
|
||||
|
pub async fn submit(
|
||||
|
&self,
|
||||
|
volume_id: VolumeId,
|
||||
|
needle: Needle,
|
||||
|
) -> WriteResult {
|
||||
|
let (response_tx, response_rx) = oneshot::channel();
|
||||
|
let request = WriteRequest {
|
||||
|
volume_id,
|
||||
|
needle,
|
||||
|
response_tx,
|
||||
|
};
|
||||
|
|
||||
|
// Send to queue; this awaits if the channel is full (backpressure).
|
||||
|
if self.tx.send(request).await.is_err() {
|
||||
|
return Err(VolumeError::Io(std::io::Error::new(
|
||||
|
std::io::ErrorKind::BrokenPipe,
|
||||
|
"write queue worker has shut down",
|
||||
|
)));
|
||||
|
}
|
||||
|
|
||||
|
// Wait for the worker to process our request.
|
||||
|
match response_rx.await {
|
||||
|
Ok(result) => result,
|
||||
|
Err(_) => Err(VolumeError::Io(std::io::Error::new(
|
||||
|
std::io::ErrorKind::BrokenPipe,
|
||||
|
"write queue worker dropped response channel",
|
||||
|
))),
|
||||
|
}
|
||||
|
}
|
||||
|
}
|
||||
|
|
||||
|
/// Background worker that drains write requests and processes them in batches.
|
||||
|
struct WriteQueueWorker {
|
||||
|
rx: mpsc::Receiver<WriteRequest>,
|
||||
|
state: Arc<VolumeServerState>,
|
||||
|
}
|
||||
|
|
||||
|
impl WriteQueueWorker {
|
||||
|
async fn run(mut self) {
|
||||
|
debug!("write queue worker started");
|
||||
|
|
||||
|
loop {
|
||||
|
// Wait for the first request (blocks until one arrives or channel closes).
|
||||
|
let first = match self.rx.recv().await {
|
||||
|
Some(req) => req,
|
||||
|
None => {
|
||||
|
debug!("write queue channel closed, worker exiting");
|
||||
|
return;
|
||||
|
}
|
||||
|
};
|
||||
|
|
||||
|
// Drain as many additional requests as available, up to MAX_BATCH_SIZE.
|
||||
|
let mut batch = Vec::with_capacity(MAX_BATCH_SIZE);
|
||||
|
batch.push(first);
|
||||
|
|
||||
|
while batch.len() < MAX_BATCH_SIZE {
|
||||
|
match self.rx.try_recv() {
|
||||
|
Ok(req) => batch.push(req),
|
||||
|
Err(_) => break,
|
||||
|
}
|
||||
|
}
|
||||
|
|
||||
|
let batch_size = batch.len();
|
||||
|
debug!("processing write batch of {} requests", batch_size);
|
||||
|
|
||||
|
// Process the batch in spawn_blocking since write_needle does file I/O.
|
||||
|
let state = self.state.clone();
|
||||
|
let _ = tokio::task::spawn_blocking(move || {
|
||||
|
process_batch(state, batch);
|
||||
|
})
|
||||
|
.await;
|
||||
|
}
|
||||
|
}
|
||||
|
}
|
||||
|
|
||||
|
/// Process a batch of write requests, grouped by volume ID.
|
||||
|
///
|
||||
|
/// Groups writes by volume to minimize the number of store lock acquisitions,
|
||||
|
/// then sends results back via each request's oneshot channel.
|
||||
|
fn process_batch(state: Arc<VolumeServerState>, batch: Vec<WriteRequest>) {
|
||||
|
// Group requests by volume ID for efficient processing.
|
||||
|
// We use a Vec of (VolumeId, Vec<(Needle, Sender)>) to preserve order
|
||||
|
// and avoid requiring Hash on VolumeId.
|
||||
|
let mut groups: Vec<(VolumeId, Vec<(Needle, oneshot::Sender<WriteResult>)>)> = Vec::new();
|
||||
|
|
||||
|
for req in batch {
|
||||
|
let vid = req.volume_id;
|
||||
|
if let Some(group) = groups.iter_mut().find(|(v, _)| *v == vid) {
|
||||
|
group.1.push((req.needle, req.response_tx));
|
||||
|
} else {
|
||||
|
groups.push((vid, vec![(req.needle, req.response_tx)]));
|
||||
|
}
|
||||
|
}
|
||||
|
|
||||
|
// Process each volume group under a single store lock.
|
||||
|
let mut store = state.store.write().unwrap();
|
||||
|
|
||||
|
for (vid, entries) in groups {
|
||||
|
for (mut needle, response_tx) in entries {
|
||||
|
let result = store.write_volume_needle(vid, &mut needle);
|
||||
|
// Send result back; ignore error if receiver dropped.
|
||||
|
let _ = response_tx.send(result);
|
||||
|
}
|
||||
|
}
|
||||
|
}
|
||||
|
|
||||
|
#[cfg(test)]
|
||||
|
mod tests {
|
||||
|
use super::*;
|
||||
|
use crate::storage::types::VolumeId;
|
||||
|
|
||||
|
/// Helper to create a minimal VolumeServerState for testing.
|
||||
|
fn make_test_state() -> Arc<VolumeServerState> {
|
||||
|
use std::sync::RwLock;
|
||||
|
use std::sync::atomic::{AtomicBool, AtomicI64, AtomicU32};
|
||||
|
use crate::security::{Guard, SigningKey};
|
||||
|
use crate::storage::store::Store;
|
||||
|
use crate::storage::needle_map::NeedleMapKind;
|
||||
|
|
||||
|
let store = Store::new(NeedleMapKind::InMemory);
|
||||
|
let guard = Guard::new(
|
||||
|
&[],
|
||||
|
SigningKey(vec![]),
|
||||
|
0,
|
||||
|
SigningKey(vec![]),
|
||||
|
0,
|
||||
|
);
|
||||
|
|
||||
|
Arc::new(VolumeServerState {
|
||||
|
store: RwLock::new(store),
|
||||
|
guard,
|
||||
|
is_stopping: RwLock::new(false),
|
||||
|
maintenance: AtomicBool::new(false),
|
||||
|
state_version: AtomicU32::new(0),
|
||||
|
concurrent_upload_limit: 0,
|
||||
|
concurrent_download_limit: 0,
|
||||
|
inflight_upload_data_timeout: std::time::Duration::ZERO,
|
||||
|
inflight_download_data_timeout: std::time::Duration::ZERO,
|
||||
|
inflight_upload_bytes: AtomicI64::new(0),
|
||||
|
inflight_download_bytes: AtomicI64::new(0),
|
||||
|
upload_notify: tokio::sync::Notify::new(),
|
||||
|
download_notify: tokio::sync::Notify::new(),
|
||||
|
data_center: String::new(),
|
||||
|
rack: String::new(),
|
||||
|
file_size_limit_bytes: 0,
|
||||
|
is_heartbeating: AtomicBool::new(false),
|
||||
|
has_master: false,
|
||||
|
pre_stop_seconds: 0,
|
||||
|
volume_state_notify: tokio::sync::Notify::new(),
|
||||
|
write_queue: std::sync::OnceLock::new(),
|
||||
|
s3_tier_registry: std::sync::RwLock::new(crate::remote_storage::s3_tier::S3TierRegistry::new()),
|
||||
|
})
|
||||
|
}
|
||||
|
|
||||
|
#[tokio::test]
|
||||
|
async fn test_write_queue_submit_no_volume() {
|
||||
|
// Submit a write to a non-existent volume -- should return VolumeError::NotFound.
|
||||
|
let state = make_test_state();
|
||||
|
let queue = WriteQueue::new(state, MAX_BATCH_SIZE);
|
||||
|
|
||||
|
let needle = Needle {
|
||||
|
id: 1.into(),
|
||||
|
cookie: 0x12345678.into(),
|
||||
|
data: vec![1, 2, 3],
|
||||
|
data_size: 3,
|
||||
|
..Needle::default()
|
||||
|
};
|
||||
|
|
||||
|
let result = queue.submit(VolumeId(999), needle).await;
|
||||
|
assert!(result.is_err());
|
||||
|
match result {
|
||||
|
Err(VolumeError::NotFound) => {} // expected
|
||||
|
other => panic!("expected NotFound, got {:?}", other),
|
||||
|
}
|
||||
|
}
|
||||
|
|
||||
|
#[tokio::test]
|
||||
|
async fn test_write_queue_concurrent_submissions() {
|
||||
|
// Submit multiple concurrent writes -- all should complete (with errors since no volume).
|
||||
|
let state = make_test_state();
|
||||
|
let queue = WriteQueue::new(state, MAX_BATCH_SIZE);
|
||||
|
|
||||
|
let mut handles = Vec::new();
|
||||
|
for i in 0..10u64 {
|
||||
|
let q = queue.clone();
|
||||
|
handles.push(tokio::spawn(async move {
|
||||
|
let needle = Needle {
|
||||
|
id: i.into(),
|
||||
|
cookie: 0xABCD.into(),
|
||||
|
data: vec![i as u8; 10],
|
||||
|
data_size: 10,
|
||||
|
..Needle::default()
|
||||
|
};
|
||||
|
q.submit(VolumeId(1), needle).await
|
||||
|
}));
|
||||
|
}
|
||||
|
|
||||
|
for handle in handles {
|
||||
|
let result = handle.await.unwrap();
|
||||
|
// All should fail with NotFound since there's no volume 1
|
||||
|
assert!(matches!(result, Err(VolumeError::NotFound)));
|
||||
|
}
|
||||
|
}
|
||||
|
|
||||
|
#[tokio::test]
|
||||
|
async fn test_write_queue_batching() {
|
||||
|
// Verify that many concurrent writes get processed (testing the batching path).
|
||||
|
let state = make_test_state();
|
||||
|
let queue = WriteQueue::new(state, MAX_BATCH_SIZE);
|
||||
|
|
||||
|
// Submit MAX_BATCH_SIZE requests concurrently
|
||||
|
let mut handles = Vec::new();
|
||||
|
for i in 0..MAX_BATCH_SIZE as u64 {
|
||||
|
let q = queue.clone();
|
||||
|
handles.push(tokio::spawn(async move {
|
||||
|
let needle = Needle {
|
||||
|
id: i.into(),
|
||||
|
cookie: 0x1111.into(),
|
||||
|
data: vec![0u8; 4],
|
||||
|
data_size: 4,
|
||||
|
..Needle::default()
|
||||
|
};
|
||||
|
q.submit(VolumeId(42), needle).await
|
||||
|
}));
|
||||
|
}
|
||||
|
|
||||
|
let mut results = Vec::new();
|
||||
|
for handle in handles {
|
||||
|
results.push(handle.await.unwrap());
|
||||
|
}
|
||||
|
|
||||
|
// All should complete (with NotFound errors since no volume exists)
|
||||
|
assert_eq!(results.len(), MAX_BATCH_SIZE);
|
||||
|
for r in results {
|
||||
|
assert!(matches!(r, Err(VolumeError::NotFound)));
|
||||
|
}
|
||||
|
}
|
||||
|
|
||||
|
#[tokio::test]
|
||||
|
async fn test_write_queue_dropped_sender() {
|
||||
|
// When the queue is dropped, subsequent submits should fail gracefully.
|
||||
|
let state = make_test_state();
|
||||
|
let queue = WriteQueue::new(state, 1);
|
||||
|
|
||||
|
// Clone then drop the original -- the worker keeps running via its rx handle.
|
||||
|
let queue2 = queue.clone();
|
||||
|
drop(queue);
|
||||
|
|
||||
|
// This should still work since the worker is alive.
|
||||
|
let needle = Needle {
|
||||
|
id: 1.into(),
|
||||
|
cookie: 0.into(),
|
||||
|
data: vec![],
|
||||
|
data_size: 0,
|
||||
|
..Needle::default()
|
||||
|
};
|
||||
|
let result = queue2.submit(VolumeId(1), needle).await;
|
||||
|
assert!(result.is_err()); // NotFound is fine -- the point is it doesn't panic
|
||||
|
}
|
||||
|
}
|
||||
Write
Preview
Loading…
Cancel
Save
Reference in new issue