From 71a13f366ea2b5c779bd2ea74385af1b03a2455b Mon Sep 17 00:00:00 2001
From: Trinity Pointard <trinity.pointard@gmail.com>
Date: Tue, 16 Mar 2021 21:08:39 +0100
Subject: [PATCH] add content defined chuking

---
 Cargo.lock         | 97 ++++++++++++++++++++++++++++++++++++++++------
 src/api/Cargo.toml |  2 +
 src/api/s3_put.rs  | 20 ++++++----
 3 files changed, 101 insertions(+), 18 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 5dc83dfa..e71152e4 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -221,6 +221,12 @@ dependencies = [
  "synstructure",
 ]
 
+[[package]]
+name = "fmt-extra"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "07f11f71b1f9be830047fbb1899d90601c3b21a471dc99fe1057303eee37f2b9"
+
 [[package]]
 name = "fnv"
 version = "1.0.7"
@@ -365,7 +371,7 @@ dependencies = [
  "hex",
  "log",
  "pretty_env_logger",
- "rand",
+ "rand 0.8.3",
  "rmp-serde",
  "serde",
  "sled",
@@ -388,6 +394,7 @@ dependencies = [
  "garage_model",
  "garage_table",
  "garage_util",
+ "hash-roll",
  "hex",
  "hmac",
  "http",
@@ -397,6 +404,7 @@ dependencies = [
  "log",
  "md-5",
  "percent-encoding",
+ "rand 0.7.3",
  "roxmltree",
  "sha2",
  "tokio",
@@ -415,7 +423,7 @@ dependencies = [
  "garage_util",
  "hex",
  "log",
- "rand",
+ "rand 0.8.3",
  "rmp-serde",
  "serde",
  "serde_bytes",
@@ -459,7 +467,7 @@ dependencies = [
  "garage_util",
  "hexdump",
  "log",
- "rand",
+ "rand 0.8.3",
  "rmp-serde",
  "serde",
  "serde_bytes",
@@ -479,7 +487,7 @@ dependencies = [
  "http",
  "hyper",
  "log",
- "rand",
+ "rand 0.8.3",
  "rmp-serde",
  "rustls",
  "serde",
@@ -529,6 +537,17 @@ dependencies = [
  "winapi",
 ]
 
+[[package]]
+name = "getrandom"
+version = "0.1.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8fc3cb4d91f53b50155bdcfd23f6a4c39ae1969c2ae85982b135750cccaf5fce"
+dependencies = [
+ "cfg-if",
+ "libc",
+ "wasi 0.9.0+wasi-snapshot-preview1",
+]
+
 [[package]]
 name = "getrandom"
 version = "0.2.2"
@@ -537,7 +556,7 @@ checksum = "c9495705279e7140bf035dde1f6e750c162df8b625267cd52cc44e0b156732c8"
 dependencies = [
  "cfg-if",
  "libc",
- "wasi",
+ "wasi 0.10.2+wasi-snapshot-preview1",
 ]
 
 [[package]]
@@ -581,6 +600,15 @@ dependencies = [
  "tracing",
 ]
 
+[[package]]
+name = "hash-roll"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a9e27803a4b526df90ed2a3f60523eeec6b5ace6ba7530f9920fbee82027fa11"
+dependencies = [
+ "fmt-extra",
+]
+
 [[package]]
 name = "hashbrown"
 version = "0.9.1"
@@ -1043,6 +1071,19 @@ dependencies = [
  "proc-macro2",
 ]
 
+[[package]]
+name = "rand"
+version = "0.7.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6a6b1679d49b24bbfe0c803429aa1874472f50d9b363131f0e89fc356b544d03"
+dependencies = [
+ "getrandom 0.1.16",
+ "libc",
+ "rand_chacha 0.2.2",
+ "rand_core 0.5.1",
+ "rand_hc 0.2.0",
+]
+
 [[package]]
 name = "rand"
 version = "0.8.3"
@@ -1050,9 +1091,19 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0ef9e7e66b4468674bfcb0c81af8b7fa0bb154fa9f28eb840da5c447baeb8d7e"
 dependencies = [
  "libc",
- "rand_chacha",
- "rand_core",
- "rand_hc",
+ "rand_chacha 0.3.0",
+ "rand_core 0.6.2",
+ "rand_hc 0.3.0",
+]
+
+[[package]]
+name = "rand_chacha"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f4c8ed856279c9737206bf725bf36935d8666ead7aa69b52be55af369d193402"
+dependencies = [
+ "ppv-lite86",
+ "rand_core 0.5.1",
 ]
 
 [[package]]
@@ -1062,7 +1113,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e12735cf05c9e10bf21534da50a147b924d555dc7a547c42e6bb2d5b6017ae0d"
 dependencies = [
  "ppv-lite86",
- "rand_core",
+ "rand_core 0.6.2",
+]
+
+[[package]]
+name = "rand_core"
+version = "0.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "90bde5296fc891b0cef12a6d03ddccc162ce7b2aff54160af9338f8d40df6d19"
+dependencies = [
+ "getrandom 0.1.16",
 ]
 
 [[package]]
@@ -1071,7 +1131,16 @@ version = "0.6.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "34cf66eb183df1c5876e2dcf6b13d57340741e8dc255b48e40a26de954d06ae7"
 dependencies = [
- "getrandom",
+ "getrandom 0.2.2",
+]
+
+[[package]]
+name = "rand_hc"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ca3129af7b92a17112d59ad498c6f81eaf463253766b90396d39ea7a39d6613c"
+dependencies = [
+ "rand_core 0.5.1",
 ]
 
 [[package]]
@@ -1080,7 +1149,7 @@ version = "0.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3190ef7066a446f2e7f42e239d161e905420ccab01eb967c9eb27d21b2322a73"
 dependencies = [
- "rand_core",
+ "rand_core 0.6.2",
 ]
 
 [[package]]
@@ -1581,6 +1650,12 @@ dependencies = [
  "try-lock",
 ]
 
+[[package]]
+name = "wasi"
+version = "0.9.0+wasi-snapshot-preview1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cccddf32554fecc6acb585f82a32a72e28b48f8c4c1883ddfeeeaa96f7d8e519"
+
 [[package]]
 name = "wasi"
 version = "0.10.2+wasi-snapshot-preview1"
diff --git a/src/api/Cargo.toml b/src/api/Cargo.toml
index 0b824ca3..5bc170a3 100644
--- a/src/api/Cargo.toml
+++ b/src/api/Cargo.toml
@@ -22,10 +22,12 @@ bytes = "1.0"
 chrono = "0.4"
 crypto-mac = "0.10"
 err-derive = "0.3"
+hash-roll = "0.3.0"
 hex = "0.4"
 hmac = "0.10"
 log = "0.4"
 md-5 = "0.9"
+rand = "0.7"
 sha2 = "0.9"
 
 futures = "0.3"
diff --git a/src/api/s3_put.rs b/src/api/s3_put.rs
index c4e3b818..e2a1b54d 100644
--- a/src/api/s3_put.rs
+++ b/src/api/s3_put.rs
@@ -3,6 +3,7 @@ use std::fmt::Write;
 use std::sync::Arc;
 
 use futures::stream::*;
+use hash_roll::{ChunkIncr, fastcdc::{FastCdc, FastCdcIncr}, gear_table::GEAR_64};
 use hyper::{Body, Request, Response};
 use md5::{digest::generic_array::*, Digest as Md5Digest, Md5};
 use sha2::Sha256;
@@ -268,21 +269,26 @@ async fn put_block_meta(
 struct BodyChunker {
 	body: Body,
 	read_all: bool,
-	block_size: usize,
+	max_block_size: usize,
 	buf: VecDeque<u8>,
+	chunker: FastCdcIncr<'static>,
 }
 
 impl BodyChunker {
 	fn new(body: Body, block_size: usize) -> Self {
+		let max_block_size = block_size * 2;
+		let chunker = FastCdc::new(&GEAR_64, block_size as u64 / 2, block_size as u64, max_block_size as u64);
+		let chunker = (&chunker).into();
 		Self {
 			body,
 			read_all: false,
-			block_size,
-			buf: VecDeque::with_capacity(2 * block_size),
+			max_block_size,
+			buf: VecDeque::with_capacity(2 * max_block_size),
+			chunker,
 		}
 	}
 	async fn next(&mut self) -> Result<Option<Vec<u8>>, GarageError> {
-		while !self.read_all && self.buf.len() < self.block_size {
+		while !self.read_all && self.buf.len() < self.max_block_size {
 			if let Some(block) = self.body.next().await {
 				let bytes = block?;
 				trace!("Body next: {} bytes", bytes.len());
@@ -293,11 +299,11 @@ impl BodyChunker {
 		}
 		if self.buf.len() == 0 {
 			Ok(None)
-		} else if self.buf.len() <= self.block_size {
-			let block = self.buf.drain(..).collect::<Vec<u8>>();
+		} else if let Some(index) = self.chunker.push(self.buf.make_contiguous()) {
+			let block = self.buf.drain(..index).collect::<Vec<u8>>();
 			Ok(Some(block))
 		} else {
-			let block = self.buf.drain(..self.block_size).collect::<Vec<u8>>();
+			let block = self.buf.drain(..).collect::<Vec<u8>>();
 			Ok(Some(block))
 		}
 	}