garage/src/model/s3/object_table.rs

338 lines
8.5 KiB
Rust
Raw Normal View History

use serde::{Deserialize, Serialize};
2020-07-08 15:33:14 +00:00
use std::collections::BTreeMap;
2020-07-08 15:34:37 +00:00
use std::sync::Arc;
2020-04-09 15:32:28 +00:00
2020-04-24 10:10:01 +00:00
use garage_util::background::BackgroundRunner;
use garage_util::data::*;
2020-04-23 17:05:46 +00:00
use garage_table::crdt::*;
2021-03-26 18:41:46 +00:00
use garage_table::replication::TableShardedReplication;
2020-04-24 10:10:01 +00:00
use garage_table::*;
2022-04-13 12:02:53 +00:00
use crate::s3::version_table::*;
2020-04-09 15:32:28 +00:00
use garage_model_050::object_table as old;
2021-03-26 20:53:28 +00:00
/// An object
2020-04-09 21:45:07 +00:00
#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)]
2020-04-09 15:32:28 +00:00
pub struct Object {
2021-04-06 03:25:28 +00:00
/// The bucket in which the object is stored, used as partition key
2021-12-14 12:55:11 +00:00
pub bucket_id: Uuid,
2020-04-09 21:45:07 +00:00
2021-04-06 03:25:28 +00:00
/// The key at which the object is stored in its bucket, used as sorting key
2020-04-09 15:32:28 +00:00
pub key: String,
2021-04-06 03:25:28 +00:00
/// The list of currenty stored versions of the object
versions: Vec<ObjectVersion>,
}
impl Object {
2021-04-08 13:13:02 +00:00
/// Initialize an Object struct from parts
2021-12-14 12:55:11 +00:00
pub fn new(bucket_id: Uuid, key: String, versions: Vec<ObjectVersion>) -> Self {
let mut ret = Self {
2021-12-14 12:55:11 +00:00
bucket_id,
key,
versions: vec![],
};
for v in versions {
ret.add_version(v)
.expect("Twice the same ObjectVersion in Object constructor");
}
ret
}
2021-03-26 20:53:28 +00:00
/// Adds a version if it wasn't already present
2021-04-23 19:57:32 +00:00
#[allow(clippy::result_unit_err)]
pub fn add_version(&mut self, new: ObjectVersion) -> Result<(), ()> {
match self
.versions
.binary_search_by(|v| v.cmp_key().cmp(&new.cmp_key()))
{
Err(i) => {
self.versions.insert(i, new);
Ok(())
}
Ok(_) => Err(()),
}
}
2021-03-26 20:53:28 +00:00
2021-04-06 03:25:28 +00:00
/// Get a list of currently stored versions of `Object`
pub fn versions(&self) -> &[ObjectVersion] {
&self.versions[..]
}
2020-04-09 15:32:28 +00:00
}
2021-03-26 20:53:28 +00:00
/// Informations about a version of an object
2020-04-09 21:45:07 +00:00
#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)]
pub struct ObjectVersion {
2021-03-26 20:53:28 +00:00
/// Id of the version
2021-05-02 21:13:08 +00:00
pub uuid: Uuid,
2021-03-26 20:53:28 +00:00
/// Timestamp of when the object was created
2020-04-09 15:32:28 +00:00
pub timestamp: u64,
2021-03-26 20:53:28 +00:00
/// State of the version
2020-04-26 18:55:13 +00:00
pub state: ObjectVersionState,
2020-04-09 15:32:28 +00:00
}
2021-03-26 20:53:28 +00:00
/// State of an object version
#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)]
2020-04-26 18:55:13 +00:00
pub enum ObjectVersionState {
2021-03-26 20:53:28 +00:00
/// The version is being received
2020-07-08 15:33:14 +00:00
Uploading(ObjectVersionHeaders),
2021-03-26 20:53:28 +00:00
/// The version is fully received
Complete(ObjectVersionData),
2021-04-06 03:25:28 +00:00
/// The version uploaded containded errors or the upload was explicitly aborted
2020-04-26 18:55:13 +00:00
Aborted,
}
2021-05-02 21:13:08 +00:00
impl Crdt for ObjectVersionState {
fn merge(&mut self, other: &Self) {
2020-04-26 18:55:13 +00:00
use ObjectVersionState::*;
2020-07-08 15:34:37 +00:00
match other {
Aborted => {
*self = Aborted;
}
Complete(b) => match self {
Aborted => {}
Complete(a) => {
a.merge(b);
}
Uploading(_) => {
*self = Complete(b.clone());
}
},
Uploading(_) => {}
}
2020-04-26 18:55:13 +00:00
}
}
2021-04-08 13:13:02 +00:00
/// Data stored in object version
#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)]
2020-04-09 21:45:07 +00:00
pub enum ObjectVersionData {
2021-04-06 03:25:28 +00:00
/// The object was deleted, this Version is a tombstone to mark it as such
2020-04-09 15:32:28 +00:00
DeleteMarker,
2021-03-26 20:53:28 +00:00
/// The object is short, it's stored inlined
Inline(ObjectVersionMeta, #[serde(with = "serde_bytes")] Vec<u8>),
2021-03-26 20:53:28 +00:00
/// The object is not short, Hash of first block is stored here, next segments hashes are
/// stored in the version table
FirstBlock(ObjectVersionMeta, Hash),
}
2021-05-02 21:13:08 +00:00
impl AutoCrdt for ObjectVersionData {
const WARN_IF_DIFFERENT: bool = true;
}
2021-03-26 20:53:28 +00:00
/// Metadata about the object version
#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)]
pub struct ObjectVersionMeta {
2021-03-26 20:53:28 +00:00
/// Headers to send to the client
2020-07-08 15:34:37 +00:00
pub headers: ObjectVersionHeaders,
2021-03-26 20:53:28 +00:00
/// Size of the object
pub size: u64,
2021-03-26 20:53:28 +00:00
/// etag of the object
2020-07-08 15:34:37 +00:00
pub etag: String,
}
2021-03-26 20:53:28 +00:00
/// Additional headers for an object
#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)]
2020-07-08 15:33:14 +00:00
pub struct ObjectVersionHeaders {
2021-03-26 20:53:28 +00:00
/// Content type of the object
2020-07-08 15:34:37 +00:00
pub content_type: String,
2021-03-26 20:53:28 +00:00
/// Any other http headers to send
2020-07-08 15:34:37 +00:00
pub other: BTreeMap<String, String>,
2020-07-08 15:33:14 +00:00
}
impl ObjectVersion {
2021-05-02 21:13:08 +00:00
fn cmp_key(&self) -> (u64, Uuid) {
2020-04-26 18:55:13 +00:00
(self.timestamp, self.uuid)
}
2021-03-26 20:53:28 +00:00
/// Is the object version currently being uploaded
2020-07-08 15:34:37 +00:00
pub fn is_uploading(&self) -> bool {
2021-04-23 19:57:32 +00:00
matches!(self.state, ObjectVersionState::Uploading(_))
2020-07-08 15:34:37 +00:00
}
2021-03-26 20:53:28 +00:00
/// Is the object version completely received
2020-04-26 18:55:13 +00:00
pub fn is_complete(&self) -> bool {
2021-04-23 19:57:32 +00:00
matches!(self.state, ObjectVersionState::Complete(_))
2020-04-26 18:55:13 +00:00
}
2021-03-26 20:53:28 +00:00
2021-04-06 03:25:28 +00:00
/// Is the object version available (received and not a tombstone)
2020-04-26 18:55:13 +00:00
pub fn is_data(&self) -> bool {
2020-07-08 15:34:37 +00:00
match self.state {
ObjectVersionState::Complete(ObjectVersionData::DeleteMarker) => false,
ObjectVersionState::Complete(_) => true,
_ => false,
}
}
}
2021-12-14 12:55:11 +00:00
impl Entry<Uuid, String> for Object {
fn partition_key(&self) -> &Uuid {
&self.bucket_id
2020-04-09 15:32:28 +00:00
}
fn sort_key(&self) -> &String {
&self.key
}
2021-03-16 15:51:15 +00:00
fn is_tombstone(&self) -> bool {
self.versions.len() == 1
&& self.versions[0].state
== ObjectVersionState::Complete(ObjectVersionData::DeleteMarker)
2021-03-16 15:51:15 +00:00
}
}
2020-04-09 15:32:28 +00:00
2021-05-02 21:13:08 +00:00
impl Crdt for Object {
2020-04-09 21:45:07 +00:00
fn merge(&mut self, other: &Self) {
// Merge versions from other into here
2020-04-09 15:32:28 +00:00
for other_v in other.versions.iter() {
match self
.versions
.binary_search_by(|v| v.cmp_key().cmp(&other_v.cmp_key()))
{
2020-04-09 15:32:28 +00:00
Ok(i) => {
2020-07-08 15:34:37 +00:00
self.versions[i].state.merge(&other_v.state);
2020-04-09 15:32:28 +00:00
}
Err(i) => {
self.versions.insert(i, other_v.clone());
}
}
}
// Remove versions which are obsolete, i.e. those that come
// before the last version which .is_complete().
let last_complete = self
.versions
.iter()
.enumerate()
.rev()
2021-04-23 19:57:32 +00:00
.find(|(_, v)| v.is_complete())
2020-04-09 15:32:28 +00:00
.map(|(vi, _)| vi);
if let Some(last_vi) = last_complete {
self.versions = self.versions.drain(last_vi..).collect::<Vec<_>>();
}
}
}
2020-04-09 21:45:07 +00:00
pub struct ObjectTable {
2020-04-12 11:03:55 +00:00
pub background: Arc<BackgroundRunner>,
pub version_table: Arc<Table<VersionTable, TableShardedReplication>>,
2020-04-09 21:45:07 +00:00
}
Implement ListMultipartUploads (#171) Implement ListMultipartUploads, also refactor ListObjects and ListObjectsV2. It took me some times as I wanted to propose the following things: - Using an iterator instead of the loop+goto pattern. I find it easier to read and it should enable some optimizations. For example, when consuming keys of a common prefix, we do many [redundant checks](https://git.deuxfleurs.fr/Deuxfleurs/garage/src/branch/main/src/api/s3_list.rs#L125-L156) while the only thing to do is to [check if the following key is still part of the common prefix](https://git.deuxfleurs.fr/Deuxfleurs/garage/src/branch/feature/s3-multipart-compat/src/api/s3_list.rs#L476). - Try to name things (see ExtractionResult and RangeBegin enums) and to separate concerns (see ListQuery and Accumulator) - An IO closure to make unit tests possibles. - Unit tests, to track regressions and document how to interact with the code - Integration tests with `s3api`. In the future, I would like to move them in Rust with the aws rust SDK. Merging of the logic of ListMultipartUploads and ListObjects was not a goal but a consequence of the previous modifications. Some points that we might want to discuss: - ListObjectsV1, when using pagination and delimiters, has a weird behavior (it lists multiple times the same prefix) with `aws s3api` due to the fact that it can not use our optimization to skip the whole prefix. It is independant from my refactor and can be tested with the commented `s3api` tests in `test-smoke.sh`. It probably has the same weird behavior on the official AWS S3 implementation. - Considering ListMultipartUploads, I had to "abuse" upload id marker to support prefix skipping. I send an `upload-id-marker` with the hardcoded value `include` to emulate your "including" token. - Some ways to test ListMultipartUploads with existing software (my tests are limited to s3api for now). Co-authored-by: Quentin Dufour <quentin@deuxfleurs.fr> Reviewed-on: https://git.deuxfleurs.fr/Deuxfleurs/garage/pulls/171 Co-authored-by: Quentin <quentin@dufour.io> Co-committed-by: Quentin <quentin@dufour.io>
2022-01-12 18:04:55 +00:00
#[derive(Clone, Copy, Debug, Serialize, Deserialize)]
pub enum ObjectFilter {
IsData,
IsUploading,
}
2020-04-12 20:24:53 +00:00
impl TableSchema for ObjectTable {
2021-12-14 11:34:01 +00:00
const TABLE_NAME: &'static str = "object";
2021-12-14 12:55:11 +00:00
type P = Uuid;
2020-04-09 15:32:28 +00:00
type S = String;
type E = Object;
Implement ListMultipartUploads (#171) Implement ListMultipartUploads, also refactor ListObjects and ListObjectsV2. It took me some times as I wanted to propose the following things: - Using an iterator instead of the loop+goto pattern. I find it easier to read and it should enable some optimizations. For example, when consuming keys of a common prefix, we do many [redundant checks](https://git.deuxfleurs.fr/Deuxfleurs/garage/src/branch/main/src/api/s3_list.rs#L125-L156) while the only thing to do is to [check if the following key is still part of the common prefix](https://git.deuxfleurs.fr/Deuxfleurs/garage/src/branch/feature/s3-multipart-compat/src/api/s3_list.rs#L476). - Try to name things (see ExtractionResult and RangeBegin enums) and to separate concerns (see ListQuery and Accumulator) - An IO closure to make unit tests possibles. - Unit tests, to track regressions and document how to interact with the code - Integration tests with `s3api`. In the future, I would like to move them in Rust with the aws rust SDK. Merging of the logic of ListMultipartUploads and ListObjects was not a goal but a consequence of the previous modifications. Some points that we might want to discuss: - ListObjectsV1, when using pagination and delimiters, has a weird behavior (it lists multiple times the same prefix) with `aws s3api` due to the fact that it can not use our optimization to skip the whole prefix. It is independant from my refactor and can be tested with the commented `s3api` tests in `test-smoke.sh`. It probably has the same weird behavior on the official AWS S3 implementation. - Considering ListMultipartUploads, I had to "abuse" upload id marker to support prefix skipping. I send an `upload-id-marker` with the hardcoded value `include` to emulate your "including" token. - Some ways to test ListMultipartUploads with existing software (my tests are limited to s3api for now). Co-authored-by: Quentin Dufour <quentin@deuxfleurs.fr> Reviewed-on: https://git.deuxfleurs.fr/Deuxfleurs/garage/pulls/171 Co-authored-by: Quentin <quentin@dufour.io> Co-committed-by: Quentin <quentin@dufour.io>
2022-01-12 18:04:55 +00:00
type Filter = ObjectFilter;
2020-04-09 15:32:28 +00:00
fn updated(&self, old: Option<&Self::E>, new: Option<&Self::E>) {
2020-04-12 11:03:55 +00:00
let version_table = self.version_table.clone();
let old = old.cloned();
let new = new.cloned();
self.background.spawn(async move {
if let (Some(old_v), Some(new_v)) = (old, new) {
// Propagate deletion of old versions
for v in old_v.versions.iter() {
let newly_deleted = match new_v
.versions
.binary_search_by(|nv| nv.cmp_key().cmp(&v.cmp_key()))
{
Err(_) => true,
Ok(i) => {
new_v.versions[i].state == ObjectVersionState::Aborted
&& v.state != ObjectVersionState::Aborted
}
};
if newly_deleted {
let deleted_version =
2021-12-14 12:55:11 +00:00
Version::new(v.uuid, old_v.bucket_id, old_v.key.clone(), true);
version_table.insert(&deleted_version).await?;
2020-04-26 18:59:17 +00:00
}
}
}
Ok(())
})
2020-04-09 15:32:28 +00:00
}
fn matches_filter(entry: &Self::E, filter: &Self::Filter) -> bool {
Implement ListMultipartUploads (#171) Implement ListMultipartUploads, also refactor ListObjects and ListObjectsV2. It took me some times as I wanted to propose the following things: - Using an iterator instead of the loop+goto pattern. I find it easier to read and it should enable some optimizations. For example, when consuming keys of a common prefix, we do many [redundant checks](https://git.deuxfleurs.fr/Deuxfleurs/garage/src/branch/main/src/api/s3_list.rs#L125-L156) while the only thing to do is to [check if the following key is still part of the common prefix](https://git.deuxfleurs.fr/Deuxfleurs/garage/src/branch/feature/s3-multipart-compat/src/api/s3_list.rs#L476). - Try to name things (see ExtractionResult and RangeBegin enums) and to separate concerns (see ListQuery and Accumulator) - An IO closure to make unit tests possibles. - Unit tests, to track regressions and document how to interact with the code - Integration tests with `s3api`. In the future, I would like to move them in Rust with the aws rust SDK. Merging of the logic of ListMultipartUploads and ListObjects was not a goal but a consequence of the previous modifications. Some points that we might want to discuss: - ListObjectsV1, when using pagination and delimiters, has a weird behavior (it lists multiple times the same prefix) with `aws s3api` due to the fact that it can not use our optimization to skip the whole prefix. It is independant from my refactor and can be tested with the commented `s3api` tests in `test-smoke.sh`. It probably has the same weird behavior on the official AWS S3 implementation. - Considering ListMultipartUploads, I had to "abuse" upload id marker to support prefix skipping. I send an `upload-id-marker` with the hardcoded value `include` to emulate your "including" token. - Some ways to test ListMultipartUploads with existing software (my tests are limited to s3api for now). Co-authored-by: Quentin Dufour <quentin@deuxfleurs.fr> Reviewed-on: https://git.deuxfleurs.fr/Deuxfleurs/garage/pulls/171 Co-authored-by: Quentin <quentin@dufour.io> Co-committed-by: Quentin <quentin@dufour.io>
2022-01-12 18:04:55 +00:00
match filter {
ObjectFilter::IsData => entry.versions.iter().any(|v| v.is_data()),
ObjectFilter::IsUploading => entry.versions.iter().any(|v| v.is_uploading()),
}
}
fn try_migrate(bytes: &[u8]) -> Option<Self::E> {
let old_obj = rmp_serde::decode::from_read_ref::<_, old::Object>(bytes).ok()?;
Some(migrate_object(old_obj))
}
}
// vvvvvvvv migration code, stupid stuff vvvvvvvvvvvv
// (we just want to change bucket into bucket_id by hashing it)
fn migrate_object(o: old::Object) -> Object {
let versions = o
.versions()
.iter()
.cloned()
.map(migrate_object_version)
.collect();
Object {
bucket_id: blake2sum(o.bucket.as_bytes()),
key: o.key,
versions,
}
}
fn migrate_object_version(v: old::ObjectVersion) -> ObjectVersion {
ObjectVersion {
uuid: Uuid::try_from(v.uuid.as_slice()).unwrap(),
timestamp: v.timestamp,
state: match v.state {
old::ObjectVersionState::Uploading(h) => {
ObjectVersionState::Uploading(migrate_object_version_headers(h))
}
old::ObjectVersionState::Complete(d) => {
ObjectVersionState::Complete(migrate_object_version_data(d))
}
old::ObjectVersionState::Aborted => ObjectVersionState::Aborted,
},
}
}
fn migrate_object_version_headers(h: old::ObjectVersionHeaders) -> ObjectVersionHeaders {
ObjectVersionHeaders {
content_type: h.content_type,
other: h.other,
}
}
fn migrate_object_version_data(d: old::ObjectVersionData) -> ObjectVersionData {
match d {
old::ObjectVersionData::DeleteMarker => ObjectVersionData::DeleteMarker,
old::ObjectVersionData::Inline(m, b) => {
ObjectVersionData::Inline(migrate_object_version_meta(m), b)
}
old::ObjectVersionData::FirstBlock(m, h) => ObjectVersionData::FirstBlock(
migrate_object_version_meta(m),
Hash::try_from(h.as_slice()).unwrap(),
),
}
}
fn migrate_object_version_meta(m: old::ObjectVersionMeta) -> ObjectVersionMeta {
ObjectVersionMeta {
headers: migrate_object_version_headers(m.headers),
size: m.size,
etag: m.etag,
}
}