diff --git a/doc/logo/garage_hires_crop.png b/doc/logo/garage_hires_crop.png new file mode 100644 index 00000000..2fd0babc Binary files /dev/null and b/doc/logo/garage_hires_crop.png differ diff --git a/doc/talks/2023-01-18-tocatta/.gitignore b/doc/talks/2023-01-18-tocatta/.gitignore new file mode 100644 index 00000000..9f1f00e6 --- /dev/null +++ b/doc/talks/2023-01-18-tocatta/.gitignore @@ -0,0 +1,17 @@ +* + +!*.txt +!*.md + +!assets + +!.gitignore +!*.svg +!*.png +!*.jpg +!*.tex +!Makefile +!.gitignore +!assets/*.drawio.pdf + +!talk.pdf diff --git a/doc/talks/2023-01-18-tocatta/Makefile b/doc/talks/2023-01-18-tocatta/Makefile new file mode 100644 index 00000000..4d600178 --- /dev/null +++ b/doc/talks/2023-01-18-tocatta/Makefile @@ -0,0 +1,12 @@ +ASSETS=assets/consistent_hashing_1.pdf \ + assets/consistent_hashing_2.pdf \ + assets/consistent_hashing_3.pdf \ + assets/consistent_hashing_4.pdf \ + assets/garage_tables.pdf \ + assets/deuxfleurs.pdf + +talk.pdf: talk.tex $(ASSETS) + pdflatex talk.tex + +assets/%.pdf: assets/%.svg + inkscape -D -z --file=$^ --export-pdf=$@ diff --git a/doc/talks/2023-01-18-tocatta/abstract.md b/doc/talks/2023-01-18-tocatta/abstract.md new file mode 100644 index 00000000..b2658868 --- /dev/null +++ b/doc/talks/2023-01-18-tocatta/abstract.md @@ -0,0 +1,39 @@ +### (fr) Garage, un système de stockage de données géo-distribué léger et robuste + +Garage est un système de stockage de données léger, géo-distribué, qui +implémente le protocole de stockage S3 de Amazon. Garage est destiné +principalement à l'auto-hébergement sur du matériel courant d'occasion. À ce +titre, il doit tolérer un grand nombre de pannes: coupures de courant, coupures +de connexion Internet, pannes de machines, ... Il doit également être facile à +déployer et à maintenir, afin de pouvoir être facilement utilisé par des +amateurs ou des petites organisations. + +Cette présentation vous proposera un aperçu de Garage et du choix technique +principal qui rend un système comme Garage possible: le refus d'utiliser des +algorithmes de consensus, remplacés avantageusement par des méthodes à +cohérence faible. Notre modèle est fortement inspiré de la base de donnée +Dynamo (DeCandia et al, 2007), et fait usage des types de données CRDT (Shapiro +et al, 2011). Nous exploreront comment ces méthodes s'appliquent à la +construction de l'abstraction "stockage objet" dans un système distribué, et +quelles autres abstractions peuvent ou ne peuvent pas être construites dans ce +modèle. + +### (en) Garage, a lightweight and robust geo-distributed data storage system + +Garage is a lightweight geo-distributed data store that implements the Amazon +S3 object storage protocol. Garage is meant primarily for self-hosting at home +on second-hand commodity hardware, meaning it has to tolerate a wide variety of +failure scenarios such as power cuts, Internet disconnections and machine +crashes. It also has to be easy to deploy and maintain, so that hobbyists and +small organizations can use it without trouble. + +This talk will present Garage and the key technical choice that made Garage +possible: refusing to use consensus algorithms and using instead weak +consistency methods, with a model that is loosely based on that of the Dynamo +database (DeCandia et al, 2007) and that makes heavy use of conflict-free +replicated data types (Shapiro et al, 2011). We will explore how these methods +are suited to building the "object store" abstraction in a distributed system, +and what other abstractions are possible or impossible to build in this model. + + + diff --git a/doc/talks/2023-01-18-tocatta/assets/AGPLv3_Logo.png b/doc/talks/2023-01-18-tocatta/assets/AGPLv3_Logo.png new file mode 100644 index 00000000..445284a3 Binary files /dev/null and b/doc/talks/2023-01-18-tocatta/assets/AGPLv3_Logo.png differ diff --git a/doc/talks/2023-01-18-tocatta/assets/aerogramme.png b/doc/talks/2023-01-18-tocatta/assets/aerogramme.png new file mode 100644 index 00000000..3aabe3ad Binary files /dev/null and b/doc/talks/2023-01-18-tocatta/assets/aerogramme.png differ diff --git a/doc/talks/2023-01-18-tocatta/assets/aerogramme.svg b/doc/talks/2023-01-18-tocatta/assets/aerogramme.svg new file mode 100644 index 00000000..0c1ee127 --- /dev/null +++ b/doc/talks/2023-01-18-tocatta/assets/aerogramme.svg @@ -0,0 +1,1241 @@ + + + + + + K2V APIS3 APIAerogramme + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +IMAPIMAPIMAPMessageindexMessagebodies diff --git a/doc/talks/2023-01-18-tocatta/assets/aerogramme_components1.drawio.pdf b/doc/talks/2023-01-18-tocatta/assets/aerogramme_components1.drawio.pdf new file mode 100644 index 00000000..71a90f26 Binary files /dev/null and b/doc/talks/2023-01-18-tocatta/assets/aerogramme_components1.drawio.pdf differ diff --git a/doc/talks/2023-01-18-tocatta/assets/aerogramme_components1.png b/doc/talks/2023-01-18-tocatta/assets/aerogramme_components1.png new file mode 100644 index 00000000..fb81b460 Binary files /dev/null and b/doc/talks/2023-01-18-tocatta/assets/aerogramme_components1.png differ diff --git a/doc/talks/2023-01-18-tocatta/assets/aerogramme_components2.drawio.pdf b/doc/talks/2023-01-18-tocatta/assets/aerogramme_components2.drawio.pdf new file mode 100644 index 00000000..87e42eed Binary files /dev/null and b/doc/talks/2023-01-18-tocatta/assets/aerogramme_components2.drawio.pdf differ diff --git a/doc/talks/2023-01-18-tocatta/assets/aerogramme_components2.png b/doc/talks/2023-01-18-tocatta/assets/aerogramme_components2.png new file mode 100644 index 00000000..f9e2df14 Binary files /dev/null and b/doc/talks/2023-01-18-tocatta/assets/aerogramme_components2.png differ diff --git a/doc/talks/2023-01-18-tocatta/assets/aerogramme_datatype.drawio.pdf b/doc/talks/2023-01-18-tocatta/assets/aerogramme_datatype.drawio.pdf new file mode 100644 index 00000000..0606e059 Binary files /dev/null and b/doc/talks/2023-01-18-tocatta/assets/aerogramme_datatype.drawio.pdf differ diff --git a/doc/talks/2023-01-18-tocatta/assets/aerogramme_datatype.png b/doc/talks/2023-01-18-tocatta/assets/aerogramme_datatype.png new file mode 100644 index 00000000..c3b015a1 Binary files /dev/null and b/doc/talks/2023-01-18-tocatta/assets/aerogramme_datatype.png differ diff --git a/doc/talks/2023-01-18-tocatta/assets/aerogramme_keys.drawio.pdf b/doc/talks/2023-01-18-tocatta/assets/aerogramme_keys.drawio.pdf new file mode 100644 index 00000000..8fea81c7 Binary files /dev/null and b/doc/talks/2023-01-18-tocatta/assets/aerogramme_keys.drawio.pdf differ diff --git a/doc/talks/2023-01-18-tocatta/assets/aerogramme_keys.png b/doc/talks/2023-01-18-tocatta/assets/aerogramme_keys.png new file mode 100644 index 00000000..ed2077d9 Binary files /dev/null and b/doc/talks/2023-01-18-tocatta/assets/aerogramme_keys.png differ diff --git a/doc/talks/2023-01-18-tocatta/assets/alex.jpg b/doc/talks/2023-01-18-tocatta/assets/alex.jpg new file mode 100644 index 00000000..eac0f0a9 Binary files /dev/null and b/doc/talks/2023-01-18-tocatta/assets/alex.jpg differ diff --git a/doc/talks/2023-01-18-tocatta/assets/atuin.jpg b/doc/talks/2023-01-18-tocatta/assets/atuin.jpg new file mode 100644 index 00000000..f2fbd61d Binary files /dev/null and b/doc/talks/2023-01-18-tocatta/assets/atuin.jpg differ diff --git a/doc/talks/2023-01-18-tocatta/assets/compatibility.png b/doc/talks/2023-01-18-tocatta/assets/compatibility.png new file mode 100644 index 00000000..ce364a9b Binary files /dev/null and b/doc/talks/2023-01-18-tocatta/assets/compatibility.png differ diff --git a/doc/talks/2023-01-18-tocatta/assets/consistent_hashing_1.svg b/doc/talks/2023-01-18-tocatta/assets/consistent_hashing_1.svg new file mode 100644 index 00000000..f8d24fd8 --- /dev/null +++ b/doc/talks/2023-01-18-tocatta/assets/consistent_hashing_1.svg @@ -0,0 +1,301 @@ + + + + + + + + + + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + 1 + + + + 2 + + + + 3 + + + + 4 + + + + 5 + + + + 6 + + + + diff --git a/doc/talks/2023-01-18-tocatta/assets/consistent_hashing_2.svg b/doc/talks/2023-01-18-tocatta/assets/consistent_hashing_2.svg new file mode 100644 index 00000000..5ac8faf6 --- /dev/null +++ b/doc/talks/2023-01-18-tocatta/assets/consistent_hashing_2.svg @@ -0,0 +1,334 @@ + + + + + + + + + + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + 1 + + + + 2 + + + + 3 + + + + 4 + + + + 5 + + + + 6 + + + + + + + + + + + + diff --git a/doc/talks/2023-01-18-tocatta/assets/consistent_hashing_3.svg b/doc/talks/2023-01-18-tocatta/assets/consistent_hashing_3.svg new file mode 100644 index 00000000..fdfd3efc --- /dev/null +++ b/doc/talks/2023-01-18-tocatta/assets/consistent_hashing_3.svg @@ -0,0 +1,358 @@ + + + + + + + + + + + + + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + 1 + + + + 2 + + + + 3 + + + + 4 + + + + 5 + + + + 6 + + + + + + + + + + + + + + diff --git a/doc/talks/2023-01-18-tocatta/assets/consistent_hashing_4.svg b/doc/talks/2023-01-18-tocatta/assets/consistent_hashing_4.svg new file mode 100644 index 00000000..95ed0e02 --- /dev/null +++ b/doc/talks/2023-01-18-tocatta/assets/consistent_hashing_4.svg @@ -0,0 +1,377 @@ + + + + + + + + + + + + + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + 1 + + + + 2 + + + + 3 + + + + 4 + + + + 5 + + + + 6 + + + + + + + + + + + + + + + + + + + diff --git a/doc/talks/2023-01-18-tocatta/assets/deuxfleurs.svg b/doc/talks/2023-01-18-tocatta/assets/deuxfleurs.svg new file mode 100644 index 00000000..c298c22b --- /dev/null +++ b/doc/talks/2023-01-18-tocatta/assets/deuxfleurs.svg @@ -0,0 +1,91 @@ + + + + + + + + + + + + + + + D + F + diff --git a/doc/talks/2023-01-18-tocatta/assets/endpoint-latency-dc.png b/doc/talks/2023-01-18-tocatta/assets/endpoint-latency-dc.png new file mode 100644 index 00000000..7c7411cd Binary files /dev/null and b/doc/talks/2023-01-18-tocatta/assets/endpoint-latency-dc.png differ diff --git a/doc/talks/2023-01-18-tocatta/assets/garage.drawio.pdf b/doc/talks/2023-01-18-tocatta/assets/garage.drawio.pdf new file mode 100644 index 00000000..a54a163c Binary files /dev/null and b/doc/talks/2023-01-18-tocatta/assets/garage.drawio.pdf differ diff --git a/doc/talks/2023-01-18-tocatta/assets/garage.drawio.png b/doc/talks/2023-01-18-tocatta/assets/garage.drawio.png new file mode 100644 index 00000000..386dd862 Binary files /dev/null and b/doc/talks/2023-01-18-tocatta/assets/garage.drawio.png differ diff --git a/doc/talks/2023-01-18-tocatta/assets/garage2.drawio.png b/doc/talks/2023-01-18-tocatta/assets/garage2.drawio.png new file mode 100644 index 00000000..8562fbcf Binary files /dev/null and b/doc/talks/2023-01-18-tocatta/assets/garage2.drawio.png differ diff --git a/doc/talks/2023-01-18-tocatta/assets/garage2a.drawio.pdf b/doc/talks/2023-01-18-tocatta/assets/garage2a.drawio.pdf new file mode 100644 index 00000000..422c9343 Binary files /dev/null and b/doc/talks/2023-01-18-tocatta/assets/garage2a.drawio.pdf differ diff --git a/doc/talks/2023-01-18-tocatta/assets/garage2b.drawio.pdf b/doc/talks/2023-01-18-tocatta/assets/garage2b.drawio.pdf new file mode 100644 index 00000000..05a9710e Binary files /dev/null and b/doc/talks/2023-01-18-tocatta/assets/garage2b.drawio.pdf differ diff --git a/doc/talks/2023-01-18-tocatta/assets/garage_tables.svg b/doc/talks/2023-01-18-tocatta/assets/garage_tables.svg new file mode 100644 index 00000000..c7172713 --- /dev/null +++ b/doc/talks/2023-01-18-tocatta/assets/garage_tables.svg @@ -0,0 +1,537 @@ + + + + + + + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + Object + + + + + + bucket + + + + + + file path + + = partition key + + = sort key + + + + + + Version 1 + deleted + + + + + + + Version 2 + id + + size + MIME type + ... + + + + + + Version + + id + h(block 1) + h(block 2) + ... + + + + + Data block + + hash + data + + + + Objects table + Versions table + Blocks table + + diff --git a/doc/talks/2023-01-18-tocatta/assets/inframap.jpg b/doc/talks/2023-01-18-tocatta/assets/inframap.jpg new file mode 100644 index 00000000..19905a99 Binary files /dev/null and b/doc/talks/2023-01-18-tocatta/assets/inframap.jpg differ diff --git a/doc/talks/2023-01-18-tocatta/assets/location-aware.png b/doc/talks/2023-01-18-tocatta/assets/location-aware.png new file mode 100644 index 00000000..f5966865 Binary files /dev/null and b/doc/talks/2023-01-18-tocatta/assets/location-aware.png differ diff --git a/doc/talks/2023-01-18-tocatta/assets/logo_chatons.png b/doc/talks/2023-01-18-tocatta/assets/logo_chatons.png new file mode 100644 index 00000000..890cf17e Binary files /dev/null and b/doc/talks/2023-01-18-tocatta/assets/logo_chatons.png differ diff --git a/doc/talks/2023-01-18-tocatta/assets/map.png b/doc/talks/2023-01-18-tocatta/assets/map.png new file mode 100644 index 00000000..1dff3ab6 Binary files /dev/null and b/doc/talks/2023-01-18-tocatta/assets/map.png differ diff --git a/doc/talks/2023-01-18-tocatta/assets/minio.png b/doc/talks/2023-01-18-tocatta/assets/minio.png new file mode 100644 index 00000000..a71e9ccc Binary files /dev/null and b/doc/talks/2023-01-18-tocatta/assets/minio.png differ diff --git a/doc/talks/2023-01-18-tocatta/assets/neptune.jpg b/doc/talks/2023-01-18-tocatta/assets/neptune.jpg new file mode 100644 index 00000000..e59f0bfa Binary files /dev/null and b/doc/talks/2023-01-18-tocatta/assets/neptune.jpg differ diff --git a/doc/talks/2023-01-18-tocatta/assets/quentin.jpg b/doc/talks/2023-01-18-tocatta/assets/quentin.jpg new file mode 100644 index 00000000..d9a7b1e7 Binary files /dev/null and b/doc/talks/2023-01-18-tocatta/assets/quentin.jpg differ diff --git a/doc/talks/2023-01-18-tocatta/assets/rust_logo.png b/doc/talks/2023-01-18-tocatta/assets/rust_logo.png new file mode 100644 index 00000000..0e4809ec Binary files /dev/null and b/doc/talks/2023-01-18-tocatta/assets/rust_logo.png differ diff --git a/doc/talks/2023-01-18-tocatta/assets/slide1.png b/doc/talks/2023-01-18-tocatta/assets/slide1.png new file mode 100644 index 00000000..eb2e67a0 Binary files /dev/null and b/doc/talks/2023-01-18-tocatta/assets/slide1.png differ diff --git a/doc/talks/2023-01-18-tocatta/assets/slide2.png b/doc/talks/2023-01-18-tocatta/assets/slide2.png new file mode 100644 index 00000000..126a39b8 Binary files /dev/null and b/doc/talks/2023-01-18-tocatta/assets/slide2.png differ diff --git a/doc/talks/2023-01-18-tocatta/assets/slide3.png b/doc/talks/2023-01-18-tocatta/assets/slide3.png new file mode 100644 index 00000000..a39f96bf Binary files /dev/null and b/doc/talks/2023-01-18-tocatta/assets/slide3.png differ diff --git a/doc/talks/2023-01-18-tocatta/assets/slideB1.png b/doc/talks/2023-01-18-tocatta/assets/slideB1.png new file mode 100644 index 00000000..b14b6070 Binary files /dev/null and b/doc/talks/2023-01-18-tocatta/assets/slideB1.png differ diff --git a/doc/talks/2023-01-18-tocatta/assets/slideB2.png b/doc/talks/2023-01-18-tocatta/assets/slideB2.png new file mode 100644 index 00000000..a881a796 Binary files /dev/null and b/doc/talks/2023-01-18-tocatta/assets/slideB2.png differ diff --git a/doc/talks/2023-01-18-tocatta/assets/slideB3.png b/doc/talks/2023-01-18-tocatta/assets/slideB3.png new file mode 100644 index 00000000..830709d2 Binary files /dev/null and b/doc/talks/2023-01-18-tocatta/assets/slideB3.png differ diff --git a/doc/talks/2023-01-18-tocatta/assets/slides.svg b/doc/talks/2023-01-18-tocatta/assets/slides.svg new file mode 100644 index 00000000..9946c6fb --- /dev/null +++ b/doc/talks/2023-01-18-tocatta/assets/slides.svg @@ -0,0 +1,4326 @@ + + + + + + + + + + + + + + + + + + + + + + User-facing application + Database + Filesystem + + + + + + + + + + diff --git a/doc/talks/2023-01-18-tocatta/assets/slidesB.svg b/doc/talks/2023-01-18-tocatta/assets/slidesB.svg new file mode 100644 index 00000000..c0a6e97c --- /dev/null +++ b/doc/talks/2023-01-18-tocatta/assets/slidesB.svg @@ -0,0 +1,444 @@ + + + +User-facing applicationDatabase*K2VObject storage*(not really a database)Database diff --git a/doc/talks/2023-01-18-tocatta/talk.pdf b/doc/talks/2023-01-18-tocatta/talk.pdf new file mode 100644 index 00000000..5acb9198 Binary files /dev/null and b/doc/talks/2023-01-18-tocatta/talk.pdf differ diff --git a/doc/talks/2023-01-18-tocatta/talk.tex b/doc/talks/2023-01-18-tocatta/talk.tex new file mode 100644 index 00000000..566f56ec --- /dev/null +++ b/doc/talks/2023-01-18-tocatta/talk.tex @@ -0,0 +1,623 @@ +%\nonstopmode +\documentclass[aspectratio=169]{beamer} +\usepackage[utf8]{inputenc} +% \usepackage[frenchb]{babel} +\usepackage{amsmath} +\usepackage{mathtools} +\usepackage{breqn} +\usepackage{multirow} +\usetheme{boxes} +\usepackage{graphicx} +\usepackage{adjustbox} +%\useoutertheme[footline=authortitle,subsection=false]{miniframes} +%\useoutertheme[footline=authorinstitute,subsection=false]{miniframes} +\useoutertheme{infolines} +\setbeamertemplate{headline}{} + +\beamertemplatenavigationsymbolsempty + +\definecolor{TitleOrange}{RGB}{255,137,0} +\setbeamercolor{title}{fg=TitleOrange} +\setbeamercolor{frametitle}{fg=TitleOrange} + +\definecolor{ListOrange}{RGB}{255,145,5} +\setbeamertemplate{itemize item}{\color{ListOrange}$\blacktriangleright$} + +\definecolor{verygrey}{RGB}{70,70,70} +\setbeamercolor{normal text}{fg=verygrey} + + +\usepackage{tabu} +\usepackage{multicol} +\usepackage{vwcol} +\usepackage{stmaryrd} +\usepackage{graphicx} + +\usepackage[normalem]{ulem} + +\AtBeginSection[]{ + \begin{frame} + \vfill + \centering + \begin{beamercolorbox}[sep=8pt,center,shadow=true,rounded=true]{title} + \usebeamerfont{title}\insertsectionhead\par% + \end{beamercolorbox} + \vfill + \end{frame} +} + +\title{Garage} +\subtitle{a lightweight and robust geo-distributed data storage system} +\author{Deuxfleurs Association} +\date{Inria, 2023-01-18} + +\begin{document} + +\begin{frame} + \centering + \includegraphics[width=.3\linewidth]{../../sticker/Garage.pdf} + \vspace{1em} + + {\large\bf Deuxfleurs Association} + \vspace{1em} + + \url{https://garagehq.deuxfleurs.fr/} + + Matrix channel: \texttt{\#garage:deuxfleurs.fr} +\end{frame} + +\begin{frame} + \frametitle{Who we are} + \begin{columns}[t] + \begin{column}{.2\textwidth} + \centering + \adjincludegraphics[width=.4\linewidth, valign=t]{assets/alex.jpg} + \end{column} + \begin{column}{.6\textwidth} + \textbf{Alex Auvolat}\\ + PhD; co-founder of Deuxfleurs + \end{column} + \begin{column}{.2\textwidth} + ~ + \end{column} + \end{columns} + \vspace{1em} + + \begin{columns}[t] + \begin{column}{.2\textwidth} + ~ + \end{column} + \begin{column}{.6\textwidth} + \textbf{Quentin Dufour}\\ + PhD; co-founder of Deuxfleurs + \end{column} + \begin{column}{.2\textwidth} + \centering + \adjincludegraphics[width=.5\linewidth, valign=t]{assets/quentin.jpg} + \end{column} + \end{columns} + \vspace{2em} + + \begin{columns}[t] + \begin{column}{.2\textwidth} + \centering + \adjincludegraphics[width=.5\linewidth, valign=t]{assets/deuxfleurs.pdf} + \end{column} + \begin{column}{.6\textwidth} + \textbf{Deuxfleurs}\\ + A non-profit self-hosting collective,\\ + member of the CHATONS network + \end{column} + \begin{column}{.2\textwidth} + \centering + \adjincludegraphics[width=.7\linewidth, valign=t]{assets/logo_chatons.png} + \end{column} + \end{columns} + +\end{frame} + +\begin{frame} + \frametitle{Our objective at Deuxfleurs} + + \begin{center} + \textbf{Promote self-hosting and small-scale hosting\\ + as an alternative to large cloud providers} + \end{center} + \vspace{2em} + \visible<2->{ + Why is it hard? + } + \visible<3->{ + \vspace{2em} + \begin{center} + \textbf{\underline{Resilience}}\\ + {\footnotesize (we want good uptime/availability with low supervision)} + \end{center} + } +\end{frame} + +\begin{frame} + \frametitle{How to make a \underline{stable} system} + + Enterprise-grade systems typically employ: + \vspace{1em} + \begin{itemize} + \item RAID + \item Redundant power grid + UPS + \item Redundant Internet connections + \item Low-latency links + \item ... + \end{itemize} + \vspace{1em} + $\to$ it's costly and only worth it at DC scale +\end{frame} + +\begin{frame} + \frametitle{How to make a \underline{resilient} system} + + \only<1,4-5>{ + Instead, we use: + \vspace{1em} + \begin{itemize} + \item \textcolor<2->{gray}{Commodity hardware (e.g. old desktop PCs)} + \vspace{.5em} + \item<4-> \textcolor<5->{gray}{Commodity Internet (e.g. FTTB, FTTH) and power grid} + \vspace{.5em} + \item<5-> \textcolor<6->{gray}{\textbf{Geographical redundancy} (multi-site replication)} + \end{itemize} + } + \only<2>{ + \begin{center} + \includegraphics[width=.8\linewidth]{assets/atuin.jpg} + \end{center} + } + \only<3>{ + \begin{center} + \includegraphics[width=.8\linewidth]{assets/neptune.jpg} + \end{center} + } + \only<6>{ + \begin{center} + \includegraphics[width=.5\linewidth]{assets/inframap.jpg} + \end{center} + } +\end{frame} + +\begin{frame} + \frametitle{How to make this happen} + \begin{center} + \only<1>{\includegraphics[width=.8\linewidth]{assets/slide1.png}}% + \only<2>{\includegraphics[width=.8\linewidth]{assets/slide2.png}}% + \only<3>{\includegraphics[width=.8\linewidth]{assets/slide3.png}}% + \end{center} +\end{frame} + +\begin{frame} + \frametitle{Distributed file systems are slow} + File systems are complex, for example: + \vspace{1em} + \begin{itemize} + \item Concurrent modification by several processes + \vspace{1em} + \item Folder hierarchies + \vspace{1em} + \item Other requirements of the POSIX spec + \end{itemize} + \vspace{1em} + Coordination in a distributed system is costly + + \vspace{1em} + Costs explode with commodity hardware / Internet connections\\ + {\small (we experienced this!)} +\end{frame} + +\begin{frame} + \frametitle{A simpler solution: object storage} + Only two operations: + \vspace{1em} + \begin{itemize} + \item Put an object at a key + \vspace{1em} + \item Retrieve an object from its key + \end{itemize} + \vspace{1em} + {\footnotesize (and a few others)} + + \vspace{1em} + Sufficient for many applications! +\end{frame} + +\begin{frame} + \frametitle{A simpler solution: object storage} + \begin{center} + \includegraphics[height=6em]{../2020-12-02_wide-team/img/Amazon-S3.jpg} + \hspace{3em} + \includegraphics[height=5em]{assets/minio.png} + \hspace{3em} + \includegraphics[height=6em]{../../logo/garage_hires_crop.png} + \end{center} + \vspace{1em} + S3: a de-facto standard, many compatible applications + + \vspace{1em} + + MinIO is self-hostable but not suited for geo-distributed deployments + + \vspace{1em} + + \textbf{Garage is a self-hosted drop-in replacement for the Amazon S3 object store} +\end{frame} + + +\begin{frame} + \frametitle{The data model of object storage} + Object storage is basically a key-value store: + \vspace{1em} + + \begin{center} + \begin{tabular}{|l|p{8cm}|} + \hline + \textbf{Key: file path + name} & \textbf{Value: file data + metadata} \\ + \hline + \hline + \texttt{index.html} & + \texttt{Content-Type: text/html; charset=utf-8} \newline + \texttt{Content-Length: 24929} \newline + \texttt{} \\ + \hline + \texttt{img/logo.svg} & + \texttt{Content-Type: text/svg+xml} \newline + \texttt{Content-Length: 13429} \newline + \texttt{} \\ + \hline + \texttt{download/index.html} & + \texttt{Content-Type: text/html; charset=utf-8} \newline + \texttt{Content-Length: 26563} \newline + \texttt{} \\ + \hline + \end{tabular} + \end{center} + +\end{frame} + +\begin{frame} + \frametitle{Two big problems} + \begin{enumerate} + \item \textbf{How to place data on different nodes?}\\ + \vspace{1em} + \underline{Constraints:} heterogeneous hardware\\ + \underline{Objective:} $n$ copies of everything, maximize usable capacity, maximize resilience\\ + \vspace{1em} + $\to$ the Dynamo model + optimization algorithms + \vspace{2em} + \item<2-> \textbf{How to guarantee consistency?}\\ + \vspace{1em} + \underline{Constraints:} slow network (geographical distance), node unavailability/crashes\\ + \underline{Objective:} maximize availability, read-after-write guarantee\\ + \vspace{1em} + $\to$ CRDTs, monotonicity, read and write quorums + \end{enumerate} +\end{frame} + +\section{Problem 1: placing data} + +\begin{frame} + \frametitle{Key-value stores, upgraded: the Dynamo model} + \textbf{Two keys:} + \begin{itemize} + \item Partition key: used to divide data into partitions (shards) + \item Sort key: used to identify items inside a partition + \end{itemize} + + \vspace{1em} + + \begin{center} + \begin{tabular}{|l|l|p{3cm}|} + \hline + \textbf{Partition key: bucket} & \textbf{Sort key: filename} & \textbf{Value} \\ + \hline + \hline + \texttt{website} & \texttt{index.html} & (file data) \\ + \hline + \texttt{website} & \texttt{img/logo.svg} & (file data) \\ + \hline + \texttt{website} & \texttt{download/index.html} & (file data) \\ + \hline + \hline + \texttt{backup} & \texttt{borg/index.2822} & (file data) \\ + \hline + \texttt{backup} & \texttt{borg/data/2/2329} & (file data) \\ + \hline + \texttt{backup} & \texttt{borg/data/2/2680} & (file data) \\ + \hline + \hline + \texttt{private} & \texttt{qq3a2nbe1qjq0ebbvo6ocsp6co} & (file data) \\ + \hline + \end{tabular} + \end{center} +\end{frame} + +\begin{frame} + \frametitle{Key-value stores, upgraded: the Dynamo model} + \begin{itemize} + \item Data with different partition keys is stored independantly,\\ + on a different set of nodes\\ + \vspace{.5em} + $\to$ no easy way to list all partition keys\\ + $\to$ no cross-shard transactions\\ + \vspace{2em} + \item Placing data: hash the partition key, select nodes accordingly\\ + \vspace{.5em} + $\to$ distributed hash table (DHT) + \vspace{2em} + \item For a given value of the partition key, items can be listed using their sort keys + \end{itemize} +\end{frame} + +\begin{frame} + \frametitle{How to spread files over different cluster nodes?} + \textbf{Consistent hashing (Dynamo):} + \vspace{1em} + + \begin{center} + \only<1>{\includegraphics[width=.40\columnwidth]{assets/consistent_hashing_1.pdf}}% + \only<2>{\includegraphics[width=.40\columnwidth]{assets/consistent_hashing_2.pdf}}% + \only<3>{\includegraphics[width=.40\columnwidth]{assets/consistent_hashing_3.pdf}}% + \only<4>{\includegraphics[width=.40\columnwidth]{assets/consistent_hashing_4.pdf}}% + \end{center} +\end{frame} + +\begin{frame} + \frametitle{Constraint: location-awareness} + \begin{center} + \includegraphics[width=\linewidth]{assets/location-aware.png} + \end{center} + \vspace{2em} + Garage replicates data on different zones when possible +\end{frame} + +\begin{frame} + \frametitle{Constraint: location-awareness} + \begin{center} + \includegraphics[width=.8\linewidth]{assets/map.png} + \end{center} +\end{frame} + +\begin{frame} + \frametitle{Issues with consistent hashing} + \begin{itemize} + \item Consistent hashing doesn't dispatch data based on geographical location of nodes + \vspace{1em} + \item<2-> Geographically aware adaptation, try 1:\\ + data quantities not well balanced between nodes + \vspace{1em} + \item<3-> Geographically aware adaptation, try 2:\\ + too many reshuffles when adding/removing nodes + \end{itemize} +\end{frame} + +\begin{frame} + \frametitle{How to spread files over different cluster nodes?} + \textbf{Garage's method: build an index table} + \vspace{1em} + + Realization: we can actually precompute an optimal solution + \vspace{1em} + + \visible<2->{ + \begin{center} + \begin{tabular}{|l|l|l|l|} + \hline + \textbf{Partition} & \textbf{Node 1} & \textbf{Node 2} & \textbf{Node 3} \\ + \hline + \hline + Partition 0 & Io (jupiter) & Drosera (atuin) & Courgette (neptune) \\ + \hline + Partition 1 & Datura (atuin) & Courgette (neptune) & Io (jupiter) \\ + \hline + Partition 2 & Io(jupiter) & Celeri (neptune) & Drosera (atuin) \\ + \hline + \hspace{1em}$\vdots$ & \hspace{1em}$\vdots$ & \hspace{1em}$\vdots$ & \hspace{1em}$\vdots$ \\ + \hline + Partition 255 & Concombre (neptune) & Io (jupiter) & Drosera (atuin) \\ + \hline + \end{tabular} + \end{center} + } + \vspace{1em} + \visible<3->{ + The index table is built centrally using an optimal algorithm,\\ + then propagated to all nodes + } +\end{frame} + +\begin{frame} + \frametitle{The relationship between \emph{partition} and \emph{partition key}} + \begin{center} + \begin{tabular}{|l|l|l|l|} + \hline + \textbf{Partition key} & \textbf{Partition} & \textbf{Sort key} & \textbf{Value} \\ + \hline + \hline + \texttt{website} & Partition 12 & \texttt{index.html} & (file data) \\ + \hline + \texttt{website} & Partition 12 & \texttt{img/logo.svg} & (file data) \\ + \hline + \texttt{website} & Partition 12 &\texttt{download/index.html} & (file data) \\ + \hline + \hline + \texttt{backup} & Partition 42 & \texttt{borg/index.2822} & (file data) \\ + \hline + \texttt{backup} & Partition 42 & \texttt{borg/data/2/2329} & (file data) \\ + \hline + \texttt{backup} & Partition 42 & \texttt{borg/data/2/2680} & (file data) \\ + \hline + \hline + \texttt{private} & Partition 42 & \texttt{qq3a2nbe1qjq0ebbvo6ocsp6co} & (file data) \\ + \hline + \end{tabular} + \end{center} + \vspace{1em} + \textbf{To read or write an item:} hash partition key + \\ \hspace{5cm} $\to$ determine partition number (first 8 bits) + \\ \hspace{5cm} $\to$ find associated nodes +\end{frame} + +\begin{frame} + \frametitle{Garage's internal data structures} + \centering + \includegraphics[width=.75\columnwidth]{assets/garage_tables.pdf} +\end{frame} + +\begin{frame} + \frametitle{Storing and retrieving files} + \begin{center} + \only<1>{\includegraphics[width=.45\linewidth]{assets/garage2a.drawio.pdf}}% + \only<2>{\includegraphics[width=.45\linewidth]{assets/garage2b.drawio.pdf}}% + \end{center} +\end{frame} + +\section{Problem 2: ensuring consistency} + +%\begin{frame} +% \frametitle{Garage's architecture} +% \begin{center} +% \includegraphics[width=.35\linewidth]{assets/garage.drawio.pdf} +% \end{center} +%\end{frame} + +\begin{frame} + \frametitle{Garage is \emph{coordination-free}:} + \begin{itemize} + \item No Raft or Paxos + \vspace{1em} + \item Internal data types are CRDTs + \vspace{1em} + \item All nodes are equivalent (no master/leader/index node) + \end{itemize} + \vspace{2em} + $\to$ less sensitive to higher latencies between nodes +\end{frame} + +\begin{frame} + \frametitle{Consistency model} + \begin{itemize} + \item Not ACID (not required by S3 spec) / not linearizable + \vspace{1em} + \item \textbf{Read-after-write consistency}\\ + {\footnotesize (stronger than eventual consistency)} + \end{itemize} +\end{frame} + +\begin{frame} + \frametitle{Impact on performances} + \begin{center} + \includegraphics[width=.8\linewidth]{assets/endpoint-latency-dc.png} + \end{center} +\end{frame} + + +\begin{frame} + \frametitle{An ever-increasing compatibility list} + \begin{center} + \includegraphics[width=.7\linewidth]{assets/compatibility.png} + \end{center} +\end{frame} + +\begin{frame} + \frametitle{Further plans for Garage} + \begin{center} + \only<1>{\includegraphics[width=.8\linewidth]{assets/slideB1.png}}% + \only<2>{\includegraphics[width=.8\linewidth]{assets/slideB2.png}}% + \only<3>{\includegraphics[width=.8\linewidth]{assets/slideB3.png}}% + \end{center} +\end{frame} + +\begin{frame} + \frametitle{K2V Design} + \begin{itemize} + \item A new, custom, minimal API + \vspace{1em} + \item<2-> Exposes the partitoning mechanism of Garage\\ + K2V = partition key / sort key / value (like Dynamo) + \vspace{1em} + \item<3-> Coordination-free, CRDT-friendly (inspired by Riak)\\ + \vspace{1em} + \item<4-> Cryptography-friendly: values are binary blobs + \end{itemize} +\end{frame} + +\begin{frame} + \frametitle{Application: an e-mail storage server} + \begin{center} + \only<1>{\includegraphics[width=.9\linewidth]{assets/aerogramme.png}}% + \end{center} +\end{frame} + +\begin{frame} + \frametitle{Aerogramme data model} + \begin{center} + \only<1>{\includegraphics[width=.4\linewidth]{assets/aerogramme_datatype.drawio.pdf}}% + \only<2->{\includegraphics[width=.9\linewidth]{assets/aerogramme_keys.drawio.pdf}\vspace{1em}}% + \end{center} + \visible<3->{Aerogramme encrypts all stored values for privacy\\ + (Garage server administrators can't read your mail)} +\end{frame} + +\begin{frame} + \frametitle{Different deployment scenarios} + \begin{center} + \only<1>{\includegraphics[width=.9\linewidth]{assets/aerogramme_components1.drawio.pdf}}% + \only<2>{\includegraphics[width=.9\linewidth]{assets/aerogramme_components2.drawio.pdf}}% + \end{center} +\end{frame} + +\begin{frame} + \frametitle{A new model for building resilient software} + \begin{itemize} + \item Design a data model suited to K2V\\ + {\footnotesize (see Cassandra docs on porting SQL data models to Cassandra)} + \vspace{1em} + \begin{itemize} + \item Use CRDTs or other eventually consistent data types (see e.g. Bayou) + \vspace{1em} + \item Store opaque binary blobs to provide End-to-End Encryption\\ + \end{itemize} + \vspace{1em} + \item Store big blobs (files) in S3 + \vspace{1em} + \item Let Garage manage sharding, replication, failover, etc. + \end{itemize} +\end{frame} + +\begin{frame} + \frametitle{Research perspectives} + \begin{itemize} + \item Write about Garage's global architecture \emph{(paper in progress)} + \vspace{1em} + \item Measure and improve Garage's performances + \vspace{1em} + \item Discuss the optimal layout algorithm, provide proofs + \vspace{1em} + \item Write about our proposed architecture for (E2EE) apps over K2V+S3 + \end{itemize} +\end{frame} + +\begin{frame} + \frametitle{Where to find us} + \begin{center} + \includegraphics[width=.25\linewidth]{../../logo/garage_hires.png}\\ + \vspace{-1em} + \url{https://garagehq.deuxfleurs.fr/}\\ + \url{mailto:garagehq@deuxfleurs.fr}\\ + \texttt{\#garage:deuxfleurs.fr} on Matrix + + \vspace{1.5em} + \includegraphics[width=.06\linewidth]{assets/rust_logo.png} + \includegraphics[width=.13\linewidth]{assets/AGPLv3_Logo.png} + \end{center} +\end{frame} + +\end{document} + +%% vim: set ts=4 sw=4 tw=0 noet spelllang=en :