diff --git a/doc/talks/2024-01-12-seed/.gitignore b/doc/talks/2024-01-12-seed/.gitignore new file mode 100644 index 000000000..9f1f00e6e --- /dev/null +++ b/doc/talks/2024-01-12-seed/.gitignore @@ -0,0 +1,17 @@ +* + +!*.txt +!*.md + +!assets + +!.gitignore +!*.svg +!*.png +!*.jpg +!*.tex +!Makefile +!.gitignore +!assets/*.drawio.pdf + +!talk.pdf diff --git a/doc/talks/2024-01-12-seed/Makefile b/doc/talks/2024-01-12-seed/Makefile new file mode 100644 index 000000000..033a8af94 --- /dev/null +++ b/doc/talks/2024-01-12-seed/Makefile @@ -0,0 +1,10 @@ +ASSETS=assets/deuxfleurs.pdf + +talk.pdf: talk.tex $(ASSETS) + pdflatex talk.tex + +assets/%.pdf: assets/%.svg + inkscape -D -z --file=$^ --export-pdf=$@ + +assets/%.pdf_tex: assets/%.svg + inkscape -D -z --file=$^ --export-pdf=$@ --export-latex diff --git a/doc/talks/2024-01-12-seed/abstract.md b/doc/talks/2024-01-12-seed/abstract.md new file mode 100644 index 000000000..b26588686 --- /dev/null +++ b/doc/talks/2024-01-12-seed/abstract.md @@ -0,0 +1,39 @@ +### (fr) Garage, un système de stockage de données géo-distribué léger et robuste + +Garage est un système de stockage de données léger, géo-distribué, qui +implémente le protocole de stockage S3 de Amazon. Garage est destiné +principalement à l'auto-hébergement sur du matériel courant d'occasion. À ce +titre, il doit tolérer un grand nombre de pannes: coupures de courant, coupures +de connexion Internet, pannes de machines, ... Il doit également être facile à +déployer et à maintenir, afin de pouvoir être facilement utilisé par des +amateurs ou des petites organisations. + +Cette présentation vous proposera un aperçu de Garage et du choix technique +principal qui rend un système comme Garage possible: le refus d'utiliser des +algorithmes de consensus, remplacés avantageusement par des méthodes à +cohérence faible. Notre modèle est fortement inspiré de la base de donnée +Dynamo (DeCandia et al, 2007), et fait usage des types de données CRDT (Shapiro +et al, 2011). Nous exploreront comment ces méthodes s'appliquent à la +construction de l'abstraction "stockage objet" dans un système distribué, et +quelles autres abstractions peuvent ou ne peuvent pas être construites dans ce +modèle. + +### (en) Garage, a lightweight and robust geo-distributed data storage system + +Garage is a lightweight geo-distributed data store that implements the Amazon +S3 object storage protocol. Garage is meant primarily for self-hosting at home +on second-hand commodity hardware, meaning it has to tolerate a wide variety of +failure scenarios such as power cuts, Internet disconnections and machine +crashes. It also has to be easy to deploy and maintain, so that hobbyists and +small organizations can use it without trouble. + +This talk will present Garage and the key technical choice that made Garage +possible: refusing to use consensus algorithms and using instead weak +consistency methods, with a model that is loosely based on that of the Dynamo +database (DeCandia et al, 2007) and that makes heavy use of conflict-free +replicated data types (Shapiro et al, 2011). We will explore how these methods +are suited to building the "object store" abstraction in a distributed system, +and what other abstractions are possible or impossible to build in this model. + + + diff --git a/doc/talks/2024-01-12-seed/assets/AGPLv3_Logo.png b/doc/talks/2024-01-12-seed/assets/AGPLv3_Logo.png new file mode 100644 index 000000000..445284a37 Binary files /dev/null and b/doc/talks/2024-01-12-seed/assets/AGPLv3_Logo.png differ diff --git a/doc/talks/2024-01-12-seed/assets/alex.jpg b/doc/talks/2024-01-12-seed/assets/alex.jpg new file mode 100644 index 000000000..eac0f0a94 Binary files /dev/null and b/doc/talks/2024-01-12-seed/assets/alex.jpg differ diff --git a/doc/talks/2024-01-12-seed/assets/atuin.jpg b/doc/talks/2024-01-12-seed/assets/atuin.jpg new file mode 100644 index 000000000..f2fbd61db Binary files /dev/null and b/doc/talks/2024-01-12-seed/assets/atuin.jpg differ diff --git a/doc/talks/2024-01-12-seed/assets/deuxfleurs.svg b/doc/talks/2024-01-12-seed/assets/deuxfleurs.svg new file mode 100644 index 000000000..c298c22bb --- /dev/null +++ b/doc/talks/2024-01-12-seed/assets/deuxfleurs.svg @@ -0,0 +1,91 @@ + + + + + + + + + + + + + + + D + F + diff --git a/doc/talks/2024-01-12-seed/assets/garage2.drawio.png b/doc/talks/2024-01-12-seed/assets/garage2.drawio.png new file mode 100644 index 000000000..8562fbcfa Binary files /dev/null and b/doc/talks/2024-01-12-seed/assets/garage2.drawio.png differ diff --git a/doc/talks/2024-01-12-seed/assets/logo_chatons.png b/doc/talks/2024-01-12-seed/assets/logo_chatons.png new file mode 100644 index 000000000..890cf17ee Binary files /dev/null and b/doc/talks/2024-01-12-seed/assets/logo_chatons.png differ diff --git a/doc/talks/2024-01-12-seed/assets/map.png b/doc/talks/2024-01-12-seed/assets/map.png new file mode 100644 index 000000000..1dff3ab68 Binary files /dev/null and b/doc/talks/2024-01-12-seed/assets/map.png differ diff --git a/doc/talks/2024-01-12-seed/assets/minio.png b/doc/talks/2024-01-12-seed/assets/minio.png new file mode 100644 index 000000000..a71e9ccc8 Binary files /dev/null and b/doc/talks/2024-01-12-seed/assets/minio.png differ diff --git a/doc/talks/2024-01-12-seed/assets/neptune.jpg b/doc/talks/2024-01-12-seed/assets/neptune.jpg new file mode 100644 index 000000000..61fcbff63 Binary files /dev/null and b/doc/talks/2024-01-12-seed/assets/neptune.jpg differ diff --git a/doc/talks/2024-01-12-seed/assets/rust_logo.png b/doc/talks/2024-01-12-seed/assets/rust_logo.png new file mode 100644 index 000000000..0e4809ec3 Binary files /dev/null and b/doc/talks/2024-01-12-seed/assets/rust_logo.png differ diff --git a/doc/talks/2024-01-12-seed/talk.pdf b/doc/talks/2024-01-12-seed/talk.pdf new file mode 100644 index 000000000..b48497a70 Binary files /dev/null and b/doc/talks/2024-01-12-seed/talk.pdf differ diff --git a/doc/talks/2024-01-12-seed/talk.tex b/doc/talks/2024-01-12-seed/talk.tex new file mode 100644 index 000000000..e7b4e2c27 --- /dev/null +++ b/doc/talks/2024-01-12-seed/talk.tex @@ -0,0 +1,370 @@ +\nonstopmode +\documentclass[aspectratio=169]{beamer} +\usepackage[utf8]{inputenc} +% \usepackage[frenchb]{babel} +\usepackage{amsmath} +\usepackage{mathtools} +\usepackage{breqn} +\usepackage{multirow} +\usetheme{boxes} +\usepackage{graphicx} +\usepackage{import} +\usepackage{adjustbox} +%\useoutertheme[footline=authortitle,subsection=false]{miniframes} +%\useoutertheme[footline=authorinstitute,subsection=false]{miniframes} +\useoutertheme{infolines} +\setbeamertemplate{headline}{} + +\beamertemplatenavigationsymbolsempty + +\definecolor{TitleOrange}{RGB}{255,137,0} +\setbeamercolor{title}{fg=TitleOrange} +\setbeamercolor{frametitle}{fg=TitleOrange} + +\definecolor{ListOrange}{RGB}{255,145,5} +\setbeamertemplate{itemize item}{\color{ListOrange}$\blacktriangleright$} + +\definecolor{verygrey}{RGB}{70,70,70} +\setbeamercolor{normal text}{fg=verygrey} + + +\usepackage{tabu} +\usepackage{multicol} +\usepackage{vwcol} +\usepackage{stmaryrd} +\usepackage{graphicx} + +\usepackage[normalem]{ulem} + +\AtBeginSection[]{ + \begin{frame} + \vfill + \centering + \begin{beamercolorbox}[sep=8pt,center,shadow=true,rounded=true]{title} + \usebeamerfont{title}\insertsectionhead\par% + \end{beamercolorbox} + \vfill + \end{frame} +} + +\title{Garage} +\subtitle{a lightweight and robust geo-distributed data storage system} +\author{Alex Auvolat, Deuxfleurs} +\date{SEED webinar, 2024-01-12} + +\begin{document} + +% \begin{frame} +% \centering +% \includegraphics[width=.3\linewidth]{../../sticker/Garage.png} +% \vspace{1em} +% +% {\large\bf Alex Auvolat, Deuxfleurs Association} +% \vspace{1em} +% +% \url{https://garagehq.deuxfleurs.fr/} +% +% %Matrix channel: \texttt{\#garage:deuxfleurs.fr} +% \end{frame} + +\begin{frame} + %\frametitle{Who I am} + \begin{columns}[t] + \begin{column}{.2\textwidth} + \centering + \adjincludegraphics[width=.4\linewidth, valign=t]{assets/alex.jpg} + \end{column} + \begin{column}{.6\textwidth} + \textbf{Alex Auvolat}\\ + Member of Deuxfleurs, lead developer of Garage + \end{column} + \begin{column}{.2\textwidth} + ~ + \end{column} + \end{columns} + \vspace{.5em} + + \begin{columns}[t] + \begin{column}{.2\textwidth} + \centering + \adjincludegraphics[width=.6\linewidth, valign=t]{../../logo/garage-notext.png} + \end{column} + \begin{column}{.6\textwidth} + \\\textbf{Garage}\\ + A self-hosted alternative to S3 for object storage + \end{column} + \begin{column}{.2\textwidth} + ~ + \end{column} + \end{columns} + \vspace{2em} + + \begin{columns}[t] + \begin{column}{.2\textwidth} + \centering + \adjincludegraphics[width=.5\linewidth, valign=t]{assets/deuxfleurs.pdf} + \end{column} + \begin{column}{.6\textwidth} + \textbf{Deuxfleurs}\\ + A non-profit self-hosting collective,\\ + member of the CHATONS network + \end{column} + \begin{column}{.2\textwidth} + \centering + \adjincludegraphics[width=.7\linewidth, valign=t]{assets/logo_chatons.png} + \end{column} + \end{columns} + +\end{frame} + +\begin{frame} + \frametitle{Stable vs Resilient} + + \hspace{1em} + \begin{minipage}{7cm} + \textbf{Building a "stable" system:} + \vspace{1em} + + Enterprise-grade systems typically employ: + \vspace{1em} + \begin{itemize} + \item RAID + \item Redundant power grid + UPS + \item Redundant Internet connections + \item Low-latency links + \item ... + \end{itemize} + \vspace{1em} + $\to$ costly, only worth at DC scale\\ + $\to$ still risk of DC-level incident... + \end{minipage} + \hfill + \begin{minipage}{7cm} + \textbf{Building a \underline{resilient} system:} + \vspace{1em} + + An alternative, cheaper way: + \vspace{1em} + \begin{itemize} + \item Commodity hardware \\(e.g. old desktop PCs) + \vspace{.5em} + \item Commodity Internet \\(e.g. FTTB, FTTH) and power grid + \vspace{.5em} + \item \textbf{Geographical redundancy} \\(multi-site replication) + \end{itemize} + \vspace{1.5em} + \end{minipage} + \hspace{1em} +\end{frame} + +\begin{frame} + \frametitle{Example: our infrastructure at Deuxfleurs} + \only<1>{ + \begin{center} + \includegraphics[width=.8\linewidth]{assets/neptune.jpg} + \end{center} + } + \only<2>{ + \begin{center} + \includegraphics[width=.8\linewidth]{assets/atuin.jpg} + \end{center} + } + \only<3>{ + \begin{center} + \includegraphics[width=.8\linewidth]{assets/inframap_jdll2023.pdf} + \end{center} + } +\end{frame} + +\begin{frame} + \frametitle{Object storage: simpler than file systems} + + \begin{minipage}{6cm} + Only two operations: + \vspace{1em} + \begin{itemize} + \item Put an object at a key + \vspace{1em} + \item Retrieve an object from its key + \end{itemize} + \vspace{1em} + {\footnotesize (and a few others)} + + \vspace{1em} + Sufficient for many applications! + \end{minipage} + \hfill + \begin{minipage}{8cm} + \begin{center} + \vspace{2em} + \includegraphics[height=6em]{../2020-12-02_wide-team/img/Amazon-S3.jpg} + \hspace{2em} + \includegraphics[height=5em]{assets/minio.png} + + \vspace{2em} + \includegraphics[height=6em]{../../logo/garage_hires_crop.png} + \end{center} + \vspace{1em} + \end{minipage} +\end{frame} + + +\begin{frame} + \frametitle{The data model of object storage} + Object storage is basically a key-value store: + \vspace{1em} + + \begin{center} + \begin{tabular}{|l|p{8cm}|} + \hline + \textbf{Key: file path + name} & \textbf{Value: file data + metadata} \\ + \hline + \hline + \texttt{index.html} & + \texttt{Content-Type: text/html; charset=utf-8} \newline + \texttt{Content-Length: 24929} \newline + \texttt{} \\ + \hline + \texttt{img/logo.svg} & + \texttt{Content-Type: text/svg+xml} \newline + \texttt{Content-Length: 13429} \newline + \texttt{} \\ + \hline + \texttt{download/index.html} & + \texttt{Content-Type: text/html; charset=utf-8} \newline + \texttt{Content-Length: 26563} \newline + \texttt{} \\ + \hline + \end{tabular} + \end{center} +\end{frame} + +\begin{frame} + \frametitle{Implementation: consensus vs weak consistency} + + \hspace{1em} + \begin{minipage}{7cm} + \textbf{Consensus-based systems:} + \vspace{1em} + \begin{itemize} + \item \textbf{Leader-based:} a leader is elected to coordinate + all reads and writes + \vspace{1em} + \item Allows for \textbf{sequential reasoning}: + program as if running on a single machine + \vspace{1em} + \item Serializability is one of the \\ + \textbf{strongest consistency guarantees} + \vspace{1em} + \item \textbf{Costly}, the leader is a bottleneck; + leader elections on failure take time + \end{itemize} + \end{minipage} + \hfill + \begin{minipage}{7cm} \visible<2->{ + \textbf{Weakly consistent systems:} + \vspace{1em} + \begin{itemize} + \item \textbf{Nodes are equivalent}, any node + can originate a read or write operation + \vspace{1em} + \item \textbf{Operations must be independent}, + conflicts are resolved after the fact + \vspace{1em} + \item Strongest achievable consistency:\\ + \textbf{read-after-write consistency}\\(using quorums) + \vspace{1em} + \item \textbf{Fast}, no single bottleneck;\\ + works transparently with offline nodes + \end{itemize} + } \end{minipage} + \hspace{1em} +\end{frame} + +\begin{frame} + \frametitle{Why avoid consensus?} + Consensus can be implemented reasonably well in practice, so why avoid it? + \vspace{2em} + \begin{itemize} + \item \textbf{Software complexity:} RAFT and PAXOS are complex beasts;\\ + harder to prove, harder to reason about + \vspace{1.5em} + \item \textbf{Performance issues:} + \vspace{1em} + \begin{itemize} + \item Taking a decision may take an \textbf{arbitrary number of steps} (in adverse scenarios) + \vspace{1em} + \item The leader is a \textbf{bottleneck} for all requests;\\ + even in leaderless approaches, \textbf{all nodes must process all operations in order} + \vspace{1em} + \item Particularly \textbf{sensitive to higher latency} between nodes + \end{itemize} + \end{itemize} +\end{frame} + +\begin{frame} + \frametitle{Objective: the right level of consistency for Garage} + + \underline{Constraints:} slow network (geographical distance), node unavailability/crashes\\ + \underline{Objective:} maximize availability, maintain an \emph{appropriate level of consistency}\\ + \vspace{1em} + \begin{enumerate} + \item<2-> \textbf{Weak consistency for most things}\\ + \vspace{1em} + \underline{Example:} \texttt{PutObject}\\ + \vspace{.5em} + If two clients write the same + object at the same time, one of the two is implicitly overwritten. + No need to coordinate, use a \emph{last-writer-wins register}. + \vspace{1em} + \item<3-> \textbf{Stronger consistency only when necessary}\\ + \vspace{1em} + \underline{Example:} \texttt{CreateBucket}\\ + \vspace{.5em} + A bucket is a reserved name in a shared namespace, + two clients should be prevented from both creating the same bucket + (\emph{mutual exclusion}). + \end{enumerate} +\end{frame} + +\begin{frame} + \frametitle{The possibility of \emph{leaderless consensus}} + Currently, Garage \emph{only has weak consistency}. Is fast, but \texttt{CreateBucket} is broken! + + \visible<2->{ + \vspace{1em} + Leaderless consensus (Antoniadis et al., 2023) alleviates issues with RAFT and PAXOS: + \vspace{1em} + \begin{itemize} + \item \textbf{No leader.} All nodes participate equally at each time step, + and different nodes can be unavailable at different times without issues. + \\ \vspace{.5em} $\to$ better tolerance to the high latency (remove bottleneck issue) + \\ $\to$ tolerates crash transparently + \vspace{1em} + \item \textbf{Simpler formalization.} The algorithm is very simple to express and to analyze in mathematical terms. + \end{itemize} + } + \visible<3->{ + \vspace{1em} + One of the possible subjects for this PhD: + \\$\to$ \emph{integration of leaderless consensus in Garage} + testing + perf eval, etc. + } +\end{frame} + +\begin{frame} + \begin{center} + \includegraphics[width=.25\linewidth]{../../logo/garage_hires.png}\\ + \vspace{-1em} + \url{https://garagehq.deuxfleurs.fr/}\\ + \url{mailto:garagehq@deuxfleurs.fr}\\ + \texttt{\#garage:deuxfleurs.fr} on Matrix + + \vspace{1.5em} + \includegraphics[width=.06\linewidth]{assets/rust_logo.png} + \includegraphics[width=.13\linewidth]{assets/AGPLv3_Logo.png} + \end{center} +\end{frame} + +\end{document} + +%% vim: set ts=4 sw=4 tw=0 noet spelllang=en :