841 lines
23 KiB
TeX
841 lines
23 KiB
TeX
\nonstopmode
|
|
\documentclass[aspectratio=169,xcolor={svgnames}]{beamer}
|
|
\usepackage[utf8]{inputenc}
|
|
% \usepackage[frenchb]{babel}
|
|
\usepackage{amsmath}
|
|
\usepackage{mathtools}
|
|
\usepackage{breqn}
|
|
\usepackage{multirow}
|
|
\usetheme{boxes}
|
|
\usepackage{graphicx}
|
|
\usepackage{import}
|
|
\usepackage{adjustbox}
|
|
\usepackage[absolute,overlay]{textpos}
|
|
%\useoutertheme[footline=authortitle,subsection=false]{miniframes}
|
|
%\useoutertheme[footline=authorinstitute,subsection=false]{miniframes}
|
|
\useoutertheme{infolines}
|
|
\setbeamertemplate{headline}{}
|
|
|
|
\beamertemplatenavigationsymbolsempty
|
|
|
|
\definecolor{TitleOrange}{RGB}{255,137,0}
|
|
\setbeamercolor{title}{fg=TitleOrange}
|
|
\setbeamercolor{frametitle}{fg=TitleOrange}
|
|
|
|
\definecolor{ListOrange}{RGB}{255,145,5}
|
|
\setbeamertemplate{itemize item}{\color{ListOrange}$\blacktriangleright$}
|
|
|
|
\definecolor{verygrey}{RGB}{70,70,70}
|
|
\setbeamercolor{normal text}{fg=verygrey}
|
|
|
|
|
|
\usepackage{tabu}
|
|
\usepackage{multicol}
|
|
\usepackage{vwcol}
|
|
\usepackage{stmaryrd}
|
|
\usepackage{graphicx}
|
|
|
|
\usepackage[normalem]{ulem}
|
|
|
|
\AtBeginSection[]{
|
|
\begin{frame}
|
|
\vfill
|
|
\centering
|
|
\begin{beamercolorbox}[sep=8pt,center,shadow=true,rounded=true]{title}
|
|
\usebeamerfont{title}\insertsectionhead\par%
|
|
\end{beamercolorbox}
|
|
\vfill
|
|
\end{frame}
|
|
}
|
|
|
|
\title{Deuxfleurs : une infrastructure numérique distribuée low-tech}
|
|
\author{Armaël Guéneau, Baptiste Jonglez}
|
|
\date{2025-04-03}
|
|
|
|
\begin{document}
|
|
|
|
\begin{frame}
|
|
\centering
|
|
\includegraphics[width=.2\linewidth]{assets/logos/deuxfleurs.pdf}
|
|
\vfill
|
|
|
|
{\large\bf Deuxfleurs : une infrastructure numérique distribuée low-tech}
|
|
\vspace{1em}
|
|
|
|
{Armaël Guéneau, Inria Toccata} \\
|
|
{Baptiste Jonglez, Inria Stack} \\
|
|
\end{frame}
|
|
|
|
\begin{frame}{Qui sommes nous}
|
|
|
|
\end{frame}
|
|
|
|
\begin{frame}{L'association Deuxfleurs}
|
|
|
|
|
|
|
|
Deuxfleurs est une association loi 1901 qui héberge des services numériques
|
|
|
|
\vfill
|
|
|
|
Objectif : \\ \textbf{Construire une infrastructure numérique robuste,
|
|
utile, et gérée en commun}
|
|
|
|
\vfill
|
|
|
|
membre du collectif CHATONS (hébergeurs associatifs)
|
|
\end{frame}
|
|
|
|
\begin{frame}{Choix politiques}
|
|
|
|
\begin{enumerate}
|
|
\setlength\itemsep{1.5em}
|
|
\item \textbf{Réduire les dépendances} aux acteurs dominants, maîtriser les dépendances
|
|
inévitables (électricité, réseau). Jusqu'à maîtriser toute la pile
|
|
logicielle, matérielle et hébergement.
|
|
|
|
\item \textbf{Sobriété radicale :} empreinte matérielle minimale et fixe. Adapter les
|
|
usages aux ressources et non l'inverse.
|
|
|
|
\item \textbf{Bonne qualité de service :} alternative crédible aux GAFAM (CHATONS
|
|
souvent critiqués pour leur qualité de service)
|
|
\end{enumerate}
|
|
|
|
\end{frame}
|
|
|
|
\begin{frame}{Conséquences de ces choix}
|
|
|
|
\begin{itemize}
|
|
\setlength\itemsep{1.5em}
|
|
\item \textbf{hébergement hors datacenters}, à la maison \\
|
|
$\Longrightarrow$ électricité et réseau domestique pas fiables
|
|
\item \textbf{matériel reconditionné peu puissant}, de capacité fixe \\
|
|
$\Longrightarrow$
|
|
adapter les services et usages en fonction des limites (RAM, CPU, stockage)
|
|
\item \textbf{infrastructure géo-distribuée et tolérante aux fautes} pour assurer une bonne
|
|
disponibilité \\
|
|
$\Longrightarrow$ besoin de logiciels adaptés
|
|
\end{itemize}
|
|
|
|
\end{frame}
|
|
|
|
\begin{frame}[plain]
|
|
\includegraphics[width=\textwidth]{infra.jpg}
|
|
\end{frame}
|
|
|
|
\begin{frame}
|
|
\begin{center}
|
|
\textbf{
|
|
Qu'est-ce qui rend l'infra Deuxfleurs intéressante dans un contexte recherche ?
|
|
}
|
|
\end{center}
|
|
|
|
\vfill
|
|
|
|
\begin{itemize}
|
|
\setlength\itemsep{1em}
|
|
\item R\&D sur des nouveaux logiciels car souvent rien sur l'étagère qui répond à nos besoins
|
|
\item Peu de contraintes
|
|
|
|
Approche greenfield : on choisit et conçoit les services que l'on fournit
|
|
\end{itemize}
|
|
|
|
\end{frame}
|
|
|
|
|
|
|
|
\begin{frame}
|
|
|
|
\end{frame}
|
|
|
|
\begin{frame}
|
|
\frametitle{Infrastructure et logiciels actuels}
|
|
|
|
{\Large Choix radicaux $\Longrightarrow$ infrastructure spécifique}
|
|
|
|
\vfill
|
|
|
|
{\large Qu'est-ce qui fonctionne actuellement ?}
|
|
|
|
\end{frame}
|
|
|
|
\begin{frame}
|
|
\frametitle{Infrastructure et logiciels actuels}
|
|
\begin{itemize}
|
|
\item Des machines en nombre limité et peu puissantes
|
|
\item Plusieurs zones géographiques
|
|
\item Un orchestrateur distribué ``off the shelf'' (Nomad + Consul)
|
|
\item Un logiciel de stockage objet distribué ``maison'' (Garage)
|
|
\item Des boucles de rétroaction
|
|
\item Des services majoritairement distribués
|
|
\end{itemize}
|
|
\end{frame}
|
|
|
|
\begin{frame}[plain]
|
|
\begin{center}
|
|
\includegraphics[height=\textheight]{assets/deuxfleurs-nomad-20241022.png}
|
|
\end{center}
|
|
\end{frame}
|
|
|
|
\begin{frame}
|
|
\frametitle{Nomad}
|
|
\begin{itemize}
|
|
\item Orchestrateur développé par Hashicorp / IBM
|
|
\item Définition déclarative des ``jobs'' avec contraintes
|
|
\item Fonctions : ordonnancement et maintien en condition
|
|
\item Control plane hautement disponible (clustering Raft)
|
|
\end{itemize}
|
|
\end{frame}
|
|
|
|
\begin{frame}[fragile]
|
|
\frametitle{Exemple de job Nomad (simplifié)}
|
|
\begin{footnotesize}
|
|
\begin{verbatim}
|
|
job "jitsi" {
|
|
datacenters = ["neptune", "scorpio", "corrin"]
|
|
type = "service"
|
|
task "front" {
|
|
driver = "docker"
|
|
config { image = "superboum/amd64_jitsi_meet:v7"
|
|
volumes = ["secrets/certs/jitsi.key:/etc/nginx/jitsi.key"] }
|
|
template { data = "{{ key \"secrets/jitsi/jitsi.key\" }}"
|
|
destination = "secrets/certs/jitsi.key" }
|
|
resources { cpu = 300, memory = 200 }
|
|
service {
|
|
port = "https_port"
|
|
name = "https-jitsi"
|
|
check { type = "tcp"
|
|
port = "https_port"
|
|
interval = "60s"
|
|
timeout = "5s" }
|
|
}
|
|
}
|
|
}
|
|
\end{verbatim}
|
|
\end{footnotesize}
|
|
\end{frame}
|
|
|
|
\begin{frame}
|
|
\frametitle{Consul}
|
|
\begin{itemize}
|
|
\item \textbf{Outil de coordination distribuée} développé par Hashicorp
|
|
\item Base de données clé-valeur distribuée (similaire à etcd)
|
|
\item Stocke la configuration utile à Nomad + contenu arbitraire
|
|
\item Service discovery (via intégration Nomad + API dédiée + DNS)
|
|
\item Hautement disponible (clustering Raft)
|
|
\end{itemize}
|
|
\end{frame}
|
|
|
|
\begin{frame}[plain]
|
|
\begin{center}
|
|
\includegraphics[height=\textheight]{infra_services_control_loop.png}
|
|
\end{center}
|
|
\end{frame}
|
|
|
|
\begin{frame}
|
|
\frametitle{Object storage: a crucial component}
|
|
\begin{center}
|
|
\includegraphics[height=6em]{assets/logos/Amazon-S3.jpg}
|
|
\hspace{3em}
|
|
\visible<2->{\includegraphics[height=5em]{assets/logos/minio.png}}
|
|
\hspace{3em}
|
|
\visible<3>{\includegraphics[height=6em]{logo/garage_hires_crop.png}}
|
|
\end{center}
|
|
\vspace{1em}
|
|
S3: a de-facto standard, many compatible applications
|
|
|
|
\vspace{1em}
|
|
\visible<2->{MinIO is self-hostable but not suited for geo-distributed deployments}
|
|
|
|
\vspace{1em}
|
|
\visible<3->{\textbf{Garage is a self-hosted drop-in replacement for the Amazon S3 object store}}
|
|
\end{frame}
|
|
|
|
\begin{frame}
|
|
\frametitle{CRDTs / weak consistency instead of consensus}
|
|
|
|
\underline{Internally, Garage uses only CRDTs} (conflict-free replicated data types)
|
|
|
|
\vspace{2em}
|
|
Why not Raft, Paxos, ...? Issues of consensus algorithms:
|
|
|
|
\vspace{1em}
|
|
\begin{itemize}
|
|
\item<2-> \textbf{Software complexity}
|
|
\vspace{1em}
|
|
\item<3-> \textbf{Performance issues:}
|
|
\vspace{.5em}
|
|
\begin{itemize}
|
|
\item<4-> The leader is a \textbf{bottleneck} for all requests\\
|
|
\vspace{.5em}
|
|
\item<5-> \textbf{Sensitive to higher latency} between nodes
|
|
\vspace{.5em}
|
|
\item<6-> \textbf{Takes time to reconverge} when disrupted (e.g. node going down)
|
|
\end{itemize}
|
|
\end{itemize}
|
|
\end{frame}
|
|
|
|
\begin{frame}
|
|
\frametitle{The data model of object storage}
|
|
Object storage is basically a \textbf{key-value store}:
|
|
\vspace{.5em}
|
|
|
|
{\scriptsize
|
|
\begin{center}
|
|
\begin{tabular}{|l|p{7cm}|}
|
|
\hline
|
|
\textbf{Key: file path + name} & \textbf{Value: file data + metadata} \\
|
|
\hline
|
|
\hline
|
|
\texttt{index.html} &
|
|
\texttt{Content-Type: text/html; charset=utf-8} \newline
|
|
\texttt{Content-Length: 24929} \newline
|
|
\texttt{<binary blob>} \\
|
|
\hline
|
|
\texttt{img/logo.svg} &
|
|
\texttt{Content-Type: text/svg+xml} \newline
|
|
\texttt{Content-Length: 13429} \newline
|
|
\texttt{<binary blob>} \\
|
|
\hline
|
|
\texttt{download/index.html} &
|
|
\texttt{Content-Type: text/html; charset=utf-8} \newline
|
|
\texttt{Content-Length: 26563} \newline
|
|
\texttt{<binary blob>} \\
|
|
\hline
|
|
\end{tabular}
|
|
\end{center}
|
|
}
|
|
|
|
\vspace{1em}
|
|
\begin{itemize}
|
|
\item<2> Maps well to CRDT data types
|
|
\end{itemize}
|
|
\end{frame}
|
|
|
|
\begin{frame}
|
|
\frametitle{Performance gains in practice}
|
|
\begin{center}
|
|
\includegraphics[width=.8\linewidth]{assets/perf/endpoint_latency_0.7_0.8_minio.png}
|
|
\end{center}
|
|
\end{frame}
|
|
|
|
|
|
% ======================================== TIMELINE
|
|
% ======================================== TIMELINE
|
|
% ======================================== TIMELINE
|
|
|
|
\section{Recent developments}
|
|
|
|
% ====================== v0.7.0 ===============================
|
|
|
|
\begin{frame}
|
|
\begin{center}
|
|
\includegraphics[width=.8\linewidth]{assets/timeline-22-24.pdf}
|
|
\end{center}
|
|
\end{frame}
|
|
|
|
\begin{frame}
|
|
\frametitle{April 2022 - Garage v0.7.0}
|
|
Focus on \underline{observability and ecosystem integration}
|
|
\vspace{2em}
|
|
\begin{itemize}
|
|
\item \textbf{Monitoring:} metrics and traces, using OpenTelemetry
|
|
\vspace{1em}
|
|
\item Replication modes with 1 or 2 copies / weaker consistency
|
|
\vspace{1em}
|
|
\item Kubernetes integration for node discovery
|
|
\vspace{1em}
|
|
\item Admin API (v0.7.2)
|
|
\end{itemize}
|
|
\end{frame}
|
|
|
|
\begin{frame}
|
|
\frametitle{Metrics (Prometheus + Grafana)}
|
|
\begin{center}
|
|
\includegraphics[width=.9\linewidth]{assets/screenshots/grafana_dashboard.png}
|
|
\end{center}
|
|
\end{frame}
|
|
|
|
\begin{frame}
|
|
\frametitle{Traces (Jaeger)}
|
|
\begin{center}
|
|
\includegraphics[width=.8\linewidth]{assets/screenshots/jaeger_listobjects.png}
|
|
\end{center}
|
|
\end{frame}
|
|
|
|
% ====================== v0.8.0 ===============================
|
|
|
|
\begin{frame}
|
|
\begin{center}
|
|
\includegraphics[width=.8\linewidth]{assets/timeline-22-24.pdf}
|
|
\end{center}
|
|
\end{frame}
|
|
|
|
\begin{frame}
|
|
\frametitle{November 2022 - Garage v0.8.0}
|
|
Focus on \underline{performance}
|
|
\vspace{2em}
|
|
\begin{itemize}
|
|
\item \textbf{Alternative metadata DB engines} (LMDB, Sqlite)
|
|
\vspace{1em}
|
|
\item \textbf{Performance improvements:} block streaming, various optimizations...
|
|
\vspace{1em}
|
|
\item Bucket quotas (max size, max \#objects)
|
|
\vspace{1em}
|
|
\item Quality of life improvements, observability, etc.
|
|
\end{itemize}
|
|
\end{frame}
|
|
|
|
\begin{frame}
|
|
\frametitle{About metadata DB engines}
|
|
\textbf{Issues with Sled:}
|
|
\vspace{1em}
|
|
\begin{itemize}
|
|
\item Huge files on disk
|
|
\vspace{.5em}
|
|
\item Unpredictable performance, especially on HDD
|
|
\vspace{.5em}
|
|
\item API limitations
|
|
\vspace{.5em}
|
|
\item Not actively maintained
|
|
\end{itemize}
|
|
|
|
\vspace{2em}
|
|
\textbf{LMDB:} very stable, good performance, file size is reasonable\\
|
|
\textbf{Sqlite} also available as a second choice
|
|
|
|
\vspace{1em}
|
|
Sled will be removed in Garage v1.0
|
|
\end{frame}
|
|
|
|
\begin{frame}
|
|
\frametitle{DB engine performance comparison}
|
|
\begin{center}
|
|
\includegraphics[width=.6\linewidth]{assets/perf/db_engine.png}
|
|
\end{center}
|
|
NB: Sqlite was slow due to synchronous mode, now configurable
|
|
\end{frame}
|
|
|
|
\begin{frame}
|
|
\frametitle{Block streaming}
|
|
\begin{center}
|
|
\only<1>{\includegraphics[width=.8\linewidth]{assets/schema-streaming-1.png}}
|
|
\only<2>{\includegraphics[width=.8\linewidth]{assets/schema-streaming-2.png}}
|
|
\end{center}
|
|
\end{frame}
|
|
|
|
\begin{frame}
|
|
\frametitle{TTFB benchmark}
|
|
\begin{center}
|
|
\includegraphics[width=.8\linewidth]{assets/perf/ttfb.png}
|
|
\end{center}
|
|
\end{frame}
|
|
|
|
\begin{frame}
|
|
\frametitle{Throughput benchmark}
|
|
\begin{center}
|
|
\includegraphics[width=.7\linewidth]{assets/perf/io-0.7-0.8-minio.png}
|
|
\end{center}
|
|
\end{frame}
|
|
|
|
% ====================== v0.9.0 ===============================
|
|
|
|
\begin{frame}
|
|
\begin{center}
|
|
\includegraphics[width=.8\linewidth]{assets/timeline-22-24.pdf}
|
|
\end{center}
|
|
\end{frame}
|
|
|
|
\begin{frame}
|
|
\frametitle{October 2023 - Garage v0.9.0}
|
|
Focus on \underline{streamlining \& usability}
|
|
\vspace{2em}
|
|
\begin{itemize}
|
|
\item Support multiple HDDs per node
|
|
\vspace{1em}
|
|
\item S3 compatibility:
|
|
\vspace{1em}
|
|
\begin{itemize}
|
|
\item support basic lifecycle configurations
|
|
\vspace{.5em}
|
|
\item allow for multipart upload part retries
|
|
\end{itemize}
|
|
\vspace{1em}
|
|
\item LMDB by default, deprecation of Sled
|
|
\vspace{1em}
|
|
\item New layout computation algorithm
|
|
\end{itemize}
|
|
\end{frame}
|
|
|
|
|
|
\begin{frame}
|
|
\frametitle{Layout computation}
|
|
\begin{overprint}
|
|
\onslide<1>
|
|
\begin{center}
|
|
\includegraphics[width=\linewidth, trim=0 0 0 -4cm]{assets/screenshots/garage_status_0.9_prod_zonehl.png}
|
|
\end{center}
|
|
\onslide<2>
|
|
\begin{center}
|
|
\includegraphics[width=.7\linewidth]{assets/map.png}
|
|
\end{center}
|
|
\end{overprint}
|
|
\vspace{1em}
|
|
Garage stores replicas on different zones when possible
|
|
\end{frame}
|
|
|
|
\begin{frame}
|
|
\frametitle{What a "layout" is}
|
|
\textbf{A layout is a precomputed index table:}
|
|
\vspace{1em}
|
|
|
|
{\footnotesize
|
|
\begin{center}
|
|
\begin{tabular}{|l|l|l|l|}
|
|
\hline
|
|
\textbf{Partition} & \textbf{Node 1} & \textbf{Node 2} & \textbf{Node 3} \\
|
|
\hline
|
|
\hline
|
|
Partition 0 & df-ymk (bespin) & Abricot (scorpio) & Courgette (neptune) \\
|
|
\hline
|
|
Partition 1 & Ananas (scorpio) & Courgette (neptune) & df-ykl (bespin) \\
|
|
\hline
|
|
Partition 2 & df-ymf (bespin) & Celeri (neptune) & Abricot (scorpio) \\
|
|
\hline
|
|
\hspace{1em}$\vdots$ & \hspace{1em}$\vdots$ & \hspace{1em}$\vdots$ & \hspace{1em}$\vdots$ \\
|
|
\hline
|
|
Partition 255 & Concombre (neptune) & df-ykl (bespin) & Abricot (scorpio) \\
|
|
\hline
|
|
\end{tabular}
|
|
\end{center}
|
|
}
|
|
|
|
\vspace{2em}
|
|
\visible<2->{
|
|
The index table is built centrally using an optimal algorithm,\\
|
|
then propagated to all nodes
|
|
}
|
|
|
|
\vspace{1em}
|
|
\visible<3->{
|
|
\footnotesize
|
|
Oulamara, M., \& Auvolat, A. (2023). \emph{An algorithm for geo-distributed and redundant storage in Garage}.\\ arXiv preprint arXiv:2302.13798.
|
|
}
|
|
\end{frame}
|
|
|
|
|
|
|
|
% ====================== v0.10.0 ===============================
|
|
|
|
\begin{frame}
|
|
\begin{center}
|
|
\includegraphics[width=.8\linewidth]{assets/timeline-22-24.pdf}
|
|
\end{center}
|
|
\end{frame}
|
|
|
|
\begin{frame}
|
|
\frametitle{October 2023 - Garage v0.10.0 beta}
|
|
Focus on \underline{consistency}
|
|
\vspace{2em}
|
|
\begin{itemize}
|
|
\item Fix consistency issues when reshuffling data
|
|
\end{itemize}
|
|
\end{frame}
|
|
|
|
\begin{frame}
|
|
\frametitle{Working with weak consistency}
|
|
Not using consensus limits us to the following:
|
|
\vspace{2em}
|
|
\begin{itemize}
|
|
\item<2-> \textbf{Conflict-free replicated data types} (CRDT)\\
|
|
\vspace{1em}
|
|
{\footnotesize Non-transactional key-value stores such as S3 are equivalent to a simple CRDT:\\
|
|
a map of \textbf{last-writer-wins registers} (each key is its own CRDT)}
|
|
\vspace{1.5em}
|
|
\item<3-> \textbf{Read-after-write consistency}\\
|
|
\vspace{1em}
|
|
{\footnotesize Can be implemented using quorums on read and write operations}
|
|
\end{itemize}
|
|
\end{frame}
|
|
|
|
\begin{frame}[t]
|
|
\frametitle{CRDT read-after-write consistency using quorums}
|
|
|
|
\vspace{1em}
|
|
{\small
|
|
\textbf{Property:} If client 1 did an operation $write(x)$ and received an OK response,\\
|
|
\hspace{2cm} and client 2 starts an operation $read()$ after client 1 received OK,\\
|
|
\hspace{2cm} then client 2 will read a value $x' \sqsupseteq x$.
|
|
}
|
|
|
|
\vspace{1.5em}
|
|
\begin{overprint}
|
|
\onslide<2-9>
|
|
\begin{figure}
|
|
\centering
|
|
\footnotesize
|
|
\def\svgwidth{.7\textwidth}
|
|
\only<2>{\import{assets/lattice/}{lattice1.pdf_tex}}%
|
|
\only<3>{\import{assets/lattice/}{lattice2.pdf_tex}}%
|
|
\only<4>{\import{assets/lattice/}{lattice3.pdf_tex}}%
|
|
\only<5>{\import{assets/lattice/}{lattice4.pdf_tex}}%
|
|
\only<6>{\import{assets/lattice/}{lattice5.pdf_tex}}%
|
|
\only<7>{\import{assets/lattice/}{lattice6.pdf_tex}}%
|
|
\only<8>{\import{assets/lattice/}{lattice7.pdf_tex}}%
|
|
\only<9>{\import{assets/lattice/}{lattice8.pdf_tex}}%
|
|
\end{figure}
|
|
|
|
\onslide<10>
|
|
\begin{minipage}{.10\textwidth}
|
|
~
|
|
\end{minipage}
|
|
\begin{minipage}{.40\textwidth}
|
|
\footnotesize
|
|
\textbf{Algorithm $write(x)$:}
|
|
\begin{enumerate}
|
|
\item Broadcast $write(x)$ to all nodes
|
|
\item Wait for $k > n/2$ nodes to reply OK
|
|
\item Return OK
|
|
\end{enumerate}
|
|
\end{minipage}
|
|
\begin{minipage}{.40\textwidth}
|
|
\footnotesize
|
|
\vspace{1em}
|
|
\textbf{Algorithm $read()$:}
|
|
\begin{enumerate}
|
|
\item Broadcast $read()$ to all nodes
|
|
\item Wait for $k > n/2$ nodes to reply\\
|
|
with values $x_1, \dots, x_k$
|
|
\item Return $x_1 \sqcup \dots \sqcup x_k$
|
|
\end{enumerate}
|
|
\end{minipage}
|
|
\end{overprint}
|
|
\end{frame}
|
|
|
|
\begin{frame}
|
|
\frametitle{A hard problem: layout changes}
|
|
\begin{itemize}
|
|
\item We rely on quorums $k > n/2$ within each partition:\\
|
|
$$n=3,~~~~~~~k\ge 2$$
|
|
\item<2-> When rebalancing, the set of nodes responsible for a partition can change:\\
|
|
|
|
\vspace{1em}
|
|
\begin{minipage}{.04\linewidth}~
|
|
\end{minipage}
|
|
\begin{minipage}{.40\linewidth}
|
|
{\tiny
|
|
\begin{tabular}{|l|l|l|l|}
|
|
\hline
|
|
\textbf{Partition} & \textbf{Node 1} & \textbf{Node 2} & \textbf{Node 3} \\
|
|
\hline
|
|
\hline
|
|
Partition 0 & \textcolor{Crimson}{df-ymk} & Abricot & \textcolor{Crimson}{Courgette} \\
|
|
\hline
|
|
Partition 1 & Ananas & \textcolor{Crimson}{Courgette} & \textcolor{Crimson}{df-ykl} \\
|
|
\hline
|
|
Partition 2 & \textcolor{Crimson}{df-ymf} & \textcolor{Crimson}{Celeri} & Abricot \\
|
|
\hline
|
|
\hspace{1em}$\dots$ & \hspace{1em}$\dots$ & \hspace{1em}$\dots$ & \hspace{1em}$\dots$ \\
|
|
\hline
|
|
\end{tabular}
|
|
}
|
|
\end{minipage}
|
|
\begin{minipage}{.04\linewidth}
|
|
$\to$
|
|
\end{minipage}
|
|
\begin{minipage}{.40\linewidth}
|
|
{\tiny
|
|
\begin{tabular}{|l|l|l|l|}
|
|
\hline
|
|
\textbf{Partition} & \textbf{Node 1} & \textbf{Node 2} & \textbf{Node 3} \\
|
|
\hline
|
|
\hline
|
|
Partition 0 & \textcolor{ForestGreen}{Dahlia} & Abricot & \textcolor{ForestGreen}{Eucalyptus} \\
|
|
\hline
|
|
Partition 1 & Ananas & \textcolor{ForestGreen}{Euphorbe} & \textcolor{ForestGreen}{Doradille} \\
|
|
\hline
|
|
Partition 2 & \textcolor{ForestGreen}{Dahlia} & \textcolor{ForestGreen}{Echinops} & Abricot \\
|
|
\hline
|
|
\hspace{1em}$\dots$ & \hspace{1em}$\dots$ & \hspace{1em}$\dots$ & \hspace{1em}$\dots$ \\
|
|
\hline
|
|
\end{tabular}
|
|
}
|
|
\end{minipage}
|
|
|
|
\vspace{2em}
|
|
\item<3-> During the rebalancing, new nodes don't yet have the data,\\
|
|
~~~~~~~~~~~~~~~~~~~and old nodes want to get rid of the data to free up space\\
|
|
\vspace{1.2em}
|
|
$\to$ risk of inconsistency, \textbf{how to coordinate?}
|
|
\end{itemize}
|
|
\end{frame}
|
|
|
|
\begin{frame}
|
|
\frametitle{Handling layout changes without losing consistency}
|
|
\begin{minipage}{.55\textwidth}
|
|
\begin{itemize}
|
|
\item \textbf{Solution:}\\
|
|
\vspace{.5em}
|
|
\begin{itemize}
|
|
\item keep track of data transfer to new nodes
|
|
\vspace{.5em}
|
|
\item use multiple write quorums\\
|
|
(new nodes + old nodes\\
|
|
while data transfer is in progress)
|
|
\vspace{.5em}
|
|
\item switching reads to new nodes\\
|
|
only once copy is finished
|
|
\end{itemize}
|
|
\vspace{1em}
|
|
\item \textbf{Implemented} in v0.10
|
|
\vspace{1em}
|
|
\item \textbf{Validated} with Jepsen testing
|
|
\end{itemize}
|
|
\end{minipage}
|
|
\begin{minipage}{.23\textwidth}
|
|
\includegraphics[width=3cm]{assets/jepsen-0.9.png}\\
|
|
{\footnotesize Garage v0.9.0}
|
|
\end{minipage}
|
|
\begin{minipage}{.2\textwidth}
|
|
\includegraphics[width=3cm]{assets/jepsen-0.10.png}\\
|
|
{\footnotesize Garage v0.10 beta}
|
|
\end{minipage}
|
|
\end{frame}
|
|
|
|
% ====================== v0.10.0 ===============================
|
|
|
|
\begin{frame}
|
|
\begin{center}
|
|
\includegraphics[width=.8\linewidth]{assets/timeline-22-24.pdf}
|
|
\end{center}
|
|
\end{frame}
|
|
|
|
\begin{frame}
|
|
\frametitle{Towards v1.0...}
|
|
Focus on \underline{security \& stability}
|
|
\vspace{2em}
|
|
\begin{itemize}
|
|
\item \textbf{Security audit} in progress by Radically Open Security
|
|
\vspace{1em}
|
|
\item Misc. S3 features (SSE-C, ...) and compatibility fixes
|
|
\vspace{1em}
|
|
\item Improve UX
|
|
\vspace{1em}
|
|
\item Fix bugs
|
|
\end{itemize}
|
|
\end{frame}
|
|
|
|
\begin{frame}
|
|
\frametitle{...and beyond!}
|
|
\begin{center}
|
|
\includegraphics[width=.6\linewidth]{assets/survey_requested_features.png}
|
|
\end{center}
|
|
\end{frame}
|
|
|
|
% ======================================== OPERATING
|
|
% ======================================== OPERATING
|
|
% ======================================== OPERATING
|
|
|
|
|
|
\section{Operating big Garage clusters}
|
|
|
|
\begin{frame}
|
|
\frametitle{Operating Garage}
|
|
\begin{center}
|
|
\only<1-2>{
|
|
\includegraphics[width=.9\linewidth]{assets/screenshots/garage_status_0.10.png}
|
|
\\\vspace{1em}
|
|
\visible<2>{\includegraphics[width=.9\linewidth]{assets/screenshots/garage_status_unhealthy_0.10.png}}
|
|
}
|
|
\end{center}
|
|
\end{frame}
|
|
|
|
\begin{frame}
|
|
\frametitle{Garage's architecture}
|
|
\begin{center}
|
|
\only<1>{\includegraphics[width=.45\linewidth]{assets/garage.drawio.pdf}}%
|
|
\only<2>{\includegraphics[width=.6\linewidth]{assets/garage_sync.drawio.pdf}}%
|
|
\end{center}
|
|
\end{frame}
|
|
|
|
\begin{frame}
|
|
\frametitle{Digging deeper}
|
|
\begin{center}
|
|
\only<1>{\includegraphics[width=.9\linewidth]{assets/screenshots/garage_stats_0.10.png}}
|
|
\only<2>{\includegraphics[width=.5\linewidth]{assets/screenshots/garage_worker_list_0.10.png}}
|
|
\only<3>{\includegraphics[width=.6\linewidth]{assets/screenshots/garage_worker_param_0.10.png}}
|
|
\end{center}
|
|
\end{frame}
|
|
|
|
\begin{frame}
|
|
\frametitle{Potential limitations and bottlenecks}
|
|
\begin{itemize}
|
|
\item Global:
|
|
\begin{itemize}
|
|
\item Max. $\sim$100 nodes per cluster (excluding gateways)
|
|
\end{itemize}
|
|
\vspace{1em}
|
|
\item Metadata:
|
|
\begin{itemize}
|
|
\item One big bucket = bottleneck, object list on 3 nodes only
|
|
\end{itemize}
|
|
\vspace{1em}
|
|
\item Block manager:
|
|
\begin{itemize}
|
|
\item Lots of small files on disk
|
|
\item Processing the resync queue can be slow
|
|
\end{itemize}
|
|
\end{itemize}
|
|
\end{frame}
|
|
|
|
\begin{frame}
|
|
\frametitle{Deployment advice for very large clusters}
|
|
\begin{itemize}
|
|
\item Metadata storage:
|
|
\begin{itemize}
|
|
\item ZFS mirror (x2) on fast NVMe
|
|
\item Use LMDB storage engine
|
|
\end{itemize}
|
|
\vspace{.5em}
|
|
\item Data block storage:
|
|
\begin{itemize}
|
|
\item Use Garage's native multi-HDD support
|
|
\item XFS on individual drives
|
|
\item Increase block size (1MB $\to$ 10MB, requires more RAM and good networking)
|
|
\item Tune \texttt{resync-tranquility} and \texttt{resync-worker-count} dynamically
|
|
\end{itemize}
|
|
\vspace{.5em}
|
|
\item Other :
|
|
\begin{itemize}
|
|
\item Split data over several buckets
|
|
\item Use less than 100 storage nodes
|
|
\item Use gateway nodes
|
|
\end{itemize}
|
|
\vspace{.5em}
|
|
\end{itemize}
|
|
Our deployments: $< 10$ TB. Some people have done more!
|
|
\end{frame}
|
|
|
|
|
|
% ======================================== END
|
|
% ======================================== END
|
|
% ======================================== END
|
|
|
|
\begin{frame}
|
|
\frametitle{Where to find us}
|
|
\begin{center}
|
|
\includegraphics[width=.25\linewidth]{logo/garage_hires.png}\\
|
|
\vspace{-1em}
|
|
\url{https://garagehq.deuxfleurs.fr/}\\
|
|
\url{mailto:garagehq@deuxfleurs.fr}\\
|
|
\texttt{\#garage:deuxfleurs.fr} on Matrix
|
|
|
|
\vspace{1.5em}
|
|
\includegraphics[width=.06\linewidth]{assets/logos/rust_logo.png}
|
|
\includegraphics[width=.13\linewidth]{assets/logos/AGPLv3_Logo.png}
|
|
\end{center}
|
|
\end{frame}
|
|
|
|
\end{document}
|
|
|
|
%% vim: set ts=4 sw=4 tw=0 noet spelllang=en :
|