forked from Deuxfleurs/garage
Talk 2023-01-18: some WIP talking about consensus
This commit is contained in:
parent
7416ba97ef
commit
fe850f62c9
6 changed files with 315 additions and 21 deletions
|
@ -3,6 +3,7 @@ ASSETS=assets/consistent_hashing_1.pdf \
|
||||||
assets/consistent_hashing_3.pdf \
|
assets/consistent_hashing_3.pdf \
|
||||||
assets/consistent_hashing_4.pdf \
|
assets/consistent_hashing_4.pdf \
|
||||||
assets/garage_tables.pdf \
|
assets/garage_tables.pdf \
|
||||||
|
assets/consensus.pdf_tex \
|
||||||
assets/deuxfleurs.pdf
|
assets/deuxfleurs.pdf
|
||||||
|
|
||||||
talk.pdf: talk.tex $(ASSETS)
|
talk.pdf: talk.tex $(ASSETS)
|
||||||
|
@ -10,3 +11,6 @@ talk.pdf: talk.tex $(ASSETS)
|
||||||
|
|
||||||
assets/%.pdf: assets/%.svg
|
assets/%.pdf: assets/%.svg
|
||||||
inkscape -D -z --file=$^ --export-pdf=$@
|
inkscape -D -z --file=$^ --export-pdf=$@
|
||||||
|
|
||||||
|
assets/%.pdf_tex: assets/%.svg
|
||||||
|
inkscape -D -z --file=$^ --export-pdf=$@ --export-latex
|
||||||
|
|
137
doc/talks/2023-01-18-tocatta/assets/consensus.svg
Normal file
137
doc/talks/2023-01-18-tocatta/assets/consensus.svg
Normal file
|
@ -0,0 +1,137 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||||
|
<!-- Created with Inkscape (http://www.inkscape.org/) -->
|
||||||
|
|
||||||
|
<svg
|
||||||
|
width="800"
|
||||||
|
height="300"
|
||||||
|
viewBox="0 0 211.66666 79.374999"
|
||||||
|
version="1.1"
|
||||||
|
id="svg5"
|
||||||
|
inkscape:version="1.2.2 (b0a8486541, 2022-12-01)"
|
||||||
|
sodipodi:docname="consensus.svg"
|
||||||
|
xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
|
||||||
|
xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
|
||||||
|
xmlns="http://www.w3.org/2000/svg"
|
||||||
|
xmlns:svg="http://www.w3.org/2000/svg">
|
||||||
|
<sodipodi:namedview
|
||||||
|
id="namedview7"
|
||||||
|
pagecolor="#ffffff"
|
||||||
|
bordercolor="#666666"
|
||||||
|
borderopacity="1.0"
|
||||||
|
inkscape:showpageshadow="2"
|
||||||
|
inkscape:pageopacity="0.0"
|
||||||
|
inkscape:pagecheckerboard="0"
|
||||||
|
inkscape:deskcolor="#d1d1d1"
|
||||||
|
inkscape:document-units="mm"
|
||||||
|
showgrid="false"
|
||||||
|
inkscape:zoom="1.4734708"
|
||||||
|
inkscape:cx="310.49139"
|
||||||
|
inkscape:cy="179.1688"
|
||||||
|
inkscape:window-width="1920"
|
||||||
|
inkscape:window-height="999"
|
||||||
|
inkscape:window-x="0"
|
||||||
|
inkscape:window-y="0"
|
||||||
|
inkscape:window-maximized="1"
|
||||||
|
inkscape:current-layer="layer1" />
|
||||||
|
<defs
|
||||||
|
id="defs2">
|
||||||
|
<marker
|
||||||
|
style="overflow:visible"
|
||||||
|
id="Arrow2"
|
||||||
|
refX="0"
|
||||||
|
refY="0"
|
||||||
|
orient="auto-start-reverse"
|
||||||
|
inkscape:stockid="Arrow2"
|
||||||
|
markerWidth="7.6999998"
|
||||||
|
markerHeight="5.5999999"
|
||||||
|
viewBox="0 0 7.7 5.6"
|
||||||
|
inkscape:isstock="true"
|
||||||
|
inkscape:collect="always"
|
||||||
|
preserveAspectRatio="xMidYMid">
|
||||||
|
<path
|
||||||
|
transform="scale(0.7)"
|
||||||
|
d="M -2,-4 9,0 -2,4 c 2,-2.33 2,-5.66 0,-8 z"
|
||||||
|
style="fill:context-stroke;fill-rule:evenodd;stroke:none"
|
||||||
|
id="arrow2L" />
|
||||||
|
</marker>
|
||||||
|
</defs>
|
||||||
|
<g
|
||||||
|
inkscape:label="Layer 1"
|
||||||
|
inkscape:groupmode="layer"
|
||||||
|
id="layer1">
|
||||||
|
<g
|
||||||
|
id="g1218"
|
||||||
|
transform="translate(-8.9161476,-12.502301)">
|
||||||
|
<circle
|
||||||
|
style="fill:#ffffff;stroke:#000000;stroke-width:1;stroke-dasharray:none;stroke-opacity:1"
|
||||||
|
id="path111"
|
||||||
|
cx="38.904896"
|
||||||
|
cy="37.936272"
|
||||||
|
r="13.474442" />
|
||||||
|
<text
|
||||||
|
xml:space="preserve"
|
||||||
|
style="font-size:8.46667px;line-height:1.25;font-family:sans-serif;text-align:center;text-anchor:middle;stroke-width:0.264583;fill:#000000"
|
||||||
|
x="38.879501"
|
||||||
|
y="40.908073"
|
||||||
|
id="text1105"><tspan
|
||||||
|
sodipodi:role="line"
|
||||||
|
id="tspan1103"
|
||||||
|
style="stroke-width:0.264583;fill:#000000"
|
||||||
|
x="38.879501"
|
||||||
|
y="40.908073">$\bot$</tspan></text>
|
||||||
|
</g>
|
||||||
|
<g
|
||||||
|
id="g1218-3"
|
||||||
|
transform="translate(127.41938,-12.502301)">
|
||||||
|
<circle
|
||||||
|
style="fill:#ffffff;stroke:#000000;stroke-width:1;stroke-dasharray:none;stroke-opacity:1"
|
||||||
|
id="path111-5"
|
||||||
|
cx="38.904896"
|
||||||
|
cy="37.936272"
|
||||||
|
r="13.474442" />
|
||||||
|
<text
|
||||||
|
xml:space="preserve"
|
||||||
|
style="font-size:8.46667px;line-height:1.25;font-family:sans-serif;text-align:center;text-anchor:middle;stroke-width:0.264583;fill:#000000"
|
||||||
|
x="38.879501"
|
||||||
|
y="40.908073"
|
||||||
|
id="text1105-6"><tspan
|
||||||
|
sodipodi:role="line"
|
||||||
|
id="tspan1103-2"
|
||||||
|
style="stroke-width:0.264583;fill:#000000"
|
||||||
|
x="38.879501"
|
||||||
|
y="40.908073">$x$</tspan></text>
|
||||||
|
</g>
|
||||||
|
<path
|
||||||
|
style="fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-dasharray:none;stroke-opacity:1;marker-end:url(#Arrow2)"
|
||||||
|
d="M 44.289635,25.433971 H 145.90576"
|
||||||
|
id="path1414"
|
||||||
|
sodipodi:nodetypes="cc" />
|
||||||
|
<text
|
||||||
|
xml:space="preserve"
|
||||||
|
style="font-size:8.46667px;line-height:1.25;font-family:sans-serif;text-align:center;text-anchor:middle;stroke-width:0.264583;fill:#000000"
|
||||||
|
x="92.729836"
|
||||||
|
y="21.781803"
|
||||||
|
id="text2092"><tspan
|
||||||
|
sodipodi:role="line"
|
||||||
|
id="tspan2090"
|
||||||
|
style="stroke-width:0.264583;fill:#000000"
|
||||||
|
x="92.729836"
|
||||||
|
y="21.781803">$propose(x) / x$</tspan></text>
|
||||||
|
<text
|
||||||
|
xml:space="preserve"
|
||||||
|
style="font-size:8.46667px;line-height:1.25;font-family:sans-serif;text-align:center;text-anchor:middle;stroke-width:0.264583;fill:#000000"
|
||||||
|
x="166.29887"
|
||||||
|
y="69.89299"
|
||||||
|
id="text2092-9"><tspan
|
||||||
|
sodipodi:role="line"
|
||||||
|
id="tspan2090-1"
|
||||||
|
style="stroke-width:0.264583;fill:#000000"
|
||||||
|
x="166.29887"
|
||||||
|
y="69.89299">$propose(y) / x$</tspan></text>
|
||||||
|
<path
|
||||||
|
style="fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-dasharray:none;stroke-opacity:1;marker-end:url(#Arrow2)"
|
||||||
|
d="m 155.82329,35.899857 c -8.35129,12.319651 0.54055,24.640898 11.72797,24.072085 8.65403,-0.440005 18.59818,-11.705963 11.8146,-20.570891"
|
||||||
|
id="path2150"
|
||||||
|
sodipodi:nodetypes="csc" />
|
||||||
|
</g>
|
||||||
|
</svg>
|
After Width: | Height: | Size: 4.7 KiB |
Binary file not shown.
Binary file not shown.
Before Width: | Height: | Size: 13 KiB |
Binary file not shown.
|
@ -8,6 +8,7 @@
|
||||||
\usepackage{multirow}
|
\usepackage{multirow}
|
||||||
\usetheme{boxes}
|
\usetheme{boxes}
|
||||||
\usepackage{graphicx}
|
\usepackage{graphicx}
|
||||||
|
\usepackage{import}
|
||||||
\usepackage{adjustbox}
|
\usepackage{adjustbox}
|
||||||
%\useoutertheme[footline=authortitle,subsection=false]{miniframes}
|
%\useoutertheme[footline=authortitle,subsection=false]{miniframes}
|
||||||
%\useoutertheme[footline=authorinstitute,subsection=false]{miniframes}
|
%\useoutertheme[footline=authorinstitute,subsection=false]{miniframes}
|
||||||
|
@ -479,33 +480,185 @@
|
||||||
|
|
||||||
\section{Problem 2: ensuring consistency}
|
\section{Problem 2: ensuring consistency}
|
||||||
|
|
||||||
%\begin{frame}
|
|
||||||
% \frametitle{Garage's architecture}
|
|
||||||
% \begin{center}
|
|
||||||
% \includegraphics[width=.35\linewidth]{assets/garage.drawio.pdf}
|
|
||||||
% \end{center}
|
|
||||||
%\end{frame}
|
|
||||||
|
|
||||||
\begin{frame}
|
\begin{frame}
|
||||||
\frametitle{Garage is \emph{coordination-free}:}
|
\frametitle{Consensus vs weak consistency}
|
||||||
\begin{itemize}
|
|
||||||
\item No Raft or Paxos
|
\hspace{1em}
|
||||||
\vspace{1em}
|
\begin{minipage}{7cm}
|
||||||
\item Internal data types are CRDTs
|
\textbf{Consensus-based systems:}
|
||||||
\vspace{1em}
|
\vspace{1em}
|
||||||
\item All nodes are equivalent (no master/leader/index node)
|
\begin{itemize}
|
||||||
\end{itemize}
|
\item \textbf{Leader-based:} a leader is elected to coordinate
|
||||||
\vspace{2em}
|
all reads and writes
|
||||||
$\to$ less sensitive to higher latencies between nodes
|
\vspace{1em}
|
||||||
|
\item \textbf{Linearizability} of all operations\\
|
||||||
|
(strongest consistency guarantee)
|
||||||
|
\vspace{1em}
|
||||||
|
\item \textbf{Replicated state machines} that can implement
|
||||||
|
any sequential specification
|
||||||
|
\vspace{1em}
|
||||||
|
\item \textbf{Costly}, the leader is a bottleneck;
|
||||||
|
leader elections on failure take time
|
||||||
|
\end{itemize}
|
||||||
|
\end{minipage}
|
||||||
|
\hfill
|
||||||
|
\begin{minipage}{7cm} \visible<2->{
|
||||||
|
\textbf{Weakly consistent systems:}
|
||||||
|
\vspace{1em}
|
||||||
|
\begin{itemize}
|
||||||
|
\item \textbf{Nodes are equivalent}, any node
|
||||||
|
can originate a read or write operation
|
||||||
|
\vspace{1em}
|
||||||
|
\item \textbf{Read-after-write consistency} with quorums,
|
||||||
|
eventual consistency without
|
||||||
|
\vspace{1em}
|
||||||
|
\item \textbf{Operations have to commute}, i.e.~we
|
||||||
|
can only implement CRDTs
|
||||||
|
\vspace{1em}
|
||||||
|
\item \textbf{Fast}, no node is a bottleneck;\\
|
||||||
|
works the same with offline nodes
|
||||||
|
\end{itemize}
|
||||||
|
} \end{minipage}
|
||||||
|
\hspace{1em}
|
||||||
\end{frame}
|
\end{frame}
|
||||||
|
|
||||||
\begin{frame}
|
\begin{frame}
|
||||||
\frametitle{Consistency model}
|
\frametitle{Consensus vs weak consistency}
|
||||||
|
\begin{center}
|
||||||
|
\textbf{The same objects cannot be implemented in both models.}
|
||||||
|
\end{center}
|
||||||
|
\vspace{2em}
|
||||||
|
|
||||||
|
\hspace{1em}
|
||||||
|
\begin{minipage}{7cm}
|
||||||
|
\underline{Consensus-based systems:}
|
||||||
|
|
||||||
|
\vspace{1em}
|
||||||
|
|
||||||
|
\textbf{Any sequential specification}\\~
|
||||||
|
\end{minipage}
|
||||||
|
\hfill
|
||||||
|
\begin{minipage}{7cm}
|
||||||
|
\underline{Weakly consistent systems:}
|
||||||
|
|
||||||
|
\vspace{1em}
|
||||||
|
|
||||||
|
\textbf{CRDTs only}\\(conflict-free replicated data types)
|
||||||
|
\end{minipage}
|
||||||
|
\hspace{1em}
|
||||||
|
|
||||||
|
\vspace{3em}
|
||||||
|
\begin{center}
|
||||||
|
Part of the complexity is \textbf{reported to the consumer of the API}
|
||||||
|
\end{center}
|
||||||
|
\end{frame}
|
||||||
|
|
||||||
|
\begin{frame}
|
||||||
|
\frametitle{Consensus vs weak consistency}
|
||||||
|
\begin{center}
|
||||||
|
\textbf{From a theoretical point of view:}\\
|
||||||
|
|
||||||
|
\end{center}
|
||||||
|
\vspace{2em}
|
||||||
|
|
||||||
|
\hspace{1em}
|
||||||
|
\begin{minipage}{6.5cm}
|
||||||
|
\underline{Consensus-based systems:}
|
||||||
|
|
||||||
|
\vspace{1em}
|
||||||
|
|
||||||
|
Require \textbf{additionnal assumptions} such as a fault detector or a strong RNG\\~
|
||||||
|
\end{minipage}
|
||||||
|
\hfill
|
||||||
|
\begin{minipage}{6.5cm}
|
||||||
|
\underline{Weakly consistent systems:}
|
||||||
|
|
||||||
|
\vspace{1em}
|
||||||
|
|
||||||
|
Can be implemented in \textbf{any asynchronous message passing distributed system}
|
||||||
|
\end{minipage}
|
||||||
|
\hspace{1em}
|
||||||
|
|
||||||
|
\vspace{3em}
|
||||||
|
\begin{center}
|
||||||
|
They represent \textbf{different classes of computational capability}
|
||||||
|
\end{center}
|
||||||
|
\end{frame}
|
||||||
|
|
||||||
|
\begin{frame}
|
||||||
|
\frametitle{Understanding the power of consensus}
|
||||||
|
\textbf{Consensus:} an API with a single operation, $propose(x)$
|
||||||
|
\begin{enumerate}
|
||||||
|
\item nodes all call $propose(x)$ with their proposed value;
|
||||||
|
\item nodes all receive the same value as a return value, which is one of the proposed values
|
||||||
|
\end{enumerate}
|
||||||
|
\vspace{1em}
|
||||||
|
|
||||||
|
\visible<2->{
|
||||||
|
\textbf{Equivalent to} a distributed algorithm that gives a total order on all requests
|
||||||
|
}
|
||||||
|
\vspace{1em}
|
||||||
|
|
||||||
|
\visible<3->{
|
||||||
|
\textbf{Implemented by} this simple replicated state machine:
|
||||||
|
\vspace{.5em}
|
||||||
|
\begin{figure}
|
||||||
|
\centering
|
||||||
|
\def\svgwidth{.5\textwidth}
|
||||||
|
\large
|
||||||
|
\import{assets/}{consensus.pdf_tex}
|
||||||
|
\end{figure}
|
||||||
|
\vspace{1em}
|
||||||
|
}
|
||||||
|
\end{frame}
|
||||||
|
|
||||||
|
\begin{frame}
|
||||||
|
\frametitle{Can my object be implemented without consensus?}
|
||||||
|
\underline{Given the specification of an API:}
|
||||||
|
\vspace{2em}
|
||||||
\begin{itemize}
|
\begin{itemize}
|
||||||
\item Not ACID (not required by S3 spec) / not linearizable
|
\item \textbf{Using this API, we can implement the consensus object} (the $propose$ function)\\
|
||||||
|
$\to$ the API is equivalent to consensus/total ordering of messages\\
|
||||||
|
$\to$ the API cannot be implemented in a weakly consistent system
|
||||||
|
\vspace{2em}
|
||||||
|
\item \textbf{This API can be implemented using only weak primitives}\\
|
||||||
|
(e.g. a bunch of atomic registers)\\
|
||||||
|
$\to$ the API is strictly weaker than consensus\\
|
||||||
|
$\to$ we can implement it in Garage!
|
||||||
|
\end{itemize}
|
||||||
|
\end{frame}
|
||||||
|
|
||||||
|
\begin{frame}
|
||||||
|
\frametitle{Why avoid consensus?}
|
||||||
|
Consensus can be implemented reasonably well in practice, so why avoid it?
|
||||||
|
\vspace{2em}
|
||||||
|
\begin{itemize}
|
||||||
|
\item \textbf{Software complexity:} RAFT and PAXOS are complex beasts;\\
|
||||||
|
harder to prove, harder to reason about
|
||||||
|
\vspace{1.5em}
|
||||||
|
\item \textbf{Performance issues:}
|
||||||
\vspace{1em}
|
\vspace{1em}
|
||||||
\item \textbf{Read-after-write consistency}\\
|
\begin{itemize}
|
||||||
{\footnotesize (stronger than eventual consistency)}
|
\item The leader is a \textbf{bottleneck} for all requests
|
||||||
|
\vspace{1em}
|
||||||
|
\item Particularly \textbf{sensitive to higher latency} between nodes
|
||||||
|
\end{itemize}
|
||||||
|
\end{itemize}
|
||||||
|
\end{frame}
|
||||||
|
|
||||||
|
\begin{frame}
|
||||||
|
\frametitle{What can we implement without consensus?}
|
||||||
|
\begin{itemize}
|
||||||
|
\item Any \textbf{conflict-free replicated data type} (CRDT)
|
||||||
|
\vspace{1em}
|
||||||
|
\item Non-transactional key-value stores such as S3 are equivalent to a simple CRDT:\\
|
||||||
|
a \textbf{last-writer-wins registry}
|
||||||
|
\vspace{1em}
|
||||||
|
\item \textbf{Read-after-write consistency} can be implemented
|
||||||
|
using quorums on read and write operations
|
||||||
|
\vspace{1em}
|
||||||
|
\item \textbf{Monotonicity of reads} can be implemented with repair-on-read\\
|
||||||
|
(makes reads more costly, not implemented in Garage)
|
||||||
\end{itemize}
|
\end{itemize}
|
||||||
\end{frame}
|
\end{frame}
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue