diff options
author | Alex Auvolat <alex@adnab.me> | 2023-01-12 16:27:02 +0100 |
---|---|---|
committer | Alex Auvolat <alex@adnab.me> | 2023-01-12 16:27:02 +0100 |
commit | fe850f62c908492d2c3cbe4c55c3cf3b3d097de0 (patch) | |
tree | 468a071ea00206ee6a700cb3a953032f9b8313e0 /doc/talks/2023-01-18-tocatta | |
parent | 7416ba97ef8e5f9592c32dae6caaf46c1dbd7610 (diff) | |
download | garage-fe850f62c908492d2c3cbe4c55c3cf3b3d097de0.tar.gz garage-fe850f62c908492d2c3cbe4c55c3cf3b3d097de0.zip |
Talk 2023-01-18: some WIP talking about consensus
Diffstat (limited to 'doc/talks/2023-01-18-tocatta')
-rw-r--r-- | doc/talks/2023-01-18-tocatta/Makefile | 4 | ||||
-rw-r--r-- | doc/talks/2023-01-18-tocatta/assets/consensus.svg | 137 | ||||
-rw-r--r-- | doc/talks/2023-01-18-tocatta/assets/garage.drawio.pdf | bin | 26098 -> 0 bytes | |||
-rw-r--r-- | doc/talks/2023-01-18-tocatta/assets/garage.drawio.png | bin | 13463 -> 0 bytes | |||
-rw-r--r-- | doc/talks/2023-01-18-tocatta/talk.pdf | bin | 2572439 -> 2594434 bytes | |||
-rw-r--r-- | doc/talks/2023-01-18-tocatta/talk.tex | 187 |
6 files changed, 311 insertions, 17 deletions
diff --git a/doc/talks/2023-01-18-tocatta/Makefile b/doc/talks/2023-01-18-tocatta/Makefile index 4d600178..4a967d24 100644 --- a/doc/talks/2023-01-18-tocatta/Makefile +++ b/doc/talks/2023-01-18-tocatta/Makefile @@ -3,6 +3,7 @@ ASSETS=assets/consistent_hashing_1.pdf \ assets/consistent_hashing_3.pdf \ assets/consistent_hashing_4.pdf \ assets/garage_tables.pdf \ + assets/consensus.pdf_tex \ assets/deuxfleurs.pdf talk.pdf: talk.tex $(ASSETS) @@ -10,3 +11,6 @@ talk.pdf: talk.tex $(ASSETS) assets/%.pdf: assets/%.svg inkscape -D -z --file=$^ --export-pdf=$@ + +assets/%.pdf_tex: assets/%.svg + inkscape -D -z --file=$^ --export-pdf=$@ --export-latex diff --git a/doc/talks/2023-01-18-tocatta/assets/consensus.svg b/doc/talks/2023-01-18-tocatta/assets/consensus.svg new file mode 100644 index 00000000..8321e383 --- /dev/null +++ b/doc/talks/2023-01-18-tocatta/assets/consensus.svg @@ -0,0 +1,137 @@ +<?xml version="1.0" encoding="UTF-8" standalone="no"?> +<!-- Created with Inkscape (http://www.inkscape.org/) --> + +<svg + width="800" + height="300" + viewBox="0 0 211.66666 79.374999" + version="1.1" + id="svg5" + inkscape:version="1.2.2 (b0a8486541, 2022-12-01)" + sodipodi:docname="consensus.svg" + xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape" + xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd" + xmlns="http://www.w3.org/2000/svg" + xmlns:svg="http://www.w3.org/2000/svg"> + <sodipodi:namedview + id="namedview7" + pagecolor="#ffffff" + bordercolor="#666666" + borderopacity="1.0" + inkscape:showpageshadow="2" + inkscape:pageopacity="0.0" + inkscape:pagecheckerboard="0" + inkscape:deskcolor="#d1d1d1" + inkscape:document-units="mm" + showgrid="false" + inkscape:zoom="1.4734708" + inkscape:cx="310.49139" + inkscape:cy="179.1688" + inkscape:window-width="1920" + inkscape:window-height="999" + inkscape:window-x="0" + inkscape:window-y="0" + inkscape:window-maximized="1" + inkscape:current-layer="layer1" /> + <defs + id="defs2"> + <marker + style="overflow:visible" + id="Arrow2" + refX="0" + refY="0" + orient="auto-start-reverse" + inkscape:stockid="Arrow2" + markerWidth="7.6999998" + markerHeight="5.5999999" + viewBox="0 0 7.7 5.6" + inkscape:isstock="true" + inkscape:collect="always" + preserveAspectRatio="xMidYMid"> + <path + transform="scale(0.7)" + d="M -2,-4 9,0 -2,4 c 2,-2.33 2,-5.66 0,-8 z" + style="fill:context-stroke;fill-rule:evenodd;stroke:none" + id="arrow2L" /> + </marker> + </defs> + <g + inkscape:label="Layer 1" + inkscape:groupmode="layer" + id="layer1"> + <g + id="g1218" + transform="translate(-8.9161476,-12.502301)"> + <circle + style="fill:#ffffff;stroke:#000000;stroke-width:1;stroke-dasharray:none;stroke-opacity:1" + id="path111" + cx="38.904896" + cy="37.936272" + r="13.474442" /> + <text + xml:space="preserve" + style="font-size:8.46667px;line-height:1.25;font-family:sans-serif;text-align:center;text-anchor:middle;stroke-width:0.264583;fill:#000000" + x="38.879501" + y="40.908073" + id="text1105"><tspan + sodipodi:role="line" + id="tspan1103" + style="stroke-width:0.264583;fill:#000000" + x="38.879501" + y="40.908073">$\bot$</tspan></text> + </g> + <g + id="g1218-3" + transform="translate(127.41938,-12.502301)"> + <circle + style="fill:#ffffff;stroke:#000000;stroke-width:1;stroke-dasharray:none;stroke-opacity:1" + id="path111-5" + cx="38.904896" + cy="37.936272" + r="13.474442" /> + <text + xml:space="preserve" + style="font-size:8.46667px;line-height:1.25;font-family:sans-serif;text-align:center;text-anchor:middle;stroke-width:0.264583;fill:#000000" + x="38.879501" + y="40.908073" + id="text1105-6"><tspan + sodipodi:role="line" + id="tspan1103-2" + style="stroke-width:0.264583;fill:#000000" + x="38.879501" + y="40.908073">$x$</tspan></text> + </g> + <path + style="fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-dasharray:none;stroke-opacity:1;marker-end:url(#Arrow2)" + d="M 44.289635,25.433971 H 145.90576" + id="path1414" + sodipodi:nodetypes="cc" /> + <text + xml:space="preserve" + style="font-size:8.46667px;line-height:1.25;font-family:sans-serif;text-align:center;text-anchor:middle;stroke-width:0.264583;fill:#000000" + x="92.729836" + y="21.781803" + id="text2092"><tspan + sodipodi:role="line" + id="tspan2090" + style="stroke-width:0.264583;fill:#000000" + x="92.729836" + y="21.781803">$propose(x) / x$</tspan></text> + <text + xml:space="preserve" + style="font-size:8.46667px;line-height:1.25;font-family:sans-serif;text-align:center;text-anchor:middle;stroke-width:0.264583;fill:#000000" + x="166.29887" + y="69.89299" + id="text2092-9"><tspan + sodipodi:role="line" + id="tspan2090-1" + style="stroke-width:0.264583;fill:#000000" + x="166.29887" + y="69.89299">$propose(y) / x$</tspan></text> + <path + style="fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-dasharray:none;stroke-opacity:1;marker-end:url(#Arrow2)" + d="m 155.82329,35.899857 c -8.35129,12.319651 0.54055,24.640898 11.72797,24.072085 8.65403,-0.440005 18.59818,-11.705963 11.8146,-20.570891" + id="path2150" + sodipodi:nodetypes="csc" /> + </g> +</svg> diff --git a/doc/talks/2023-01-18-tocatta/assets/garage.drawio.pdf b/doc/talks/2023-01-18-tocatta/assets/garage.drawio.pdf Binary files differdeleted file mode 100644 index a54a163c..00000000 --- a/doc/talks/2023-01-18-tocatta/assets/garage.drawio.pdf +++ /dev/null diff --git a/doc/talks/2023-01-18-tocatta/assets/garage.drawio.png b/doc/talks/2023-01-18-tocatta/assets/garage.drawio.png Binary files differdeleted file mode 100644 index 386dd862..00000000 --- a/doc/talks/2023-01-18-tocatta/assets/garage.drawio.png +++ /dev/null diff --git a/doc/talks/2023-01-18-tocatta/talk.pdf b/doc/talks/2023-01-18-tocatta/talk.pdf Binary files differindex 5acb9198..ba9bde3d 100644 --- a/doc/talks/2023-01-18-tocatta/talk.pdf +++ b/doc/talks/2023-01-18-tocatta/talk.pdf diff --git a/doc/talks/2023-01-18-tocatta/talk.tex b/doc/talks/2023-01-18-tocatta/talk.tex index 566f56ec..ac9b4077 100644 --- a/doc/talks/2023-01-18-tocatta/talk.tex +++ b/doc/talks/2023-01-18-tocatta/talk.tex @@ -8,6 +8,7 @@ \usepackage{multirow} \usetheme{boxes} \usepackage{graphicx} +\usepackage{import} \usepackage{adjustbox} %\useoutertheme[footline=authortitle,subsection=false]{miniframes} %\useoutertheme[footline=authorinstitute,subsection=false]{miniframes} @@ -479,33 +480,185 @@ \section{Problem 2: ensuring consistency} -%\begin{frame} -% \frametitle{Garage's architecture} -% \begin{center} -% \includegraphics[width=.35\linewidth]{assets/garage.drawio.pdf} -% \end{center} -%\end{frame} +\begin{frame} + \frametitle{Consensus vs weak consistency} + + \hspace{1em} + \begin{minipage}{7cm} + \textbf{Consensus-based systems:} + \vspace{1em} + \begin{itemize} + \item \textbf{Leader-based:} a leader is elected to coordinate + all reads and writes + \vspace{1em} + \item \textbf{Linearizability} of all operations\\ + (strongest consistency guarantee) + \vspace{1em} + \item \textbf{Replicated state machines} that can implement + any sequential specification + \vspace{1em} + \item \textbf{Costly}, the leader is a bottleneck; + leader elections on failure take time + \end{itemize} + \end{minipage} + \hfill + \begin{minipage}{7cm} \visible<2->{ + \textbf{Weakly consistent systems:} + \vspace{1em} + \begin{itemize} + \item \textbf{Nodes are equivalent}, any node + can originate a read or write operation + \vspace{1em} + \item \textbf{Read-after-write consistency} with quorums, + eventual consistency without + \vspace{1em} + \item \textbf{Operations have to commute}, i.e.~we + can only implement CRDTs + \vspace{1em} + \item \textbf{Fast}, no node is a bottleneck;\\ + works the same with offline nodes + \end{itemize} + } \end{minipage} + \hspace{1em} +\end{frame} + +\begin{frame} + \frametitle{Consensus vs weak consistency} + \begin{center} + \textbf{The same objects cannot be implemented in both models.} + \end{center} + \vspace{2em} + + \hspace{1em} + \begin{minipage}{7cm} + \underline{Consensus-based systems:} + + \vspace{1em} + + \textbf{Any sequential specification}\\~ + \end{minipage} + \hfill + \begin{minipage}{7cm} + \underline{Weakly consistent systems:} + + \vspace{1em} + + \textbf{CRDTs only}\\(conflict-free replicated data types) + \end{minipage} + \hspace{1em} + + \vspace{3em} + \begin{center} + Part of the complexity is \textbf{reported to the consumer of the API} + \end{center} +\end{frame} + +\begin{frame} + \frametitle{Consensus vs weak consistency} + \begin{center} + \textbf{From a theoretical point of view:}\\ + + \end{center} + \vspace{2em} + + \hspace{1em} + \begin{minipage}{6.5cm} + \underline{Consensus-based systems:} + + \vspace{1em} + + Require \textbf{additionnal assumptions} such as a fault detector or a strong RNG\\~ + \end{minipage} + \hfill + \begin{minipage}{6.5cm} + \underline{Weakly consistent systems:} + + \vspace{1em} + + Can be implemented in \textbf{any asynchronous message passing distributed system} + \end{minipage} + \hspace{1em} + + \vspace{3em} + \begin{center} + They represent \textbf{different classes of computational capability} + \end{center} +\end{frame} \begin{frame} - \frametitle{Garage is \emph{coordination-free}:} + \frametitle{Understanding the power of consensus} + \textbf{Consensus:} an API with a single operation, $propose(x)$ + \begin{enumerate} + \item nodes all call $propose(x)$ with their proposed value; + \item nodes all receive the same value as a return value, which is one of the proposed values + \end{enumerate} + \vspace{1em} + + \visible<2->{ + \textbf{Equivalent to} a distributed algorithm that gives a total order on all requests + } + \vspace{1em} + + \visible<3->{ + \textbf{Implemented by} this simple replicated state machine: + \vspace{.5em} + \begin{figure} + \centering + \def\svgwidth{.5\textwidth} + \large + \import{assets/}{consensus.pdf_tex} + \end{figure} + \vspace{1em} + } +\end{frame} + +\begin{frame} + \frametitle{Can my object be implemented without consensus?} + \underline{Given the specification of an API:} + \vspace{2em} \begin{itemize} - \item No Raft or Paxos - \vspace{1em} - \item Internal data types are CRDTs + \item \textbf{Using this API, we can implement the consensus object} (the $propose$ function)\\ + $\to$ the API is equivalent to consensus/total ordering of messages\\ + $\to$ the API cannot be implemented in a weakly consistent system + \vspace{2em} + \item \textbf{This API can be implemented using only weak primitives}\\ + (e.g. a bunch of atomic registers)\\ + $\to$ the API is strictly weaker than consensus\\ + $\to$ we can implement it in Garage! + \end{itemize} +\end{frame} + +\begin{frame} + \frametitle{Why avoid consensus?} + Consensus can be implemented reasonably well in practice, so why avoid it? + \vspace{2em} + \begin{itemize} + \item \textbf{Software complexity:} RAFT and PAXOS are complex beasts;\\ + harder to prove, harder to reason about + \vspace{1.5em} + \item \textbf{Performance issues:} \vspace{1em} - \item All nodes are equivalent (no master/leader/index node) + \begin{itemize} + \item The leader is a \textbf{bottleneck} for all requests + \vspace{1em} + \item Particularly \textbf{sensitive to higher latency} between nodes + \end{itemize} \end{itemize} - \vspace{2em} - $\to$ less sensitive to higher latencies between nodes \end{frame} \begin{frame} - \frametitle{Consistency model} + \frametitle{What can we implement without consensus?} \begin{itemize} - \item Not ACID (not required by S3 spec) / not linearizable + \item Any \textbf{conflict-free replicated data type} (CRDT) + \vspace{1em} + \item Non-transactional key-value stores such as S3 are equivalent to a simple CRDT:\\ + a \textbf{last-writer-wins registry} + \vspace{1em} + \item \textbf{Read-after-write consistency} can be implemented + using quorums on read and write operations \vspace{1em} - \item \textbf{Read-after-write consistency}\\ - {\footnotesize (stronger than eventual consistency)} + \item \textbf{Monotonicity of reads} can be implemented with repair-on-read\\ + (makes reads more costly, not implemented in Garage) \end{itemize} \end{frame} |