From fe850f62c908492d2c3cbe4c55c3cf3b3d097de0 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Thu, 12 Jan 2023 16:27:02 +0100 Subject: Talk 2023-01-18: some WIP talking about consensus --- doc/talks/2023-01-18-tocatta/Makefile | 4 + doc/talks/2023-01-18-tocatta/assets/consensus.svg | 137 +++++++++++++++ .../2023-01-18-tocatta/assets/garage.drawio.pdf | Bin 26098 -> 0 bytes .../2023-01-18-tocatta/assets/garage.drawio.png | Bin 13463 -> 0 bytes doc/talks/2023-01-18-tocatta/talk.pdf | Bin 2572439 -> 2594434 bytes doc/talks/2023-01-18-tocatta/talk.tex | 187 +++++++++++++++++++-- 6 files changed, 311 insertions(+), 17 deletions(-) create mode 100644 doc/talks/2023-01-18-tocatta/assets/consensus.svg delete mode 100644 doc/talks/2023-01-18-tocatta/assets/garage.drawio.pdf delete mode 100644 doc/talks/2023-01-18-tocatta/assets/garage.drawio.png (limited to 'doc/talks') diff --git a/doc/talks/2023-01-18-tocatta/Makefile b/doc/talks/2023-01-18-tocatta/Makefile index 4d600178..4a967d24 100644 --- a/doc/talks/2023-01-18-tocatta/Makefile +++ b/doc/talks/2023-01-18-tocatta/Makefile @@ -3,6 +3,7 @@ ASSETS=assets/consistent_hashing_1.pdf \ assets/consistent_hashing_3.pdf \ assets/consistent_hashing_4.pdf \ assets/garage_tables.pdf \ + assets/consensus.pdf_tex \ assets/deuxfleurs.pdf talk.pdf: talk.tex $(ASSETS) @@ -10,3 +11,6 @@ talk.pdf: talk.tex $(ASSETS) assets/%.pdf: assets/%.svg inkscape -D -z --file=$^ --export-pdf=$@ + +assets/%.pdf_tex: assets/%.svg + inkscape -D -z --file=$^ --export-pdf=$@ --export-latex diff --git a/doc/talks/2023-01-18-tocatta/assets/consensus.svg b/doc/talks/2023-01-18-tocatta/assets/consensus.svg new file mode 100644 index 00000000..8321e383 --- /dev/null +++ b/doc/talks/2023-01-18-tocatta/assets/consensus.svg @@ -0,0 +1,137 @@ + + + + + + + + + + + + + + $\bot$ + + + + $x$ + + + $propose(x) / x$ + $propose(y) / x$ + + + diff --git a/doc/talks/2023-01-18-tocatta/assets/garage.drawio.pdf b/doc/talks/2023-01-18-tocatta/assets/garage.drawio.pdf deleted file mode 100644 index a54a163c..00000000 Binary files a/doc/talks/2023-01-18-tocatta/assets/garage.drawio.pdf and /dev/null differ diff --git a/doc/talks/2023-01-18-tocatta/assets/garage.drawio.png b/doc/talks/2023-01-18-tocatta/assets/garage.drawio.png deleted file mode 100644 index 386dd862..00000000 Binary files a/doc/talks/2023-01-18-tocatta/assets/garage.drawio.png and /dev/null differ diff --git a/doc/talks/2023-01-18-tocatta/talk.pdf b/doc/talks/2023-01-18-tocatta/talk.pdf index 5acb9198..ba9bde3d 100644 Binary files a/doc/talks/2023-01-18-tocatta/talk.pdf and b/doc/talks/2023-01-18-tocatta/talk.pdf differ diff --git a/doc/talks/2023-01-18-tocatta/talk.tex b/doc/talks/2023-01-18-tocatta/talk.tex index 566f56ec..ac9b4077 100644 --- a/doc/talks/2023-01-18-tocatta/talk.tex +++ b/doc/talks/2023-01-18-tocatta/talk.tex @@ -8,6 +8,7 @@ \usepackage{multirow} \usetheme{boxes} \usepackage{graphicx} +\usepackage{import} \usepackage{adjustbox} %\useoutertheme[footline=authortitle,subsection=false]{miniframes} %\useoutertheme[footline=authorinstitute,subsection=false]{miniframes} @@ -479,33 +480,185 @@ \section{Problem 2: ensuring consistency} -%\begin{frame} -% \frametitle{Garage's architecture} -% \begin{center} -% \includegraphics[width=.35\linewidth]{assets/garage.drawio.pdf} -% \end{center} -%\end{frame} +\begin{frame} + \frametitle{Consensus vs weak consistency} + + \hspace{1em} + \begin{minipage}{7cm} + \textbf{Consensus-based systems:} + \vspace{1em} + \begin{itemize} + \item \textbf{Leader-based:} a leader is elected to coordinate + all reads and writes + \vspace{1em} + \item \textbf{Linearizability} of all operations\\ + (strongest consistency guarantee) + \vspace{1em} + \item \textbf{Replicated state machines} that can implement + any sequential specification + \vspace{1em} + \item \textbf{Costly}, the leader is a bottleneck; + leader elections on failure take time + \end{itemize} + \end{minipage} + \hfill + \begin{minipage}{7cm} \visible<2->{ + \textbf{Weakly consistent systems:} + \vspace{1em} + \begin{itemize} + \item \textbf{Nodes are equivalent}, any node + can originate a read or write operation + \vspace{1em} + \item \textbf{Read-after-write consistency} with quorums, + eventual consistency without + \vspace{1em} + \item \textbf{Operations have to commute}, i.e.~we + can only implement CRDTs + \vspace{1em} + \item \textbf{Fast}, no node is a bottleneck;\\ + works the same with offline nodes + \end{itemize} + } \end{minipage} + \hspace{1em} +\end{frame} + +\begin{frame} + \frametitle{Consensus vs weak consistency} + \begin{center} + \textbf{The same objects cannot be implemented in both models.} + \end{center} + \vspace{2em} + + \hspace{1em} + \begin{minipage}{7cm} + \underline{Consensus-based systems:} + + \vspace{1em} + + \textbf{Any sequential specification}\\~ + \end{minipage} + \hfill + \begin{minipage}{7cm} + \underline{Weakly consistent systems:} + + \vspace{1em} + + \textbf{CRDTs only}\\(conflict-free replicated data types) + \end{minipage} + \hspace{1em} + + \vspace{3em} + \begin{center} + Part of the complexity is \textbf{reported to the consumer of the API} + \end{center} +\end{frame} + +\begin{frame} + \frametitle{Consensus vs weak consistency} + \begin{center} + \textbf{From a theoretical point of view:}\\ + + \end{center} + \vspace{2em} + + \hspace{1em} + \begin{minipage}{6.5cm} + \underline{Consensus-based systems:} + + \vspace{1em} + + Require \textbf{additionnal assumptions} such as a fault detector or a strong RNG\\~ + \end{minipage} + \hfill + \begin{minipage}{6.5cm} + \underline{Weakly consistent systems:} + + \vspace{1em} + + Can be implemented in \textbf{any asynchronous message passing distributed system} + \end{minipage} + \hspace{1em} + + \vspace{3em} + \begin{center} + They represent \textbf{different classes of computational capability} + \end{center} +\end{frame} \begin{frame} - \frametitle{Garage is \emph{coordination-free}:} + \frametitle{Understanding the power of consensus} + \textbf{Consensus:} an API with a single operation, $propose(x)$ + \begin{enumerate} + \item nodes all call $propose(x)$ with their proposed value; + \item nodes all receive the same value as a return value, which is one of the proposed values + \end{enumerate} + \vspace{1em} + + \visible<2->{ + \textbf{Equivalent to} a distributed algorithm that gives a total order on all requests + } + \vspace{1em} + + \visible<3->{ + \textbf{Implemented by} this simple replicated state machine: + \vspace{.5em} + \begin{figure} + \centering + \def\svgwidth{.5\textwidth} + \large + \import{assets/}{consensus.pdf_tex} + \end{figure} + \vspace{1em} + } +\end{frame} + +\begin{frame} + \frametitle{Can my object be implemented without consensus?} + \underline{Given the specification of an API:} + \vspace{2em} \begin{itemize} - \item No Raft or Paxos - \vspace{1em} - \item Internal data types are CRDTs + \item \textbf{Using this API, we can implement the consensus object} (the $propose$ function)\\ + $\to$ the API is equivalent to consensus/total ordering of messages\\ + $\to$ the API cannot be implemented in a weakly consistent system + \vspace{2em} + \item \textbf{This API can be implemented using only weak primitives}\\ + (e.g. a bunch of atomic registers)\\ + $\to$ the API is strictly weaker than consensus\\ + $\to$ we can implement it in Garage! + \end{itemize} +\end{frame} + +\begin{frame} + \frametitle{Why avoid consensus?} + Consensus can be implemented reasonably well in practice, so why avoid it? + \vspace{2em} + \begin{itemize} + \item \textbf{Software complexity:} RAFT and PAXOS are complex beasts;\\ + harder to prove, harder to reason about + \vspace{1.5em} + \item \textbf{Performance issues:} \vspace{1em} - \item All nodes are equivalent (no master/leader/index node) + \begin{itemize} + \item The leader is a \textbf{bottleneck} for all requests + \vspace{1em} + \item Particularly \textbf{sensitive to higher latency} between nodes + \end{itemize} \end{itemize} - \vspace{2em} - $\to$ less sensitive to higher latencies between nodes \end{frame} \begin{frame} - \frametitle{Consistency model} + \frametitle{What can we implement without consensus?} \begin{itemize} - \item Not ACID (not required by S3 spec) / not linearizable + \item Any \textbf{conflict-free replicated data type} (CRDT) + \vspace{1em} + \item Non-transactional key-value stores such as S3 are equivalent to a simple CRDT:\\ + a \textbf{last-writer-wins registry} + \vspace{1em} + \item \textbf{Read-after-write consistency} can be implemented + using quorums on read and write operations \vspace{1em} - \item \textbf{Read-after-write consistency}\\ - {\footnotesize (stronger than eventual consistency)} + \item \textbf{Monotonicity of reads} can be implemented with repair-on-read\\ + (makes reads more costly, not implemented in Garage) \end{itemize} \end{frame} -- cgit v1.2.3