diff options
Diffstat (limited to 'doc/talks/2021-04-28_spirals-team/sota.tex')
-rw-r--r-- | doc/talks/2021-04-28_spirals-team/sota.tex | 323 |
1 files changed, 323 insertions, 0 deletions
diff --git a/doc/talks/2021-04-28_spirals-team/sota.tex b/doc/talks/2021-04-28_spirals-team/sota.tex new file mode 100644 index 00000000..23b9087a --- /dev/null +++ b/doc/talks/2021-04-28_spirals-team/sota.tex @@ -0,0 +1,323 @@ +\section{State of the art} + + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\begin{frame}{The CAP theorem}{Consistency vs. Availability} + +\begin{block}{Eric Brewer's theorem} +``A shared-state system can have \textbf{at most two} of the following properties at any given time: + +\begin{itemize} + \item \textbf{C}onsistency + \item \textbf{A}vailability + \item \textbf{P}artition tolerance'' +\end{itemize} +\end{block} + + +\begin{center} +\Large +Under network partitions, a distributed data store has to sacrifice either availability or consistency. +\end{center} +\vfill + +\begin{itemize} + \item \textbf{Consistency-first}: Abort incoming queries; + \item \textbf{Availability-first}: Return possibly stale data. +\end{itemize} + +\end{frame} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\begin{frame}{Consistency-first: the ACID model}{Consistency vs. Availability} + +\textbf{Transaction}: unit of work within an ACID data store. +%Comprises multiple operations. +%E.g. bank transfer. +%E.g. a bank transfer from A to B is a transaction involving two operations: withdraw money from A & credit B with the same money amount. +\vfill + +\begin{itemize} + \item \textbf{\underline{A}tomicity}: Transactions either complete entirely or fail. + + No transaction ever seen as in-progress. + + \item \textbf{\underline{C}onsistency}: Transactions always generate a valid state. + + The database maintains its invariants across transactions. + + \item \textbf{\underline{I}solation}: Concurrent transactions are seen as sequential. + + Transactions are serializable, or sequentially consistent. + + \item \textbf{\underline{D}urability}: Committed transactions are never forgotten. +\end{itemize} +\vfill\centering + +Reads are fast, writes are slow. + +\vfill\raggedright + +Example: relational databases. +\end{frame} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\begin{frame}[fragile]{Concurrent writes in ACID}{Consistency vs. Availability} + + +\begin{columns} +\column{.5\columnwidth} + \begin{block}{} + \begin{lstlisting} +transaction AcqDoses(y): + x <- SELECT #vaccines; + UPDATE #vaccines = (x + y); + \end{lstlisting} + \end{block} + \vspace{5ex} + + Supports compound operations. +\column{.5\columnwidth} +\centering +\includegraphics[width=\columnwidth]{figures/conflict_acid.pdf} +\end{columns} + +\end{frame} + + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\begin{frame}{Availability-first: the BASE model}{Consistency vs. Availability} + + +Some apps prefer availability, e.g. Amazon products' reviews. +\vfill + +The BASE model trades Consistency \& Isolation for Availability. + + +%Some applications do not care about strong consistency and prefer being highly available (e.g. Amazon's product reviews). + +%In order to achieve higher availability, the BASE model relaxes consistency constraints of the ACID model: "eventual consistency". +\vfill + +\begin{itemize} + \item \textbf{\underline{B}asic \underline{A}vailability}: + The data store thrives to be available. + + \item \textbf{\underline{S}oft-state}: + Replicas can disagree on the valid state. + + \item \textbf{\underline{E}ventual consistency}: + In the absence of write queries, + the data store will eventually converge to a single valid state. +\end{itemize} +\vfill\centering + +Writes are fast, reads are slow. + +\vfill\raggedright + +Examples: key-value \& object stores. + +\end{frame} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\begin{frame}{Concurrent writes in BASE}{Consistency vs. Availability} + +\begin{columns} +\column{.5\columnwidth} + \begin{block}{Object} + \begin{itemize} + \item Unique key + \item Arbitrary value + \item Metadata + \end{itemize} + \end{block} + \vspace{5ex} + + Conflict resolution = client's job! + \vspace{5ex} + + No compound operations. +\column{.5\columnwidth} + \centering + \includegraphics[width=\columnwidth]{figures/conflict_base.pdf} +\end{columns} + +% KV storage is another example, distinction is minor here + +% Object = unique key, arbitrary value, metadata. + +% Object storage only provides semantics to investigate causal order of queries *for individual objects*. No compound operations, no transactions. + +% Much easier to distribute, and "scale-out". + +% Write is fast, read is slow (gotta collect all object versions). + +% \todo{vaccines example with BASE model} + +\end{frame} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\begin{frame}{Strong Eventual Consistency w/ CRDTs}{Consistency vs. Availability} + +\centering\small + +\fullcite{defago_conflict-free_2011} + +\vfill\raggedright\normalsize + +\begin{block}{Strong Eventual Consistency (SEC)} + \begin{itemize} + \item CRDTs specify distributed operations + \item Conflicts will be solved according to specification + \item Proven \& bound eventual convergence + \end{itemize} +\end{block} + +\vfill\centering +\includegraphics[width=.5\columnwidth]{figures/crdt.pdf} + +\end{frame} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\begin{frame}[fragile]{Concurrent writes with CRDTs}{Consistency vs. Availability} + +\begin{columns} +\column{.5\columnwidth} + \begin{block}{} + \begin{lstlisting} +CRDT Counter(x0): + history = {} + op. incr(y): + history U= {(UUID(), y)} + op. decr(y): + history U= {(UUID(), -y)} + op. read(): + x = x0 + for (_, y) in history: + x += y + return x + \end{lstlisting} + \end{block} + \vspace{2ex} + + Operations commute? + + $\implies$ screw total order! +\column{.5\columnwidth} + \centering + \includegraphics[width=\columnwidth]{figures/conflict_crdt.pdf} +\end{columns} + +\end{frame} + + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\begin{frame}{A complex CRDT: the DAG}{Consistency vs. Availability} + +\centering +\only<1>{\includegraphics[height=\textheight]{figures/dag_crdt.png}}% +\only<2>{ + Just to say I swept a lot under the rug. + \vfill + + For details, go read: + + \fullcite{defago_conflict-free_2011} + \vfill + + For an implementation, check \textbf{AntidoteDB}. +} + +\end{frame} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\begin{frame}{State of the practice}{Path dependency to the ``cloud''} + +\begin{block}{The BASE model is fashionable because} +\centering + +``\emph{High-performance} object storage for \emph{AI analytics} with PBs of \emph{IoT data streams} at the \emph{edge}, using \emph{5G}.'' + % \begin{itemize} + % \item Highest performance + % \item IoT data streams are inherently distributed + % \end{itemize} +\end{block} + +\vfill\centering + +\includegraphics[width=.9\columnwidth]{figures/minio_edge.png} + +\vfill\raggedright + + +%\begin{block}{} +\begin{itemize} + \item Always backed by cloud: high performance network links. + \item Edge nodes always seen as clients or data sources, not peers. +\end{itemize} +%\end{block} + +% There is \textbf{always a central cloud cluster} in these use-cases. + +% Hidden constraint: \textbf{high performance inter-node connectivity}. + + + +\end{frame} + + + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +% \begin{frame}{A brief history of storage} + +% We keep it short because we'll follow chronological order in the next section too. + +% \end{frame} + + +% \begin{frame}{In the beginning, there were \emph{monoliths}} + +% \includegraphics[width=.5\columnwidth]{figures/stonehenge.jpg} + +% Web applications used to be monolithic: + +% \begin{itemize} +% \item One or two servers; +% \item Availability was not an obsession; +% \item Latency was acceptable. +% \end{itemize} + +% Relational databases were queens. + +% \end{frame} + + +% \begin{frame}{Then came \emph{expectations}} +% Then, the whole world went online, and suddenly: expectations! + +% \begin{itemize} +% \item ``Milliseconds matter.'' (Algolia slogan) +% \item Critical networked services (healthcare, logistics) need 100\% availability +% \end{itemize} + +% $\implies$ Microservices \& horizontal scalability. + +% \todo{Develop on the `herd not sheep' paradigm a bit.} + +% \end{frame} + + +% \begin{frame}{Distributing state/storage: the remaining unknown} + +% The microservices orchestration game works well for \emph{stateless} services. + +% However, any application requires \emph{state}, persistent data. + +% And this is tough. As we will now see. + +% (Not that it's not well studied: distributed storage has always been fashionable.) + +% \end{frame}
\ No newline at end of file |