aboutsummaryrefslogtreecommitdiff
path: root/doc/talks/2024-02-03-fosdem/talk.tex
diff options
context:
space:
mode:
authorAlex Auvolat <alex@adnab.me>2024-01-23 16:50:30 +0100
committerAlex Auvolat <alex@adnab.me>2024-01-23 16:50:30 +0100
commitc2541f280c0b267bbaf71702b2966c91a4c5105b (patch)
tree01280a5b2673b91e4f0b34869a319ab8cd9c5b23 /doc/talks/2024-02-03-fosdem/talk.tex
parent4de7ac60232d521d7b31bddc0768002894cecf9d (diff)
downloadgarage-c2541f280c0b267bbaf71702b2966c91a4c5105b.tar.gz
garage-c2541f280c0b267bbaf71702b2966c91a4c5105b.zip
[talk-fosdem-24] WIP, write talk, modify lots of assets
Diffstat (limited to 'doc/talks/2024-02-03-fosdem/talk.tex')
-rw-r--r--doc/talks/2024-02-03-fosdem/talk.tex844
1 files changed, 227 insertions, 617 deletions
diff --git a/doc/talks/2024-02-03-fosdem/talk.tex b/doc/talks/2024-02-03-fosdem/talk.tex
index 4c4dcdbc..b7254dc2 100644
--- a/doc/talks/2024-02-03-fosdem/talk.tex
+++ b/doc/talks/2024-02-03-fosdem/talk.tex
@@ -10,6 +10,7 @@
\usepackage{graphicx}
\usepackage{import}
\usepackage{adjustbox}
+\usepackage[absolute,overlay]{textpos}
%\useoutertheme[footline=authortitle,subsection=false]{miniframes}
%\useoutertheme[footline=authorinstitute,subsection=false]{miniframes}
\useoutertheme{infolines}
@@ -86,7 +87,7 @@
\begin{columns}[t]
\begin{column}{.2\textwidth}
\centering
- \adjincludegraphics[width=.5\linewidth, valign=t]{../assets/deuxfleurs.pdf}
+ \adjincludegraphics[width=.5\linewidth, valign=t]{../assets/logos/deuxfleurs.pdf}
\end{column}
\begin{column}{.6\textwidth}
\textbf{Deuxfleurs}\\
@@ -95,7 +96,7 @@
\end{column}
\begin{column}{.2\textwidth}
\centering
- \adjincludegraphics[width=.7\linewidth, valign=t]{../assets/logo_chatons.png}
+ \adjincludegraphics[width=.7\linewidth, valign=t]{../assets/logos/logo_chatons.png}
\end{column}
\end{columns}
@@ -116,39 +117,25 @@
\vspace{2em}
\begin{center}
\textbf{\underline{Resilience}}\\
- {\footnotesize (we want good uptime/availability with low supervision)}
+ {\footnotesize we want good uptime/availability with low supervision}
\end{center}
}
\end{frame}
\begin{frame}
- \frametitle{How to make a \underline{stable} system}
+ \frametitle{Building a resilient system with cheap stuff}
- Enterprise-grade systems typically employ:
- \vspace{1em}
- \begin{itemize}
- \item RAID
- \item Redundant power grid + UPS
- \item Redundant Internet connections
- \item Low-latency links
- \item ...
- \end{itemize}
- \vspace{1em}
- $\to$ it's costly and only worth it at DC scale
-\end{frame}
-
-\begin{frame}
- \frametitle{How to make a \underline{resilient} system}
-
- \only<1,4-5>{
- Instead, we use:
- \vspace{1em}
+ \only<1,4-7>{
\begin{itemize}
- \item \textcolor<2->{gray}{Commodity hardware (e.g. old desktop PCs)}
+ \item \textcolor<5->{gray}{Commodity hardware (e.g. old desktop PCs)\\
\vspace{.5em}
- \item<4-> \textcolor<5->{gray}{Commodity Internet (e.g. FTTB, FTTH) and power grid}
+ \visible<4->{{\footnotesize (can die at any time)}}}
+ \vspace{1.5em}
+ \item<5-> \textcolor<7->{gray}{Regular Internet (e.g. FTTB, FTTH) and power grid connections\\
\vspace{.5em}
- \item<5-> \textcolor<6->{gray}{\textbf{Geographical redundancy} (multi-site replication)}
+ \visible<6->{{\footnotesize (can be unavailable randomly)}}}
+ \vspace{1.5em}
+ \item<7-> \textbf{Geographical redundancy} (multi-site replication)
\end{itemize}
}
\only<2>{
@@ -161,7 +148,7 @@
\includegraphics[width=.8\linewidth]{../assets/atuin.jpg}
\end{center}
}
- \only<6>{
+ \only<8>{
\begin{center}
\includegraphics[width=.8\linewidth]{../assets/inframap_jdll2023.pdf}
\end{center}
@@ -171,22 +158,20 @@
\begin{frame}
\frametitle{Object storage: a crucial component}
\begin{center}
- \includegraphics[height=6em]{../assets/Amazon-S3.jpg}
+ \includegraphics[height=6em]{../assets/logos/Amazon-S3.jpg}
\hspace{3em}
- \includegraphics[height=5em]{../assets/minio.png}
+ \visible<2->{\includegraphics[height=5em]{../assets/logos/minio.png}}
\hspace{3em}
- \includegraphics[height=6em]{../../logo/garage_hires_crop.png}
+ \visible<3>{\includegraphics[height=6em]{../../logo/garage_hires_crop.png}}
\end{center}
\vspace{1em}
S3: a de-facto standard, many compatible applications
\vspace{1em}
-
- MinIO is self-hostable but not suited for geo-distributed deployments
+ \visible<2->{MinIO is self-hostable but not suited for geo-distributed deployments}
\vspace{1em}
-
- \textbf{Garage is a self-hosted drop-in replacement for the Amazon S3 object store}
+ \visible<3->{\textbf{Garage is a self-hosted drop-in replacement for the Amazon S3 object store}}
\end{frame}
\begin{frame}
@@ -194,30 +179,30 @@
Consensus can be implemented reasonably well in practice, so why avoid it?
\vspace{1em}
\begin{itemize}
- \item \textbf{Software complexity}
+ \item<2-> \textbf{Software complexity}
\vspace{1em}
- \item \textbf{Performance issues:}
+ \item<3-> \textbf{Performance issues:}
\vspace{.5em}
\begin{itemize}
- \item The leader is a \textbf{bottleneck} for all requests\\
+ \item<4-> The leader is a \textbf{bottleneck} for all requests\\
\vspace{.5em}
- \item \textbf{Sensitive to higher latency} between nodes
+ \item<5-> \textbf{Sensitive to higher latency} between nodes
\vspace{.5em}
- \item \textbf{Takes time to reconverge} when disrupted (e.g. node going down)
+ \item<6-> \textbf{Takes time to reconverge} when disrupted (e.g. node going down)
\end{itemize}
\end{itemize}
\vspace{2em}
- $\to$ Garage uses only CRDTs internally (conflict-free replicated data types)
+ \visible<7->{\underline{Internally, Garage uses only CRDTs} (conflict-free replicated data types)}
\end{frame}
\begin{frame}
\frametitle{The data model of object storage}
- Object storage is basically a key-value store:
- \vspace{1em}
+ Object storage is basically a \textbf{key-value store}:
+ \vspace{.5em}
- {\footnotesize
+ {\scriptsize
\begin{center}
- \begin{tabular}{|l|p{8cm}|}
+ \begin{tabular}{|l|p{7cm}|}
\hline
\textbf{Key: file path + name} & \textbf{Value: file data + metadata} \\
\hline
@@ -242,28 +227,33 @@
}
\vspace{1em}
- Simple interface, compatible with many existing applications
-
- \vspace{1em}
- Maps well to CRDT data types
+ \begin{itemize}
+ \item<2> Maps well to CRDT data types
+ \end{itemize}
\end{frame}
\begin{frame}
\frametitle{Performance gains in practice}
\begin{center}
- \includegraphics[width=.8\linewidth]{../assets/endpoint_latency_0.7_0.8_minio.png}
+ \includegraphics[width=.8\linewidth]{../assets/perf/endpoint_latency_0.7_0.8_minio.png}
\end{center}
\end{frame}
+
+% ======================================== TIMELINE
+% ======================================== TIMELINE
+% ======================================== TIMELINE
+
+\section{Recent developments}
+
+% ====================== v0.7.0 ===============================
+
\begin{frame}
- \frametitle{Timeline}
\begin{center}
\includegraphics[width=.8\linewidth]{../assets/timeline-22-24.pdf}
\end{center}
\end{frame}
-% ====================== v0.7.0 ===============================
-
\begin{frame}
\frametitle{April 2022 - Garage v0.7.0}
Focus on \underline{observability and ecosystem integration}
@@ -271,8 +261,7 @@
\begin{itemize}
\item \textbf{Monitoring:} metrics and traces, using OpenTelemetry
\vspace{1em}
- \item Alternative replication modes with 1 or 2 copies,\\
- modes with weaker consistency
+ \item Replication modes with 1 or 2 copies / weaker consistency
\vspace{1em}
\item Kubernetes integration
\vspace{1em}
@@ -285,20 +274,26 @@
\begin{frame}
\frametitle{Metrics (Prometheus + Grafana)}
\begin{center}
- \includegraphics[width=.9\linewidth]{../assets/grafana_dashboard.png}
+ \includegraphics[width=.9\linewidth]{../assets/screenshots/grafana_dashboard.png}
\end{center}
\end{frame}
\begin{frame}
\frametitle{Traces (Jaeger)}
\begin{center}
- \includegraphics[width=.8\linewidth]{../assets/jaeger_listobjects.png}
+ \includegraphics[width=.8\linewidth]{../assets/screenshots/jaeger_listobjects.png}
\end{center}
\end{frame}
% ====================== v0.8.0 ===============================
\begin{frame}
+ \begin{center}
+ \includegraphics[width=.8\linewidth]{../assets/timeline-22-24.pdf}
+ \end{center}
+\end{frame}
+
+\begin{frame}
\frametitle{November 2022 - Garage v0.8.0}
Focus on \underline{performance}
\vspace{2em}
@@ -315,8 +310,8 @@
\begin{frame}
\frametitle{About metadata DB engines}
- Issues with Sled:
- \vspace{2em}
+ \textbf{Issues with Sled:}
+ \vspace{1em}
\begin{itemize}
\item Huge files on disk
\vspace{.5em}
@@ -326,8 +321,9 @@
\vspace{.5em}
\item Not actively maintained
\end{itemize}
+
\vspace{2em}
- LMDB: very stable, good performance, reasonably small files on disk
+ \textbf{LMDB:} very stable, good performance, reasonably small files on disk
\vspace{1em}
Sled will be removed in Garage v1.0
@@ -336,7 +332,7 @@
\begin{frame}
\frametitle{DB engine performance comparison}
\begin{center}
- \includegraphics[width=.6\linewidth]{../assets/db_engine.png}
+ \includegraphics[width=.6\linewidth]{../assets/perf/db_engine.png}
\end{center}
NB: Sqlite was slow due to synchronous journaling mode, now configurable
\end{frame}
@@ -352,20 +348,26 @@
\begin{frame}
\frametitle{TTFB benchmark}
\begin{center}
- \includegraphics[width=.8\linewidth]{../assets/ttfb.png}
+ \includegraphics[width=.8\linewidth]{../assets/perf/ttfb.png}
\end{center}
\end{frame}
\begin{frame}
\frametitle{Throughput benchmark}
\begin{center}
- \includegraphics[width=.7\linewidth]{../assets/io-0.7-0.8-minio.png}
+ \includegraphics[width=.7\linewidth]{../assets/perf/io-0.7-0.8-minio.png}
\end{center}
\end{frame}
% ====================== v0.9.0 ===============================
\begin{frame}
+ \begin{center}
+ \includegraphics[width=.8\linewidth]{../assets/timeline-22-24.pdf}
+ \end{center}
+\end{frame}
+
+\begin{frame}
\frametitle{October 2023 - Garage v0.9.0}
Focus on \underline{streamlining \& usability}
\vspace{2em}
@@ -389,91 +391,72 @@
\begin{frame}
\frametitle{Layout computation}
- \begin{center}
- \includegraphics[width=\linewidth]{../assets/location-aware.png}
- \end{center}
- \vspace{2em}
- Garage replicates data on different zones when possible
-\end{frame}
-
-\begin{frame}
- \frametitle{Layout computation}
+ \begin{overprint}
+ \onslide<1>
\begin{center}
- \includegraphics[width=.8\linewidth]{../assets/map.png}
+ \includegraphics[width=\linewidth, trim=0 0 0 -4cm]{../assets/screenshots/garage_status_0.9_prod_zonehl.png}
\end{center}
-\end{frame}
-
-\begin{frame}
- \frametitle{Optimal layout computation}
+ \onslide<2>
\begin{center}
- \includegraphics[width=.6\linewidth]{../assets/geodistrib_paper.png}
+ \includegraphics[width=.7\linewidth]{../assets/map.png}
\end{center}
+ \end{overprint}
+ \vspace{1em}
+ Garage stores replicas on different zones when possible
\end{frame}
\begin{frame}
\frametitle{What a "layout" is}
- \textbf{A layout is a precomputed index table}
- \vspace{2em}
+ \textbf{A layout is a precomputed index table:}
+ \vspace{1em}
- \begin{center}
- \begin{tabular}{|l|l|l|l|}
- \hline
- \textbf{Partition} & \textbf{Node 1} & \textbf{Node 2} & \textbf{Node 3} \\
- \hline
- \hline
- Partition 0 & Io (jupiter) & Drosera (atuin) & Courgette (neptune) \\
- \hline
- Partition 1 & Datura (atuin) & Courgette (neptune) & Io (jupiter) \\
- \hline
- Partition 2 & Io(jupiter) & Celeri (neptune) & Drosera (atuin) \\
- \hline
- \hspace{1em}$\vdots$ & \hspace{1em}$\vdots$ & \hspace{1em}$\vdots$ & \hspace{1em}$\vdots$ \\
- \hline
- Partition 255 & Concombre (neptune) & Io (jupiter) & Drosera (atuin) \\
- \hline
- \end{tabular}
- \end{center}
+ {\footnotesize
+ \begin{center}
+ \begin{tabular}{|l|l|l|l|}
+ \hline
+ \textbf{Partition} & \textbf{Node 1} & \textbf{Node 2} & \textbf{Node 3} \\
+ \hline
+ \hline
+ Partition 0 & Io (jupiter) & Drosera (atuin) & Courgette (neptune) \\
+ \hline
+ Partition 1 & Datura (atuin) & Courgette (neptune) & Io (jupiter) \\
+ \hline
+ Partition 2 & Io(jupiter) & Celeri (neptune) & Drosera (atuin) \\
+ \hline
+ \hspace{1em}$\vdots$ & \hspace{1em}$\vdots$ & \hspace{1em}$\vdots$ & \hspace{1em}$\vdots$ \\
+ \hline
+ Partition 255 & Concombre (neptune) & Io (jupiter) & Drosera (atuin) \\
+ \hline
+ \end{tabular}
+ \end{center}
+ }
- \vspace{1em}
+ \vspace{2em}
+ \visible<2->{
The index table is built centrally using an optimal algorithm,\\
then propagated to all nodes
+ }
\end{frame}
\begin{frame}
- \frametitle{The relationship between \emph{partition} and \emph{partition key}}
- \begin{center}
- \begin{tabular}{|l|l|l|l|}
- \hline
- \textbf{Partition key} & \textbf{Partition} & \textbf{Sort key} & \textbf{Value} \\
- \hline
- \hline
- \texttt{website} & Partition 12 & \texttt{index.html} & (file data) \\
- \hline
- \texttt{website} & Partition 12 & \texttt{img/logo.svg} & (file data) \\
- \hline
- \texttt{website} & Partition 12 &\texttt{download/index.html} & (file data) \\
- \hline
- \hline
- \texttt{backup} & Partition 42 & \texttt{borg/index.2822} & (file data) \\
- \hline
- \texttt{backup} & Partition 42 & \texttt{borg/data/2/2329} & (file data) \\
- \hline
- \texttt{backup} & Partition 42 & \texttt{borg/data/2/2680} & (file data) \\
- \hline
- \hline
- \texttt{private} & Partition 42 & \texttt{qq3a2nbe1qjq0ebbvo6ocsp6co} & (file data) \\
- \hline
- \end{tabular}
- \end{center}
- \vspace{1em}
- \textbf{To read or write an item:} hash partition key
- \\ \hspace{5cm} $\to$ determine partition number (first 8 bits)
- \\ \hspace{5cm} $\to$ find associated nodes
+ \frametitle{Optimal layout computation}
+ \begin{figure}
+ \center
+ \includegraphics[width=.6\linewidth]{../assets/geodistrib_paper.png}
+ \end{figure}
\end{frame}
+
+
% ====================== v0.10.0 ===============================
\begin{frame}
+ \begin{center}
+ \includegraphics[width=.8\linewidth]{../assets/timeline-22-24.pdf}
+ \end{center}
+\end{frame}
+
+\begin{frame}
\frametitle{October 2023 - Garage v0.10.0 beta}
Focus on \underline{consistency}
\vspace{2em}
@@ -482,386 +465,150 @@
\end{itemize}
\end{frame}
-
-
-
-% --------------------------------------------------------------------
-% =================================================================================
-% =================================================================================
-% --------------------------------------------------------------------
-% --------------------------------------------------------------------
-% --------------------------------------------------------------------
-% =================================================================================
-% =================================================================================
-% --------------------------------------------------------------------
-
-
-
-
\begin{frame}
- \frametitle{Two big problems}
- \begin{enumerate}
- \item \textbf{How to place data on different nodes?}\\
- \vspace{1em}
- \underline{Constraints:} heterogeneous hardware\\
- \underline{Objective:} $n$ copies of everything, maximize usable capacity, maximize resilience\\
- \vspace{1em}
- $\to$ the Dynamo model + optimization algorithms
- \vspace{2em}
- \item<2-> \textbf{How to guarantee consistency?}\\
+ \frametitle{Working with weak consistency}
+ Not using consensus limits us to the following:
+ \vspace{2em}
+ \begin{itemize}
+ \item<2-> \textbf{Conflict-free replicated data types} (CRDT)\\
\vspace{1em}
- \underline{Constraints:} slow network (geographical distance), node unavailability/crashes\\
- \underline{Objective:} maximize availability, read-after-write guarantee\\
+ {\footnotesize Non-transactional key-value stores such as S3 are equivalent to a simple CRDT:\\
+ a map of \textbf{last-writer-wins registers} (each key is its own CRDT)}
+ \vspace{1.5em}
+ \item<3-> \textbf{Read-after-write consistency}\\
\vspace{1em}
- $\to$ CRDTs, monotonicity, read and write quorums
- \end{enumerate}
+ {\footnotesize Can be implemented using quorums on read and write operations}
+ \end{itemize}
\end{frame}
-\section{Problem 1: placing data}
-
-\begin{frame}
- \frametitle{Key-value stores, upgraded: the Dynamo model}
- \textbf{Two keys:}
- \begin{itemize}
- \item Partition key: used to divide data into partitions {\small (a.k.a.~shards)}
- \item Sort key: used to identify items inside a partition
- \end{itemize}
+\begin{frame}[t]
+ \frametitle{CRDT read-after-write consistency using quorums}
\vspace{1em}
+ {\small
+ \textbf{Property:} If node $A$ did an operation $write(x)$ and received an OK response,\\
+ \hspace{2cm} and node $B$ starts an operation $read()$ after $A$ received OK,\\
+ \hspace{2cm} then $B$ will read a value $x' \sqsupseteq x$.
+ }
- \begin{center}
- \begin{tabular}{|l|l|p{3cm}|}
- \hline
- \textbf{Partition key: bucket} & \textbf{Sort key: filename} & \textbf{Value} \\
- \hline
- \hline
- \texttt{website} & \texttt{index.html} & (file data) \\
- \hline
- \texttt{website} & \texttt{img/logo.svg} & (file data) \\
- \hline
- \texttt{website} & \texttt{download/index.html} & (file data) \\
- \hline
- \hline
- \texttt{backup} & \texttt{borg/index.2822} & (file data) \\
- \hline
- \texttt{backup} & \texttt{borg/data/2/2329} & (file data) \\
- \hline
- \texttt{backup} & \texttt{borg/data/2/2680} & (file data) \\
- \hline
- \hline
- \texttt{private} & \texttt{qq3a2nbe1qjq0ebbvo6ocsp6co} & (file data) \\
- \hline
- \end{tabular}
- \end{center}
-\end{frame}
-
-\begin{frame}
- \frametitle{Key-value stores, upgraded: the Dynamo model}
- \begin{itemize}
- \item Data with different partition keys is stored independently,\\
- on a different set of nodes\\
- \vspace{.5em}
- $\to$ no easy way to list all partition keys\\
- $\to$ no cross-shard transactions\\
- \vspace{2em}
- \item Placing data: hash the partition key, select nodes accordingly\\
- \vspace{.5em}
- $\to$ distributed hash table (DHT)
- \vspace{2em}
- \item For a given value of the partition key, items can be listed using their sort keys
- \end{itemize}
+ \vspace{1.5em}
+ \begin{overprint}
+ \onslide<2-9>
+ \begin{figure}
+ \centering
+ \footnotesize
+ \def\svgwidth{.7\textwidth}
+ \only<2>{\import{../assets/lattice/}{lattice1.pdf_tex}}%
+ \only<3>{\import{../assets/lattice/}{lattice2.pdf_tex}}%
+ \only<4>{\import{../assets/lattice/}{lattice3.pdf_tex}}%
+ \only<5>{\import{../assets/lattice/}{lattice4.pdf_tex}}%
+ \only<6>{\import{../assets/lattice/}{lattice5.pdf_tex}}%
+ \only<7>{\import{../assets/lattice/}{lattice6.pdf_tex}}%
+ \only<8>{\import{../assets/lattice/}{lattice7.pdf_tex}}%
+ \only<9>{\import{../assets/lattice/}{lattice8.pdf_tex}}%
+ \end{figure}
+
+ \onslide<10>
+ \begin{minipage}{.10\textwidth}
+ ~
+ \end{minipage}
+ \begin{minipage}{.40\textwidth}
+ \footnotesize
+ \textbf{Algorithm $write(x)$:}
+ \begin{enumerate}
+ \item Broadcast $write(x)$ to all nodes
+ \item Wait for $k > n/2$ nodes to reply OK
+ \item Return OK
+ \end{enumerate}
+ \end{minipage}
+ \begin{minipage}{.40\textwidth}
+ \footnotesize
+ \vspace{1em}
+ \textbf{Algorithm $read()$:}
+ \begin{enumerate}
+ \item Broadcast $read()$ to all nodes
+ \item Wait for $k > n/2$ nodes to reply\\
+ with values $x_1, \dots, x_k$
+ \item Return $x_1 \sqcup \dots \sqcup x_k$
+ \end{enumerate}
+ \end{minipage}
+ \end{overprint}
\end{frame}
-
\begin{frame}
- \frametitle{Issues with consistent hashing}
+ \frametitle{A hard problem: layout changes}
\begin{itemize}
- \item Consistent hashing doesn't dispatch data based on geographical location of nodes
- \vspace{1em}
- \item<2-> Geographically aware adaptation, try 1:\\
- data quantities not well balanced between nodes
- \vspace{1em}
- \item<3-> Geographically aware adaptation, try 2:\\
- too many reshuffles when adding/removing nodes
+ \item We rely on quorums $k > n/2$ within each partition:\\
+ $$n=3,~~~~~~~k\ge 2$$
+ \item<2-> When rebalancing, the set of nodes responsible for a partition can change:\\
+ $$\{A, B, C\} \to \{A, D, E\}$$
+ \vspace{.01em}
+ \item<3-> During the rebalancing, $D$ and $E$ don't yet have the data,\\
+ ~~~~~~~~~~~~~~~~~~~and $B$ and $C$ want to get rid of the data to free up space\\
+ \vspace{1.2em}
+ $\to$ risk of inconsistency, \textbf{how to coordinate?}
\end{itemize}
\end{frame}
-
-\section{Problem 2: ensuring consistency}
-
\begin{frame}
- \frametitle{Consensus vs weak consistency}
-
- \hspace{1em}
- \begin{minipage}{7cm}
- \textbf{Consensus-based systems:}
- \vspace{1em}
+ \frametitle{Handling layout changes without losing consistency}
+ \begin{minipage}{.55\textwidth}
\begin{itemize}
- \item \textbf{Leader-based:} a leader is elected to coordinate
- all reads and writes
- \vspace{1em}
- \item \textbf{Linearizability} of all operations\\
- (strongest consistency guarantee)
+ \item \textbf{Solution:}\\
+ \vspace{.5em}
+ \begin{itemize}
+ \item keep track of data transfer to new nodes
+ \vspace{.5em}
+ \item use multiple write quorums\\
+ (new nodes + old nodes\\
+ while data transfer is in progress)
+ \vspace{.5em}
+ \item switching reads to new nodes\\
+ only once copy is finished
+ \end{itemize}
\vspace{1em}
- \item Any sequential specification can be implemented as a \textbf{replicated state machine}
+ \item \textbf{Implemented} in v0.10
\vspace{1em}
- \item \textbf{Costly}, the leader is a bottleneck;
- leader elections on failure take time
+ \item \textbf{Validated} with Jepsen testing
\end{itemize}
\end{minipage}
- \hfill
- \begin{minipage}{7cm} \visible<2->{
- \textbf{Weakly consistent systems:}
- \vspace{1em}
- \begin{itemize}
- \item \textbf{Nodes are equivalent}, any node
- can originate a read or write operation
- \vspace{1em}
- \item \textbf{Read-after-write consistency} with quorums,
- eventual consistency without
- \vspace{1em}
- \item \textbf{Operations have to commute}, i.e.~we
- can only implement CRDTs
- \vspace{1em}
- \item \textbf{Fast}, no single bottleneck;\\
- works the same with offline nodes
- \end{itemize}
- } \end{minipage}
- \hspace{1em}
-\end{frame}
-
-\begin{frame}
- \frametitle{Consensus vs weak consistency}
- \begin{center}
- \textbf{From a theoretical point of view:}\\
-
- \end{center}
- \vspace{2em}
-
- \hspace{1em}
- \begin{minipage}{6.5cm}
- \underline{Consensus-based systems:}
-
- \vspace{1em}
-
- Require \textbf{additional assumptions} such as a fault detector or a strong RNG\\
- (FLP impossibility theorem)
+ \begin{minipage}{.23\textwidth}
+ \includegraphics[width=3cm]{../assets/jepsen-0.9.png}\\
+ {\footnotesize Garage v0.9.0}
\end{minipage}
- \hfill
- \begin{minipage}{6.5cm}
- \underline{Weakly consistent systems:}
-
- \vspace{1em}
-
- Can be implemented in \textbf{any\\asynchronous message passing\\distributed system} with node crashes
+ \begin{minipage}{.2\textwidth}
+ \includegraphics[width=3cm]{../assets/jepsen-0.10.png}\\
+ {\footnotesize Garage v0.10 beta}
\end{minipage}
- \hspace{1em}
-
- \vspace{3em}
- \begin{center}
- They represent \textbf{different classes of computational capability}\\
- \end{center}
\end{frame}
+% ====================== v0.10.0 ===============================
+
\begin{frame}
- \frametitle{Consensus vs weak consistency}
\begin{center}
- \textbf{The same objects cannot be implemented in both models.}
+ \includegraphics[width=.8\linewidth]{../assets/timeline-22-24.pdf}
\end{center}
- \vspace{2em}
-
- \hspace{1em}
- \begin{minipage}{6.5cm}
- \underline{Consensus-based systems:}
-
- \vspace{1em}
-
- \textbf{Any sequential specification}\\~
-
- \vspace{1em}
- \textbf{Easier to program for}: just write your program as if it were sequential on a single machine
-
- \end{minipage}
- \hfill
- \begin{minipage}{6.5cm}
- \underline{Weakly consistent systems:}
-
- \vspace{1em}
-
- \textbf{Only CRDTs}\\(conflict-free replicated data types)
-
- \vspace{1em}
- Part of the complexity is \textbf{reported to the consumer of the API}\\~
- \end{minipage}
- \hspace{1em}
-\end{frame}
-
-\begin{frame}
- \frametitle{Understanding the power of consensus}
- \textbf{Consensus:} an API with a single operation, $propose(x)$
- \begin{enumerate}
- \item nodes all call $propose(x)$ with their proposed value;
- \item nodes all receive the same value as a return value, which is one of the proposed values
- \end{enumerate}
- \vspace{1em}
-
- \visible<2->{
- \textbf{Equivalent to} a distributed algorithm that gives a total order on all requests
- }
- \vspace{1em}
-
- \visible<3->{
- \textbf{Implemented by} this simple replicated state machine:
- \vspace{.5em}
- \begin{figure}
- \centering
- \def\svgwidth{.5\textwidth}
- \large
- \import{assets/}{consensus.pdf_tex}
- \end{figure}
- \vspace{1em}
- }
\end{frame}
\begin{frame}
- \frametitle{Can my object be implemented without consensus?}
- \underline{Given the specification of an API:}
+ \frametitle{Towards v1.0}
+ Focus on \underline{security \& stability}
\vspace{2em}
\begin{itemize}
- \item \textbf{Using this API, we can implement the consensus object} (the $propose$ function)\\
- $\to$ the API is equivalent to consensus/total ordering of messages\\
- $\to$ the API cannot be implemented in a weakly consistent system
- \vspace{2em}
- \item<2-> \textbf{This API can be implemented using only weak primitives}\\
- (e.g. in the asynchronous message passing model with no further assumption)\\
- $\to$ the API is strictly weaker than consensus\\
- $\to$ we can implement it in Garage!
- \end{itemize}
-\end{frame}
-
-
-\begin{frame}
- \frametitle{What can we implement without consensus?}
- \begin{itemize}
- \item Any \textbf{conflict-free replicated data type} (CRDT)
+ \item \textbf{Security audit} in progress by Radically Open Security
\vspace{1em}
- \item<2-> Non-transactional key-value stores such as S3 are equivalent to a simple CRDT:\\
- a map of \textbf{last-writer-wins registers} (each key is its own CRDT)
+ \item Misc. S3 features (SSE-C, ...) and compatibility fixes
\vspace{1em}
- \item<3-> \textbf{Read-after-write consistency} can be implemented
- using quorums on read and write operations
+ \item Improve UX
\vspace{1em}
- \item<4-> \textbf{Monotonicity of reads} can be implemented with repair-on-read\\
- (makes reads more costly, not implemented in Garage)
+ \item Fix bugs
\end{itemize}
\end{frame}
-\begin{frame}
- \frametitle{CRDTs and quorums: read-after-write consistency}
- \begin{figure}
- \centering
- \def\svgwidth{.8\textwidth}
- \only<1>{\import{assets/}{lattice1.pdf_tex}}%
- \only<2>{\import{assets/}{lattice2.pdf_tex}}%
- \only<3>{\import{assets/}{lattice3.pdf_tex}}%
- \only<4>{\import{assets/}{lattice4.pdf_tex}}%
- \only<5>{\import{assets/}{lattice5.pdf_tex}}%
- \only<6>{\import{assets/}{lattice6.pdf_tex}}%
- \only<7>{\import{assets/}{lattice7.pdf_tex}}%
- \only<8>{\import{assets/}{lattice8.pdf_tex}}%
- \end{figure}
-\end{frame}
-
-\begin{frame}
- \frametitle{CRDTs and quorums: read-after-write consistency}
- \textbf{Property:} If node $A$ did an operation $write(x)$ and received an OK response,\\
- \hspace{2cm} and node $B$ starts an operation $read()$ after $A$ received OK,\\
- \hspace{2cm} then $B$ will read a value $x' \sqsupseteq x$.
-
- \vspace{1em}
-
- \hspace{1em}
- \begin{minipage}{6.8cm}
- \textbf{Algorithm $write(x)$:}
- \begin{enumerate}
- \item Broadcast $write(x)$ to all nodes
- \item Wait for $k > n/2$ nodes to reply OK
- \item Return OK
- \end{enumerate}
- \end{minipage}
- \hfill
- \begin{minipage}{6.8cm}
- \vspace{1em}
- \textbf{Algorithm $read()$:}
- \begin{enumerate}
- \item Broadcast $read()$ to all nodes
- \item Wait for $k > n/2$ nodes to reply\\
- with values $x_1, \dots, x_k$
- \item Return $x_1 \sqcup \dots \sqcup x_k$
- \end{enumerate}
- \end{minipage}
- \hspace{1em}
-
- \vspace{2em}
- \textbf{Why does it work?} There is at least one node at the intersection between the two sets of nodes that replied to each request, that ``saw'' $x$ before the $read()$ started ($x_i \sqsupseteq x$).
-\end{frame}
-
-\begin{frame}
- \frametitle{CRDTs and quorums: monotonic-reads consistency}
- \begin{figure}
- \centering
- \def\svgwidth{.8\textwidth}
- \only<1>{\import{assets/}{latticeB_1.pdf_tex}}%
- \only<2>{\import{assets/}{latticeB_2.pdf_tex}}%
- \only<3>{\import{assets/}{latticeB_3.pdf_tex}}%
- \only<4>{\import{assets/}{latticeB_4.pdf_tex}}%
- \only<5>{\import{assets/}{latticeB_5.pdf_tex}}%
- \only<6>{\import{assets/}{latticeB_6.pdf_tex}}%
- \only<7>{\import{assets/}{latticeB_7.pdf_tex}}%
- \only<8>{\import{assets/}{latticeB_8.pdf_tex}}%
- \only<9>{\import{assets/}{latticeB_9.pdf_tex}}%
- \only<10>{\import{assets/}{latticeB_10.pdf_tex}}%
- \end{figure}
-\end{frame}
-
-\begin{frame}
- \frametitle{CRDTs and quorums: monotonic-reads consistency}
- \textbf{Property:} If node $A$ did an operation $read()$ and received $x$ as a response,\\
- \hspace{2cm} and node $B$ starts an operation $read()$ after $A$ received $x$,\\
- \hspace{2cm} then $B$ will read a value $x' \sqsupseteq x$.
-
- \vspace{1em}
-
- \textbf{Algorithm $monotonic\_read()$:} {\small (a.k.a. repair-on-read)}
- \begin{enumerate}
- \item Broadcast $read()$ to all nodes
- \item Wait for $k > n/2$ nodes to reply with values $x_1, \dots, x_k$
- \item If $x_i \ne x_j$ for some nodes $i$ and $j$,\\
- \hspace{1cm}then call $write(x_1 \sqcup \dots \sqcup x_k)$ and wait for OK from $k' > n/2$ nodes
- \item Return $x_1 \sqcup \dots \sqcup x_k$
- \end{enumerate}
-
- \vspace{1em}
-
- This makes reads slower in some cases, and is \textbf{not implemented in Garage}.
-\end{frame}
-
-\begin{frame}
- \frametitle{A hard problem: layout changes}
- \begin{itemize}
- \item We rely on quorums $k > n/2$ within each partition:\\
- $$n=3,~~~~~~~k\ge 2$$
- \item<2-> When rebalancing, the set of nodes responsible for a partition can change:\\
- $$\{n_A, n_B, n_C\} \to \{n_A, n_D, n_E\}$$
- \vspace{.01em}
- \item<3-> During the rebalancing, $D$ and $E$ don't yet have the data,\\
- ~~~~~~~~~~~~~~~~~~~and $B$ and $C$ want to get rid of the data to free up space\\
- \vspace{.2em}
- $\to$ quorums only within the new set of nodes don't work\\
- $\to$ how to coordinate? \textbf{currently, we don't...}
+% ======================================== OPERATING
+% ======================================== OPERATING
+% ======================================== OPERATING
- \end{itemize}
-\end{frame}
\section{Operating big Garage clusters}
@@ -869,9 +616,9 @@
\frametitle{Operating Garage}
\begin{center}
\only<1-2>{
- \includegraphics[width=.9\linewidth]{assets/scr_garage_status.png}
+ \includegraphics[width=.9\linewidth]{../assets/screenshots/garage_status_0.10.png}
\\\vspace{1em}
- \visible<2>{\includegraphics[width=.85\linewidth]{assets/scr_garage_status_broken.png}}
+ \visible<2>{\includegraphics[width=.9\linewidth]{../assets/screenshots/garage_status_unhealthy_0.10.png}}
}
\end{center}
\end{frame}
@@ -879,17 +626,17 @@
\begin{frame}
\frametitle{Garage's architecture}
\begin{center}
- \only<1>{\includegraphics[width=.45\linewidth]{assets/garage.drawio.pdf}}%
- \only<2>{\includegraphics[width=.6\linewidth]{assets/garage_sync.drawio.pdf}}%
+ \only<1>{\includegraphics[width=.45\linewidth]{../assets/garage.drawio.pdf}}%
+ \only<2>{\includegraphics[width=.6\linewidth]{../assets/garage_sync.drawio.pdf}}%
\end{center}
\end{frame}
\begin{frame}
\frametitle{Digging deeper}
\begin{center}
- \only<1>{\includegraphics[width=.9\linewidth]{assets/scr_garage_stats.png}}
- \only<2>{\includegraphics[width=.6\linewidth]{assets/scr_garage_worker_list.png}}
- \only<3>{\includegraphics[width=.6\linewidth]{assets/scr_garage_worker_get.png}}
+ \only<1>{\includegraphics[width=.9\linewidth]{../assets/screenshots/garage_stats_0.10.png}}
+ \only<2>{\includegraphics[width=.5\linewidth]{../assets/screenshots/garage_worker_list_0.10.png}}
+ \only<3>{\includegraphics[width=.6\linewidth]{../assets/screenshots/garage_worker_param_0.10.png}}
\end{center}
\end{frame}
@@ -910,7 +657,6 @@
\begin{itemize}
\item Lots of small files on disk
\item Processing the resync queue can be slow
- \item Multi-HDD support not yet released (soon!)
\end{itemize}
\end{itemize}
\end{frame}
@@ -926,7 +672,7 @@
\vspace{.5em}
\item Data block storage:
\begin{itemize}
- \item Wait for v0.9 with multi-HDD support
+ \item Use Garage's native multi-HDD support
\item XFS on individual drives
\item Increase block size (1MB $\to$ 10MB, requires more RAM and good networking)
\item Tune \texttt{resync-tranquility} and \texttt{resync-worker-count} dynamically
@@ -943,146 +689,10 @@
Current deployments: $< 10$ TB, we don't have much experience with more
\end{frame}
-\section{Going further than the S3 API}
-
-\begin{frame}
- \frametitle{Using Garage for everything}
- \begin{center}
- \only<1>{\includegraphics[width=.8\linewidth]{assets/slideB1.png}}%
- \only<2>{\includegraphics[width=.8\linewidth]{assets/slideB2.png}}%
- \only<3>{\includegraphics[width=.8\linewidth]{assets/slideB3.png}}%
- \end{center}
-\end{frame}
-
-\begin{frame}
- \frametitle{K2V Design}
- \begin{itemize}
- \item A new, custom, minimal API\\
- \vspace{.5em}
- \begin{itemize}
- \item Single-item operations
- \item Operations on ranges and batches of items
- \item Polling operations to help implement a PubSub pattern
- \end{itemize}
- \vspace{1em}
- \item<2-> Exposes the partitoning mechanism of Garage\\
- K2V = partition key / sort key / value (like Dynamo)
- \vspace{1em}
- \item<3-> Weakly consistent, CRDT-friendly\\
- $\to$ no support for transactions (not ACID)
- \vspace{1em}
- \item<4-> Cryptography-friendly: values are binary blobs
- \end{itemize}
-\end{frame}
-
-\begin{frame}
- \frametitle{Handling concurrent values}
- \textbf{How to handle concurrency?} Example:
- \vspace{1em}
- \begin{enumerate}
- \item Client $A$ reads the initial value of a key, $x_0$
- \vspace{1em}
- \item<2-> Client $B$ also reads the initial value $x_0$ of that key
- \vspace{1em}
- \item<3-> Client $A$ modifies $x_0$, and writes a new value $x_1$
- \vspace{1em}
- \item<4-> Client $B$ also modifies $x_0$, and writes a new value $x'_1$,\\
- without having a chance to first read $x_1$\\
- \vspace{1em}
- $\to$ what should the final state be?
- \end{enumerate}
-\end{frame}
-
-\begin{frame}
- \frametitle{Handling concurrent values}
- \begin{itemize}
- \item If we keep only $x_1$ or $x'_1$, we risk \textbf{loosing application data}
- \vspace{1.5em}
- \item<2-> Values are opaque binary blobs, \textbf{K2V cannot resolve conflicts} by itself\\
- (e.g. by implementing a CRDT)
- \vspace{1.5em}
- \item<3-> Solution: \textbf{we keep both!}\\
- $\to$ the value of the key is now $\{x_1, x'_1\}$\\
- $\to$ the client application can decide how to resolve conflicts on the next read
- \end{itemize}
-\end{frame}
-
-\begin{frame}
- \frametitle{Keeping track of causality}
- How does K2V know that $x_1$ and $x'_1$ are concurrent?
- \vspace{1em}
- \begin{itemize}
- \item $read()$ returns \textbf{a set of values} and an associated \textbf{causality token}\\
- \vspace{1.5em}
- \item<2-> When calling $write()$, the client sends \textbf{the causality token from its last read}
- \vspace{1.5em}
- \item<3-> The causality token represents the set of values \textbf{already seen by the client}\\
- $\to$ those values are the \textbf{causal past} of the write operation\\
- $\to$ K2V can keep concurrent values and overwrite all ones in the causal past
- \vspace{1.5em}
- \item<4-> Internally, the causality token is \textbf{a vector clock}
- \end{itemize}
-\end{frame}
-
-\begin{frame}
- \frametitle{Application: an e-mail storage server}
- \begin{center}
- \only<1>{\includegraphics[width=.9\linewidth]{assets/aerogramme.png}}%
- \end{center}
-\end{frame}
-
-\begin{frame}
- \frametitle{Aerogramme data model}
- \begin{center}
- \only<1->{\includegraphics[width=.4\linewidth]{assets/aerogramme_datatype.drawio.pdf}}%
- \end{center}
- \visible<2->{Aerogramme encrypts all stored values for privacy\\
- (Garage server administrators can't read your mail)}
-\end{frame}
-
-\begin{frame}
- \frametitle{Different deployment scenarios}
- \begin{center}
- \only<1>{\includegraphics[width=.9\linewidth]{assets/aerogramme_components1.drawio.pdf}}%
- \only<2>{\includegraphics[width=.9\linewidth]{assets/aerogramme_components2.drawio.pdf}}%
- \end{center}
-\end{frame}
-
-\begin{frame}
- \frametitle{A new model for building resilient software}
- How to build an application using only Garage as a data store:
- \vspace{1em}
- \begin{enumerate}
- \item Design a data model suited to K2V\\
- {\footnotesize (see Cassandra docs on porting SQL data models to Cassandra)}
- \vspace{1em}
- \begin{itemize}
- \item Use CRDTs or other eventually consistent data types (see e.g. Bayou)
- \vspace{1em}
- \item Store opaque binary blobs to provide End-to-End Encryption\\
- \end{itemize}
- \vspace{1em}
- \item<2-> Store big blobs (files) using the S3 API
- \vspace{1em}
- \item<3-> Let Garage manage sharding, replication, failover, etc.
- \end{enumerate}
-\end{frame}
-
-\section{Conclusion}
-\begin{frame}
- \frametitle{Perspectives}
- \begin{itemize}
- \item Fix the consistency issue when rebalancing
- \vspace{1em}
- \item Write about Garage's architecture and properties,\\
- and about our proposed architecture for (E2EE) apps over K2V+S3
- \vspace{1em}
- \item Continue developing Garage; finish Aerogramme; build new applications...
- \vspace{1em}
- \item Anything else?
- \end{itemize}
-\end{frame}
+% ======================================== END
+% ======================================== END
+% ======================================== END
\begin{frame}
\frametitle{Where to find us}
@@ -1094,8 +704,8 @@
\texttt{\#garage:deuxfleurs.fr} on Matrix
\vspace{1.5em}
- \includegraphics[width=.06\linewidth]{assets/rust_logo.png}
- \includegraphics[width=.13\linewidth]{assets/AGPLv3_Logo.png}
+ \includegraphics[width=.06\linewidth]{../assets/logos/rust_logo.png}
+ \includegraphics[width=.13\linewidth]{../assets/logos/AGPLv3_Logo.png}
\end{center}
\end{frame}