diff options
author | Alex Auvolat <alex@adnab.me> | 2024-01-23 16:50:30 +0100 |
---|---|---|
committer | Alex Auvolat <alex@adnab.me> | 2024-01-23 16:50:30 +0100 |
commit | c2541f280c0b267bbaf71702b2966c91a4c5105b (patch) | |
tree | 01280a5b2673b91e4f0b34869a319ab8cd9c5b23 /doc/talks/2024-02-03-fosdem/talk.tex | |
parent | 4de7ac60232d521d7b31bddc0768002894cecf9d (diff) | |
download | garage-c2541f280c0b267bbaf71702b2966c91a4c5105b.tar.gz garage-c2541f280c0b267bbaf71702b2966c91a4c5105b.zip |
[talk-fosdem-24] WIP, write talk, modify lots of assets
Diffstat (limited to 'doc/talks/2024-02-03-fosdem/talk.tex')
-rw-r--r-- | doc/talks/2024-02-03-fosdem/talk.tex | 844 |
1 files changed, 227 insertions, 617 deletions
diff --git a/doc/talks/2024-02-03-fosdem/talk.tex b/doc/talks/2024-02-03-fosdem/talk.tex index 4c4dcdbc..b7254dc2 100644 --- a/doc/talks/2024-02-03-fosdem/talk.tex +++ b/doc/talks/2024-02-03-fosdem/talk.tex @@ -10,6 +10,7 @@ \usepackage{graphicx} \usepackage{import} \usepackage{adjustbox} +\usepackage[absolute,overlay]{textpos} %\useoutertheme[footline=authortitle,subsection=false]{miniframes} %\useoutertheme[footline=authorinstitute,subsection=false]{miniframes} \useoutertheme{infolines} @@ -86,7 +87,7 @@ \begin{columns}[t] \begin{column}{.2\textwidth} \centering - \adjincludegraphics[width=.5\linewidth, valign=t]{../assets/deuxfleurs.pdf} + \adjincludegraphics[width=.5\linewidth, valign=t]{../assets/logos/deuxfleurs.pdf} \end{column} \begin{column}{.6\textwidth} \textbf{Deuxfleurs}\\ @@ -95,7 +96,7 @@ \end{column} \begin{column}{.2\textwidth} \centering - \adjincludegraphics[width=.7\linewidth, valign=t]{../assets/logo_chatons.png} + \adjincludegraphics[width=.7\linewidth, valign=t]{../assets/logos/logo_chatons.png} \end{column} \end{columns} @@ -116,39 +117,25 @@ \vspace{2em} \begin{center} \textbf{\underline{Resilience}}\\ - {\footnotesize (we want good uptime/availability with low supervision)} + {\footnotesize we want good uptime/availability with low supervision} \end{center} } \end{frame} \begin{frame} - \frametitle{How to make a \underline{stable} system} + \frametitle{Building a resilient system with cheap stuff} - Enterprise-grade systems typically employ: - \vspace{1em} - \begin{itemize} - \item RAID - \item Redundant power grid + UPS - \item Redundant Internet connections - \item Low-latency links - \item ... - \end{itemize} - \vspace{1em} - $\to$ it's costly and only worth it at DC scale -\end{frame} - -\begin{frame} - \frametitle{How to make a \underline{resilient} system} - - \only<1,4-5>{ - Instead, we use: - \vspace{1em} + \only<1,4-7>{ \begin{itemize} - \item \textcolor<2->{gray}{Commodity hardware (e.g. old desktop PCs)} + \item \textcolor<5->{gray}{Commodity hardware (e.g. old desktop PCs)\\ \vspace{.5em} - \item<4-> \textcolor<5->{gray}{Commodity Internet (e.g. FTTB, FTTH) and power grid} + \visible<4->{{\footnotesize (can die at any time)}}} + \vspace{1.5em} + \item<5-> \textcolor<7->{gray}{Regular Internet (e.g. FTTB, FTTH) and power grid connections\\ \vspace{.5em} - \item<5-> \textcolor<6->{gray}{\textbf{Geographical redundancy} (multi-site replication)} + \visible<6->{{\footnotesize (can be unavailable randomly)}}} + \vspace{1.5em} + \item<7-> \textbf{Geographical redundancy} (multi-site replication) \end{itemize} } \only<2>{ @@ -161,7 +148,7 @@ \includegraphics[width=.8\linewidth]{../assets/atuin.jpg} \end{center} } - \only<6>{ + \only<8>{ \begin{center} \includegraphics[width=.8\linewidth]{../assets/inframap_jdll2023.pdf} \end{center} @@ -171,22 +158,20 @@ \begin{frame} \frametitle{Object storage: a crucial component} \begin{center} - \includegraphics[height=6em]{../assets/Amazon-S3.jpg} + \includegraphics[height=6em]{../assets/logos/Amazon-S3.jpg} \hspace{3em} - \includegraphics[height=5em]{../assets/minio.png} + \visible<2->{\includegraphics[height=5em]{../assets/logos/minio.png}} \hspace{3em} - \includegraphics[height=6em]{../../logo/garage_hires_crop.png} + \visible<3>{\includegraphics[height=6em]{../../logo/garage_hires_crop.png}} \end{center} \vspace{1em} S3: a de-facto standard, many compatible applications \vspace{1em} - - MinIO is self-hostable but not suited for geo-distributed deployments + \visible<2->{MinIO is self-hostable but not suited for geo-distributed deployments} \vspace{1em} - - \textbf{Garage is a self-hosted drop-in replacement for the Amazon S3 object store} + \visible<3->{\textbf{Garage is a self-hosted drop-in replacement for the Amazon S3 object store}} \end{frame} \begin{frame} @@ -194,30 +179,30 @@ Consensus can be implemented reasonably well in practice, so why avoid it? \vspace{1em} \begin{itemize} - \item \textbf{Software complexity} + \item<2-> \textbf{Software complexity} \vspace{1em} - \item \textbf{Performance issues:} + \item<3-> \textbf{Performance issues:} \vspace{.5em} \begin{itemize} - \item The leader is a \textbf{bottleneck} for all requests\\ + \item<4-> The leader is a \textbf{bottleneck} for all requests\\ \vspace{.5em} - \item \textbf{Sensitive to higher latency} between nodes + \item<5-> \textbf{Sensitive to higher latency} between nodes \vspace{.5em} - \item \textbf{Takes time to reconverge} when disrupted (e.g. node going down) + \item<6-> \textbf{Takes time to reconverge} when disrupted (e.g. node going down) \end{itemize} \end{itemize} \vspace{2em} - $\to$ Garage uses only CRDTs internally (conflict-free replicated data types) + \visible<7->{\underline{Internally, Garage uses only CRDTs} (conflict-free replicated data types)} \end{frame} \begin{frame} \frametitle{The data model of object storage} - Object storage is basically a key-value store: - \vspace{1em} + Object storage is basically a \textbf{key-value store}: + \vspace{.5em} - {\footnotesize + {\scriptsize \begin{center} - \begin{tabular}{|l|p{8cm}|} + \begin{tabular}{|l|p{7cm}|} \hline \textbf{Key: file path + name} & \textbf{Value: file data + metadata} \\ \hline @@ -242,28 +227,33 @@ } \vspace{1em} - Simple interface, compatible with many existing applications - - \vspace{1em} - Maps well to CRDT data types + \begin{itemize} + \item<2> Maps well to CRDT data types + \end{itemize} \end{frame} \begin{frame} \frametitle{Performance gains in practice} \begin{center} - \includegraphics[width=.8\linewidth]{../assets/endpoint_latency_0.7_0.8_minio.png} + \includegraphics[width=.8\linewidth]{../assets/perf/endpoint_latency_0.7_0.8_minio.png} \end{center} \end{frame} + +% ======================================== TIMELINE +% ======================================== TIMELINE +% ======================================== TIMELINE + +\section{Recent developments} + +% ====================== v0.7.0 =============================== + \begin{frame} - \frametitle{Timeline} \begin{center} \includegraphics[width=.8\linewidth]{../assets/timeline-22-24.pdf} \end{center} \end{frame} -% ====================== v0.7.0 =============================== - \begin{frame} \frametitle{April 2022 - Garage v0.7.0} Focus on \underline{observability and ecosystem integration} @@ -271,8 +261,7 @@ \begin{itemize} \item \textbf{Monitoring:} metrics and traces, using OpenTelemetry \vspace{1em} - \item Alternative replication modes with 1 or 2 copies,\\ - modes with weaker consistency + \item Replication modes with 1 or 2 copies / weaker consistency \vspace{1em} \item Kubernetes integration \vspace{1em} @@ -285,20 +274,26 @@ \begin{frame} \frametitle{Metrics (Prometheus + Grafana)} \begin{center} - \includegraphics[width=.9\linewidth]{../assets/grafana_dashboard.png} + \includegraphics[width=.9\linewidth]{../assets/screenshots/grafana_dashboard.png} \end{center} \end{frame} \begin{frame} \frametitle{Traces (Jaeger)} \begin{center} - \includegraphics[width=.8\linewidth]{../assets/jaeger_listobjects.png} + \includegraphics[width=.8\linewidth]{../assets/screenshots/jaeger_listobjects.png} \end{center} \end{frame} % ====================== v0.8.0 =============================== \begin{frame} + \begin{center} + \includegraphics[width=.8\linewidth]{../assets/timeline-22-24.pdf} + \end{center} +\end{frame} + +\begin{frame} \frametitle{November 2022 - Garage v0.8.0} Focus on \underline{performance} \vspace{2em} @@ -315,8 +310,8 @@ \begin{frame} \frametitle{About metadata DB engines} - Issues with Sled: - \vspace{2em} + \textbf{Issues with Sled:} + \vspace{1em} \begin{itemize} \item Huge files on disk \vspace{.5em} @@ -326,8 +321,9 @@ \vspace{.5em} \item Not actively maintained \end{itemize} + \vspace{2em} - LMDB: very stable, good performance, reasonably small files on disk + \textbf{LMDB:} very stable, good performance, reasonably small files on disk \vspace{1em} Sled will be removed in Garage v1.0 @@ -336,7 +332,7 @@ \begin{frame} \frametitle{DB engine performance comparison} \begin{center} - \includegraphics[width=.6\linewidth]{../assets/db_engine.png} + \includegraphics[width=.6\linewidth]{../assets/perf/db_engine.png} \end{center} NB: Sqlite was slow due to synchronous journaling mode, now configurable \end{frame} @@ -352,20 +348,26 @@ \begin{frame} \frametitle{TTFB benchmark} \begin{center} - \includegraphics[width=.8\linewidth]{../assets/ttfb.png} + \includegraphics[width=.8\linewidth]{../assets/perf/ttfb.png} \end{center} \end{frame} \begin{frame} \frametitle{Throughput benchmark} \begin{center} - \includegraphics[width=.7\linewidth]{../assets/io-0.7-0.8-minio.png} + \includegraphics[width=.7\linewidth]{../assets/perf/io-0.7-0.8-minio.png} \end{center} \end{frame} % ====================== v0.9.0 =============================== \begin{frame} + \begin{center} + \includegraphics[width=.8\linewidth]{../assets/timeline-22-24.pdf} + \end{center} +\end{frame} + +\begin{frame} \frametitle{October 2023 - Garage v0.9.0} Focus on \underline{streamlining \& usability} \vspace{2em} @@ -389,91 +391,72 @@ \begin{frame} \frametitle{Layout computation} - \begin{center} - \includegraphics[width=\linewidth]{../assets/location-aware.png} - \end{center} - \vspace{2em} - Garage replicates data on different zones when possible -\end{frame} - -\begin{frame} - \frametitle{Layout computation} + \begin{overprint} + \onslide<1> \begin{center} - \includegraphics[width=.8\linewidth]{../assets/map.png} + \includegraphics[width=\linewidth, trim=0 0 0 -4cm]{../assets/screenshots/garage_status_0.9_prod_zonehl.png} \end{center} -\end{frame} - -\begin{frame} - \frametitle{Optimal layout computation} + \onslide<2> \begin{center} - \includegraphics[width=.6\linewidth]{../assets/geodistrib_paper.png} + \includegraphics[width=.7\linewidth]{../assets/map.png} \end{center} + \end{overprint} + \vspace{1em} + Garage stores replicas on different zones when possible \end{frame} \begin{frame} \frametitle{What a "layout" is} - \textbf{A layout is a precomputed index table} - \vspace{2em} + \textbf{A layout is a precomputed index table:} + \vspace{1em} - \begin{center} - \begin{tabular}{|l|l|l|l|} - \hline - \textbf{Partition} & \textbf{Node 1} & \textbf{Node 2} & \textbf{Node 3} \\ - \hline - \hline - Partition 0 & Io (jupiter) & Drosera (atuin) & Courgette (neptune) \\ - \hline - Partition 1 & Datura (atuin) & Courgette (neptune) & Io (jupiter) \\ - \hline - Partition 2 & Io(jupiter) & Celeri (neptune) & Drosera (atuin) \\ - \hline - \hspace{1em}$\vdots$ & \hspace{1em}$\vdots$ & \hspace{1em}$\vdots$ & \hspace{1em}$\vdots$ \\ - \hline - Partition 255 & Concombre (neptune) & Io (jupiter) & Drosera (atuin) \\ - \hline - \end{tabular} - \end{center} + {\footnotesize + \begin{center} + \begin{tabular}{|l|l|l|l|} + \hline + \textbf{Partition} & \textbf{Node 1} & \textbf{Node 2} & \textbf{Node 3} \\ + \hline + \hline + Partition 0 & Io (jupiter) & Drosera (atuin) & Courgette (neptune) \\ + \hline + Partition 1 & Datura (atuin) & Courgette (neptune) & Io (jupiter) \\ + \hline + Partition 2 & Io(jupiter) & Celeri (neptune) & Drosera (atuin) \\ + \hline + \hspace{1em}$\vdots$ & \hspace{1em}$\vdots$ & \hspace{1em}$\vdots$ & \hspace{1em}$\vdots$ \\ + \hline + Partition 255 & Concombre (neptune) & Io (jupiter) & Drosera (atuin) \\ + \hline + \end{tabular} + \end{center} + } - \vspace{1em} + \vspace{2em} + \visible<2->{ The index table is built centrally using an optimal algorithm,\\ then propagated to all nodes + } \end{frame} \begin{frame} - \frametitle{The relationship between \emph{partition} and \emph{partition key}} - \begin{center} - \begin{tabular}{|l|l|l|l|} - \hline - \textbf{Partition key} & \textbf{Partition} & \textbf{Sort key} & \textbf{Value} \\ - \hline - \hline - \texttt{website} & Partition 12 & \texttt{index.html} & (file data) \\ - \hline - \texttt{website} & Partition 12 & \texttt{img/logo.svg} & (file data) \\ - \hline - \texttt{website} & Partition 12 &\texttt{download/index.html} & (file data) \\ - \hline - \hline - \texttt{backup} & Partition 42 & \texttt{borg/index.2822} & (file data) \\ - \hline - \texttt{backup} & Partition 42 & \texttt{borg/data/2/2329} & (file data) \\ - \hline - \texttt{backup} & Partition 42 & \texttt{borg/data/2/2680} & (file data) \\ - \hline - \hline - \texttt{private} & Partition 42 & \texttt{qq3a2nbe1qjq0ebbvo6ocsp6co} & (file data) \\ - \hline - \end{tabular} - \end{center} - \vspace{1em} - \textbf{To read or write an item:} hash partition key - \\ \hspace{5cm} $\to$ determine partition number (first 8 bits) - \\ \hspace{5cm} $\to$ find associated nodes + \frametitle{Optimal layout computation} + \begin{figure} + \center + \includegraphics[width=.6\linewidth]{../assets/geodistrib_paper.png} + \end{figure} \end{frame} + + % ====================== v0.10.0 =============================== \begin{frame} + \begin{center} + \includegraphics[width=.8\linewidth]{../assets/timeline-22-24.pdf} + \end{center} +\end{frame} + +\begin{frame} \frametitle{October 2023 - Garage v0.10.0 beta} Focus on \underline{consistency} \vspace{2em} @@ -482,386 +465,150 @@ \end{itemize} \end{frame} - - - -% -------------------------------------------------------------------- -% ================================================================================= -% ================================================================================= -% -------------------------------------------------------------------- -% -------------------------------------------------------------------- -% -------------------------------------------------------------------- -% ================================================================================= -% ================================================================================= -% -------------------------------------------------------------------- - - - - \begin{frame} - \frametitle{Two big problems} - \begin{enumerate} - \item \textbf{How to place data on different nodes?}\\ - \vspace{1em} - \underline{Constraints:} heterogeneous hardware\\ - \underline{Objective:} $n$ copies of everything, maximize usable capacity, maximize resilience\\ - \vspace{1em} - $\to$ the Dynamo model + optimization algorithms - \vspace{2em} - \item<2-> \textbf{How to guarantee consistency?}\\ + \frametitle{Working with weak consistency} + Not using consensus limits us to the following: + \vspace{2em} + \begin{itemize} + \item<2-> \textbf{Conflict-free replicated data types} (CRDT)\\ \vspace{1em} - \underline{Constraints:} slow network (geographical distance), node unavailability/crashes\\ - \underline{Objective:} maximize availability, read-after-write guarantee\\ + {\footnotesize Non-transactional key-value stores such as S3 are equivalent to a simple CRDT:\\ + a map of \textbf{last-writer-wins registers} (each key is its own CRDT)} + \vspace{1.5em} + \item<3-> \textbf{Read-after-write consistency}\\ \vspace{1em} - $\to$ CRDTs, monotonicity, read and write quorums - \end{enumerate} + {\footnotesize Can be implemented using quorums on read and write operations} + \end{itemize} \end{frame} -\section{Problem 1: placing data} - -\begin{frame} - \frametitle{Key-value stores, upgraded: the Dynamo model} - \textbf{Two keys:} - \begin{itemize} - \item Partition key: used to divide data into partitions {\small (a.k.a.~shards)} - \item Sort key: used to identify items inside a partition - \end{itemize} +\begin{frame}[t] + \frametitle{CRDT read-after-write consistency using quorums} \vspace{1em} + {\small + \textbf{Property:} If node $A$ did an operation $write(x)$ and received an OK response,\\ + \hspace{2cm} and node $B$ starts an operation $read()$ after $A$ received OK,\\ + \hspace{2cm} then $B$ will read a value $x' \sqsupseteq x$. + } - \begin{center} - \begin{tabular}{|l|l|p{3cm}|} - \hline - \textbf{Partition key: bucket} & \textbf{Sort key: filename} & \textbf{Value} \\ - \hline - \hline - \texttt{website} & \texttt{index.html} & (file data) \\ - \hline - \texttt{website} & \texttt{img/logo.svg} & (file data) \\ - \hline - \texttt{website} & \texttt{download/index.html} & (file data) \\ - \hline - \hline - \texttt{backup} & \texttt{borg/index.2822} & (file data) \\ - \hline - \texttt{backup} & \texttt{borg/data/2/2329} & (file data) \\ - \hline - \texttt{backup} & \texttt{borg/data/2/2680} & (file data) \\ - \hline - \hline - \texttt{private} & \texttt{qq3a2nbe1qjq0ebbvo6ocsp6co} & (file data) \\ - \hline - \end{tabular} - \end{center} -\end{frame} - -\begin{frame} - \frametitle{Key-value stores, upgraded: the Dynamo model} - \begin{itemize} - \item Data with different partition keys is stored independently,\\ - on a different set of nodes\\ - \vspace{.5em} - $\to$ no easy way to list all partition keys\\ - $\to$ no cross-shard transactions\\ - \vspace{2em} - \item Placing data: hash the partition key, select nodes accordingly\\ - \vspace{.5em} - $\to$ distributed hash table (DHT) - \vspace{2em} - \item For a given value of the partition key, items can be listed using their sort keys - \end{itemize} + \vspace{1.5em} + \begin{overprint} + \onslide<2-9> + \begin{figure} + \centering + \footnotesize + \def\svgwidth{.7\textwidth} + \only<2>{\import{../assets/lattice/}{lattice1.pdf_tex}}% + \only<3>{\import{../assets/lattice/}{lattice2.pdf_tex}}% + \only<4>{\import{../assets/lattice/}{lattice3.pdf_tex}}% + \only<5>{\import{../assets/lattice/}{lattice4.pdf_tex}}% + \only<6>{\import{../assets/lattice/}{lattice5.pdf_tex}}% + \only<7>{\import{../assets/lattice/}{lattice6.pdf_tex}}% + \only<8>{\import{../assets/lattice/}{lattice7.pdf_tex}}% + \only<9>{\import{../assets/lattice/}{lattice8.pdf_tex}}% + \end{figure} + + \onslide<10> + \begin{minipage}{.10\textwidth} + ~ + \end{minipage} + \begin{minipage}{.40\textwidth} + \footnotesize + \textbf{Algorithm $write(x)$:} + \begin{enumerate} + \item Broadcast $write(x)$ to all nodes + \item Wait for $k > n/2$ nodes to reply OK + \item Return OK + \end{enumerate} + \end{minipage} + \begin{minipage}{.40\textwidth} + \footnotesize + \vspace{1em} + \textbf{Algorithm $read()$:} + \begin{enumerate} + \item Broadcast $read()$ to all nodes + \item Wait for $k > n/2$ nodes to reply\\ + with values $x_1, \dots, x_k$ + \item Return $x_1 \sqcup \dots \sqcup x_k$ + \end{enumerate} + \end{minipage} + \end{overprint} \end{frame} - \begin{frame} - \frametitle{Issues with consistent hashing} + \frametitle{A hard problem: layout changes} \begin{itemize} - \item Consistent hashing doesn't dispatch data based on geographical location of nodes - \vspace{1em} - \item<2-> Geographically aware adaptation, try 1:\\ - data quantities not well balanced between nodes - \vspace{1em} - \item<3-> Geographically aware adaptation, try 2:\\ - too many reshuffles when adding/removing nodes + \item We rely on quorums $k > n/2$ within each partition:\\ + $$n=3,~~~~~~~k\ge 2$$ + \item<2-> When rebalancing, the set of nodes responsible for a partition can change:\\ + $$\{A, B, C\} \to \{A, D, E\}$$ + \vspace{.01em} + \item<3-> During the rebalancing, $D$ and $E$ don't yet have the data,\\ + ~~~~~~~~~~~~~~~~~~~and $B$ and $C$ want to get rid of the data to free up space\\ + \vspace{1.2em} + $\to$ risk of inconsistency, \textbf{how to coordinate?} \end{itemize} \end{frame} - -\section{Problem 2: ensuring consistency} - \begin{frame} - \frametitle{Consensus vs weak consistency} - - \hspace{1em} - \begin{minipage}{7cm} - \textbf{Consensus-based systems:} - \vspace{1em} + \frametitle{Handling layout changes without losing consistency} + \begin{minipage}{.55\textwidth} \begin{itemize} - \item \textbf{Leader-based:} a leader is elected to coordinate - all reads and writes - \vspace{1em} - \item \textbf{Linearizability} of all operations\\ - (strongest consistency guarantee) + \item \textbf{Solution:}\\ + \vspace{.5em} + \begin{itemize} + \item keep track of data transfer to new nodes + \vspace{.5em} + \item use multiple write quorums\\ + (new nodes + old nodes\\ + while data transfer is in progress) + \vspace{.5em} + \item switching reads to new nodes\\ + only once copy is finished + \end{itemize} \vspace{1em} - \item Any sequential specification can be implemented as a \textbf{replicated state machine} + \item \textbf{Implemented} in v0.10 \vspace{1em} - \item \textbf{Costly}, the leader is a bottleneck; - leader elections on failure take time + \item \textbf{Validated} with Jepsen testing \end{itemize} \end{minipage} - \hfill - \begin{minipage}{7cm} \visible<2->{ - \textbf{Weakly consistent systems:} - \vspace{1em} - \begin{itemize} - \item \textbf{Nodes are equivalent}, any node - can originate a read or write operation - \vspace{1em} - \item \textbf{Read-after-write consistency} with quorums, - eventual consistency without - \vspace{1em} - \item \textbf{Operations have to commute}, i.e.~we - can only implement CRDTs - \vspace{1em} - \item \textbf{Fast}, no single bottleneck;\\ - works the same with offline nodes - \end{itemize} - } \end{minipage} - \hspace{1em} -\end{frame} - -\begin{frame} - \frametitle{Consensus vs weak consistency} - \begin{center} - \textbf{From a theoretical point of view:}\\ - - \end{center} - \vspace{2em} - - \hspace{1em} - \begin{minipage}{6.5cm} - \underline{Consensus-based systems:} - - \vspace{1em} - - Require \textbf{additional assumptions} such as a fault detector or a strong RNG\\ - (FLP impossibility theorem) + \begin{minipage}{.23\textwidth} + \includegraphics[width=3cm]{../assets/jepsen-0.9.png}\\ + {\footnotesize Garage v0.9.0} \end{minipage} - \hfill - \begin{minipage}{6.5cm} - \underline{Weakly consistent systems:} - - \vspace{1em} - - Can be implemented in \textbf{any\\asynchronous message passing\\distributed system} with node crashes + \begin{minipage}{.2\textwidth} + \includegraphics[width=3cm]{../assets/jepsen-0.10.png}\\ + {\footnotesize Garage v0.10 beta} \end{minipage} - \hspace{1em} - - \vspace{3em} - \begin{center} - They represent \textbf{different classes of computational capability}\\ - \end{center} \end{frame} +% ====================== v0.10.0 =============================== + \begin{frame} - \frametitle{Consensus vs weak consistency} \begin{center} - \textbf{The same objects cannot be implemented in both models.} + \includegraphics[width=.8\linewidth]{../assets/timeline-22-24.pdf} \end{center} - \vspace{2em} - - \hspace{1em} - \begin{minipage}{6.5cm} - \underline{Consensus-based systems:} - - \vspace{1em} - - \textbf{Any sequential specification}\\~ - - \vspace{1em} - \textbf{Easier to program for}: just write your program as if it were sequential on a single machine - - \end{minipage} - \hfill - \begin{minipage}{6.5cm} - \underline{Weakly consistent systems:} - - \vspace{1em} - - \textbf{Only CRDTs}\\(conflict-free replicated data types) - - \vspace{1em} - Part of the complexity is \textbf{reported to the consumer of the API}\\~ - \end{minipage} - \hspace{1em} -\end{frame} - -\begin{frame} - \frametitle{Understanding the power of consensus} - \textbf{Consensus:} an API with a single operation, $propose(x)$ - \begin{enumerate} - \item nodes all call $propose(x)$ with their proposed value; - \item nodes all receive the same value as a return value, which is one of the proposed values - \end{enumerate} - \vspace{1em} - - \visible<2->{ - \textbf{Equivalent to} a distributed algorithm that gives a total order on all requests - } - \vspace{1em} - - \visible<3->{ - \textbf{Implemented by} this simple replicated state machine: - \vspace{.5em} - \begin{figure} - \centering - \def\svgwidth{.5\textwidth} - \large - \import{assets/}{consensus.pdf_tex} - \end{figure} - \vspace{1em} - } \end{frame} \begin{frame} - \frametitle{Can my object be implemented without consensus?} - \underline{Given the specification of an API:} + \frametitle{Towards v1.0} + Focus on \underline{security \& stability} \vspace{2em} \begin{itemize} - \item \textbf{Using this API, we can implement the consensus object} (the $propose$ function)\\ - $\to$ the API is equivalent to consensus/total ordering of messages\\ - $\to$ the API cannot be implemented in a weakly consistent system - \vspace{2em} - \item<2-> \textbf{This API can be implemented using only weak primitives}\\ - (e.g. in the asynchronous message passing model with no further assumption)\\ - $\to$ the API is strictly weaker than consensus\\ - $\to$ we can implement it in Garage! - \end{itemize} -\end{frame} - - -\begin{frame} - \frametitle{What can we implement without consensus?} - \begin{itemize} - \item Any \textbf{conflict-free replicated data type} (CRDT) + \item \textbf{Security audit} in progress by Radically Open Security \vspace{1em} - \item<2-> Non-transactional key-value stores such as S3 are equivalent to a simple CRDT:\\ - a map of \textbf{last-writer-wins registers} (each key is its own CRDT) + \item Misc. S3 features (SSE-C, ...) and compatibility fixes \vspace{1em} - \item<3-> \textbf{Read-after-write consistency} can be implemented - using quorums on read and write operations + \item Improve UX \vspace{1em} - \item<4-> \textbf{Monotonicity of reads} can be implemented with repair-on-read\\ - (makes reads more costly, not implemented in Garage) + \item Fix bugs \end{itemize} \end{frame} -\begin{frame} - \frametitle{CRDTs and quorums: read-after-write consistency} - \begin{figure} - \centering - \def\svgwidth{.8\textwidth} - \only<1>{\import{assets/}{lattice1.pdf_tex}}% - \only<2>{\import{assets/}{lattice2.pdf_tex}}% - \only<3>{\import{assets/}{lattice3.pdf_tex}}% - \only<4>{\import{assets/}{lattice4.pdf_tex}}% - \only<5>{\import{assets/}{lattice5.pdf_tex}}% - \only<6>{\import{assets/}{lattice6.pdf_tex}}% - \only<7>{\import{assets/}{lattice7.pdf_tex}}% - \only<8>{\import{assets/}{lattice8.pdf_tex}}% - \end{figure} -\end{frame} - -\begin{frame} - \frametitle{CRDTs and quorums: read-after-write consistency} - \textbf{Property:} If node $A$ did an operation $write(x)$ and received an OK response,\\ - \hspace{2cm} and node $B$ starts an operation $read()$ after $A$ received OK,\\ - \hspace{2cm} then $B$ will read a value $x' \sqsupseteq x$. - - \vspace{1em} - - \hspace{1em} - \begin{minipage}{6.8cm} - \textbf{Algorithm $write(x)$:} - \begin{enumerate} - \item Broadcast $write(x)$ to all nodes - \item Wait for $k > n/2$ nodes to reply OK - \item Return OK - \end{enumerate} - \end{minipage} - \hfill - \begin{minipage}{6.8cm} - \vspace{1em} - \textbf{Algorithm $read()$:} - \begin{enumerate} - \item Broadcast $read()$ to all nodes - \item Wait for $k > n/2$ nodes to reply\\ - with values $x_1, \dots, x_k$ - \item Return $x_1 \sqcup \dots \sqcup x_k$ - \end{enumerate} - \end{minipage} - \hspace{1em} - - \vspace{2em} - \textbf{Why does it work?} There is at least one node at the intersection between the two sets of nodes that replied to each request, that ``saw'' $x$ before the $read()$ started ($x_i \sqsupseteq x$). -\end{frame} - -\begin{frame} - \frametitle{CRDTs and quorums: monotonic-reads consistency} - \begin{figure} - \centering - \def\svgwidth{.8\textwidth} - \only<1>{\import{assets/}{latticeB_1.pdf_tex}}% - \only<2>{\import{assets/}{latticeB_2.pdf_tex}}% - \only<3>{\import{assets/}{latticeB_3.pdf_tex}}% - \only<4>{\import{assets/}{latticeB_4.pdf_tex}}% - \only<5>{\import{assets/}{latticeB_5.pdf_tex}}% - \only<6>{\import{assets/}{latticeB_6.pdf_tex}}% - \only<7>{\import{assets/}{latticeB_7.pdf_tex}}% - \only<8>{\import{assets/}{latticeB_8.pdf_tex}}% - \only<9>{\import{assets/}{latticeB_9.pdf_tex}}% - \only<10>{\import{assets/}{latticeB_10.pdf_tex}}% - \end{figure} -\end{frame} - -\begin{frame} - \frametitle{CRDTs and quorums: monotonic-reads consistency} - \textbf{Property:} If node $A$ did an operation $read()$ and received $x$ as a response,\\ - \hspace{2cm} and node $B$ starts an operation $read()$ after $A$ received $x$,\\ - \hspace{2cm} then $B$ will read a value $x' \sqsupseteq x$. - - \vspace{1em} - - \textbf{Algorithm $monotonic\_read()$:} {\small (a.k.a. repair-on-read)} - \begin{enumerate} - \item Broadcast $read()$ to all nodes - \item Wait for $k > n/2$ nodes to reply with values $x_1, \dots, x_k$ - \item If $x_i \ne x_j$ for some nodes $i$ and $j$,\\ - \hspace{1cm}then call $write(x_1 \sqcup \dots \sqcup x_k)$ and wait for OK from $k' > n/2$ nodes - \item Return $x_1 \sqcup \dots \sqcup x_k$ - \end{enumerate} - - \vspace{1em} - - This makes reads slower in some cases, and is \textbf{not implemented in Garage}. -\end{frame} - -\begin{frame} - \frametitle{A hard problem: layout changes} - \begin{itemize} - \item We rely on quorums $k > n/2$ within each partition:\\ - $$n=3,~~~~~~~k\ge 2$$ - \item<2-> When rebalancing, the set of nodes responsible for a partition can change:\\ - $$\{n_A, n_B, n_C\} \to \{n_A, n_D, n_E\}$$ - \vspace{.01em} - \item<3-> During the rebalancing, $D$ and $E$ don't yet have the data,\\ - ~~~~~~~~~~~~~~~~~~~and $B$ and $C$ want to get rid of the data to free up space\\ - \vspace{.2em} - $\to$ quorums only within the new set of nodes don't work\\ - $\to$ how to coordinate? \textbf{currently, we don't...} +% ======================================== OPERATING +% ======================================== OPERATING +% ======================================== OPERATING - \end{itemize} -\end{frame} \section{Operating big Garage clusters} @@ -869,9 +616,9 @@ \frametitle{Operating Garage} \begin{center} \only<1-2>{ - \includegraphics[width=.9\linewidth]{assets/scr_garage_status.png} + \includegraphics[width=.9\linewidth]{../assets/screenshots/garage_status_0.10.png} \\\vspace{1em} - \visible<2>{\includegraphics[width=.85\linewidth]{assets/scr_garage_status_broken.png}} + \visible<2>{\includegraphics[width=.9\linewidth]{../assets/screenshots/garage_status_unhealthy_0.10.png}} } \end{center} \end{frame} @@ -879,17 +626,17 @@ \begin{frame} \frametitle{Garage's architecture} \begin{center} - \only<1>{\includegraphics[width=.45\linewidth]{assets/garage.drawio.pdf}}% - \only<2>{\includegraphics[width=.6\linewidth]{assets/garage_sync.drawio.pdf}}% + \only<1>{\includegraphics[width=.45\linewidth]{../assets/garage.drawio.pdf}}% + \only<2>{\includegraphics[width=.6\linewidth]{../assets/garage_sync.drawio.pdf}}% \end{center} \end{frame} \begin{frame} \frametitle{Digging deeper} \begin{center} - \only<1>{\includegraphics[width=.9\linewidth]{assets/scr_garage_stats.png}} - \only<2>{\includegraphics[width=.6\linewidth]{assets/scr_garage_worker_list.png}} - \only<3>{\includegraphics[width=.6\linewidth]{assets/scr_garage_worker_get.png}} + \only<1>{\includegraphics[width=.9\linewidth]{../assets/screenshots/garage_stats_0.10.png}} + \only<2>{\includegraphics[width=.5\linewidth]{../assets/screenshots/garage_worker_list_0.10.png}} + \only<3>{\includegraphics[width=.6\linewidth]{../assets/screenshots/garage_worker_param_0.10.png}} \end{center} \end{frame} @@ -910,7 +657,6 @@ \begin{itemize} \item Lots of small files on disk \item Processing the resync queue can be slow - \item Multi-HDD support not yet released (soon!) \end{itemize} \end{itemize} \end{frame} @@ -926,7 +672,7 @@ \vspace{.5em} \item Data block storage: \begin{itemize} - \item Wait for v0.9 with multi-HDD support + \item Use Garage's native multi-HDD support \item XFS on individual drives \item Increase block size (1MB $\to$ 10MB, requires more RAM and good networking) \item Tune \texttt{resync-tranquility} and \texttt{resync-worker-count} dynamically @@ -943,146 +689,10 @@ Current deployments: $< 10$ TB, we don't have much experience with more \end{frame} -\section{Going further than the S3 API} - -\begin{frame} - \frametitle{Using Garage for everything} - \begin{center} - \only<1>{\includegraphics[width=.8\linewidth]{assets/slideB1.png}}% - \only<2>{\includegraphics[width=.8\linewidth]{assets/slideB2.png}}% - \only<3>{\includegraphics[width=.8\linewidth]{assets/slideB3.png}}% - \end{center} -\end{frame} - -\begin{frame} - \frametitle{K2V Design} - \begin{itemize} - \item A new, custom, minimal API\\ - \vspace{.5em} - \begin{itemize} - \item Single-item operations - \item Operations on ranges and batches of items - \item Polling operations to help implement a PubSub pattern - \end{itemize} - \vspace{1em} - \item<2-> Exposes the partitoning mechanism of Garage\\ - K2V = partition key / sort key / value (like Dynamo) - \vspace{1em} - \item<3-> Weakly consistent, CRDT-friendly\\ - $\to$ no support for transactions (not ACID) - \vspace{1em} - \item<4-> Cryptography-friendly: values are binary blobs - \end{itemize} -\end{frame} - -\begin{frame} - \frametitle{Handling concurrent values} - \textbf{How to handle concurrency?} Example: - \vspace{1em} - \begin{enumerate} - \item Client $A$ reads the initial value of a key, $x_0$ - \vspace{1em} - \item<2-> Client $B$ also reads the initial value $x_0$ of that key - \vspace{1em} - \item<3-> Client $A$ modifies $x_0$, and writes a new value $x_1$ - \vspace{1em} - \item<4-> Client $B$ also modifies $x_0$, and writes a new value $x'_1$,\\ - without having a chance to first read $x_1$\\ - \vspace{1em} - $\to$ what should the final state be? - \end{enumerate} -\end{frame} - -\begin{frame} - \frametitle{Handling concurrent values} - \begin{itemize} - \item If we keep only $x_1$ or $x'_1$, we risk \textbf{loosing application data} - \vspace{1.5em} - \item<2-> Values are opaque binary blobs, \textbf{K2V cannot resolve conflicts} by itself\\ - (e.g. by implementing a CRDT) - \vspace{1.5em} - \item<3-> Solution: \textbf{we keep both!}\\ - $\to$ the value of the key is now $\{x_1, x'_1\}$\\ - $\to$ the client application can decide how to resolve conflicts on the next read - \end{itemize} -\end{frame} - -\begin{frame} - \frametitle{Keeping track of causality} - How does K2V know that $x_1$ and $x'_1$ are concurrent? - \vspace{1em} - \begin{itemize} - \item $read()$ returns \textbf{a set of values} and an associated \textbf{causality token}\\ - \vspace{1.5em} - \item<2-> When calling $write()$, the client sends \textbf{the causality token from its last read} - \vspace{1.5em} - \item<3-> The causality token represents the set of values \textbf{already seen by the client}\\ - $\to$ those values are the \textbf{causal past} of the write operation\\ - $\to$ K2V can keep concurrent values and overwrite all ones in the causal past - \vspace{1.5em} - \item<4-> Internally, the causality token is \textbf{a vector clock} - \end{itemize} -\end{frame} - -\begin{frame} - \frametitle{Application: an e-mail storage server} - \begin{center} - \only<1>{\includegraphics[width=.9\linewidth]{assets/aerogramme.png}}% - \end{center} -\end{frame} - -\begin{frame} - \frametitle{Aerogramme data model} - \begin{center} - \only<1->{\includegraphics[width=.4\linewidth]{assets/aerogramme_datatype.drawio.pdf}}% - \end{center} - \visible<2->{Aerogramme encrypts all stored values for privacy\\ - (Garage server administrators can't read your mail)} -\end{frame} - -\begin{frame} - \frametitle{Different deployment scenarios} - \begin{center} - \only<1>{\includegraphics[width=.9\linewidth]{assets/aerogramme_components1.drawio.pdf}}% - \only<2>{\includegraphics[width=.9\linewidth]{assets/aerogramme_components2.drawio.pdf}}% - \end{center} -\end{frame} - -\begin{frame} - \frametitle{A new model for building resilient software} - How to build an application using only Garage as a data store: - \vspace{1em} - \begin{enumerate} - \item Design a data model suited to K2V\\ - {\footnotesize (see Cassandra docs on porting SQL data models to Cassandra)} - \vspace{1em} - \begin{itemize} - \item Use CRDTs or other eventually consistent data types (see e.g. Bayou) - \vspace{1em} - \item Store opaque binary blobs to provide End-to-End Encryption\\ - \end{itemize} - \vspace{1em} - \item<2-> Store big blobs (files) using the S3 API - \vspace{1em} - \item<3-> Let Garage manage sharding, replication, failover, etc. - \end{enumerate} -\end{frame} - -\section{Conclusion} -\begin{frame} - \frametitle{Perspectives} - \begin{itemize} - \item Fix the consistency issue when rebalancing - \vspace{1em} - \item Write about Garage's architecture and properties,\\ - and about our proposed architecture for (E2EE) apps over K2V+S3 - \vspace{1em} - \item Continue developing Garage; finish Aerogramme; build new applications... - \vspace{1em} - \item Anything else? - \end{itemize} -\end{frame} +% ======================================== END +% ======================================== END +% ======================================== END \begin{frame} \frametitle{Where to find us} @@ -1094,8 +704,8 @@ \texttt{\#garage:deuxfleurs.fr} on Matrix \vspace{1.5em} - \includegraphics[width=.06\linewidth]{assets/rust_logo.png} - \includegraphics[width=.13\linewidth]{assets/AGPLv3_Logo.png} + \includegraphics[width=.06\linewidth]{../assets/logos/rust_logo.png} + \includegraphics[width=.13\linewidth]{../assets/logos/AGPLv3_Logo.png} \end{center} \end{frame} |