aboutsummaryrefslogtreecommitdiff
path: root/doc/talks/2023-01-18-tocatta/talk.tex
diff options
context:
space:
mode:
authorAlex Auvolat <alex@adnab.me>2023-01-12 13:25:09 +0100
committerAlex Auvolat <alex@adnab.me>2023-01-12 13:25:09 +0100
commit7416ba97ef8e5f9592c32dae6caaf46c1dbd7610 (patch)
tree4e3bef6c3faa938a7808cb351bfdf7aa8a5b4e67 /doc/talks/2023-01-18-tocatta/talk.tex
parent94d723f27cea7aa58d11af3e18a165283b40f19a (diff)
downloadgarage-7416ba97ef8e5f9592c32dae6caaf46c1dbd7610.tar.gz
garage-7416ba97ef8e5f9592c32dae6caaf46c1dbd7610.zip
Talk 2023-01-18 WIP
Diffstat (limited to 'doc/talks/2023-01-18-tocatta/talk.tex')
-rw-r--r--doc/talks/2023-01-18-tocatta/talk.tex623
1 files changed, 623 insertions, 0 deletions
diff --git a/doc/talks/2023-01-18-tocatta/talk.tex b/doc/talks/2023-01-18-tocatta/talk.tex
new file mode 100644
index 00000000..566f56ec
--- /dev/null
+++ b/doc/talks/2023-01-18-tocatta/talk.tex
@@ -0,0 +1,623 @@
+%\nonstopmode
+\documentclass[aspectratio=169]{beamer}
+\usepackage[utf8]{inputenc}
+% \usepackage[frenchb]{babel}
+\usepackage{amsmath}
+\usepackage{mathtools}
+\usepackage{breqn}
+\usepackage{multirow}
+\usetheme{boxes}
+\usepackage{graphicx}
+\usepackage{adjustbox}
+%\useoutertheme[footline=authortitle,subsection=false]{miniframes}
+%\useoutertheme[footline=authorinstitute,subsection=false]{miniframes}
+\useoutertheme{infolines}
+\setbeamertemplate{headline}{}
+
+\beamertemplatenavigationsymbolsempty
+
+\definecolor{TitleOrange}{RGB}{255,137,0}
+\setbeamercolor{title}{fg=TitleOrange}
+\setbeamercolor{frametitle}{fg=TitleOrange}
+
+\definecolor{ListOrange}{RGB}{255,145,5}
+\setbeamertemplate{itemize item}{\color{ListOrange}$\blacktriangleright$}
+
+\definecolor{verygrey}{RGB}{70,70,70}
+\setbeamercolor{normal text}{fg=verygrey}
+
+
+\usepackage{tabu}
+\usepackage{multicol}
+\usepackage{vwcol}
+\usepackage{stmaryrd}
+\usepackage{graphicx}
+
+\usepackage[normalem]{ulem}
+
+\AtBeginSection[]{
+ \begin{frame}
+ \vfill
+ \centering
+ \begin{beamercolorbox}[sep=8pt,center,shadow=true,rounded=true]{title}
+ \usebeamerfont{title}\insertsectionhead\par%
+ \end{beamercolorbox}
+ \vfill
+ \end{frame}
+}
+
+\title{Garage}
+\subtitle{a lightweight and robust geo-distributed data storage system}
+\author{Deuxfleurs Association}
+\date{Inria, 2023-01-18}
+
+\begin{document}
+
+\begin{frame}
+ \centering
+ \includegraphics[width=.3\linewidth]{../../sticker/Garage.pdf}
+ \vspace{1em}
+
+ {\large\bf Deuxfleurs Association}
+ \vspace{1em}
+
+ \url{https://garagehq.deuxfleurs.fr/}
+
+ Matrix channel: \texttt{\#garage:deuxfleurs.fr}
+\end{frame}
+
+\begin{frame}
+ \frametitle{Who we are}
+ \begin{columns}[t]
+ \begin{column}{.2\textwidth}
+ \centering
+ \adjincludegraphics[width=.4\linewidth, valign=t]{assets/alex.jpg}
+ \end{column}
+ \begin{column}{.6\textwidth}
+ \textbf{Alex Auvolat}\\
+ PhD; co-founder of Deuxfleurs
+ \end{column}
+ \begin{column}{.2\textwidth}
+ ~
+ \end{column}
+ \end{columns}
+ \vspace{1em}
+
+ \begin{columns}[t]
+ \begin{column}{.2\textwidth}
+ ~
+ \end{column}
+ \begin{column}{.6\textwidth}
+ \textbf{Quentin Dufour}\\
+ PhD; co-founder of Deuxfleurs
+ \end{column}
+ \begin{column}{.2\textwidth}
+ \centering
+ \adjincludegraphics[width=.5\linewidth, valign=t]{assets/quentin.jpg}
+ \end{column}
+ \end{columns}
+ \vspace{2em}
+
+ \begin{columns}[t]
+ \begin{column}{.2\textwidth}
+ \centering
+ \adjincludegraphics[width=.5\linewidth, valign=t]{assets/deuxfleurs.pdf}
+ \end{column}
+ \begin{column}{.6\textwidth}
+ \textbf{Deuxfleurs}\\
+ A non-profit self-hosting collective,\\
+ member of the CHATONS network
+ \end{column}
+ \begin{column}{.2\textwidth}
+ \centering
+ \adjincludegraphics[width=.7\linewidth, valign=t]{assets/logo_chatons.png}
+ \end{column}
+ \end{columns}
+
+\end{frame}
+
+\begin{frame}
+ \frametitle{Our objective at Deuxfleurs}
+
+ \begin{center}
+ \textbf{Promote self-hosting and small-scale hosting\\
+ as an alternative to large cloud providers}
+ \end{center}
+ \vspace{2em}
+ \visible<2->{
+ Why is it hard?
+ }
+ \visible<3->{
+ \vspace{2em}
+ \begin{center}
+ \textbf{\underline{Resilience}}\\
+ {\footnotesize (we want good uptime/availability with low supervision)}
+ \end{center}
+ }
+\end{frame}
+
+\begin{frame}
+ \frametitle{How to make a \underline{stable} system}
+
+ Enterprise-grade systems typically employ:
+ \vspace{1em}
+ \begin{itemize}
+ \item RAID
+ \item Redundant power grid + UPS
+ \item Redundant Internet connections
+ \item Low-latency links
+ \item ...
+ \end{itemize}
+ \vspace{1em}
+ $\to$ it's costly and only worth it at DC scale
+\end{frame}
+
+\begin{frame}
+ \frametitle{How to make a \underline{resilient} system}
+
+ \only<1,4-5>{
+ Instead, we use:
+ \vspace{1em}
+ \begin{itemize}
+ \item \textcolor<2->{gray}{Commodity hardware (e.g. old desktop PCs)}
+ \vspace{.5em}
+ \item<4-> \textcolor<5->{gray}{Commodity Internet (e.g. FTTB, FTTH) and power grid}
+ \vspace{.5em}
+ \item<5-> \textcolor<6->{gray}{\textbf{Geographical redundancy} (multi-site replication)}
+ \end{itemize}
+ }
+ \only<2>{
+ \begin{center}
+ \includegraphics[width=.8\linewidth]{assets/atuin.jpg}
+ \end{center}
+ }
+ \only<3>{
+ \begin{center}
+ \includegraphics[width=.8\linewidth]{assets/neptune.jpg}
+ \end{center}
+ }
+ \only<6>{
+ \begin{center}
+ \includegraphics[width=.5\linewidth]{assets/inframap.jpg}
+ \end{center}
+ }
+\end{frame}
+
+\begin{frame}
+ \frametitle{How to make this happen}
+ \begin{center}
+ \only<1>{\includegraphics[width=.8\linewidth]{assets/slide1.png}}%
+ \only<2>{\includegraphics[width=.8\linewidth]{assets/slide2.png}}%
+ \only<3>{\includegraphics[width=.8\linewidth]{assets/slide3.png}}%
+ \end{center}
+\end{frame}
+
+\begin{frame}
+ \frametitle{Distributed file systems are slow}
+ File systems are complex, for example:
+ \vspace{1em}
+ \begin{itemize}
+ \item Concurrent modification by several processes
+ \vspace{1em}
+ \item Folder hierarchies
+ \vspace{1em}
+ \item Other requirements of the POSIX spec
+ \end{itemize}
+ \vspace{1em}
+ Coordination in a distributed system is costly
+
+ \vspace{1em}
+ Costs explode with commodity hardware / Internet connections\\
+ {\small (we experienced this!)}
+\end{frame}
+
+\begin{frame}
+ \frametitle{A simpler solution: object storage}
+ Only two operations:
+ \vspace{1em}
+ \begin{itemize}
+ \item Put an object at a key
+ \vspace{1em}
+ \item Retrieve an object from its key
+ \end{itemize}
+ \vspace{1em}
+ {\footnotesize (and a few others)}
+
+ \vspace{1em}
+ Sufficient for many applications!
+\end{frame}
+
+\begin{frame}
+ \frametitle{A simpler solution: object storage}
+ \begin{center}
+ \includegraphics[height=6em]{../2020-12-02_wide-team/img/Amazon-S3.jpg}
+ \hspace{3em}
+ \includegraphics[height=5em]{assets/minio.png}
+ \hspace{3em}
+ \includegraphics[height=6em]{../../logo/garage_hires_crop.png}
+ \end{center}
+ \vspace{1em}
+ S3: a de-facto standard, many compatible applications
+
+ \vspace{1em}
+
+ MinIO is self-hostable but not suited for geo-distributed deployments
+
+ \vspace{1em}
+
+ \textbf{Garage is a self-hosted drop-in replacement for the Amazon S3 object store}
+\end{frame}
+
+
+\begin{frame}
+ \frametitle{The data model of object storage}
+ Object storage is basically a key-value store:
+ \vspace{1em}
+
+ \begin{center}
+ \begin{tabular}{|l|p{8cm}|}
+ \hline
+ \textbf{Key: file path + name} & \textbf{Value: file data + metadata} \\
+ \hline
+ \hline
+ \texttt{index.html} &
+ \texttt{Content-Type: text/html; charset=utf-8} \newline
+ \texttt{Content-Length: 24929} \newline
+ \texttt{<binary blob>} \\
+ \hline
+ \texttt{img/logo.svg} &
+ \texttt{Content-Type: text/svg+xml} \newline
+ \texttt{Content-Length: 13429} \newline
+ \texttt{<binary blob>} \\
+ \hline
+ \texttt{download/index.html} &
+ \texttt{Content-Type: text/html; charset=utf-8} \newline
+ \texttt{Content-Length: 26563} \newline
+ \texttt{<binary blob>} \\
+ \hline
+ \end{tabular}
+ \end{center}
+
+\end{frame}
+
+\begin{frame}
+ \frametitle{Two big problems}
+ \begin{enumerate}
+ \item \textbf{How to place data on different nodes?}\\
+ \vspace{1em}
+ \underline{Constraints:} heterogeneous hardware\\
+ \underline{Objective:} $n$ copies of everything, maximize usable capacity, maximize resilience\\
+ \vspace{1em}
+ $\to$ the Dynamo model + optimization algorithms
+ \vspace{2em}
+ \item<2-> \textbf{How to guarantee consistency?}\\
+ \vspace{1em}
+ \underline{Constraints:} slow network (geographical distance), node unavailability/crashes\\
+ \underline{Objective:} maximize availability, read-after-write guarantee\\
+ \vspace{1em}
+ $\to$ CRDTs, monotonicity, read and write quorums
+ \end{enumerate}
+\end{frame}
+
+\section{Problem 1: placing data}
+
+\begin{frame}
+ \frametitle{Key-value stores, upgraded: the Dynamo model}
+ \textbf{Two keys:}
+ \begin{itemize}
+ \item Partition key: used to divide data into partitions (shards)
+ \item Sort key: used to identify items inside a partition
+ \end{itemize}
+
+ \vspace{1em}
+
+ \begin{center}
+ \begin{tabular}{|l|l|p{3cm}|}
+ \hline
+ \textbf{Partition key: bucket} & \textbf{Sort key: filename} & \textbf{Value} \\
+ \hline
+ \hline
+ \texttt{website} & \texttt{index.html} & (file data) \\
+ \hline
+ \texttt{website} & \texttt{img/logo.svg} & (file data) \\
+ \hline
+ \texttt{website} & \texttt{download/index.html} & (file data) \\
+ \hline
+ \hline
+ \texttt{backup} & \texttt{borg/index.2822} & (file data) \\
+ \hline
+ \texttt{backup} & \texttt{borg/data/2/2329} & (file data) \\
+ \hline
+ \texttt{backup} & \texttt{borg/data/2/2680} & (file data) \\
+ \hline
+ \hline
+ \texttt{private} & \texttt{qq3a2nbe1qjq0ebbvo6ocsp6co} & (file data) \\
+ \hline
+ \end{tabular}
+ \end{center}
+\end{frame}
+
+\begin{frame}
+ \frametitle{Key-value stores, upgraded: the Dynamo model}
+ \begin{itemize}
+ \item Data with different partition keys is stored independantly,\\
+ on a different set of nodes\\
+ \vspace{.5em}
+ $\to$ no easy way to list all partition keys\\
+ $\to$ no cross-shard transactions\\
+ \vspace{2em}
+ \item Placing data: hash the partition key, select nodes accordingly\\
+ \vspace{.5em}
+ $\to$ distributed hash table (DHT)
+ \vspace{2em}
+ \item For a given value of the partition key, items can be listed using their sort keys
+ \end{itemize}
+\end{frame}
+
+\begin{frame}
+ \frametitle{How to spread files over different cluster nodes?}
+ \textbf{Consistent hashing (Dynamo):}
+ \vspace{1em}
+
+ \begin{center}
+ \only<1>{\includegraphics[width=.40\columnwidth]{assets/consistent_hashing_1.pdf}}%
+ \only<2>{\includegraphics[width=.40\columnwidth]{assets/consistent_hashing_2.pdf}}%
+ \only<3>{\includegraphics[width=.40\columnwidth]{assets/consistent_hashing_3.pdf}}%
+ \only<4>{\includegraphics[width=.40\columnwidth]{assets/consistent_hashing_4.pdf}}%
+ \end{center}
+\end{frame}
+
+\begin{frame}
+ \frametitle{Constraint: location-awareness}
+ \begin{center}
+ \includegraphics[width=\linewidth]{assets/location-aware.png}
+ \end{center}
+ \vspace{2em}
+ Garage replicates data on different zones when possible
+\end{frame}
+
+\begin{frame}
+ \frametitle{Constraint: location-awareness}
+ \begin{center}
+ \includegraphics[width=.8\linewidth]{assets/map.png}
+ \end{center}
+\end{frame}
+
+\begin{frame}
+ \frametitle{Issues with consistent hashing}
+ \begin{itemize}
+ \item Consistent hashing doesn't dispatch data based on geographical location of nodes
+ \vspace{1em}
+ \item<2-> Geographically aware adaptation, try 1:\\
+ data quantities not well balanced between nodes
+ \vspace{1em}
+ \item<3-> Geographically aware adaptation, try 2:\\
+ too many reshuffles when adding/removing nodes
+ \end{itemize}
+\end{frame}
+
+\begin{frame}
+ \frametitle{How to spread files over different cluster nodes?}
+ \textbf{Garage's method: build an index table}
+ \vspace{1em}
+
+ Realization: we can actually precompute an optimal solution
+ \vspace{1em}
+
+ \visible<2->{
+ \begin{center}
+ \begin{tabular}{|l|l|l|l|}
+ \hline
+ \textbf{Partition} & \textbf{Node 1} & \textbf{Node 2} & \textbf{Node 3} \\
+ \hline
+ \hline
+ Partition 0 & Io (jupiter) & Drosera (atuin) & Courgette (neptune) \\
+ \hline
+ Partition 1 & Datura (atuin) & Courgette (neptune) & Io (jupiter) \\
+ \hline
+ Partition 2 & Io(jupiter) & Celeri (neptune) & Drosera (atuin) \\
+ \hline
+ \hspace{1em}$\vdots$ & \hspace{1em}$\vdots$ & \hspace{1em}$\vdots$ & \hspace{1em}$\vdots$ \\
+ \hline
+ Partition 255 & Concombre (neptune) & Io (jupiter) & Drosera (atuin) \\
+ \hline
+ \end{tabular}
+ \end{center}
+ }
+ \vspace{1em}
+ \visible<3->{
+ The index table is built centrally using an optimal algorithm,\\
+ then propagated to all nodes
+ }
+\end{frame}
+
+\begin{frame}
+ \frametitle{The relationship between \emph{partition} and \emph{partition key}}
+ \begin{center}
+ \begin{tabular}{|l|l|l|l|}
+ \hline
+ \textbf{Partition key} & \textbf{Partition} & \textbf{Sort key} & \textbf{Value} \\
+ \hline
+ \hline
+ \texttt{website} & Partition 12 & \texttt{index.html} & (file data) \\
+ \hline
+ \texttt{website} & Partition 12 & \texttt{img/logo.svg} & (file data) \\
+ \hline
+ \texttt{website} & Partition 12 &\texttt{download/index.html} & (file data) \\
+ \hline
+ \hline
+ \texttt{backup} & Partition 42 & \texttt{borg/index.2822} & (file data) \\
+ \hline
+ \texttt{backup} & Partition 42 & \texttt{borg/data/2/2329} & (file data) \\
+ \hline
+ \texttt{backup} & Partition 42 & \texttt{borg/data/2/2680} & (file data) \\
+ \hline
+ \hline
+ \texttt{private} & Partition 42 & \texttt{qq3a2nbe1qjq0ebbvo6ocsp6co} & (file data) \\
+ \hline
+ \end{tabular}
+ \end{center}
+ \vspace{1em}
+ \textbf{To read or write an item:} hash partition key
+ \\ \hspace{5cm} $\to$ determine partition number (first 8 bits)
+ \\ \hspace{5cm} $\to$ find associated nodes
+\end{frame}
+
+\begin{frame}
+ \frametitle{Garage's internal data structures}
+ \centering
+ \includegraphics[width=.75\columnwidth]{assets/garage_tables.pdf}
+\end{frame}
+
+\begin{frame}
+ \frametitle{Storing and retrieving files}
+ \begin{center}
+ \only<1>{\includegraphics[width=.45\linewidth]{assets/garage2a.drawio.pdf}}%
+ \only<2>{\includegraphics[width=.45\linewidth]{assets/garage2b.drawio.pdf}}%
+ \end{center}
+\end{frame}
+
+\section{Problem 2: ensuring consistency}
+
+%\begin{frame}
+% \frametitle{Garage's architecture}
+% \begin{center}
+% \includegraphics[width=.35\linewidth]{assets/garage.drawio.pdf}
+% \end{center}
+%\end{frame}
+
+\begin{frame}
+ \frametitle{Garage is \emph{coordination-free}:}
+ \begin{itemize}
+ \item No Raft or Paxos
+ \vspace{1em}
+ \item Internal data types are CRDTs
+ \vspace{1em}
+ \item All nodes are equivalent (no master/leader/index node)
+ \end{itemize}
+ \vspace{2em}
+ $\to$ less sensitive to higher latencies between nodes
+\end{frame}
+
+\begin{frame}
+ \frametitle{Consistency model}
+ \begin{itemize}
+ \item Not ACID (not required by S3 spec) / not linearizable
+ \vspace{1em}
+ \item \textbf{Read-after-write consistency}\\
+ {\footnotesize (stronger than eventual consistency)}
+ \end{itemize}
+\end{frame}
+
+\begin{frame}
+ \frametitle{Impact on performances}
+ \begin{center}
+ \includegraphics[width=.8\linewidth]{assets/endpoint-latency-dc.png}
+ \end{center}
+\end{frame}
+
+
+\begin{frame}
+ \frametitle{An ever-increasing compatibility list}
+ \begin{center}
+ \includegraphics[width=.7\linewidth]{assets/compatibility.png}
+ \end{center}
+\end{frame}
+
+\begin{frame}
+ \frametitle{Further plans for Garage}
+ \begin{center}
+ \only<1>{\includegraphics[width=.8\linewidth]{assets/slideB1.png}}%
+ \only<2>{\includegraphics[width=.8\linewidth]{assets/slideB2.png}}%
+ \only<3>{\includegraphics[width=.8\linewidth]{assets/slideB3.png}}%
+ \end{center}
+\end{frame}
+
+\begin{frame}
+ \frametitle{K2V Design}
+ \begin{itemize}
+ \item A new, custom, minimal API
+ \vspace{1em}
+ \item<2-> Exposes the partitoning mechanism of Garage\\
+ K2V = partition key / sort key / value (like Dynamo)
+ \vspace{1em}
+ \item<3-> Coordination-free, CRDT-friendly (inspired by Riak)\\
+ \vspace{1em}
+ \item<4-> Cryptography-friendly: values are binary blobs
+ \end{itemize}
+\end{frame}
+
+\begin{frame}
+ \frametitle{Application: an e-mail storage server}
+ \begin{center}
+ \only<1>{\includegraphics[width=.9\linewidth]{assets/aerogramme.png}}%
+ \end{center}
+\end{frame}
+
+\begin{frame}
+ \frametitle{Aerogramme data model}
+ \begin{center}
+ \only<1>{\includegraphics[width=.4\linewidth]{assets/aerogramme_datatype.drawio.pdf}}%
+ \only<2->{\includegraphics[width=.9\linewidth]{assets/aerogramme_keys.drawio.pdf}\vspace{1em}}%
+ \end{center}
+ \visible<3->{Aerogramme encrypts all stored values for privacy\\
+ (Garage server administrators can't read your mail)}
+\end{frame}
+
+\begin{frame}
+ \frametitle{Different deployment scenarios}
+ \begin{center}
+ \only<1>{\includegraphics[width=.9\linewidth]{assets/aerogramme_components1.drawio.pdf}}%
+ \only<2>{\includegraphics[width=.9\linewidth]{assets/aerogramme_components2.drawio.pdf}}%
+ \end{center}
+\end{frame}
+
+\begin{frame}
+ \frametitle{A new model for building resilient software}
+ \begin{itemize}
+ \item Design a data model suited to K2V\\
+ {\footnotesize (see Cassandra docs on porting SQL data models to Cassandra)}
+ \vspace{1em}
+ \begin{itemize}
+ \item Use CRDTs or other eventually consistent data types (see e.g. Bayou)
+ \vspace{1em}
+ \item Store opaque binary blobs to provide End-to-End Encryption\\
+ \end{itemize}
+ \vspace{1em}
+ \item Store big blobs (files) in S3
+ \vspace{1em}
+ \item Let Garage manage sharding, replication, failover, etc.
+ \end{itemize}
+\end{frame}
+
+\begin{frame}
+ \frametitle{Research perspectives}
+ \begin{itemize}
+ \item Write about Garage's global architecture \emph{(paper in progress)}
+ \vspace{1em}
+ \item Measure and improve Garage's performances
+ \vspace{1em}
+ \item Discuss the optimal layout algorithm, provide proofs
+ \vspace{1em}
+ \item Write about our proposed architecture for (E2EE) apps over K2V+S3
+ \end{itemize}
+\end{frame}
+
+\begin{frame}
+ \frametitle{Where to find us}
+ \begin{center}
+ \includegraphics[width=.25\linewidth]{../../logo/garage_hires.png}\\
+ \vspace{-1em}
+ \url{https://garagehq.deuxfleurs.fr/}\\
+ \url{mailto:garagehq@deuxfleurs.fr}\\
+ \texttt{\#garage:deuxfleurs.fr} on Matrix
+
+ \vspace{1.5em}
+ \includegraphics[width=.06\linewidth]{assets/rust_logo.png}
+ \includegraphics[width=.13\linewidth]{assets/AGPLv3_Logo.png}
+ \end{center}
+\end{frame}
+
+\end{document}
+
+%% vim: set ts=4 sw=4 tw=0 noet spelllang=en :