aboutsummaryrefslogtreecommitdiff
path: root/doc/talks/2021-04-28_spirals-team/sota.tex
blob: 23b9087a859ce336d9e4b080f1032298530b1c2b (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
\section{State of the art}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}{The CAP theorem}{Consistency vs. Availability}

\begin{block}{Eric Brewer's theorem}
``A shared-state system can have \textbf{at most two} of the following properties at any given time:

\begin{itemize}
	\item \textbf{C}onsistency
	\item \textbf{A}vailability
	\item \textbf{P}artition tolerance''
\end{itemize}
\end{block}


\begin{center}
\Large 
Under network partitions, a distributed data store has to sacrifice either availability or consistency.
\end{center}
\vfill

\begin{itemize}
	\item \textbf{Consistency-first}: Abort incoming queries;
	\item \textbf{Availability-first}: Return possibly stale data.
\end{itemize}

\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}{Consistency-first: the ACID model}{Consistency vs. Availability}

\textbf{Transaction}: unit of work within an ACID data store. 
%Comprises multiple operations.
%E.g. bank transfer.
%E.g. a bank transfer from A to B is a transaction involving two operations: withdraw money from A & credit B with the same money amount. 
\vfill

\begin{itemize}
	\item \textbf{\underline{A}tomicity}: Transactions either complete entirely or fail.

	No transaction ever seen as in-progress.

	\item \textbf{\underline{C}onsistency}: Transactions always generate a valid state.

	The database maintains its invariants across transactions.

	\item \textbf{\underline{I}solation}: Concurrent transactions are seen as sequential.

	Transactions are serializable, or sequentially consistent.

	\item \textbf{\underline{D}urability}: Committed transactions are never forgotten.
\end{itemize}
\vfill\centering

Reads are fast, writes are slow.

\vfill\raggedright

Example: relational databases.
\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}[fragile]{Concurrent writes in ACID}{Consistency vs. Availability}


\begin{columns}
\column{.5\columnwidth}
	\begin{block}{}
		\begin{lstlisting}
transaction AcqDoses(y):
  x <- SELECT #vaccines;
  UPDATE #vaccines = (x + y);
		\end{lstlisting}
	\end{block}
	\vspace{5ex}

	Supports compound operations.
\column{.5\columnwidth}
\centering
\includegraphics[width=\columnwidth]{figures/conflict_acid.pdf}
\end{columns}

\end{frame}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}{Availability-first: the BASE model}{Consistency vs. Availability}


Some apps prefer availability, e.g. Amazon products' reviews.
\vfill 

The BASE model trades Consistency \& Isolation for Availability.


%Some applications do not care about strong consistency and prefer being highly available (e.g. Amazon's product reviews).

%In order to achieve higher availability, the BASE model relaxes consistency constraints of the ACID model: "eventual consistency".
\vfill

\begin{itemize}
	\item \textbf{\underline{B}asic \underline{A}vailability}: 
	The data store thrives to be available.

	\item \textbf{\underline{S}oft-state}: 
	Replicas can disagree on the valid state.

	\item \textbf{\underline{E}ventual consistency}: 
	In the absence of write queries, 
	the data store will eventually converge to a single valid state.
\end{itemize}
\vfill\centering

Writes are fast, reads are slow.

\vfill\raggedright

Examples: key-value \& object stores.

\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}{Concurrent writes in BASE}{Consistency vs. Availability}

\begin{columns}
\column{.5\columnwidth}
	\begin{block}{Object}
		\begin{itemize}
			\item Unique key
			\item Arbitrary value 
			\item Metadata
		\end{itemize}
	\end{block}
	\vspace{5ex} 

	Conflict resolution = client's job!
	\vspace{5ex}

	No compound operations.
\column{.5\columnwidth}
	\centering
	\includegraphics[width=\columnwidth]{figures/conflict_base.pdf}
\end{columns}

% KV storage is another example, distinction is minor here

% Object = unique key, arbitrary value, metadata.

% Object storage only provides semantics to investigate causal order of queries *for individual objects*. No compound operations, no transactions.

% Much easier to distribute, and "scale-out".

% Write is fast, read is slow (gotta collect all object versions).

% \todo{vaccines example with BASE model}

\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}{Strong Eventual Consistency w/ CRDTs}{Consistency vs. Availability}

\centering\small

\fullcite{defago_conflict-free_2011}

\vfill\raggedright\normalsize

\begin{block}{Strong Eventual Consistency (SEC)}
	\begin{itemize}
		\item CRDTs specify distributed operations
		\item Conflicts will be solved according to specification
		\item Proven \& bound eventual convergence
	\end{itemize}
\end{block}

\vfill\centering
\includegraphics[width=.5\columnwidth]{figures/crdt.pdf}

\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}[fragile]{Concurrent writes with CRDTs}{Consistency vs. Availability}

\begin{columns}
\column{.5\columnwidth}
	\begin{block}{}
		\begin{lstlisting}
CRDT Counter(x0):
  history = {}
  op. incr(y):
    history U= {(UUID(), y)}
  op. decr(y):
    history U= {(UUID(), -y)}
  op. read():
    x = x0
    for (_, y) in history:
    	x += y
    return x
		\end{lstlisting}
	\end{block}
	\vspace{2ex}

	Operations commute?

	$\implies$ screw total order!
\column{.5\columnwidth}
	\centering
	\includegraphics[width=\columnwidth]{figures/conflict_crdt.pdf}
\end{columns}

\end{frame}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}{A complex CRDT: the DAG}{Consistency vs. Availability}

\centering
\only<1>{\includegraphics[height=\textheight]{figures/dag_crdt.png}}%
\only<2>{
	Just to say I swept a lot under the rug.
	\vfill

	For details, go read:

	\fullcite{defago_conflict-free_2011}
	\vfill

	For an implementation, check \textbf{AntidoteDB}.
}

\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}{State of the practice}{Path dependency to the ``cloud''}

\begin{block}{The BASE model is fashionable because}
\centering 

``\emph{High-performance} object storage for \emph{AI analytics} with PBs of \emph{IoT data streams} at the \emph{edge}, using \emph{5G}.''
	% \begin{itemize}
	% 	\item Highest performance 
	% 	\item IoT data streams are inherently distributed
	% \end{itemize}
\end{block}

\vfill\centering

\includegraphics[width=.9\columnwidth]{figures/minio_edge.png}

\vfill\raggedright


%\begin{block}{}
\begin{itemize}
	\item Always backed by cloud: high performance network links.
	\item Edge nodes always seen as clients or data sources, not peers.
\end{itemize}
%\end{block}

% There is \textbf{always a central cloud cluster} in these use-cases.

% Hidden constraint: \textbf{high performance inter-node connectivity}.



\end{frame}



%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% \begin{frame}{A brief history of storage}

% We keep it short because we'll follow chronological order in the next section too.

% \end{frame}


% \begin{frame}{In the beginning, there were \emph{monoliths}}

% \includegraphics[width=.5\columnwidth]{figures/stonehenge.jpg}

% Web applications used to be monolithic:

% \begin{itemize}
% 	\item One or two servers;
% 	\item Availability was not an obsession;
% 	\item Latency was acceptable.
% \end{itemize}

% Relational databases were queens.

% \end{frame}


% \begin{frame}{Then came \emph{expectations}}
% Then, the whole world went online, and suddenly: expectations!

% \begin{itemize}
% 	\item ``Milliseconds matter.'' (Algolia slogan)
% 	\item Critical networked services (healthcare, logistics) need 100\% availability 
% \end{itemize}

% $\implies$ Microservices \& horizontal scalability.

% \todo{Develop on the `herd not sheep' paradigm a bit.}

% \end{frame}


% \begin{frame}{Distributing state/storage: the remaining unknown}

% The microservices orchestration game works well for \emph{stateless} services.

% However, any application requires \emph{state}, persistent data. 

% And this is tough. As we will now see.

% (Not that it's not well studied: distributed storage has always been fashionable.)
	
% \end{frame}